From 70f9ace2f16309a0d5963c1784912d9547f493b3 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Fri, 6 May 2016 11:35:24 +0200 Subject: [PATCH 01/17] Add debug module that parses the Epiphany ELF file for symbol information. --- Makefile | 3 +- include/host_bsp_private.h | 26 +++++++ src/host_bsp.c | 21 +++++- src/host_bsp_debug.c | 149 +++++++++++++++++++++++++++++++++++++ 4 files changed, 196 insertions(+), 3 deletions(-) create mode 100644 src/host_bsp_debug.c diff --git a/Makefile b/Makefile index 420f3c6..f52f72e 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,8 @@ HOST_SRCS = \ host_bsp_memory.c \ host_bsp_buffer.c \ host_bsp_mp.c \ - host_bsp_utility.c + host_bsp_utility.c \ + host_bsp_debug.c #First include directory is only for cross-compiling INCLUDES = -I/usr/include/esdk \ diff --git a/include/host_bsp_private.h b/include/host_bsp_private.h index 988bed3..5fed59e 100644 --- a/include/host_bsp_private.h +++ b/include/host_bsp_private.h @@ -31,6 +31,18 @@ see the files COPYING and COPYING.LESSER. If not, see #define MAX_N_STREAMS 1000 +#ifdef DEBUG +typedef struct { + int index; + unsigned int value; // must be unsigned, addresses use the last bit + size_t size; + int type; // STT_FUNC, STT_OBJECT, etc + int bind; // STB_GLOBAL, STB_LOCAL + int section; // SHN_ABS, SHN_COMMON, SHN_UNDEF, or section index + char name[64]; +} Symbol; +#endif + /* * Global BSP state */ @@ -80,6 +92,11 @@ typedef struct { // Buffer ebsp_stream_descriptor buffered_streams[NPROCS][MAX_N_STREAMS]; +#ifdef DEBUG + Symbol* e_symbols; + int num_symbols; +#endif + } bsp_state_t; extern bsp_state_t state; @@ -141,3 +158,12 @@ void _update_remote_timer(); void _microsleep(int microseconds); void _get_p_coords(int pid, int* row, int* col); void init_application_path(); + +/* + * host_bsp_debug + */ +#ifdef DEBUG +void _read_elf(const char* filename); +Symbol* _get_symbol_by_addr(void* addr); +Symbol* _get_symbol_by_name(const char* symbol); +#endif diff --git a/src/host_bsp.c b/src/host_bsp.c index 44a9287..e9c32bc 100644 --- a/src/host_bsp.c +++ b/src/host_bsp.c @@ -25,6 +25,7 @@ see the files COPYING and COPYING.LESSER. If not, see #include #include #include +#include #include #define __USE_XOPEN2K @@ -52,6 +53,10 @@ int bsp_init(const char* _e_name, int argc, char** argv) { return 0; } +#ifdef DEBUG + _read_elf(state.e_fullpath); +#endif + // Initialize the Epiphany system for the working with the host application if (e_init(NULL) != E_OK) { fprintf(stderr, "ERROR: Could not initialize HAL data structures.\n"); @@ -325,9 +330,15 @@ int ebsp_spmd() { e_read(&state.dev, prow, pcol, E_REG_PC, &pc[i], sizeof(uint32_t)); } - printf(" PC for every core:"); + printf("Current instruction for every core:"); for (int i = 0; i < state.nprocs_used; i++) { - printf(" %p", (void*)pc[i]); + if ((i % 4) == 0) + printf("\n\t"); + Symbol* sym = _get_symbol_by_addr((void*)pc[i]); + if (sym) + printf(" %s+%p", sym->name, (void*)(pc[i] - sym->value)); + else + printf(" %p", (void*)pc[i]); } printf("\n"); @@ -396,6 +407,12 @@ int bsp_end() { return 0; } +#ifdef DEBUG + if (state.e_symbols) + free(state.e_symbols); + state.e_symbols = 0; +#endif + if (bsp_initialized >= 2) e_free(&state.emem); diff --git a/src/host_bsp_debug.c b/src/host_bsp_debug.c new file mode 100644 index 0000000..93f2c57 --- /dev/null +++ b/src/host_bsp_debug.c @@ -0,0 +1,149 @@ +/* +This file is part of the Epiphany BSP library. + +Copyright (C) 2014-2015 Buurlage Wits +Support e-mail: + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License (LGPL) +as published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +and the GNU Lesser General Public License along with this program, +see the files COPYING and COPYING.LESSER. If not, see +. +*/ +#ifdef DEBUG +#include "host_bsp_private.h" + +#include +#include +#include +#include + +void _read_elf(const char* filename); +void _parse_elf(char* buffer, size_t fsize); +void _parse_symbols(char* buffer, size_t fsize, Elf32_Shdr* shdr, + size_t symtab_index); +Symbol* _get_symbol_by_addr(void* addr); +Symbol* _get_symbol_by_name(const char* symbol); + +void _read_elf(const char* filename) { + state.e_symbols = 0; + state.num_symbols = 0; + + FILE* file = fopen(filename, "r"); + + if (!file) { + fprintf(stderr, "ERROR: Could not open %s\n", filename); + return; + } + + size_t fsize; + fseek(file, 0L, SEEK_END); + fsize = ftell(file); + fseek(file, 0L, SEEK_SET); + + char* buffer = malloc(fsize); + size_t read = fread(buffer, 1, fsize, file); + + if (read < fsize) + fprintf(stderr, "ERROR: Could not read full file %s\n", filename); + else + _parse_elf(buffer, fsize); + + free(buffer); + fclose(file); + + return; +} + +#define EM_ADAPTEVA_EPIPHANY 0x1223 /* Adapteva's Epiphany architecture. */ +int is_epiphany_exec_elf(Elf32_Ehdr* ehdr) { + return ehdr && memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0 && + ehdr->e_ident[EI_CLASS] == ELFCLASS32 && ehdr->e_type == ET_EXEC && + ehdr->e_version == EV_CURRENT && + ehdr->e_machine == EM_ADAPTEVA_EPIPHANY; +} + +void _parse_elf(char* buffer, size_t fsize) { + Elf32_Ehdr* ehdr = (Elf32_Ehdr*)buffer; + Elf32_Shdr* shdr; + + if (fsize < sizeof(Elf32_Ehdr) || !is_epiphany_exec_elf(ehdr) || + (ehdr->e_shoff + ehdr->e_shnum * sizeof(Elf32_Shdr) > fsize)) { + fprintf(stderr, "ERROR: File is not an Epiphany executable."); + return; + } + + shdr = (Elf32_Shdr*)&buffer[ehdr->e_shoff]; + for (size_t i = 0; i < ehdr->e_shnum; i++) { + if (shdr[i].sh_type == SHT_SYMTAB) + _parse_symbols(buffer, fsize, shdr, i); + } + return; +} + +void _parse_symbols(char* buffer, size_t fsize, Elf32_Shdr* shdr, + size_t symtab_index) { + Elf32_Shdr* symtab = &shdr[symtab_index]; + + size_t count = symtab->sh_size / symtab->sh_entsize; + + const char* symstr = &buffer[shdr[symtab->sh_link].sh_offset]; + + Elf32_Sym* symbol = (Elf32_Sym*)&buffer[symtab->sh_offset]; + + // First count the number of symbols that we want to save + state.num_symbols = 0; + for (size_t i = 0; i < count; i++) { + if (ELF32_ST_BIND(symbol[i].st_info) == STB_GLOBAL && + symbol[i].st_shndx != SHN_ABS) { + state.num_symbols++; + } + } + + // Now save them in the array + state.e_symbols = (Symbol*)malloc(state.num_symbols * sizeof(Symbol)); + size_t j = 0; + for (size_t i = 0; i < count; i++) { + if (ELF32_ST_BIND(symbol[i].st_info) != STB_GLOBAL || + symbol[i].st_shndx == SHN_ABS) + continue; + Symbol* sym = &state.e_symbols[j++]; + sym->index = i; + sym->value = symbol[i].st_value; + sym->size = symbol[i].st_size; + sym->type = ELF32_ST_TYPE(symbol[i].st_info); + sym->bind = ELF32_ST_BIND(symbol[i].st_info); + sym->section = symbol[i].st_shndx; + memcpy(sym->name, &symstr[symbol[i].st_name], sizeof(sym->name)); + sym->name[sizeof(sym->name) - 1] = 0; + } +} + +Symbol* _get_symbol_by_addr(void* addr) { + for (size_t i = 0; i < state.num_symbols; i++) { + if (state.e_symbols[i].value <= ((unsigned int)addr) && + ((unsigned int)addr) < state.e_symbols[i].value + state.e_symbols[i].size) { + return &state.e_symbols[i]; + } + } + return 0; +} + +Symbol* _get_symbol_by_name(const char* symbol) { + for (size_t i = 0; i < state.num_symbols; i++) { + if (!strncmp(state.e_symbols[i].name, symbol, sizeof(state.e_symbols[i].name))) { + return &state.e_symbols[i]; + } + } + return 0; +} +#endif From 0ea6f81f52234263ffbc44c3c970812e9584f4c4 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Sun, 15 May 2016 01:06:57 +0200 Subject: [PATCH 02/17] Make BSP var list distributed. Remove bsp_var_list, bsp_var_counter from ebsp_combuf. Remove var_pushed from coredata. Add bsp_var_list to coredata. Implement bsp_push_reg with new system. Implement bsp_pop_reg with new system. Additionally, multiple variables can be pushed within one sync and variables no longer have to be pushed and popped in order. --- CHANGELOG.md | 11 +++++++++++ include/e_bsp_private.h | 6 +++--- include/ebsp_common.h | 4 +--- src/e_bsp.c | 10 +++------- src/e_bsp_drma.c | 38 +++++++++++++++++------------------- test/bsp_hpput/e_bsp_hpput.c | 5 ++++- 6 files changed, 40 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 729af3e..81bb49a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## 1.0.0-beta.3 + +### Added +- BSP variable list is stored distributed over all cores instead of in external memory +- Implement `bsp_pop_reg` + +### Fixed + +### Removed + + ## 1.0.0-beta.2 - 2016-04-21 ### Added diff --git a/include/e_bsp_private.h b/include/e_bsp_private.h index 6abe4e9..7c3199a 100644 --- a/include/e_bsp_private.h +++ b/include/e_bsp_private.h @@ -49,6 +49,9 @@ typedef struct { // time_passed is epiphany cpu time (so not walltime) in seconds float time_passed; + // BSP variable list + void* bsp_var_list[MAX_BSP_VARS]; + // counter for ebsp_combuf::data_requests[pid] uint32_t request_counter; @@ -63,9 +66,6 @@ typedef struct { volatile e_barrier_t sync_barrier[NPROCS]; volatile e_barrier_t* sync_barrier_tgt[NPROCS]; - // if this core has done a bsp_push_reg - int8_t var_pushed; - // Mutex is used for message_queue (send) and data_payloads (put) e_mutex_t payload_mutex; diff --git a/include/ebsp_common.h b/include/ebsp_common.h index ad2f010..9bcd5d0 100644 --- a/include/ebsp_common.h +++ b/include/ebsp_common.h @@ -31,7 +31,7 @@ see the files COPYING and COPYING.LESSER. If not, see // An address takes 4 bytes, and MAX_BSP_VARS is the maximum // amount of variables that can be registered so in total we need // NCORES * MAX_BSP_VARS * 4 bytes to save all this data -#define MAX_BSP_VARS 64 +#define MAX_BSP_VARS 20 // The maximum amount of buffered put/get operations each // core is allowed to do per sync step @@ -127,8 +127,6 @@ typedef struct { // int out_buffer_size[_NPROCS]; // Epiphany <--> Epiphany - void* bsp_var_list[MAX_BSP_VARS][NPROCS]; - uint32_t bsp_var_counter; ebsp_data_request data_requests[NPROCS][MAX_DATA_REQUESTS]; ebsp_message_queue message_queue[2]; ebsp_payload_buffer data_payloads; // used for put/get/send diff --git a/src/e_bsp.c b/src/e_bsp.c index 2e4c236..e3c069c 100644 --- a/src/e_bsp.c +++ b/src/e_bsp.c @@ -41,7 +41,6 @@ void EXT_MEM_TEXT bsp_begin() { coredata.pid = col + cols * row; coredata.nprocs = combuf->nprocs; coredata.request_counter = 0; - coredata.var_pushed = 0; coredata.tagsize = combuf->tagsize; coredata.tagsize_next = coredata.tagsize; coredata.read_queue_index = 0; @@ -54,6 +53,9 @@ void EXT_MEM_TEXT bsp_begin() { e_get_global_address(row, col, (void*)E_REG_DMA1STATUS); coredata.local_nstreams = combuf->n_streams[coredata.pid]; + for (int i = 0; i < MAX_BSP_VARS; i++) + coredata.bsp_var_list[i] = 0; + for (int s = 0; s < coredata.nprocs; s++) coredata.coreids[s] = (uint16_t)e_coreid_from_coords(s / cols, s % cols); @@ -180,12 +182,6 @@ void bsp_sync() { // xor seems to produce the shortest assembly coredata.read_queue_index ^= 1; - if (coredata.var_pushed) { - coredata.var_pushed = 0; - if (coredata.pid == 0) - combuf->bsp_var_counter++; - } - coredata.tagsize = coredata.tagsize_next; coredata.message_index = 0; diff --git a/src/e_bsp_drma.c b/src/e_bsp_drma.c index 7b28e4e..2de5793 100644 --- a/src/e_bsp_drma.c +++ b/src/e_bsp_drma.c @@ -23,18 +23,12 @@ see the files COPYING and COPYING.LESSER. If not, see #include "e_bsp_private.h" #include -const char err_pushreg_multiple[] EXT_MEM_RO = - "BSP ERROR: multiple bsp_push_reg calls within one sync"; - const char err_pushreg_overflow[] EXT_MEM_RO = "BSP ERROR: Trying to push more than MAX_BSP_VARS vars"; const char err_var_not_found[] EXT_MEM_RO = "BSP ERROR: could not find bsp var %p"; -const char err_pop_reg_not_implemented[] EXT_MEM_RO = - "BSP ERROR: Function bsp_pop_reg not implemented"; - const char err_get_overflow[] EXT_MEM_RO = "BSP ERROR: too many bsp_get requests per sync"; @@ -51,9 +45,14 @@ void* _get_remote_addr(int pid, const void* addr, int offset) { // Find the slot for our local pid // And return the entry for the remote pid including the epiphany mapping for (int slot = 0; slot < MAX_BSP_VARS; ++slot) { - if (combuf->bsp_var_list[slot][coredata.pid] == addr) { - // Address as registered by other core and as seen by other core - unsigned uptr = (unsigned)combuf->bsp_var_list[slot][pid] + offset; + if (coredata.bsp_var_list[slot] == addr) { + // Get the remote copy of the BSP var list + unsigned remote_var_list = (unsigned)&(coredata.bsp_var_list[slot]); + remote_var_list |= ((uint32_t)coredata.coreids[pid]) << 20; + // Read its value + unsigned remote_ptr = *(unsigned*)remote_var_list; + // Add the offset + unsigned uptr = remote_ptr + offset; // If it was global, then it is directly valid from here // If it was local, add the remote coreid in the highest 12 bits @@ -68,20 +67,19 @@ void* _get_remote_addr(int pid, const void* addr, int offset) { } void EXT_MEM_TEXT bsp_push_reg(const void* variable, const int nbytes) { - if (coredata.var_pushed) - return ebsp_message(err_pushreg_multiple); - - if (combuf->bsp_var_counter == MAX_BSP_VARS) - return ebsp_message(err_pushreg_overflow); - - combuf->bsp_var_list[combuf->bsp_var_counter][coredata.pid] = - (void*)variable; - - coredata.var_pushed = 1; + for (size_t i = 0; i < MAX_BSP_VARS; i++) { + if (coredata.bsp_var_list[i] == 0) { + coredata.bsp_var_list[i] = (void*)variable; + return; + } + } + return ebsp_message(err_pushreg_overflow); } void EXT_MEM_TEXT bsp_pop_reg(const void* variable) { - ebsp_message(err_pop_reg_not_implemented); + for (size_t i = 0; i < MAX_BSP_VARS; i++) + if (coredata.bsp_var_list[i] == variable) + coredata.bsp_var_list[i] = 0; return; } diff --git a/test/bsp_hpput/e_bsp_hpput.c b/test/bsp_hpput/e_bsp_hpput.c index b696489..08dab98 100644 --- a/test/bsp_hpput/e_bsp_hpput.c +++ b/test/bsp_hpput/e_bsp_hpput.c @@ -38,7 +38,10 @@ int main() { // Only core 2 will do both registrations in the same sync if (bsp_pid() == 2) bsp_sync(); - // expect: ($02: BSP ERROR: multiple bsp_push_reg calls within one sync) + + // This used to be a unit test, but in the new version it is allowed to + // register multiple variables within one sync + // noexpect: ($02: BSP ERROR: multiple bsp_push_reg calls within one sync) if (bsp_pid() == 1) { bsp_hpput(0, &var, &var, 0, sizeof(int)); From 081a425f6190085f040aa497b47b3c8b86e2eedf Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Sat, 14 May 2016 23:29:27 +0000 Subject: [PATCH 03/17] Cleanup bsp_begin code Remove the use of % and / in bsp_begin. Both of these functions take up about 700 Bytes in local memory! Remove some data initialization from bsp_begin since `coredata` is in the .bss section so it will be zero initialized anyways. Remove TRAP instruction from bsp_end, so that main can exit properly and call global deconstructors. --- CHANGELOG.md | 3 +++ src/e_bsp.c | 21 +++++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 81bb49a..3274b94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ - Implement `bsp_pop_reg` ### Fixed +- `bsp_begin` no longer uses divide and modulus operator which take up large amounts of memory +- `bsp_begin` no longer initializes coredata to zero since this is already done in the loader +- `bsp_end` no longer executes TRAP so that `main` can finish properly ### Removed diff --git a/src/e_bsp.c b/src/e_bsp.c index e3c069c..66e8c30 100644 --- a/src/e_bsp.c +++ b/src/e_bsp.c @@ -36,29 +36,24 @@ void EXT_MEM_TEXT bsp_begin() { int row = e_group_config.core_row; int col = e_group_config.core_col; int cols = e_group_config.group_cols; + int rows = e_group_config.group_rows; - // Initialize local data + // Since coredata is in the .bss section it will automatically be filled + // with zeroes so no need to do that here. Only fill the nonzero elements coredata.pid = col + cols * row; coredata.nprocs = combuf->nprocs; - coredata.request_counter = 0; coredata.tagsize = combuf->tagsize; coredata.tagsize_next = coredata.tagsize; - coredata.read_queue_index = 0; - coredata.message_index = 0; - coredata.cur_dma_desc = NULL; - coredata.last_dma_desc = NULL; coredata.dma1config = e_get_global_address(row, col, (void*)E_REG_DMA1CONFIG); coredata.dma1status = e_get_global_address(row, col, (void*)E_REG_DMA1STATUS); coredata.local_nstreams = combuf->n_streams[coredata.pid]; - for (int i = 0; i < MAX_BSP_VARS; i++) - coredata.bsp_var_list[i] = 0; - - for (int s = 0; s < coredata.nprocs; s++) - coredata.coreids[s] = - (uint16_t)e_coreid_from_coords(s / cols, s % cols); + int s = 0; + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + coredata.coreids[s++] = (uint16_t)e_coreid_from_coords(i, j); // Initialize the barrier and mutexes e_barrier_init(coredata.sync_barrier, coredata.sync_barrier_tgt); @@ -134,8 +129,6 @@ void EXT_MEM_TEXT bsp_begin() { void bsp_end() { _write_syncstate(STATE_FINISH); - // Finish execution - __asm__("trap 3"); } int bsp_nprocs() { return coredata.nprocs; } From ccff822df972e6e5340237397707257554ac30c3 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Sun, 15 May 2016 09:08:19 +0200 Subject: [PATCH 04/17] Fix size of syncstate writes by host Syncstate is 1 byte, but host wrote 4 bytes. This could potentially overwrite other things, but due to 4-byte alignment in `coredata` this was not the case. --- src/host_bsp_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host_bsp_memory.c b/src/host_bsp_memory.c index c37a9c3..510268d 100644 --- a/src/host_bsp_memory.c +++ b/src/host_bsp_memory.c @@ -67,7 +67,7 @@ int ebsp_read(int pid, off_t src, void* dst, int size) { } int _write_core_syncstate(int pid, int syncstate) { - return ebsp_write(pid, &syncstate, (off_t)state.combuf.syncstate_ptr, 4); + return ebsp_write(pid, &syncstate, (off_t)state.combuf.syncstate_ptr, 1); } int _write_extmem(void* src, off_t offset, int size) { From 7be26ec643d5fe688a817ef66814802a9324c768 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Sun, 15 May 2016 13:40:11 +0200 Subject: [PATCH 05/17] Add `bsp_pop_reg` documentation --- include/e_bsp.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/e_bsp.h b/include/e_bsp.h index b755243..ee64b6d 100644 --- a/include/e_bsp.h +++ b/include/e_bsp.h @@ -202,9 +202,7 @@ void bsp_push_reg(const void* variable, const int nbytes); * previously registered with bsp_push_reg() * * The operation takes effect after the next call to bsp_sync(). - * @remarks In the current implementation, this function does - * nothing. In a future update this function will free up variable - * spots for new registrations. + * The order in which the variables are popped does not matter. */ void bsp_pop_reg(const void* variable); From c2a3333dd8b49ce167e245c7fc625b7d1903a427 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Tue, 7 Jun 2016 18:29:56 +0200 Subject: [PATCH 06/17] Add new stream API combining up and down streams --- include/e_bsp.h | 123 +++++++++++++++++++++++++ include/host_bsp.h | 25 ++++++ src/e_bsp.c | 3 + src/e_bsp_buffer.c | 204 +++++++++++++++++++++++++++++++++++++++++- src/host_bsp_buffer.c | 68 +++++++++++++- 5 files changed, 420 insertions(+), 3 deletions(-) diff --git a/include/e_bsp.h b/include/e_bsp.h index ee64b6d..76ba8b7 100644 --- a/include/e_bsp.h +++ b/include/e_bsp.h @@ -546,6 +546,129 @@ void ebsp_close_down_stream(unsigned stream_id); */ void ebsp_set_up_chunk_size(unsigned stream_id, int nbytes); +// +// New streaming API starting here +// + +/** + * Open a stream that was created using `ebsp_stream_create` on the host. + * + * @param stream_id The identifier of the stream, for this processor. + * @return Maximal token size. + * + * The i'th stream designated for the calling core will have `stream_id` i. + * + * @remarks This function has to be called *before* performing any other + * operation on the stream. + * @remarks A call to the function should always match a single call to + * `ebsp_close_up_stream`. + */ +int ebsp_stream_open(int stream_id); + +/** + * Wait for pending transfers to complete and close a stream. + * + * @param stream_id The identifier of the stream + * + * Cleans up the stream, and frees any buffers that may have been used by the + * stream. + */ +void ebsp_stream_close(int stream_id); + +/** + * Move the cursor in the stream, to change the next token to be obtained. + * + * @param stream_id The identifier of the stream + * @param delta_tokens The number of tokens to skip if `delta_tokens > 0`, + * or to go back if `delta_tokens < 0`. + * + * If `delta_tokens` is out of bounds, then the cursor will be moved to + * the start or end of the stream respectively. + * `ebsp_stream_seek(i, INT_MIN)` will set the cursor to the start + * `ebsp_stream_seek(i, INT_MAX)` will set the cursor to the end of the stream + * + * Note that if `ebsp_stream_move_down` is used with `preload` enabled + * (meaning the last call to that function had `preload` enabled), + * then the preloaded token will not be changed, so the first call to + * `ebsp_stream_move_down` after this will still yield a token from the + * previous position. + * If `preload` was not enabled then the next call to `ebsp_stream_move_down` + * will yield a token from the new position. + * + * @remarks This function provides a mechanism through which chunks can be + * obtained multiple times. It gives you random access in the memory in + * the data stream. + * @remarks This function has `O(delta_tokens)` complexity. + */ +void ebsp_stream_seek(int stream_id, int delta_tokens); + +/** + * Obtain the next token from a stream. + * + * @param stream_id The identifier of the stream + * @param buffer Receives a pointer to a local copy of the next token. + * @param preload If this parameter is nonzero then the BSP system will + * preload the next token asynchroneously (double buffering). + * @return Number of bytes of the obtained chunk. If stream has + * finished or an error has occurred this function will return `0`. + * + * @remarks Behaviour is undefined if the stream was not opened using + * `ebsp_stream_open`. + * @remarks Memory is transferred using the `DMA1` engine. + * @remarks When using double buffering, the BSP system will allocate memory + * for the next chunk, and will start writing to it using the DMA engine + * while the current chunk is processed. This requires more (local) memory, + * but can greatly increase the overall speed. + */ +int ebsp_stream_move_down(int stream_id, void** buffer, int preload); + +/** + * Write a local token up to a stream. + * + * @param stream_id The identifier of the stream + * @param data The data to be sent up the stream + * @param data_size The size of the data to be sent, i.e. the size of the token. + * Behaviour is undefined if it is not a multiple of 8. + * If it is not a multiple of 8 bytes then transfers will be slow. + * @param wait_for_completion If nonzero this function blocks untill + * the data is completely written to the stream. + * @return Number of bytes written. Zero if an error has occurred. + * + * The function *always* waits for the previous token to have finished. + * + * If `wait_for_completion` is nonzero, this function will wait untill + * the data is transferred. This corresponds to single buffering. + * + * Alternativly, double buffering can be used as follows. + * Set `wait_for_completion` to zero and continue constructing the next token + * in a different buffer. Usage example: + * \code{.c} + * int* buf1 = ebsp_malloc(100 * sizeof(int)); + * int* buf2 = ebsp_malloc(100 * sizeof(int)); + * int* curbuf = buf1; + * int* otherbuf = buf2; + * + * ebsp_stream_open(0); // open stream 0 + * while (...) { + * // Fill curbuf + * for (int i = 0; i < 100; i++) + * curbuf[i] = 5; + * + * // Send up + * ebsp_stream_move_up(0, curbuf, 100 * sizeof(int), 0); + * // Use other buffer + * swap(curbuf, otherbuf); + * } + * ebsp_free(buf1); + * ebsp_free(buf2); + * \endcode + * + * @remarks Behaviour is undefined if the stream was not opened using + * `ebsp_stream_open`. + * @remarks Memory is transferred using the `DMA1` engine. + */ +int ebsp_stream_move_up(int stream_id, const void* data, int data_size, int wait_for_completion); + /** * Allocate external memory. * @param nbytes The size of the memory block diff --git a/include/host_bsp.h b/include/host_bsp.h index d151b1d..0aa1d29 100644 --- a/include/host_bsp.h +++ b/include/host_bsp.h @@ -309,3 +309,28 @@ void ebsp_create_down_stream(const void* src, int dst_core_id, int nbytes, * This function outputs an error if `chunksize` is less than 16. */ void* ebsp_create_up_stream(int dst_core_id, int max_nbytes, int chunksize); + +/** + * Creates a generic stream for streaming data to or from an Epiphany core. + * + * @param processor_id The processor identifier of the core. + * @param stream_size The total number of bytes of data in the stream. + * @param token_size The size in bytes of a single token. Must be at least 16. + * @param initial_data (Optional) The data which should be streamed to an + * Epiphany core. + * @return A pointer to a section of external memory storing the tokens. + * + * If `initial_data` is non-zero, it is copied to the stream (`stream_size` + * bytes). + * If `initial_data` is zero, an empty stream of size `stream_size` is created. + * In this case, `stream_size` should be the maximum number of bytes that will + * be sent up from the Epiphany cores to the host. + * + * This function outputs an error if `token_size` is less than 16. + * + * @remarks If `initial_data` is nonzero, the data is copied so that after the + * call it can safely be freed or overwritten by the user. + */ +void* ebsp_stream_create(int processor_id, int stream_size, int token_size, + const void* initial_data); + diff --git a/src/e_bsp.c b/src/e_bsp.c index 66e8c30..f1a8aea 100644 --- a/src/e_bsp.c +++ b/src/e_bsp.c @@ -92,6 +92,9 @@ void EXT_MEM_TEXT bsp_begin() { _init_local_malloc(); // Copy stream descriptors to local memory + // TODO: do this only when the stream is opened + // and send them back when closed so that streams + // can change owner unsigned int nbytes = combuf->n_streams[coredata.pid] * sizeof(ebsp_stream_descriptor); coredata.local_streams = ebsp_malloc(nbytes); diff --git a/src/e_bsp_buffer.c b/src/e_bsp_buffer.c index 61e9f37..2c2866b 100644 --- a/src/e_bsp_buffer.c +++ b/src/e_bsp_buffer.c @@ -21,7 +21,7 @@ see the files COPYING and COPYING.LESSER. If not, see */ #include "e_bsp_private.h" -#include +#include const char err_no_such_stream[] EXT_MEM_RO = "BSP ERROR: stream does not exist"; @@ -161,7 +161,7 @@ int ebsp_move_chunk_up(void** address, unsigned stream_id, int prealloc) { stream->next_buffer = tmp; stream->cursor += chunk_size; // move pointer in extmem - } else // no prealloc + } else // no prealloc { if (stream->next_buffer != NULL) { ebsp_free(stream->next_buffer); @@ -406,3 +406,203 @@ void ebsp_move_down_cursor(int stream_id, int jump_n_chunks) { } } } + +// +// New streaming API starting here +// + +// When stream headers are interleaved, they are saved as: +// +// 00000000, nextsize, data, +// prevsize, nextsize, data, +// ... +// prevsize, nextsize, data, +// prevsize, 00000000 +// +// So a header consists of two integers (8 byte total). +// The two sizes do NOT include these headers. +// They are only the size of the data inbetween. +// The local copies of the data include these 8 bytes. + +int ebsp_stream_open(int stream_id) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return 0; + } + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + // Go to start + stream->cursor = stream->extmem_addr; + + return stream->max_chunksize; +} + +void ebsp_stream_close(int stream_id) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return; + } + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + // Wait for any data transfer to finish before closing + ebsp_dma_wait(&stream->e_dma_desc); + + if (stream->current_buffer != NULL) { + ebsp_free(stream->current_buffer); + stream->current_buffer = NULL; + } + if (stream->next_buffer != NULL) { + ebsp_free(stream->next_buffer); + stream->next_buffer = NULL; + } +} + +void ebsp_stream_seek(int stream_id, int delta_tokens) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return; + } + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + if (delta_tokens >= 0) { // forward + while (delta_tokens--) { + // read 2nd int (next size) in header + int chunk_size = *(int*)(stream->cursor + sizeof(int)); + if (chunk_size == 0) + return; + stream->cursor += 2 * sizeof(int) + chunk_size; + } + } else { // backward + if (delta_tokens == INT_MIN) { + stream->cursor = stream->extmem_addr; + } + + while (delta_tokens++) { + // read 1st int (prev size) in header + int chunk_size = *(int*)(stream->cursor); + if (chunk_size == 0) + return; + stream->cursor -= 2 * sizeof(int) + chunk_size; + } + } +} + +int ebsp_stream_move_down(int stream_id, void** buffer, int preload) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return 0; + } + + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + if (stream->current_buffer == NULL) { + stream->current_buffer = + ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); + if (stream->current_buffer == NULL) { + ebsp_message(err_out_of_memory); + return 0; + } + } + + // At this point in the code: + // current_buffer contains data from previous token, + // which has been given to the user last time (zero at first time). + // This means current_buffer can be overwritten now + // The new token is: + // - locally available already in next_buffer (preload) + // - not here yet (no preload) + + if (stream->next_buffer == NULL) { + // Data not here yet (did not preload last time) + // Overwrite current buffer. + _ebsp_write_chunk(stream, stream->current_buffer); + } else { + // Data is locally available already in next_buffer (preload). + // Swap buffers. + void* tmp = stream->current_buffer; + stream->current_buffer = stream->next_buffer; + stream->next_buffer = tmp; + } + + // Wait for the dma we just started (if) + // Or the one from previous preload (else) + ebsp_dma_wait(&(stream->e_dma_desc)); + + // At this point in the code: + // current_buffer contains data from the current token, + // which we should now give to the user. + + // *buffer must point after the header + (*buffer) = (void*)((unsigned)stream->current_buffer + 2 * sizeof(int)); + + int* header = (int*)(stream->current_buffer); + int current_chunk_size = header[1]; + + // Check for end-of-stream + if (current_chunk_size == 0) { + (*buffer) = NULL; + return 0; + } + + if (preload) { + if (stream->next_buffer == NULL) { + // no next buffer available, malloc it + stream->next_buffer = + ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); + if (stream->next_buffer == NULL) { + ebsp_message(err_out_of_memory); + return 0; + } + } + _ebsp_write_chunk(stream, stream->next_buffer); + } else { + // free malloced next buffer + if (stream->next_buffer != NULL) { + ebsp_free(stream->next_buffer); + stream->next_buffer = NULL; + } + } + + // At this point: next_buffer should point to data of NEXT token + + return current_chunk_size; +} + +int ebsp_stream_move_up(int stream_id, const void* data, int data_size, + int wait_for_completion) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return 0; + } + + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + ebsp_dma_handle* desc = &stream->e_dma_desc; + + // Wait for any previous transfer to finish (either down or up) + ebsp_dma_wait(desc); + + // Round data_size up to a multiple of 8 + // If this is not done, integer access to the headers will crash + data_size = ((data_size + 8 - 1) / 8) * 8; + + // First write both the header before and after this token. + int* header1 = (int*)(stream->cursor); + int* header2 = (int*)(stream->cursor + 2 * sizeof(int) + data_size); + // header1[0] is filled at the previous iteration + header1[1] = data_size; + header2[0] = data_size; + header2[1] = 0; // terminating 0 + + stream->cursor += 2 * sizeof(int); + + // Now write the data to extmem (async) + ebsp_dma_push(desc, (void*)(stream->cursor), data, data_size); // start dma + stream->cursor += data_size; // move pointer in extmem + + if (wait_for_completion) + ebsp_dma_wait(desc); + + return data_size; +} + diff --git a/src/host_bsp_buffer.c b/src/host_bsp_buffer.c index 197d621..98e6233 100644 --- a/src/host_bsp_buffer.c +++ b/src/host_bsp_buffer.c @@ -25,8 +25,8 @@ see the files COPYING and COPYING.LESSER. If not, see #include #include -extern bsp_state_t state; +extern bsp_state_t state; #define MINIMUM_CHUNK_SIZE (4 * sizeof(int)) void ebsp_create_down_stream(const void* src, int dst_core_id, int nbytes, @@ -121,6 +121,72 @@ void* ebsp_create_up_stream(int src_core_id, int nbytes, int max_chunksize) { return extmem_out_buffer; } +void* ebsp_stream_create(int processor_id, int stream_size, int token_size, + const void* initial_data) { + if (token_size < MINIMUM_CHUNK_SIZE) { + printf("ERROR: minimum token size is %i bytes\n", MINIMUM_CHUNK_SIZE); + return 0; + } + + // Amount of tokens, rounded up + int ntokens = (stream_size + token_size - 1) / token_size; + + int nbytes_including_headers = + stream_size + ntokens * 2 * sizeof(int) + + 2 * sizeof(int); // the +2*sizeof(int) is the terminating header + // headers consist of 2 ints: prev size and next size + + // 1) malloc in extmem + void* extmem_buffer = ebsp_ext_malloc(nbytes_including_headers); + if (extmem_buffer == 0) { + printf("ERROR: not enough memory in extmem for ebsp_stream_create\n"); + return 0; + } + + // 2) copy the data to extmem, inserting headers + unsigned dst_cursor = (unsigned)extmem_buffer; + unsigned src_cursor = (unsigned)initial_data; + + if (initial_data) { + int current_chunksize = token_size; + int last_chunksize = 0; + for (int nbytes_left = stream_size; nbytes_left > 0; + nbytes_left -= token_size) { + if (nbytes_left < token_size) + current_chunksize = nbytes_left; + + (*(int*)dst_cursor) = last_chunksize; // write prev header + dst_cursor += sizeof(int); + (*(int*)dst_cursor) = current_chunksize; // write next header + dst_cursor += sizeof(int); + + memcpy((void*)dst_cursor, (void*)src_cursor, current_chunksize); + + dst_cursor += current_chunksize; + src_cursor += current_chunksize; + + last_chunksize = current_chunksize; + } + // Write a terminating header + (*(int*)dst_cursor) = current_chunksize; // write terminating header (prev) + dst_cursor += sizeof(int); + (*(int*)dst_cursor) = 0; // write terminating header (next) + dst_cursor += sizeof(int); + } else { + // Write a terminating header, or upstreams will crash + (*(int*)dst_cursor) = 0; // prevsize + dst_cursor += sizeof(int); + (*(int*)dst_cursor) = 0; // nextsize + dst_cursor += sizeof(int); + } + + // 3) add stream to state + _ebsp_add_stream(processor_id, extmem_buffer, nbytes_including_headers, + token_size, 0); + + return extmem_buffer; +} + // add ebsp_stream_descriptor to state.buffered_streams, update state.n_streams void _ebsp_add_stream(int core_id, void* extmem_buffer, int nbytes, int max_chunksize, int is_down_stream) { From 87178803a4c189d7c20774a2142dcf25b66f02fd Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Tue, 7 Jun 2016 18:54:49 +0200 Subject: [PATCH 07/17] Add unit test for new stream API --- src/e_bsp_buffer.c | 8 ++--- test/bsp_streams/e_bsp_streams.c | 56 +++++++++++++++++++++++++++++ test/bsp_streams/host_bsp_streams.c | 29 +++++++++++++++ 3 files changed, 89 insertions(+), 4 deletions(-) diff --git a/src/e_bsp_buffer.c b/src/e_bsp_buffer.c index 2c2866b..ee5e306 100644 --- a/src/e_bsp_buffer.c +++ b/src/e_bsp_buffer.c @@ -504,6 +504,9 @@ int ebsp_stream_move_down(int stream_id, void** buffer, int preload) { } } + // Wait for any previous transfer to finish (either down or up) + ebsp_dma_wait(&(stream->e_dma_desc)); + // At this point in the code: // current_buffer contains data from previous token, // which has been given to the user last time (zero at first time). @@ -516,6 +519,7 @@ int ebsp_stream_move_down(int stream_id, void** buffer, int preload) { // Data not here yet (did not preload last time) // Overwrite current buffer. _ebsp_write_chunk(stream, stream->current_buffer); + ebsp_dma_wait(&(stream->e_dma_desc)); } else { // Data is locally available already in next_buffer (preload). // Swap buffers. @@ -524,10 +528,6 @@ int ebsp_stream_move_down(int stream_id, void** buffer, int preload) { stream->next_buffer = tmp; } - // Wait for the dma we just started (if) - // Or the one from previous preload (else) - ebsp_dma_wait(&(stream->e_dma_desc)); - // At this point in the code: // current_buffer contains data from the current token, // which we should now give to the user. diff --git a/test/bsp_streams/e_bsp_streams.c b/test/bsp_streams/e_bsp_streams.c index 2bec5dd..fb4db89 100644 --- a/test/bsp_streams/e_bsp_streams.c +++ b/test/bsp_streams/e_bsp_streams.c @@ -21,6 +21,7 @@ see the files COPYING and COPYING.LESSER. If not, see */ #include +#include #include "../common.h" int main() { @@ -28,6 +29,8 @@ int main() { int s = bsp_pid(); + // Old streaming API + int* upstream = 0; int* upstreamDouble = 0; int chunk_size = ebsp_open_up_stream((void**)&upstream, 0); @@ -92,6 +95,59 @@ int main() { ebsp_close_up_stream(5); // expect: ($00: BSP ERROR: stream does not exist) + // New streaming API + int tokensize = ebsp_stream_open(5); + int tokensize2 = ebsp_stream_open(6); + + if (tokensize != tokensize2) + ebsp_message("Invalid token size at ebsp_stream_open"); + + // Double buffered upstream + int* up1 = ebsp_malloc(tokensize); + int* up2 = ebsp_malloc(tokensize); + + // First stream down from 6 and copy it into 5 + for (;;) { + int* buffer; + int size = ebsp_stream_move_down(6, (void**)&buffer, 1); + if (size == 0) + break; + + for (int j = 0; j < tokensize / sizeof(int); ++j) + up1[j] = buffer[j]; + + ebsp_stream_move_up(5, up1, size, 0); + // swap buffers + int* tmp = up1; + up1 = up2; + up2 = tmp; + } + + // Now stream down from 5, double the values, and copy it into 6 + ebsp_stream_seek(5, INT_MIN); // go back to start + ebsp_stream_seek(6, INT_MIN); // go back to start + for (;;) { + int* buffer; + int size = ebsp_stream_move_down(5, (void**)&buffer, 1); + if (size == 0) + break; + + for (int j = 0; j < tokensize / sizeof(int); ++j) + up1[j] = 2 * buffer[j]; + + ebsp_stream_move_up(6, up1, size, 0); + // swap buffers + int* tmp = up1; + up1 = up2; + up2 = tmp; + } + + ebsp_stream_close(5); + ebsp_stream_close(6); + + ebsp_free(up1); + ebsp_free(up2); + bsp_end(); return 0; diff --git a/test/bsp_streams/host_bsp_streams.c b/test/bsp_streams/host_bsp_streams.c index 9f6fd02..2496d0a 100644 --- a/test/bsp_streams/host_bsp_streams.c +++ b/test/bsp_streams/host_bsp_streams.c @@ -45,6 +45,7 @@ int main(int argc, char** argv) { c++; } + // Old streaming API for (int s = 0; s < bsp_nprocs(); ++s) { upstreams[s] = (int*)ebsp_create_up_stream(s, chunks * chunk_size, chunk_size); @@ -56,8 +57,21 @@ int main(int argc, char** argv) { chunk_size); } + // New streaming API + // Create two streams for each core. An empty one and a filled one. + int** streams1 = malloc(sizeof(int*) * bsp_nprocs()); + int** streams2 = malloc(sizeof(int*) * bsp_nprocs()); + + for (int s = 0; s < bsp_nprocs(); ++s) { + streams1[s] = ebsp_stream_create(s, chunks * chunk_size, chunk_size, 0); + streams2[s] = + ebsp_stream_create(s, chunks * chunk_size, chunk_size, downdata); + } + ebsp_spmd(); + // results of old API + for (int i = 0; i < chunk_size * chunks / sizeof(int); ++i) { printf("%i ", upstreams[5][i]); } @@ -69,6 +83,21 @@ int main(int argc, char** argv) { } // expect: (30 28 26 24 22 20 18 16 14 12 10 8 6 4 2 0 ) + // results of new API + + for (int i = 0; i < chunk_size * chunks / sizeof(int); ++i) { + printf("%i ", streams1[5][i]); + } + printf("\n"); + // expect: (0 1 2 3 11 10 9 8 8 9 10 11 3 2 1 0 ) + + // Check the data in the DOWN stream. It should have been used + // as an upstream as well. + for (int i = 0; i < chunk_size * chunks / sizeof(int); ++i) { + printf("%i ", streams2[5][i]); + } + // expect: (30 28 26 24 22 20 18 16 14 12 10 8 6 4 2 0 ) + // finalize bsp_end(); From 67cbc5b41a0a6e728b094bcf2b5cb4cbbc09a221 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Wed, 18 Jan 2017 13:45:14 +0100 Subject: [PATCH 08/17] Add new stream API to headers --- include/e_bsp.h | 194 ++++++++----------------------------- include/ebsp_common.h | 7 +- include/host_bsp.h | 48 +++------ include/host_bsp_private.h | 2 +- src/e_bsp.c | 10 -- src/e_bsp_buffer.c | 4 +- src/host_bsp_buffer.c | 126 +++--------------------- 7 files changed, 69 insertions(+), 322 deletions(-) diff --git a/include/e_bsp.h b/include/e_bsp.h index 76ba8b7..6d666eb 100644 --- a/include/e_bsp.h +++ b/include/e_bsp.h @@ -417,182 +417,63 @@ int bsp_hpmove(void** tag_ptr_buf, void** payload_ptr_buf); void ebsp_send_up(const void* tag, const void* payload, int nbytes); /** - * Obtain the next chunk of data from a stream. + * Open a stream that was created using `bsp_stream_create` on the host. * - * @param address A pointer to a value that is overwritten with the local - * memory location of the data chunk - * @param stream_id The identifier of the stream - * @param prealloc If this parameter is equal to `1` then the BSP system will - * use double buffering, if it is `0` then single buffering is used. - * @return Number of bytes of the obtained chunk. If stream has - * finished or an error has occurred this function will return `0`. - * - * @remarks Memory is transferred using the `DMA1` engine. - * @remarks When using double buffering, the BSP system will allocate memory - * for the next chunk, and will start writing to it using the DMA engine - * while the current chunk is processed. This requires more (local) memory, - * but can greatly increase the overall speed. - */ -int ebsp_move_chunk_down(void** address, unsigned stream_id, int prealloc); - -/** - * Move a chunk of data up from a stream. - - * @param address A pointer to a value that is overwritten with the local - * memory location where the next chunk should be written to. - * @param stream_id The identifier of the stream - * @param prealloc If this parameter is equal to `1` then the BSP system will - * use double buffering, if it is `0` then single buffering is used. - * @return Number of bytes allocated for the next chunk of this stream. if - * stream has finished or an error has occurred this function will return `0`. - * - * @remarks Memory is transferred using the `DMA1` engine. - * @remarks When using the double buffering mode, `*address` will contain the - * location of a new chunk of memory, such that the BSP program can continue - * while the current chunk is being copied using the DMA engine. This requires - * more local memory, but can improve the performance of the program. - */ -int ebsp_move_chunk_up(void** address, unsigned stream_id, int prealloc); - -/** - * Move the cursor pointing to the next chunk to be obtained from a stream. - * - * @param stream_id The identifier of the stream - * @param jump_n_chunks The number of chunks to skip if `jump_n_chunks > 0`, - * or to go back if `jump_n_chunks < 0`. - * - * @remarks Internally a stream is a collection of data, along with specific - * information for a stream. For example, a stream holds a pointer to the - * next chunk that should be written to a core. Using this function you can - * change which chunk should be the `next` chunk. This allows you to obtain - * chunks multiple times, or to skip chunks completely. - * @remarks This function provides a mechanism through which chunks can be - * obtained multiple times. It gives you random access in the memory in - * the data stream. - * @remarks This function has `O(jump_n_chunks)` complexity. - */ -void ebsp_move_down_cursor(int stream_id, int jump_n_chunks); - -/** - * Resets the cursor pointing to the next chunk to be obtained from a stream. - * - * @param stream_id The identifier of the stream - * - * After calling this function the *next chunk* that is obtained from this - * stream is equal to the first chunk. - * - * @remarks This function has `O(1)` complexity. - */ -void ebsp_reset_down_cursor(int stream_id); - -/** - * Open an up stream. + * @param stream Pointer to an existing `bsp_stream` struct to hold the stream + * data. This struct can be allocated on the stack by the user. + * @param stream_id The index of the stream. + * @return Nonzero if succesful. * - * @param address Pointer to a variable that will be overwritten with the - * location where the data should be written for the first chunk that will - * be sent up. - * @param stream_id The identifier of the stream - * @return Number of bytes that can be written to the first chunk to be sent up. + * The first stream created by the host will have `stream_id` 0. * - * @remarks This function has to be called *before* performing any other operation - * on the stream. - * @remarks A call to the function should always match a single call to - * `ebsp_close_up_stream`. - */ -int ebsp_open_up_stream(void** address, unsigned stream_id); - -/** - * Close an up stream. - * - * @param stream_id The identifier of the stream - * - * Cleans up the stream, and frees any buffers that may have been used by the - * stream. - */ -void ebsp_close_up_stream(unsigned stream_id); - -/** - * Open a down stream. - * - * @param address Pointer to a variable that will be overwritten with the - * location where the data should be written for the first chunk that will - * be sent up. - * @param stream_id The identifier of the stream - * @return The size of the first chunk of this stream in bytes. - * - * @remarks This function has to be called *before* performing any other operation - * on the stream. - * @remarks A call to the function should always match a single call to - * `ebsp_close_down_stream`. - */ -int ebsp_open_down_stream(void** address, unsigned stream_id); - -/** - * Close a down stream. - * - * @param stream_id The identifier of the stream - * - * Cleans up the stream, and frees any buffers that may have been used by the - * stream. - */ -void ebsp_close_down_stream(unsigned stream_id); - -/** - * Set the number of bytes of the current chunk in the up stream. - * - * @param stream_id The identifier of the stream - * @param nbytes The number of bytes that should be moved from the current - * chunk of data in the next call to `ebsp_move_chunk_up`. - */ -void ebsp_set_up_chunk_size(unsigned stream_id, int nbytes); - -// -// New streaming API starting here -// - -/** - * Open a stream that was created using `ebsp_stream_create` on the host. - * - * @param stream_id The identifier of the stream, for this processor. - * @return Maximal token size. - * - * The i'th stream designated for the calling core will have `stream_id` i. + * Usage example: + * \code{.c} + * bsp_stream mystream; + * if( bsp_stream_open(&mystream, 3) ) { + * // Get some data + * void* buffer = 0; + * bsp_stream_move_down(&mystream, &buffer, 0); + * // The data is now in buffer + * // Finally, close the stream + * bsp_stream_close(&mystream);` + * } + * \endcose * * @remarks This function has to be called *before* performing any other * operation on the stream. * @remarks A call to the function should always match a single call to - * `ebsp_close_up_stream`. + * `bsp_stream_close`. */ -int ebsp_stream_open(int stream_id); +int bsp_stream_open(ebsp_stream* stream, int stream_id); /** * Wait for pending transfers to complete and close a stream. * - * @param stream_id The identifier of the stream + * @param stream The handle of the stream * * Cleans up the stream, and frees any buffers that may have been used by the * stream. */ -void ebsp_stream_close(int stream_id); +void bsp_stream_close(ebsp_stream* stream); /** * Move the cursor in the stream, to change the next token to be obtained. * - * @param stream_id The identifier of the stream + * @param stream The handle of the stream * @param delta_tokens The number of tokens to skip if `delta_tokens > 0`, * or to go back if `delta_tokens < 0`. * * If `delta_tokens` is out of bounds, then the cursor will be moved to * the start or end of the stream respectively. - * `ebsp_stream_seek(i, INT_MIN)` will set the cursor to the start - * `ebsp_stream_seek(i, INT_MAX)` will set the cursor to the end of the stream + * `bsp_stream_seek(i, INT_MIN)` will set the cursor to the start + * `bsp_stream_seek(i, INT_MAX)` will set the cursor to the end of the stream * - * Note that if `ebsp_stream_move_down` is used with `preload` enabled + * Note that if `bsp_stream_move_down` is used with `preload` enabled * (meaning the last call to that function had `preload` enabled), * then the preloaded token will not be changed, so the first call to - * `ebsp_stream_move_down` after this will still yield a token from the + * `bsp_stream_move_down` after this will still yield a token from the * previous position. - * If `preload` was not enabled then the next call to `ebsp_stream_move_down` + * If `preload` was not enabled then the next call to `bsp_stream_move_down` * will yield a token from the new position. * * @remarks This function provides a mechanism through which chunks can be @@ -600,12 +481,12 @@ void ebsp_stream_close(int stream_id); * the data stream. * @remarks This function has `O(delta_tokens)` complexity. */ -void ebsp_stream_seek(int stream_id, int delta_tokens); +void bsp_stream_seek(ebsp_stream* stream, int delta_tokens); /** * Obtain the next token from a stream. * - * @param stream_id The identifier of the stream + * @param stream The handle of the stream * @param buffer Receives a pointer to a local copy of the next token. * @param preload If this parameter is nonzero then the BSP system will * preload the next token asynchroneously (double buffering). @@ -613,19 +494,19 @@ void ebsp_stream_seek(int stream_id, int delta_tokens); * finished or an error has occurred this function will return `0`. * * @remarks Behaviour is undefined if the stream was not opened using - * `ebsp_stream_open`. + * `bsp_stream_open`. * @remarks Memory is transferred using the `DMA1` engine. * @remarks When using double buffering, the BSP system will allocate memory * for the next chunk, and will start writing to it using the DMA engine * while the current chunk is processed. This requires more (local) memory, * but can greatly increase the overall speed. */ -int ebsp_stream_move_down(int stream_id, void** buffer, int preload); +int bsp_stream_move_down(ebsp_stream* stream, void** buffer, int preload); /** * Write a local token up to a stream. * - * @param stream_id The identifier of the stream + * @param stream The handle of the stream * @param data The data to be sent up the stream * @param data_size The size of the data to be sent, i.e. the size of the token. * Behaviour is undefined if it is not a multiple of 8. @@ -648,14 +529,15 @@ int ebsp_stream_move_down(int stream_id, void** buffer, int preload); * int* curbuf = buf1; * int* otherbuf = buf2; * - * ebsp_stream_open(0); // open stream 0 + * ebsp_stream s; + * bsp_stream_open(&s, 0); // open stream 0 * while (...) { * // Fill curbuf * for (int i = 0; i < 100; i++) * curbuf[i] = 5; * * // Send up - * ebsp_stream_move_up(0, curbuf, 100 * sizeof(int), 0); + * bsp_stream_move_up(&s, curbuf, 100 * sizeof(int), 0); * // Use other buffer * swap(curbuf, otherbuf); * } @@ -664,10 +546,10 @@ int ebsp_stream_move_down(int stream_id, void** buffer, int preload); * \endcode * * @remarks Behaviour is undefined if the stream was not opened using - * `ebsp_stream_open`. + * `bsp_stream_open`. * @remarks Memory is transferred using the `DMA1` engine. */ -int ebsp_stream_move_up(int stream_id, const void* data, int data_size, int wait_for_completion); +int bsp_stream_move_up(ebsp_stream* stream, const void* data, int data_size, int wait_for_completion); /** * Allocate external memory. diff --git a/include/ebsp_common.h b/include/ebsp_common.h index 9bcd5d0..5c4ff2b 100644 --- a/include/ebsp_common.h +++ b/include/ebsp_common.h @@ -100,10 +100,9 @@ typedef struct { int nbytes; // size of the stream including headers int max_chunksize; // size of required buffer in e_core memory ebsp_dma_handle e_dma_desc; // descriptor of dma, used as dma_id as well + int32_t pid; // Processor currently owning the stream or -1 if none void* current_buffer; // pointer (in e_core_mem) to current chunk void* next_buffer; // pointer (in e_core_mem) to next chunk - int is_down_stream; // is 1 if it is a down-stream, 0 if it is an up-stream - int _padding; // make sure struct is 8 byte aligned when packed in arrays } __attribute__((aligned(8))) ebsp_stream_descriptor; // ebsp_combuf is a struct for epiphany <-> ARM communication @@ -121,8 +120,8 @@ typedef struct { float remotetimer; int32_t nprocs; int32_t tagsize; // Only for initial and final messages - int n_streams[NPROCS]; - void* extmem_streams[NPROCS]; + int32_t nstreams; + ebsp_stream_descriptor* streams; // void* extmem_current_out_chunk[_NPROCS]; // int out_buffer_size[_NPROCS]; diff --git a/include/host_bsp.h b/include/host_bsp.h index 0aa1d29..b62cc93 100644 --- a/include/host_bsp.h +++ b/include/host_bsp.h @@ -278,59 +278,35 @@ void ebsp_move(void* payload, int buffer_size); */ int ebsp_hpmove(void** tag_ptr_buf, void** payload_ptr_buf); -/** - * Creates a down stream - * - * @param src The data which should be streamed down to an Epiphany core. - * @param dst_core_id The processor identifier of the receiving core. - * @param nbytes The total number of bytes of the data to be streamed down. - * @param chunksize The size in bytes of a single chunk. Must be at least 16. - * - * This function outputs an error if `chunksize` is less than 16. - * - * @remarks The data is copied from `src`, such that the data `src` can be - * safely freed or overwritten after this call. - */ - -void ebsp_create_down_stream(const void* src, int dst_core_id, int nbytes, - int chunksize); - -/** - * Creates an up stream - * - * @param dst_core_id The processor identifier of the sending core. - * @param max_nbytes The maximum number of bytes that will be sent up using - * this up stream. - * @param chunksize The maximum number of bytes of a single chunk that can be - * sent up through this stream. Must be at least 16. - * @return A pointer to a section of external memory storing the chunks - * sent up by the sending core. - * - * This function outputs an error if `chunksize` is less than 16. - */ -void* ebsp_create_up_stream(int dst_core_id, int max_nbytes, int chunksize); - /** * Creates a generic stream for streaming data to or from an Epiphany core. * - * @param processor_id The processor identifier of the core. * @param stream_size The total number of bytes of data in the stream. * @param token_size The size in bytes of a single token. Must be at least 16. * @param initial_data (Optional) The data which should be streamed to an * Epiphany core. * @return A pointer to a section of external memory storing the tokens. * - * If `initial_data` is non-zero, it is copied to the stream (`stream_size` + * The function returns NULL on failure. + * + * If `initial_data` is nonzero, it is copied to the stream (`stream_size` * bytes). * If `initial_data` is zero, an empty stream of size `stream_size` is created. * In this case, `stream_size` should be the maximum number of bytes that will * be sent up from the Epiphany cores to the host. * - * This function outputs an error if `token_size` is less than 16. + * This function prints an error if `token_size` is less than 16. + * + * The format of the data pointed to by the return value is as follows: + * Before every token, there are two integers that specify the size + * of the preceding token and the size of the token itself. + * + * If you want to use the returned pointer directly you have to manually take + * care of this data format. * * @remarks If `initial_data` is nonzero, the data is copied so that after the * call it can safely be freed or overwritten by the user. */ -void* ebsp_stream_create(int processor_id, int stream_size, int token_size, +void* bsp_stream_create(int stream_size, int token_size, const void* initial_data); diff --git a/include/host_bsp_private.h b/include/host_bsp_private.h index 5fed59e..e6bd434 100644 --- a/include/host_bsp_private.h +++ b/include/host_bsp_private.h @@ -90,7 +90,7 @@ typedef struct { struct timespec ts_start, ts_end; // Buffer - ebsp_stream_descriptor buffered_streams[NPROCS][MAX_N_STREAMS]; + ebsp_stream_descriptor buffered_streams[MAX_N_STREAMS]; #ifdef DEBUG Symbol* e_symbols; diff --git a/src/e_bsp.c b/src/e_bsp.c index f1a8aea..03fd28f 100644 --- a/src/e_bsp.c +++ b/src/e_bsp.c @@ -91,16 +91,6 @@ void EXT_MEM_TEXT bsp_begin() { _init_local_malloc(); - // Copy stream descriptors to local memory - // TODO: do this only when the stream is opened - // and send them back when closed so that streams - // can change owner - unsigned int nbytes = - combuf->n_streams[coredata.pid] * sizeof(ebsp_stream_descriptor); - coredata.local_streams = ebsp_malloc(nbytes); - ebsp_memcpy(coredata.local_streams, combuf->extmem_streams[coredata.pid], - nbytes); - // Send &syncstate to ARM if (coredata.pid == 0) combuf->syncstate_ptr = (int8_t*)&coredata.syncstate; diff --git a/src/e_bsp_buffer.c b/src/e_bsp_buffer.c index ee5e306..bf42ae7 100644 --- a/src/e_bsp_buffer.c +++ b/src/e_bsp_buffer.c @@ -424,8 +424,8 @@ void ebsp_move_down_cursor(int stream_id, int jump_n_chunks) { // They are only the size of the data inbetween. // The local copies of the data include these 8 bytes. -int ebsp_stream_open(int stream_id) { - if (stream_id >= coredata.local_nstreams) { +int ebsp_stream_open(int stream_id, ebsp_stream* stream) { + if (stream_id >= combuf->nstreams) { ebsp_message(err_no_such_stream); return 0; } diff --git a/src/host_bsp_buffer.c b/src/host_bsp_buffer.c index 98e6233..3d409b8 100644 --- a/src/host_bsp_buffer.c +++ b/src/host_bsp_buffer.c @@ -29,104 +29,16 @@ see the files COPYING and COPYING.LESSER. If not, see extern bsp_state_t state; #define MINIMUM_CHUNK_SIZE (4 * sizeof(int)) -void ebsp_create_down_stream(const void* src, int dst_core_id, int nbytes, - int max_chunksize) { - if (max_chunksize < MINIMUM_CHUNK_SIZE) { - printf("ERROR: minimum chunk size is %i bytes\n", MINIMUM_CHUNK_SIZE); - return; - } - - int nchunks = (nbytes + max_chunksize - 1) / - max_chunksize; // nbytes/chunksize rounded up - - int nbytes_including_headers = - nbytes + nchunks * 2 * sizeof(int) + - 2 * sizeof(int); // the +2*sizeof(int) is the terminating header - // headers consist of 2 ints: prev size and next size - - // 1) malloc in extmem - void* extmem_in_buffer = ebsp_ext_malloc(nbytes_including_headers); - if (extmem_in_buffer == 0) { - printf( - "ERROR: not enough memory in extmem for ebsp_send_buffered_raw\n"); - return; - } - - // 2) copy the data to extmem, inserting headers - unsigned dst_cursor = (unsigned)extmem_in_buffer; - unsigned src_cursor = (unsigned)src; - - int current_chunksize = max_chunksize; - int last_chunksize = 0; - for (int nbytes_left = nbytes; nbytes_left > 0; - nbytes_left -= max_chunksize) { - if (nbytes_left < max_chunksize) - current_chunksize = nbytes_left; - - (*(int*)dst_cursor) = last_chunksize; // write prev header - dst_cursor += sizeof(int); - (*(int*)dst_cursor) = current_chunksize; // write next header - dst_cursor += sizeof(int); - - memcpy((void*)dst_cursor, (void*)src_cursor, current_chunksize); - - dst_cursor += current_chunksize; - src_cursor += current_chunksize; - - last_chunksize = current_chunksize; - } - - (*(int*)dst_cursor) = current_chunksize; // write terminating header (prev) - dst_cursor += sizeof(int); - (*(int*)dst_cursor) = 0; // write terminating header (next) - dst_cursor += sizeof(int); - - // 3) add stream to state - _ebsp_add_stream(dst_core_id, extmem_in_buffer, nbytes_including_headers, - max_chunksize, 1); -} - -void ebsp_create_down_stream_raw(const void* src, int dst_core_id, int nbytes, - int max_chunksize) { - // 1) malloc in extmem - void* extmem_in_buffer = ebsp_ext_malloc(nbytes); - if (extmem_in_buffer == 0) { - printf( - "ERROR: not enough memory in extmem for ebsp_send_buffered_raw\n"); - return; - } - // 2) copy the data there directly - memcpy(extmem_in_buffer, src, nbytes); - - // 3) add stream to state - _ebsp_add_stream(dst_core_id, extmem_in_buffer, nbytes, max_chunksize, 1); -} - -void* ebsp_create_up_stream(int src_core_id, int nbytes, int max_chunksize) { - if (max_chunksize < MINIMUM_CHUNK_SIZE) { - printf("ERROR: minimum chunk size is %i bytes\n", MINIMUM_CHUNK_SIZE); - return NULL; - } - - // 1) malloc in extmem - void* extmem_out_buffer = ebsp_ext_malloc(nbytes); - if (extmem_out_buffer == 0) { - printf("ERROR: not enough memory in extmem for ebsp_get_buffered\n"); - return NULL; - } - - // 2) add stream to state - _ebsp_add_stream(src_core_id, extmem_out_buffer, nbytes, max_chunksize, 0); - - return extmem_out_buffer; -} - -void* ebsp_stream_create(int processor_id, int stream_size, int token_size, +void* ebsp_stream_create(int stream_size, int token_size, const void* initial_data) { if (token_size < MINIMUM_CHUNK_SIZE) { printf("ERROR: minimum token size is %i bytes\n", MINIMUM_CHUNK_SIZE); return 0; } + if (state.combuf.nstreams == MAX_N_STREAMS) { + printf("ERROR: Reached limit of %d streams.\n", MAX_N_STREAMS); + return 0; + } // Amount of tokens, rounded up int ntokens = (stream_size + token_size - 1) / token_size; @@ -180,32 +92,20 @@ void* ebsp_stream_create(int processor_id, int stream_size, int token_size, dst_cursor += sizeof(int); } - // 3) add stream to state - _ebsp_add_stream(processor_id, extmem_buffer, nbytes_including_headers, - token_size, 0); - - return extmem_buffer; -} - -// add ebsp_stream_descriptor to state.buffered_streams, update state.n_streams -void _ebsp_add_stream(int core_id, void* extmem_buffer, int nbytes, - int max_chunksize, int is_down_stream) { - if (state.combuf.n_streams[core_id] == MAX_N_STREAMS) { - printf("ERROR: state.combuf.n_streams >= MAX_N_STREAMS\n"); - return; - } - + // 3) add stream to combuf ebsp_stream_descriptor x; x.extmem_addr = _arm_to_e_pointer(extmem_buffer); x.cursor = x.extmem_addr; - x.nbytes = nbytes; - x.max_chunksize = max_chunksize; + x.nbytes = nbytes_including_headers; + x.max_chunksize = token_size; + x.pid = -1; memset(&x.e_dma_desc, 0, sizeof(ebsp_dma_handle)); x.current_buffer = NULL; x.next_buffer = NULL; - x.is_down_stream = is_down_stream; - state.buffered_streams[core_id][state.combuf.n_streams[core_id]] = x; - state.combuf.n_streams[core_id]++; + state.buffered_streams[state.combuf.nstreams] = x; + state.combuf.nstreams++; + + return extmem_buffer; } From 9f035c83d2e0655c3525192508d21c5c6d092485 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Wed, 18 Jan 2017 14:22:12 +0100 Subject: [PATCH 09/17] Update implementation of new streaming API --- include/e_bsp.h | 5 +- include/e_bsp_datatypes.h | 12 ++ include/e_bsp_private.h | 3 + include/host_bsp.h | 10 + src/e_bsp_buffer.c | 416 ++++---------------------------------- src/host_bsp_buffer.c | 2 +- 6 files changed, 67 insertions(+), 381 deletions(-) diff --git a/include/e_bsp.h b/include/e_bsp.h index 6d666eb..d0c70dd 100644 --- a/include/e_bsp.h +++ b/include/e_bsp.h @@ -449,7 +449,10 @@ int bsp_stream_open(ebsp_stream* stream, int stream_id); /** * Wait for pending transfers to complete and close a stream. * - * @param stream The handle of the stream + * @param stream The handle of the stream, opened by `bsp_stream_open`. + * + * Behaviour is undefined if `stream` is not a handle opened by + * `bsp_stream_open`. * * Cleans up the stream, and frees any buffers that may have been used by the * stream. diff --git a/include/e_bsp_datatypes.h b/include/e_bsp_datatypes.h index 3af9f0a..bfaac6f 100644 --- a/include/e_bsp_datatypes.h +++ b/include/e_bsp_datatypes.h @@ -31,3 +31,15 @@ typedef struct { void* dst_addr; } __attribute__((aligned(8))) ebsp_dma_handle; +typedef struct { + ebsp_dma_handle e_dma_desc; // descriptor of dma, used as dma_id as well + void* cursor; // current position of the stream in extmem + int32_t id; // stream_id of the stream + void* extmem_start; // extmem data in e_core address space + void* extmem_end; // end of allocated region + void* current_buffer; // pointer (in e_core_mem) to current chunk + void* next_buffer; // pointer (in e_core_mem) to next chunk + uint32_t max_chunksize; // maximum size of a token exluding 8 byte header +} __attribute__((aligned(8))) ebsp_stream; + + diff --git a/include/e_bsp_private.h b/include/e_bsp_private.h index 7c3199a..1889b72 100644 --- a/include/e_bsp_private.h +++ b/include/e_bsp_private.h @@ -72,6 +72,9 @@ typedef struct { // Mutex for ebsp_message e_mutex_t ebsp_message_mutex; + // Mutex for opening a stream + e_mutex_t stream_mutex; + // Mutex for ebsp_ext_malloc (internal malloc does not have mutex) e_mutex_t malloc_mutex; diff --git a/include/host_bsp.h b/include/host_bsp.h index b62cc93..5a9c37c 100644 --- a/include/host_bsp.h +++ b/include/host_bsp.h @@ -301,6 +301,16 @@ int ebsp_hpmove(void** tag_ptr_buf, void** payload_ptr_buf); * Before every token, there are two integers that specify the size * of the preceding token and the size of the token itself. * + * 00000000, nextsize, data, + * prevsize, nextsize, data, + * ... + * prevsize, nextsize, data, + * prevsize, 00000000 + * + * So a header consists of two integers (8 byte total). + * The two sizes do NOT include these headers. + * They are only the size of the data inbetween. + * * If you want to use the returned pointer directly you have to manually take * care of this data format. * diff --git a/src/e_bsp_buffer.c b/src/e_bsp_buffer.c index bf42ae7..d64cf51 100644 --- a/src/e_bsp_buffer.c +++ b/src/e_bsp_buffer.c @@ -43,153 +43,10 @@ const char err_create_opened[] EXT_MEM_RO = const char err_out_of_memory[] EXT_MEM_RO = "BSP ERROR: could not allocate enough memory for stream"; -void ebsp_set_up_chunk_size(unsigned stream_id, int nbytes) { - ebsp_stream_descriptor* out_stream = &coredata.local_streams[stream_id]; +const char err_stream_in_use[] EXT_MEM_RO = + "BSP ERROR: stream with id %d is in use"; - int* header = (int*)out_stream->current_buffer; - // update the *next* value to the new numer of bytes - header[1] = nbytes; -} - -int ebsp_open_up_stream(void** address, unsigned stream_id) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return 0; - } - - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; - - if (stream->is_down_stream) { - ebsp_message(err_mixed_up_down); - return 0; - } - - if (stream->current_buffer != NULL) { - ebsp_message(err_create_opened); - return 0; - } - - stream->current_buffer = ebsp_malloc(stream->max_chunksize + sizeof(int)); - if (stream->current_buffer == NULL) { - ebsp_message(err_out_of_memory); - return 0; - } - - (*address) = (void*)((unsigned)stream->current_buffer + sizeof(int)); - - // Set the size to max_chunksize - int* header = (int*)stream->current_buffer; - header[0] = stream->max_chunksize; - - stream->cursor = stream->extmem_addr; - - return stream->max_chunksize; -} - -void ebsp_close_up_stream(unsigned stream_id) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return; - } - - ebsp_stream_descriptor* out_stream = &coredata.local_streams[stream_id]; - - if (out_stream->is_down_stream) { - ebsp_message(err_mixed_up_down); - return; - } - - if (out_stream->current_buffer == NULL) { - ebsp_message(err_close_closed); - return; - } - - // wait for data transfer to finish before closing - ebsp_dma_handle* desc = (ebsp_dma_handle*)&out_stream->e_dma_desc; - ebsp_dma_wait(desc); - - ebsp_free(out_stream->current_buffer); - out_stream->current_buffer = NULL; - - if (out_stream->next_buffer != NULL) { - ebsp_free(out_stream->next_buffer); - out_stream->next_buffer = NULL; - } -} - -int ebsp_move_chunk_up(void** address, unsigned stream_id, int prealloc) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return 0; - } - - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; - - if (stream->is_down_stream) { - ebsp_message(err_mixed_up_down); - return 0; - } - - ebsp_dma_handle* desc = (ebsp_dma_handle*)&stream->e_dma_desc; - - // if we prealloced last time, we have to wait until dma is finished - if (stream->next_buffer != NULL) { - ebsp_dma_wait(desc); - } - - if (prealloc) { - if (stream->next_buffer == NULL) { - stream->next_buffer = - ebsp_malloc(stream->max_chunksize + sizeof(int)); - if (stream->next_buffer == NULL) { - ebsp_message(err_out_of_memory); - return 0; - } - } - - // read int header from current_buffer (next size) - int chunk_size = ((int*)stream->current_buffer)[0]; - - void* src = (void*)((unsigned)stream->current_buffer + sizeof(int)); - void* dst = stream->cursor; - - ebsp_dma_push(desc, dst, src, chunk_size); // start dma - // ebsp_dma_start(); - - void* tmp = stream->current_buffer; // swap buffers - stream->current_buffer = stream->next_buffer; - stream->next_buffer = tmp; - - stream->cursor += chunk_size; // move pointer in extmem - } else // no prealloc - { - if (stream->next_buffer != NULL) { - ebsp_free(stream->next_buffer); - stream->next_buffer = NULL; - } - - // read int header from current_buffer (next size) - int chunk_size = ((int*)stream->current_buffer)[0]; - - void* src = (void*)((unsigned)stream->current_buffer + sizeof(int)); - void* dst = stream->cursor; - - ebsp_dma_push(desc, dst, src, chunk_size); // start dma - // ebsp_dma_start(); - ebsp_dma_wait(desc); - - stream->cursor += chunk_size; // move pointer in extmem - } - - (*address) = (void*)((unsigned)stream->current_buffer + sizeof(int)); - - // Set the out_size to max_chunksize - *((int*)(stream->current_buffer)) = stream->max_chunksize; - - return stream->max_chunksize; -} - -void _ebsp_write_chunk(ebsp_stream_descriptor* stream, void* target) { +void _ebsp_write_chunk(ebsp_stream* stream, void* target) { // read 2nd int in header from ext (next size) int chunk_size = *(int*)(stream->cursor + sizeof(int)); ebsp_dma_handle* desc = (ebsp_dma_handle*)&(stream->e_dma_desc); @@ -212,205 +69,6 @@ void _ebsp_write_chunk(ebsp_stream_descriptor* stream, void* target) { } } -int ebsp_open_down_stream(void** address, unsigned stream_id) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return 0; - } - - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; - - if (!stream->is_down_stream) { - ebsp_message(err_mixed_up_down); - return 0; - } - if (stream->current_buffer != NULL || stream->next_buffer != NULL) { - ebsp_message(err_open_opened); - return 0; - } - - stream->cursor = stream->extmem_addr; - - // this will be the current buffer when move_chunk_down gets called for - // the first time - stream->next_buffer = ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); - if (stream->next_buffer == NULL) { - ebsp_message(err_out_of_memory); - return 0; - } - - _ebsp_write_chunk(stream, stream->next_buffer); - - *address = (void*)((unsigned)stream->next_buffer + 2 * sizeof(int)); - - return stream->max_chunksize; -} - -void ebsp_close_down_stream(unsigned stream_id) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return; - } - - ebsp_stream_descriptor* in_stream = &coredata.local_streams[stream_id]; - - if (!(in_stream->is_down_stream)) { - ebsp_message(err_mixed_up_down); - return; - } - - if (in_stream->current_buffer == NULL) { - ebsp_message(err_close_closed); - return; - } - - ebsp_dma_handle* desc = (ebsp_dma_handle*)&in_stream->e_dma_desc; - ebsp_dma_wait(desc); - - ebsp_free(in_stream->current_buffer); - in_stream->current_buffer = NULL; - - if (in_stream->next_buffer != NULL) { - ebsp_free(in_stream->next_buffer); - in_stream->next_buffer = NULL; - } -} - -int ebsp_move_chunk_down(void** address, unsigned stream_id, int prealloc) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return 0; - } - - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; - - ebsp_dma_handle* desc = (ebsp_dma_handle*)(&(stream->e_dma_desc)); - - // if(stream->current_buffer == NULL) - // stream->current_buffer = - // ebsp_malloc(stream->max_chunksize + 2*sizeof(int)); - - // Here: current_buffer contains data from previous chunk - // this can be null the first time ebsp_move_chunk_down is called - - if (!(stream->is_down_stream)) { - ebsp_message(err_mixed_up_down); - return 0; - } - - if (stream->next_buffer == NULL) // did not prealloc last time - { - // overwrite current buffer - _ebsp_write_chunk(stream, stream->current_buffer); - } else // did prealloc last time - { - void* tmp = stream->current_buffer; - stream->current_buffer = stream->next_buffer; - stream->next_buffer = tmp; - } - - // either wait for dma_push from last prealloc (else) - // or the one we just started (if) - ebsp_dma_wait(desc); - - // Here: current_buffer contains data from THIS chunk - - // *address must point after the counter header - (*address) = (void*)((unsigned)stream->current_buffer + 2 * sizeof(int)); - - // the counter header - int current_chunk_size = - *((int*)((unsigned)stream->current_buffer + sizeof(int))); - - if (current_chunk_size == 0) // stream has ended - { - (*address) = NULL; - return 0; - } - - if (prealloc) { - if (stream->next_buffer == NULL) { - // no next buffer available, malloc it - stream->next_buffer = - ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); - if (stream->next_buffer == NULL) { - ebsp_message(err_out_of_memory); - return 0; - } - } - _ebsp_write_chunk(stream, stream->next_buffer); - } else { - // free malloced next buffer - if (stream->next_buffer != NULL) { - ebsp_free(stream->next_buffer); - stream->next_buffer = NULL; - } - } - - // Here: next_buffer should (possibly) point to data of NEXT - // chunk (begin written to) or be zero - - return current_chunk_size; -} - -void ebsp_reset_down_cursor(int stream_id) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return; - } - - ebsp_stream_descriptor* in_stream = &coredata.local_streams[stream_id]; - - size_t chunk_size = -1; - - // break when previous block has size 0 (begin of stream) - while (chunk_size != 0) { - // read 1st int in (prev size) header from ext - chunk_size = *(int*)(in_stream->cursor); - in_stream->cursor = (void*)(((unsigned)(in_stream->cursor)) - - 2 * sizeof(int) - chunk_size); - } -} - -void ebsp_move_down_cursor(int stream_id, int jump_n_chunks) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return; - } - - ebsp_stream_descriptor* in_stream = &coredata.local_streams[stream_id]; - - if (jump_n_chunks > 0) // jump forward - { - while (jump_n_chunks--) { - // read 2nd int in (next size) header from ext - size_t chunk_size = *(int*)(in_stream->cursor + sizeof(int)); - if (chunk_size == 0) { - ebsp_message(err_jump_out_of_bounds); - return; - } - in_stream->cursor = (void*)(((unsigned)(in_stream->cursor)) + - 2 * sizeof(int) + chunk_size); - } - } else // jump backward - { - while (jump_n_chunks++) { - // read 1st int in (prev size) header from ext - int chunk_size = *(int*)(in_stream->cursor); - if (chunk_size == 0) { - ebsp_message(err_jump_out_of_bounds); - return; - } - in_stream->cursor = (void*)(((unsigned)(in_stream->cursor)) - - 2 * sizeof(int) - chunk_size); - } - } -} - -// -// New streaming API starting here -// - // When stream headers are interleaved, they are saved as: // // 00000000, nextsize, data, @@ -424,26 +82,42 @@ void ebsp_move_down_cursor(int stream_id, int jump_n_chunks) { // They are only the size of the data inbetween. // The local copies of the data include these 8 bytes. -int ebsp_stream_open(int stream_id, ebsp_stream* stream) { +int bsp_stream_open(ebsp_stream* stream, int stream_id) { if (stream_id >= combuf->nstreams) { ebsp_message(err_no_such_stream); return 0; } - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + ebsp_stream_descriptor* s = &(combuf->streams[stream_id]); + + int mypid = coredata.pid; + + e_mutex_lock(0, 0, &coredata.stream_mutex); + if (s->pid == -1) { + s->pid = mypid; + mypid = -1; + } + e_mutex_unlock(0, 0, &coredata.stream_mutex); + + if (mypid != -1) { + ebsp_message(err_stream_in_use, stream_id); + return 0; + } + + // Fill stream struct + stream->id = stream_id; + stream->extmem_start = s->extmem_addr; + stream->extmem_end = stream->extmem_start + s->nbytes; + stream->current_buffer = NULL; + stream->next_buffer = NULL; + stream->max_chunksize = s->max_chunksize; // Go to start - stream->cursor = stream->extmem_addr; + stream->cursor = stream->extmem_start; return stream->max_chunksize; } -void ebsp_stream_close(int stream_id) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return; - } - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; - +void bsp_stream_close(ebsp_stream* stream) { // Wait for any data transfer to finish before closing ebsp_dma_wait(&stream->e_dma_desc); @@ -455,15 +129,13 @@ void ebsp_stream_close(int stream_id) { ebsp_free(stream->next_buffer); stream->next_buffer = NULL; } -} -void ebsp_stream_seek(int stream_id, int delta_tokens) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return; - } - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + // Should not have to lock mutex for this atomic write + combuf->streams[stream->id].pid = -1; + stream->id = -1; +} +void bsp_stream_seek(ebsp_stream* stream, int delta_tokens) { if (delta_tokens >= 0) { // forward while (delta_tokens--) { // read 2nd int (next size) in header @@ -474,7 +146,7 @@ void ebsp_stream_seek(int stream_id, int delta_tokens) { } } else { // backward if (delta_tokens == INT_MIN) { - stream->cursor = stream->extmem_addr; + stream->cursor = stream->extmem_start; } while (delta_tokens++) { @@ -487,14 +159,7 @@ void ebsp_stream_seek(int stream_id, int delta_tokens) { } } -int ebsp_stream_move_down(int stream_id, void** buffer, int preload) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return 0; - } - - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; - +int bsp_stream_move_down(ebsp_stream* stream, void** buffer, int preload) { if (stream->current_buffer == NULL) { stream->current_buffer = ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); @@ -568,15 +233,8 @@ int ebsp_stream_move_down(int stream_id, void** buffer, int preload) { return current_chunk_size; } -int ebsp_stream_move_up(int stream_id, const void* data, int data_size, +int bsp_stream_move_up(ebsp_stream* stream, const void* data, int data_size, int wait_for_completion) { - if (stream_id >= coredata.local_nstreams) { - ebsp_message(err_no_such_stream); - return 0; - } - - ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; - ebsp_dma_handle* desc = &stream->e_dma_desc; // Wait for any previous transfer to finish (either down or up) diff --git a/src/host_bsp_buffer.c b/src/host_bsp_buffer.c index 3d409b8..41ea3e2 100644 --- a/src/host_bsp_buffer.c +++ b/src/host_bsp_buffer.c @@ -29,7 +29,7 @@ see the files COPYING and COPYING.LESSER. If not, see extern bsp_state_t state; #define MINIMUM_CHUNK_SIZE (4 * sizeof(int)) -void* ebsp_stream_create(int stream_size, int token_size, +void* bsp_stream_create(int stream_size, int token_size, const void* initial_data) { if (token_size < MINIMUM_CHUNK_SIZE) { printf("ERROR: minimum token size is %i bytes\n", MINIMUM_CHUNK_SIZE); From ba74322bdcf7799e008f92d4f5c51c70c2ee8b58 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Wed, 18 Jan 2017 14:54:39 +0100 Subject: [PATCH 10/17] Add deprecated stream functionality back --- Makefile | 2 + include/e_bsp.h | 1 + include/e_bsp_datatypes.h | 4 +- include/e_bsp_deprecated.h | 154 +++++++++++ include/ebsp_common.h | 8 +- include/host_bsp.h | 1 + include/host_bsp_deprecated.h | 55 ++++ include/host_bsp_private.h | 6 +- src/e_bsp_buffer.c | 31 +-- src/e_bsp_buffer_deprecated.c | 409 ++++++++++++++++++++++++++++ src/host_bsp.c | 11 +- src/host_bsp_buffer.c | 2 +- src/host_bsp_buffer_deprecated.c | 145 ++++++++++ test/bsp_streams/e_bsp_streams.c | 23 +- test/bsp_streams/host_bsp_streams.c | 4 +- 15 files changed, 811 insertions(+), 45 deletions(-) create mode 100644 include/e_bsp_deprecated.h create mode 100644 include/host_bsp_deprecated.h create mode 100644 src/e_bsp_buffer_deprecated.c create mode 100644 src/host_bsp_buffer_deprecated.c diff --git a/Makefile b/Makefile index f52f72e..bbd728e 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,7 @@ E_SRCS = \ e_bsp_mp.c \ e_bsp_memory.c\ e_bsp_buffer.c \ + e_bsp_buffer_deprecated.c \ e_bsp_dma.c E_ASM_SRCS = \ @@ -40,6 +41,7 @@ HOST_SRCS = \ host_bsp.c \ host_bsp_memory.c \ host_bsp_buffer.c \ + host_bsp_buffer_deprecated.c \ host_bsp_mp.c \ host_bsp_utility.c \ host_bsp_debug.c diff --git a/include/e_bsp.h b/include/e_bsp.h index d0c70dd..d0df7b3 100644 --- a/include/e_bsp.h +++ b/include/e_bsp.h @@ -42,6 +42,7 @@ see the files COPYING and COPYING.LESSER. If not, see #include #include "e_bsp_datatypes.h" +#include "e_bsp_deprecated.h" /** * Denotes the start of a BSP program. diff --git a/include/e_bsp_datatypes.h b/include/e_bsp_datatypes.h index bfaac6f..cf40699 100644 --- a/include/e_bsp_datatypes.h +++ b/include/e_bsp_datatypes.h @@ -34,12 +34,12 @@ typedef struct { typedef struct { ebsp_dma_handle e_dma_desc; // descriptor of dma, used as dma_id as well void* cursor; // current position of the stream in extmem - int32_t id; // stream_id of the stream + int id; // stream_id of the stream void* extmem_start; // extmem data in e_core address space void* extmem_end; // end of allocated region void* current_buffer; // pointer (in e_core_mem) to current chunk void* next_buffer; // pointer (in e_core_mem) to next chunk - uint32_t max_chunksize; // maximum size of a token exluding 8 byte header + unsigned max_chunksize; // maximum size of a token exluding 8 byte header } __attribute__((aligned(8))) ebsp_stream; diff --git a/include/e_bsp_deprecated.h b/include/e_bsp_deprecated.h new file mode 100644 index 0000000..24f237e --- /dev/null +++ b/include/e_bsp_deprecated.h @@ -0,0 +1,154 @@ +/* +This file is part of the Epiphany BSP library. + +Copyright (C) 2014-2015 Buurlage Wits +Support e-mail: + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License (LGPL) +as published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +and the GNU Lesser General Public License along with this program, +see the files COPYING and COPYING.LESSER. If not, see +. +*/ + +#pragma once + +/** + * Obtain the next chunk of data from a stream. + * + * @param address A pointer to a value that is overwritten with the local + * memory location of the data chunk + * @param stream_id The identifier of the stream + * @param prealloc If this parameter is equal to `1` then the BSP system will + * use double buffering, if it is `0` then single buffering is used. + * @return Number of bytes of the obtained chunk. If stream has + * finished or an error has occurred this function will return `0`. + * + * @remarks Memory is transferred using the `DMA1` engine. + * @remarks When using double buffering, the BSP system will allocate memory + * for the next chunk, and will start writing to it using the DMA engine + * while the current chunk is processed. This requires more (local) memory, + * but can greatly increase the overall speed. + */ +int ebsp_move_chunk_down(void** address, unsigned stream_id, int prealloc); + +/** + * Move a chunk of data up from a stream. + + * @param address A pointer to a value that is overwritten with the local + * memory location where the next chunk should be written to. + * @param stream_id The identifier of the stream + * @param prealloc If this parameter is equal to `1` then the BSP system will + * use double buffering, if it is `0` then single buffering is used. + * @return Number of bytes allocated for the next chunk of this stream. if + * stream has finished or an error has occurred this function will return `0`. + * + * @remarks Memory is transferred using the `DMA1` engine. + * @remarks When using the double buffering mode, `*address` will contain the + * location of a new chunk of memory, such that the BSP program can continue + * while the current chunk is being copied using the DMA engine. This requires + * more local memory, but can improve the performance of the program. + */ +int ebsp_move_chunk_up(void** address, unsigned stream_id, int prealloc); + +/** + * Move the cursor pointing to the next chunk to be obtained from a stream. + * + * @param stream_id The identifier of the stream + * @param jump_n_chunks The number of chunks to skip if `jump_n_chunks > 0`, + * or to go back if `jump_n_chunks < 0`. + * + * @remarks Internally a stream is a collection of data, along with specific + * information for a stream. For example, a stream holds a pointer to the + * next chunk that should be written to a core. Using this function you can + * change which chunk should be the `next` chunk. This allows you to obtain + * chunks multiple times, or to skip chunks completely. + * @remarks This function provides a mechanism through which chunks can be + * obtained multiple times. It gives you random access in the memory in + * the data stream. + * @remarks This function has `O(jump_n_chunks)` complexity. + */ +void ebsp_move_down_cursor(int stream_id, int jump_n_chunks); + +/** + * Resets the cursor pointing to the next chunk to be obtained from a stream. + * + * @param stream_id The identifier of the stream + * + * After calling this function the *next chunk* that is obtained from this + * stream is equal to the first chunk. + * + * @remarks This function has `O(1)` complexity. + */ +void ebsp_reset_down_cursor(int stream_id); + +/** + * Open an up stream. + * + * @param address Pointer to a variable that will be overwritten with the + * location where the data should be written for the first chunk that will + * be sent up. + * @param stream_id The identifier of the stream + * @return Number of bytes that can be written to the first chunk to be sent up. + * + * @remarks This function has to be called *before* performing any other operation + * on the stream. + * @remarks A call to the function should always match a single call to + * `ebsp_close_up_stream`. + */ +int ebsp_open_up_stream(void** address, unsigned stream_id); + +/** + * Close an up stream. + * + * @param stream_id The identifier of the stream + * + * Cleans up the stream, and frees any buffers that may have been used by the + * stream. + */ +void ebsp_close_up_stream(unsigned stream_id); + +/** + * Open a down stream. + * + * @param address Pointer to a variable that will be overwritten with the + * location where the data should be written for the first chunk that will + * be sent up. + * @param stream_id The identifier of the stream + * @return The size of the first chunk of this stream in bytes. + * + * @remarks This function has to be called *before* performing any other operation + * on the stream. + * @remarks A call to the function should always match a single call to + * `ebsp_close_down_stream`. + */ +int ebsp_open_down_stream(void** address, unsigned stream_id); + +/** + * Close a down stream. + * + * @param stream_id The identifier of the stream + * + * Cleans up the stream, and frees any buffers that may have been used by the + * stream. + */ +void ebsp_close_down_stream(unsigned stream_id); + +/** + * Set the number of bytes of the current chunk in the up stream. + * + * @param stream_id The identifier of the stream + * @param nbytes The number of bytes that should be moved from the current + * chunk of data in the next call to `ebsp_move_chunk_up`. + */ +void ebsp_set_up_chunk_size(unsigned stream_id, int nbytes); + diff --git a/include/ebsp_common.h b/include/ebsp_common.h index 5c4ff2b..79b92af 100644 --- a/include/ebsp_common.h +++ b/include/ebsp_common.h @@ -103,6 +103,8 @@ typedef struct { int32_t pid; // Processor currently owning the stream or -1 if none void* current_buffer; // pointer (in e_core_mem) to current chunk void* next_buffer; // pointer (in e_core_mem) to next chunk + int is_down_stream; // is 1 if it is a down-stream, 0 if it is an up-stream + int _padding; // make sure struct is 8 byte aligned when packed in arrays } __attribute__((aligned(8))) ebsp_stream_descriptor; // ebsp_combuf is a struct for epiphany <-> ARM communication @@ -120,10 +122,12 @@ typedef struct { float remotetimer; int32_t nprocs; int32_t tagsize; // Only for initial and final messages + // Deprecated streams + int n_streams[NPROCS]; + void* extmem_streams[NPROCS]; + // New streams int32_t nstreams; ebsp_stream_descriptor* streams; - // void* extmem_current_out_chunk[_NPROCS]; - // int out_buffer_size[_NPROCS]; // Epiphany <--> Epiphany ebsp_data_request data_requests[NPROCS][MAX_DATA_REQUESTS]; diff --git a/include/host_bsp.h b/include/host_bsp.h index 5a9c37c..f9a5ad9 100644 --- a/include/host_bsp.h +++ b/include/host_bsp.h @@ -56,6 +56,7 @@ see the files COPYING and COPYING.LESSER. If not, see #pragma once #include +#include "host_bsp_deprecated.h" /** * Write data to the Epiphany processor. diff --git a/include/host_bsp_deprecated.h b/include/host_bsp_deprecated.h new file mode 100644 index 0000000..0a4c520 --- /dev/null +++ b/include/host_bsp_deprecated.h @@ -0,0 +1,55 @@ +/* +This file is part of the Epiphany BSP library. + +Copyright (C) 2014-2015 Buurlage Wits +Support e-mail: + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License (LGPL) +as published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +and the GNU Lesser General Public License along with this program, +see the files COPYING and COPYING.LESSER. If not, see +. +*/ + +#pragma once + +/** + * Creates a down stream + * + * @param src The data which should be streamed down to an Epiphany core. + * @param dst_core_id The processor identifier of the receiving core. + * @param nbytes The total number of bytes of the data to be streamed down. + * @param chunksize The size in bytes of a single chunk. Must be at least 16. + * + * This function outputs an error if `chunksize` is less than 16. + * + * @remarks The data is copied from `src`, such that the data `src` can be + * safely freed or overwritten after this call. + */ + +void ebsp_create_down_stream(const void* src, int dst_core_id, int nbytes, + int chunksize); + +/** + * Creates an up stream + * + * @param dst_core_id The processor identifier of the sending core. + * @param max_nbytes The maximum number of bytes that will be sent up using + * this up stream. + * @param chunksize The maximum number of bytes of a single chunk that can be + * sent up through this stream. Must be at least 16. + * @return A pointer to a section of external memory storing the chunks + * sent up by the sending core. + * + * This function outputs an error if `chunksize` is less than 16. + */ +void* ebsp_create_up_stream(int dst_core_id, int max_nbytes, int chunksize); diff --git a/include/host_bsp_private.h b/include/host_bsp_private.h index e6bd434..74af35c 100644 --- a/include/host_bsp_private.h +++ b/include/host_bsp_private.h @@ -89,8 +89,10 @@ typedef struct { // Timer storage struct timespec ts_start, ts_end; - // Buffer - ebsp_stream_descriptor buffered_streams[MAX_N_STREAMS]; + // Buffer. First is deprecated, second is new version + ebsp_stream_descriptor buffered_streams[NPROCS][MAX_N_STREAMS]; + ebsp_stream_descriptor shared_streams[MAX_N_STREAMS]; + #ifdef DEBUG Symbol* e_symbols; diff --git a/src/e_bsp_buffer.c b/src/e_bsp_buffer.c index d64cf51..07cecbd 100644 --- a/src/e_bsp_buffer.c +++ b/src/e_bsp_buffer.c @@ -23,30 +23,15 @@ see the files COPYING and COPYING.LESSER. If not, see #include "e_bsp_private.h" #include -const char err_no_such_stream[] EXT_MEM_RO = "BSP ERROR: stream does not exist"; +const char err_no_such_stream2[] EXT_MEM_RO = "BSP ERROR: stream does not exist"; -const char err_mixed_up_down[] EXT_MEM_RO = - "BSP ERROR: mixed up and down streams"; - -const char err_close_closed[] EXT_MEM_RO = - "BSP ERROR: tried to close closed stream"; - -const char err_open_opened[] EXT_MEM_RO = - "BSP ERROR: tried to open opened stream"; - -const char err_jump_out_of_bounds[] EXT_MEM_RO = - "BSP ERROR: tried jumping past bounds of stream"; - -const char err_create_opened[] EXT_MEM_RO = - "BSP ERROR: tried creating opened stream"; - -const char err_out_of_memory[] EXT_MEM_RO = +const char err_out_of_memory2[] EXT_MEM_RO = "BSP ERROR: could not allocate enough memory for stream"; const char err_stream_in_use[] EXT_MEM_RO = "BSP ERROR: stream with id %d is in use"; -void _ebsp_write_chunk(ebsp_stream* stream, void* target) { +void _ebsp_read_chunk(ebsp_stream* stream, void* target) { // read 2nd int in header from ext (next size) int chunk_size = *(int*)(stream->cursor + sizeof(int)); ebsp_dma_handle* desc = (ebsp_dma_handle*)&(stream->e_dma_desc); @@ -84,7 +69,7 @@ void _ebsp_write_chunk(ebsp_stream* stream, void* target) { int bsp_stream_open(ebsp_stream* stream, int stream_id) { if (stream_id >= combuf->nstreams) { - ebsp_message(err_no_such_stream); + ebsp_message(err_no_such_stream2); return 0; } ebsp_stream_descriptor* s = &(combuf->streams[stream_id]); @@ -164,7 +149,7 @@ int bsp_stream_move_down(ebsp_stream* stream, void** buffer, int preload) { stream->current_buffer = ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); if (stream->current_buffer == NULL) { - ebsp_message(err_out_of_memory); + ebsp_message(err_out_of_memory2); return 0; } } @@ -183,7 +168,7 @@ int bsp_stream_move_down(ebsp_stream* stream, void** buffer, int preload) { if (stream->next_buffer == NULL) { // Data not here yet (did not preload last time) // Overwrite current buffer. - _ebsp_write_chunk(stream, stream->current_buffer); + _ebsp_read_chunk(stream, stream->current_buffer); ebsp_dma_wait(&(stream->e_dma_desc)); } else { // Data is locally available already in next_buffer (preload). @@ -215,11 +200,11 @@ int bsp_stream_move_down(ebsp_stream* stream, void** buffer, int preload) { stream->next_buffer = ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); if (stream->next_buffer == NULL) { - ebsp_message(err_out_of_memory); + ebsp_message(err_out_of_memory2); return 0; } } - _ebsp_write_chunk(stream, stream->next_buffer); + _ebsp_read_chunk(stream, stream->next_buffer); } else { // free malloced next buffer if (stream->next_buffer != NULL) { diff --git a/src/e_bsp_buffer_deprecated.c b/src/e_bsp_buffer_deprecated.c new file mode 100644 index 0000000..8aa2776 --- /dev/null +++ b/src/e_bsp_buffer_deprecated.c @@ -0,0 +1,409 @@ +/* +This file is part of the Epiphany BSP library. + +Copyright (C) 2014-2015 Buurlage Wits +Support e-mail: + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License (LGPL) +as published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +and the GNU Lesser General Public License along with this program, +see the files COPYING and COPYING.LESSER. If not, see +. +*/ + +#include "e_bsp_private.h" +#include + +const char err_no_such_stream[] EXT_MEM_RO = "BSP ERROR: stream does not exist"; + +const char err_mixed_up_down[] EXT_MEM_RO = + "BSP ERROR: mixed up and down streams"; + +const char err_close_closed[] EXT_MEM_RO = + "BSP ERROR: tried to close closed stream"; + +const char err_open_opened[] EXT_MEM_RO = + "BSP ERROR: tried to open opened stream"; + +const char err_jump_out_of_bounds[] EXT_MEM_RO = + "BSP ERROR: tried jumping past bounds of stream"; + +const char err_create_opened[] EXT_MEM_RO = + "BSP ERROR: tried creating opened stream"; + +const char err_out_of_memory[] EXT_MEM_RO = + "BSP ERROR: could not allocate enough memory for stream"; + +void ebsp_set_up_chunk_size(unsigned stream_id, int nbytes) { + ebsp_stream_descriptor* out_stream = &coredata.local_streams[stream_id]; + + int* header = (int*)out_stream->current_buffer; + // update the *next* value to the new numer of bytes + header[1] = nbytes; +} + +int ebsp_open_up_stream(void** address, unsigned stream_id) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return 0; + } + + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + if (stream->is_down_stream) { + ebsp_message(err_mixed_up_down); + return 0; + } + + if (stream->current_buffer != NULL) { + ebsp_message(err_create_opened); + return 0; + } + + stream->current_buffer = ebsp_malloc(stream->max_chunksize + sizeof(int)); + if (stream->current_buffer == NULL) { + ebsp_message(err_out_of_memory); + return 0; + } + + (*address) = (void*)((unsigned)stream->current_buffer + sizeof(int)); + + // Set the size to max_chunksize + int* header = (int*)stream->current_buffer; + header[0] = stream->max_chunksize; + + stream->cursor = stream->extmem_addr; + + return stream->max_chunksize; +} + +void ebsp_close_up_stream(unsigned stream_id) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return; + } + + ebsp_stream_descriptor* out_stream = &coredata.local_streams[stream_id]; + + if (out_stream->is_down_stream) { + ebsp_message(err_mixed_up_down); + return; + } + + if (out_stream->current_buffer == NULL) { + ebsp_message(err_close_closed); + return; + } + + // wait for data transfer to finish before closing + ebsp_dma_handle* desc = (ebsp_dma_handle*)&out_stream->e_dma_desc; + ebsp_dma_wait(desc); + + ebsp_free(out_stream->current_buffer); + out_stream->current_buffer = NULL; + + if (out_stream->next_buffer != NULL) { + ebsp_free(out_stream->next_buffer); + out_stream->next_buffer = NULL; + } +} + +int ebsp_move_chunk_up(void** address, unsigned stream_id, int prealloc) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return 0; + } + + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + if (stream->is_down_stream) { + ebsp_message(err_mixed_up_down); + return 0; + } + + ebsp_dma_handle* desc = (ebsp_dma_handle*)&stream->e_dma_desc; + + // if we prealloced last time, we have to wait until dma is finished + if (stream->next_buffer != NULL) { + ebsp_dma_wait(desc); + } + + if (prealloc) { + if (stream->next_buffer == NULL) { + stream->next_buffer = + ebsp_malloc(stream->max_chunksize + sizeof(int)); + if (stream->next_buffer == NULL) { + ebsp_message(err_out_of_memory); + return 0; + } + } + + // read int header from current_buffer (next size) + int chunk_size = ((int*)stream->current_buffer)[0]; + + void* src = (void*)((unsigned)stream->current_buffer + sizeof(int)); + void* dst = stream->cursor; + + ebsp_dma_push(desc, dst, src, chunk_size); // start dma + // ebsp_dma_start(); + + void* tmp = stream->current_buffer; // swap buffers + stream->current_buffer = stream->next_buffer; + stream->next_buffer = tmp; + + stream->cursor += chunk_size; // move pointer in extmem + } else // no prealloc + { + if (stream->next_buffer != NULL) { + ebsp_free(stream->next_buffer); + stream->next_buffer = NULL; + } + + // read int header from current_buffer (next size) + int chunk_size = ((int*)stream->current_buffer)[0]; + + void* src = (void*)((unsigned)stream->current_buffer + sizeof(int)); + void* dst = stream->cursor; + + ebsp_dma_push(desc, dst, src, chunk_size); // start dma + // ebsp_dma_start(); + ebsp_dma_wait(desc); + + stream->cursor += chunk_size; // move pointer in extmem + } + + (*address) = (void*)((unsigned)stream->current_buffer + sizeof(int)); + + // Set the out_size to max_chunksize + *((int*)(stream->current_buffer)) = stream->max_chunksize; + + return stream->max_chunksize; +} + +void _ebsp_write_chunk(ebsp_stream_descriptor* stream, void* target) { + // read 2nd int in header from ext (next size) + int chunk_size = *(int*)(stream->cursor + sizeof(int)); + ebsp_dma_handle* desc = (ebsp_dma_handle*)&(stream->e_dma_desc); + + if (chunk_size != 0) // stream has not ended + { + void* dst = target; + void* src = stream->cursor; + + // write to current + ebsp_dma_push(desc, dst, src, chunk_size + 2 * sizeof(int)); + // ebsp_dma_start(); + + // jump over header+chunk + stream->cursor = (void*)(((unsigned)(stream->cursor)) + + 2 * sizeof(int) + chunk_size); + } else { + // set next size to 0 + *((int*)(target + sizeof(int))) = 0; + } +} + +int ebsp_open_down_stream(void** address, unsigned stream_id) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return 0; + } + + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + if (!stream->is_down_stream) { + ebsp_message(err_mixed_up_down); + return 0; + } + if (stream->current_buffer != NULL || stream->next_buffer != NULL) { + ebsp_message(err_open_opened); + return 0; + } + + stream->cursor = stream->extmem_addr; + + // this will be the current buffer when move_chunk_down gets called for + // the first time + stream->next_buffer = ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); + if (stream->next_buffer == NULL) { + ebsp_message(err_out_of_memory); + return 0; + } + + _ebsp_write_chunk(stream, stream->next_buffer); + + *address = (void*)((unsigned)stream->next_buffer + 2 * sizeof(int)); + + return stream->max_chunksize; +} + +void ebsp_close_down_stream(unsigned stream_id) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return; + } + + ebsp_stream_descriptor* in_stream = &coredata.local_streams[stream_id]; + + if (!(in_stream->is_down_stream)) { + ebsp_message(err_mixed_up_down); + return; + } + + if (in_stream->current_buffer == NULL) { + ebsp_message(err_close_closed); + return; + } + + ebsp_dma_handle* desc = (ebsp_dma_handle*)&in_stream->e_dma_desc; + ebsp_dma_wait(desc); + + ebsp_free(in_stream->current_buffer); + in_stream->current_buffer = NULL; + + if (in_stream->next_buffer != NULL) { + ebsp_free(in_stream->next_buffer); + in_stream->next_buffer = NULL; + } +} + +int ebsp_move_chunk_down(void** address, unsigned stream_id, int prealloc) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return 0; + } + + ebsp_stream_descriptor* stream = &coredata.local_streams[stream_id]; + + ebsp_dma_handle* desc = (ebsp_dma_handle*)(&(stream->e_dma_desc)); + + // if(stream->current_buffer == NULL) + // stream->current_buffer = + // ebsp_malloc(stream->max_chunksize + 2*sizeof(int)); + + // Here: current_buffer contains data from previous chunk + // this can be null the first time ebsp_move_chunk_down is called + + if (!(stream->is_down_stream)) { + ebsp_message(err_mixed_up_down); + return 0; + } + + if (stream->next_buffer == NULL) // did not prealloc last time + { + // overwrite current buffer + _ebsp_write_chunk(stream, stream->current_buffer); + } else // did prealloc last time + { + void* tmp = stream->current_buffer; + stream->current_buffer = stream->next_buffer; + stream->next_buffer = tmp; + } + + // either wait for dma_push from last prealloc (else) + // or the one we just started (if) + ebsp_dma_wait(desc); + + // Here: current_buffer contains data from THIS chunk + + // *address must point after the counter header + (*address) = (void*)((unsigned)stream->current_buffer + 2 * sizeof(int)); + + // the counter header + int current_chunk_size = + *((int*)((unsigned)stream->current_buffer + sizeof(int))); + + if (current_chunk_size == 0) // stream has ended + { + (*address) = NULL; + return 0; + } + + if (prealloc) { + if (stream->next_buffer == NULL) { + // no next buffer available, malloc it + stream->next_buffer = + ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); + if (stream->next_buffer == NULL) { + ebsp_message(err_out_of_memory); + return 0; + } + } + _ebsp_write_chunk(stream, stream->next_buffer); + } else { + // free malloced next buffer + if (stream->next_buffer != NULL) { + ebsp_free(stream->next_buffer); + stream->next_buffer = NULL; + } + } + + // Here: next_buffer should (possibly) point to data of NEXT + // chunk (begin written to) or be zero + + return current_chunk_size; +} + +void ebsp_reset_down_cursor(int stream_id) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return; + } + + ebsp_stream_descriptor* in_stream = &coredata.local_streams[stream_id]; + + size_t chunk_size = -1; + + // break when previous block has size 0 (begin of stream) + while (chunk_size != 0) { + // read 1st int in (prev size) header from ext + chunk_size = *(int*)(in_stream->cursor); + in_stream->cursor = (void*)(((unsigned)(in_stream->cursor)) - + 2 * sizeof(int) - chunk_size); + } +} + +void ebsp_move_down_cursor(int stream_id, int jump_n_chunks) { + if (stream_id >= coredata.local_nstreams) { + ebsp_message(err_no_such_stream); + return; + } + + ebsp_stream_descriptor* in_stream = &coredata.local_streams[stream_id]; + + if (jump_n_chunks > 0) // jump forward + { + while (jump_n_chunks--) { + // read 2nd int in (next size) header from ext + size_t chunk_size = *(int*)(in_stream->cursor + sizeof(int)); + if (chunk_size == 0) { + ebsp_message(err_jump_out_of_bounds); + return; + } + in_stream->cursor = (void*)(((unsigned)(in_stream->cursor)) + + 2 * sizeof(int) + chunk_size); + } + } else // jump backward + { + while (jump_n_chunks++) { + // read 1st int in (prev size) header from ext + int chunk_size = *(int*)(in_stream->cursor); + if (chunk_size == 0) { + ebsp_message(err_jump_out_of_bounds); + return; + } + in_stream->cursor = (void*)(((unsigned)(in_stream->cursor)) - + 2 * sizeof(int) - chunk_size); + } + } +} + diff --git a/src/host_bsp.c b/src/host_bsp.c index e9c32bc..1fe2b7c 100644 --- a/src/host_bsp.c +++ b/src/host_bsp.c @@ -161,14 +161,21 @@ int ebsp_spmd() { } // Write stream structs to combuf + extmem + + // Depcrecated streams: for (int p = 0; p < NPROCS; p++) { int nbytes = state.combuf.n_streams[p] * sizeof(ebsp_stream_descriptor); void* stream_descriptors = ebsp_ext_malloc(nbytes); memcpy(stream_descriptors, state.buffered_streams[p], nbytes); state.combuf.extmem_streams[p] = _arm_to_e_pointer(stream_descriptors); + } - // TODO void* extmem_current_out_chunk[NPROCS]; - // TODO int out_buffer_size[NPROCS]; + // New streams: + { + int nbytes = state.combuf.nstreams * sizeof(ebsp_stream_descriptor); + void* stream_descriptors = ebsp_ext_malloc(nbytes); + memcpy(stream_descriptors, state.shared_streams, nbytes); + state.combuf.streams = _arm_to_e_pointer(stream_descriptors); } // Write communication buffer containing nprocs, diff --git a/src/host_bsp_buffer.c b/src/host_bsp_buffer.c index 41ea3e2..705a9bd 100644 --- a/src/host_bsp_buffer.c +++ b/src/host_bsp_buffer.c @@ -104,7 +104,7 @@ void* bsp_stream_create(int stream_size, int token_size, x.current_buffer = NULL; x.next_buffer = NULL; - state.buffered_streams[state.combuf.nstreams] = x; + state.shared_streams[state.combuf.nstreams] = x; state.combuf.nstreams++; return extmem_buffer; diff --git a/src/host_bsp_buffer_deprecated.c b/src/host_bsp_buffer_deprecated.c new file mode 100644 index 0000000..804c1dc --- /dev/null +++ b/src/host_bsp_buffer_deprecated.c @@ -0,0 +1,145 @@ +/* +This file is part of the Epiphany BSP library. + +Copyright (C) 2014-2015 Buurlage Wits +Support e-mail: + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License (LGPL) +as published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +and the GNU Lesser General Public License along with this program, +see the files COPYING and COPYING.LESSER. If not, see +. +*/ + +#include "host_bsp_private.h" + +#include +#include + + +extern bsp_state_t state; +#define MINIMUM_CHUNK_SIZE (4 * sizeof(int)) + +void ebsp_create_down_stream(const void* src, int dst_core_id, int nbytes, + int max_chunksize) { + if (max_chunksize < MINIMUM_CHUNK_SIZE) { + printf("ERROR: minimum chunk size is %i bytes\n", MINIMUM_CHUNK_SIZE); + return; + } + + int nchunks = (nbytes + max_chunksize - 1) / + max_chunksize; // nbytes/chunksize rounded up + + int nbytes_including_headers = + nbytes + nchunks * 2 * sizeof(int) + + 2 * sizeof(int); // the +2*sizeof(int) is the terminating header + // headers consist of 2 ints: prev size and next size + + // 1) malloc in extmem + void* extmem_in_buffer = ebsp_ext_malloc(nbytes_including_headers); + if (extmem_in_buffer == 0) { + printf( + "ERROR: not enough memory in extmem for ebsp_send_buffered_raw\n"); + return; + } + + // 2) copy the data to extmem, inserting headers + unsigned dst_cursor = (unsigned)extmem_in_buffer; + unsigned src_cursor = (unsigned)src; + + int current_chunksize = max_chunksize; + int last_chunksize = 0; + for (int nbytes_left = nbytes; nbytes_left > 0; + nbytes_left -= max_chunksize) { + if (nbytes_left < max_chunksize) + current_chunksize = nbytes_left; + + (*(int*)dst_cursor) = last_chunksize; // write prev header + dst_cursor += sizeof(int); + (*(int*)dst_cursor) = current_chunksize; // write next header + dst_cursor += sizeof(int); + + memcpy((void*)dst_cursor, (void*)src_cursor, current_chunksize); + + dst_cursor += current_chunksize; + src_cursor += current_chunksize; + + last_chunksize = current_chunksize; + } + + (*(int*)dst_cursor) = current_chunksize; // write terminating header (prev) + dst_cursor += sizeof(int); + (*(int*)dst_cursor) = 0; // write terminating header (next) + dst_cursor += sizeof(int); + + // 3) add stream to state + _ebsp_add_stream(dst_core_id, extmem_in_buffer, nbytes_including_headers, + max_chunksize, 1); +} + +void ebsp_create_down_stream_raw(const void* src, int dst_core_id, int nbytes, + int max_chunksize) { + // 1) malloc in extmem + void* extmem_in_buffer = ebsp_ext_malloc(nbytes); + if (extmem_in_buffer == 0) { + printf( + "ERROR: not enough memory in extmem for ebsp_send_buffered_raw\n"); + return; + } + // 2) copy the data there directly + memcpy(extmem_in_buffer, src, nbytes); + + // 3) add stream to state + _ebsp_add_stream(dst_core_id, extmem_in_buffer, nbytes, max_chunksize, 1); +} + +void* ebsp_create_up_stream(int src_core_id, int nbytes, int max_chunksize) { + if (max_chunksize < MINIMUM_CHUNK_SIZE) { + printf("ERROR: minimum chunk size is %i bytes\n", MINIMUM_CHUNK_SIZE); + return NULL; + } + + // 1) malloc in extmem + void* extmem_out_buffer = ebsp_ext_malloc(nbytes); + if (extmem_out_buffer == 0) { + printf("ERROR: not enough memory in extmem for ebsp_get_buffered\n"); + return NULL; + } + + // 2) add stream to state + _ebsp_add_stream(src_core_id, extmem_out_buffer, nbytes, max_chunksize, 0); + + return extmem_out_buffer; +} + +// add ebsp_stream_descriptor to state.buffered_streams, update state.n_streams +void _ebsp_add_stream(int core_id, void* extmem_buffer, int nbytes, + int max_chunksize, int is_down_stream) { + if (state.combuf.n_streams[core_id] == MAX_N_STREAMS) { + printf("ERROR: state.combuf.n_streams >= MAX_N_STREAMS\n"); + return; + } + + ebsp_stream_descriptor x; + + x.extmem_addr = _arm_to_e_pointer(extmem_buffer); + x.cursor = x.extmem_addr; + x.nbytes = nbytes; + x.max_chunksize = max_chunksize; + memset(&x.e_dma_desc, 0, sizeof(ebsp_dma_handle)); + x.current_buffer = NULL; + x.next_buffer = NULL; + x.is_down_stream = is_down_stream; + + state.buffered_streams[core_id][state.combuf.n_streams[core_id]] = x; + state.combuf.n_streams[core_id]++; +} diff --git a/test/bsp_streams/e_bsp_streams.c b/test/bsp_streams/e_bsp_streams.c index fb4db89..45cb773 100644 --- a/test/bsp_streams/e_bsp_streams.c +++ b/test/bsp_streams/e_bsp_streams.c @@ -96,11 +96,12 @@ int main() { // expect: ($00: BSP ERROR: stream does not exist) // New streaming API - int tokensize = ebsp_stream_open(5); - int tokensize2 = ebsp_stream_open(6); + ebsp_stream s1, s2; + int tokensize = bsp_stream_open(&s1, 2 * s + 0); + int tokensize2 = bsp_stream_open(&s2, 2 * s + 1); if (tokensize != tokensize2) - ebsp_message("Invalid token size at ebsp_stream_open"); + ebsp_message("Invalid token size at bsp_stream_open"); // Double buffered upstream int* up1 = ebsp_malloc(tokensize); @@ -109,14 +110,14 @@ int main() { // First stream down from 6 and copy it into 5 for (;;) { int* buffer; - int size = ebsp_stream_move_down(6, (void**)&buffer, 1); + int size = bsp_stream_move_down(&s2, (void**)&buffer, 1); if (size == 0) break; for (int j = 0; j < tokensize / sizeof(int); ++j) up1[j] = buffer[j]; - ebsp_stream_move_up(5, up1, size, 0); + bsp_stream_move_up(&s1, up1, size, 0); // swap buffers int* tmp = up1; up1 = up2; @@ -124,26 +125,26 @@ int main() { } // Now stream down from 5, double the values, and copy it into 6 - ebsp_stream_seek(5, INT_MIN); // go back to start - ebsp_stream_seek(6, INT_MIN); // go back to start + bsp_stream_seek(&s1, INT_MIN); // go back to start + bsp_stream_seek(&s2, INT_MIN); // go back to start for (;;) { int* buffer; - int size = ebsp_stream_move_down(5, (void**)&buffer, 1); + int size = bsp_stream_move_down(&s1, (void**)&buffer, 1); if (size == 0) break; for (int j = 0; j < tokensize / sizeof(int); ++j) up1[j] = 2 * buffer[j]; - ebsp_stream_move_up(6, up1, size, 0); + bsp_stream_move_up(&s2, up1, size, 0); // swap buffers int* tmp = up1; up1 = up2; up2 = tmp; } - ebsp_stream_close(5); - ebsp_stream_close(6); + bsp_stream_close(&s1); + bsp_stream_close(&s2); ebsp_free(up1); ebsp_free(up2); diff --git a/test/bsp_streams/host_bsp_streams.c b/test/bsp_streams/host_bsp_streams.c index 2496d0a..daa1bd9 100644 --- a/test/bsp_streams/host_bsp_streams.c +++ b/test/bsp_streams/host_bsp_streams.c @@ -63,9 +63,9 @@ int main(int argc, char** argv) { int** streams2 = malloc(sizeof(int*) * bsp_nprocs()); for (int s = 0; s < bsp_nprocs(); ++s) { - streams1[s] = ebsp_stream_create(s, chunks * chunk_size, chunk_size, 0); + streams1[s] = bsp_stream_create(chunks * chunk_size, chunk_size, 0); streams2[s] = - ebsp_stream_create(s, chunks * chunk_size, chunk_size, downdata); + bsp_stream_create(chunks * chunk_size, chunk_size, downdata); } ebsp_spmd(); From a3893b9f10c52b08b6d31253655548d6334d346b Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Sat, 15 Oct 2016 18:18:53 +0000 Subject: [PATCH 11/17] Fix streaming unit test for new API --- src/e_bsp.c | 10 ++++++++++ test/bsp_streams/e_bsp_streams.c | 2 +- test/bsp_streams/host_bsp_streams.c | 25 +++++++++++++++++++------ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/e_bsp.c b/src/e_bsp.c index 03fd28f..f1a8aea 100644 --- a/src/e_bsp.c +++ b/src/e_bsp.c @@ -91,6 +91,16 @@ void EXT_MEM_TEXT bsp_begin() { _init_local_malloc(); + // Copy stream descriptors to local memory + // TODO: do this only when the stream is opened + // and send them back when closed so that streams + // can change owner + unsigned int nbytes = + combuf->n_streams[coredata.pid] * sizeof(ebsp_stream_descriptor); + coredata.local_streams = ebsp_malloc(nbytes); + ebsp_memcpy(coredata.local_streams, combuf->extmem_streams[coredata.pid], + nbytes); + // Send &syncstate to ARM if (coredata.pid == 0) combuf->syncstate_ptr = (int8_t*)&coredata.syncstate; diff --git a/test/bsp_streams/e_bsp_streams.c b/test/bsp_streams/e_bsp_streams.c index 45cb773..ae461d7 100644 --- a/test/bsp_streams/e_bsp_streams.c +++ b/test/bsp_streams/e_bsp_streams.c @@ -94,7 +94,7 @@ int main() { if (s == 0) ebsp_close_up_stream(5); // expect: ($00: BSP ERROR: stream does not exist) - + // New streaming API ebsp_stream s1, s2; int tokensize = bsp_stream_open(&s1, 2 * s + 0); diff --git a/test/bsp_streams/host_bsp_streams.c b/test/bsp_streams/host_bsp_streams.c index daa1bd9..3235b88 100644 --- a/test/bsp_streams/host_bsp_streams.c +++ b/test/bsp_streams/host_bsp_streams.c @@ -81,21 +81,34 @@ int main(int argc, char** argv) { for (int i = 0; i < chunk_size * chunks / sizeof(int); ++i) { printf("%i ", upstreamsDouble[5][i]); } + printf("\n"); // expect: (30 28 26 24 22 20 18 16 14 12 10 8 6 4 2 0 ) // results of new API - - for (int i = 0; i < chunk_size * chunks / sizeof(int); ++i) { - printf("%i ", streams1[5][i]); + int* ptr = streams1[5]; + for (int c = 0; c < chunks; c++) { + // Skip headers!!! + ptr++; + ptr++; + for (int i = 0; i < chunk_size / sizeof(int); ++i) { + printf("%i ", *ptr++); + } } printf("\n"); - // expect: (0 1 2 3 11 10 9 8 8 9 10 11 3 2 1 0 ) + // expect: (15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ) // Check the data in the DOWN stream. It should have been used // as an upstream as well. - for (int i = 0; i < chunk_size * chunks / sizeof(int); ++i) { - printf("%i ", streams2[5][i]); + ptr = streams2[5]; + for (int c = 0; c < chunks; c++) { + // Skip headers!!! + ptr++; + ptr++; + for (int i = 0; i < chunk_size / sizeof(int); ++i) { + printf("%i ", *ptr++); + } } + printf("\n"); // expect: (30 28 26 24 22 20 18 16 14 12 10 8 6 4 2 0 ) // finalize From 9244d4afef16a5aff4c51b7f182c413bc56c8550 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Sat, 15 Oct 2016 18:29:08 +0000 Subject: [PATCH 12/17] Add unit test for opening another core's stream --- test/bsp_streams/e_bsp_streams.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/bsp_streams/e_bsp_streams.c b/test/bsp_streams/e_bsp_streams.c index ae461d7..c302ad6 100644 --- a/test/bsp_streams/e_bsp_streams.c +++ b/test/bsp_streams/e_bsp_streams.c @@ -103,6 +103,27 @@ int main() { if (tokensize != tokensize2) ebsp_message("Invalid token size at bsp_stream_open"); + // Switch stream 0 (core 0) and stream 2 (core 1) + // Also test the in-use error message + + ebsp_barrier(); + if (s == 0) { + bsp_stream_close(&s1); + bsp_stream_open(&s1, 2); + // expect: ($00: BSP ERROR: stream with id 2 is in use) + } + + // Close stream 2 on core 1, then open on core 0 + ebsp_barrier(); + if (s == 1) + bsp_stream_close(&s1); + ebsp_barrier(); + if (s == 0) + bsp_stream_open(&s1, 2); // NOW it should be succesful + if (s == 1) + bsp_stream_open(&s1, 0); // Core 1 can now open stream 0 + ebsp_barrier(); + // Double buffered upstream int* up1 = ebsp_malloc(tokensize); int* up2 = ebsp_malloc(tokensize); From 3e949c7a5f0c889de6509103daee83f417f0bfc7 Mon Sep 17 00:00:00 2001 From: Tom Bannink Date: Wed, 18 Jan 2017 16:33:12 +0100 Subject: [PATCH 13/17] Add extra error checks in streaming functions --- include/e_bsp.h | 9 +-- src/e_bsp_buffer.c | 82 +++++++++++++++++++++------- src/e_bsp_buffer_deprecated.c | 33 +++++++---- src/e_bsp_memory.c | 8 +++ src/extmem_malloc_implementation.cpp | 37 +++++++++++++ src/host_bsp_buffer.c | 2 +- 6 files changed, 136 insertions(+), 35 deletions(-) diff --git a/include/e_bsp.h b/include/e_bsp.h index d0df7b3..4095b40 100644 --- a/include/e_bsp.h +++ b/include/e_bsp.h @@ -474,10 +474,8 @@ void bsp_stream_close(ebsp_stream* stream); * * Note that if `bsp_stream_move_down` is used with `preload` enabled * (meaning the last call to that function had `preload` enabled), - * then the preloaded token will not be changed, so the first call to - * `bsp_stream_move_down` after this will still yield a token from the - * previous position. - * If `preload` was not enabled then the next call to `bsp_stream_move_down` + * then calling `ebsp_stream_seek` will discard any token that was + * preloaded in memory, so the first call to `ebsp_stream_move_down` after this * will yield a token from the new position. * * @remarks This function provides a mechanism through which chunks can be @@ -496,6 +494,9 @@ void bsp_stream_seek(ebsp_stream* stream, int delta_tokens); * preload the next token asynchroneously (double buffering). * @return Number of bytes of the obtained chunk. If stream has * finished or an error has occurred this function will return `0`. + * + * When calling this function, the token that was obtained at the previous + * call will be overwritten. * * @remarks Behaviour is undefined if the stream was not opened using * `bsp_stream_open`. diff --git a/src/e_bsp_buffer.c b/src/e_bsp_buffer.c index 07cecbd..2c1d1ec 100644 --- a/src/e_bsp_buffer.c +++ b/src/e_bsp_buffer.c @@ -31,27 +31,44 @@ const char err_out_of_memory2[] EXT_MEM_RO = const char err_stream_in_use[] EXT_MEM_RO = "BSP ERROR: stream with id %d is in use"; +const char err_stream_full[] EXT_MEM_RO = + "BSP ERROR: Stream %d has %u space left, token of size %u can not be moved up."; + +const char err_up_size_warning[] EXT_MEM_RO = + "BSP WARNING: Moving token of size %d up to stream %d with max token size %d"; + +const char err_token_size[] EXT_MEM_RO = + "BSP ERROR: Stream contained token larger (%d) than maximum token size (%d) for stream. (truncated)"; + void _ebsp_read_chunk(ebsp_stream* stream, void* target) { - // read 2nd int in header from ext (next size) + // read header from ext + int prev_size = *(int*)(stream->cursor); int chunk_size = *(int*)(stream->cursor + sizeof(int)); - ebsp_dma_handle* desc = (ebsp_dma_handle*)&(stream->e_dma_desc); if (chunk_size != 0) // stream has not ended { - void* dst = target; - void* src = stream->cursor; - - // write to current - ebsp_dma_push(desc, dst, src, chunk_size + 2 * sizeof(int)); - // ebsp_dma_start(); + void* dst = target + 2 * sizeof(int); + void* src = stream->cursor + 2 * sizeof(int); // jump over header+chunk stream->cursor = (void*)(((unsigned)(stream->cursor)) + 2 * sizeof(int) + chunk_size); - } else { - // set next size to 0 - *((int*)(target + sizeof(int))) = 0; + + // If token is too large, truncate it. + // However DO jump the correct distance with cursor + if (chunk_size > stream->max_chunksize) { + ebsp_message(err_token_size, chunk_size, stream->max_chunksize); + chunk_size = stream->max_chunksize; + } + + ebsp_dma_push(&stream->e_dma_desc, dst, src, chunk_size); } + + // copy it to local + // we do NOT do this with the DMA because of the + // possible trunction done above + *(int*)(target) = prev_size; + *(int*)(target + sizeof(int)) = chunk_size; } // When stream headers are interleaved, they are saved as: @@ -126,25 +143,36 @@ void bsp_stream_seek(ebsp_stream* stream, int delta_tokens) { // read 2nd int (next size) in header int chunk_size = *(int*)(stream->cursor + sizeof(int)); if (chunk_size == 0) - return; + break; stream->cursor += 2 * sizeof(int) + chunk_size; } } else { // backward if (delta_tokens == INT_MIN) { stream->cursor = stream->extmem_start; + } else { + while (delta_tokens++) { + // read 1st int (prev size) in header + int chunk_size = *(int*)(stream->cursor); + if (chunk_size == 0) + break; + stream->cursor -= 2 * sizeof(int) + chunk_size; + } } + } - while (delta_tokens++) { - // read 1st int (prev size) in header - int chunk_size = *(int*)(stream->cursor); - if (chunk_size == 0) - return; - stream->cursor -= 2 * sizeof(int) + chunk_size; - } + // If there was anything preloaded, discard it + if (stream->next_buffer != NULL) { + // Wait for a possible write to it + ebsp_dma_wait(&stream->e_dma_desc); + // Free it + ebsp_free(stream->next_buffer); + stream->next_buffer = NULL; } } int bsp_stream_move_down(ebsp_stream* stream, void** buffer, int preload) { + *buffer = NULL; + if (stream->current_buffer == NULL) { stream->current_buffer = ebsp_malloc(stream->max_chunksize + 2 * sizeof(int)); @@ -229,6 +257,22 @@ int bsp_stream_move_up(ebsp_stream* stream, const void* data, int data_size, // If this is not done, integer access to the headers will crash data_size = ((data_size + 8 - 1) / 8) * 8; + if (data_size > stream->max_chunksize) { + ebsp_message(err_up_size_warning, data_size, stream->id, + stream->max_chunksize); + } + + // Check if there is enough space in the stream, + // including terminating header + // Be carefull to use unsigned here, since these addresses + // are over the INT_MAX boundary. + unsigned space_required = (unsigned)data_size + 4 * sizeof(int); + unsigned space_left = (unsigned)stream->extmem_end - (unsigned)stream->cursor; + if (space_left < space_required) { + ebsp_message(err_stream_full, stream->id, space_left, space_required); + return 0; + } + // First write both the header before and after this token. int* header1 = (int*)(stream->cursor); int* header2 = (int*)(stream->cursor + 2 * sizeof(int) + data_size); diff --git a/src/e_bsp_buffer_deprecated.c b/src/e_bsp_buffer_deprecated.c index 8aa2776..ed0b89e 100644 --- a/src/e_bsp_buffer_deprecated.c +++ b/src/e_bsp_buffer_deprecated.c @@ -43,6 +43,9 @@ const char err_create_opened[] EXT_MEM_RO = const char err_out_of_memory[] EXT_MEM_RO = "BSP ERROR: could not allocate enough memory for stream"; +const char err_token_size2[] EXT_MEM_RO = + "BSP ERROR: Stream contained token larger (%d) than maximum token size (%d) for stream. (truncated)"; + void ebsp_set_up_chunk_size(unsigned stream_id, int nbytes) { ebsp_stream_descriptor* out_stream = &coredata.local_streams[stream_id]; @@ -190,26 +193,34 @@ int ebsp_move_chunk_up(void** address, unsigned stream_id, int prealloc) { } void _ebsp_write_chunk(ebsp_stream_descriptor* stream, void* target) { - // read 2nd int in header from ext (next size) + // read header from ext + int prev_size = *(int*)(stream->cursor); int chunk_size = *(int*)(stream->cursor + sizeof(int)); - ebsp_dma_handle* desc = (ebsp_dma_handle*)&(stream->e_dma_desc); if (chunk_size != 0) // stream has not ended { - void* dst = target; - void* src = stream->cursor; - - // write to current - ebsp_dma_push(desc, dst, src, chunk_size + 2 * sizeof(int)); - // ebsp_dma_start(); + void* dst = target + 2 * sizeof(int); + void* src = stream->cursor + 2 * sizeof(int); // jump over header+chunk stream->cursor = (void*)(((unsigned)(stream->cursor)) + 2 * sizeof(int) + chunk_size); - } else { - // set next size to 0 - *((int*)(target + sizeof(int))) = 0; + + // If token is too large, truncate it. + // However DO jump the correct distance with cursor + if (chunk_size > stream->max_chunksize) { + ebsp_message(err_token_size2, chunk_size, stream->max_chunksize); + chunk_size = stream->max_chunksize; + } + + ebsp_dma_push(&stream->e_dma_desc, dst, src, chunk_size); } + + // copy it to local + // we do NOT do this with the DMA because of the + // possible trunction done above + *(int*)(target) = prev_size; + *(int*)(target + sizeof(int)) = chunk_size; } int ebsp_open_down_stream(void** address, unsigned stream_id) { diff --git a/src/e_bsp_memory.c b/src/e_bsp_memory.c index d6c16be..466c534 100644 --- a/src/e_bsp_memory.c +++ b/src/e_bsp_memory.c @@ -79,6 +79,14 @@ void EXT_MEM_TEXT ebsp_free(void* ptr) { } } +// For debug purposes +void EXT_MEM_TEXT print_malloc_info() { + uint32_t used, free; + _get_malloc_info(coredata.local_malloc_base, &used, &free); + ebsp_message("MALLOC STATE: %u Bytes used. %u Bytes free.", + (unsigned int)used, (unsigned int)free); +} + void ebsp_memcpy(void* dest, const void* source, size_t nbytes) { unsigned bits = (unsigned)dest | (unsigned)source; if ((bits & 0x7) == 0) { diff --git a/src/extmem_malloc_implementation.cpp b/src/extmem_malloc_implementation.cpp index eb27d42..80affe0 100644 --- a/src/extmem_malloc_implementation.cpp +++ b/src/extmem_malloc_implementation.cpp @@ -111,6 +111,11 @@ void* MALLOC_FUNCTION_PREFIX _malloc(void* base, uint32_t nbytes) { chunks_left -= 32; continue; } + } else if (mask == -1) { + // All 32 bits (chunks) are in use + // So start at least AFTER this one + start_mask = i + 1; + start_bit = 0; } else { // Mask is not empty. We will need to parse all individual bits for (uint32_t j = 0; j < 32; ++j) { @@ -192,3 +197,35 @@ void MALLOC_FUNCTION_PREFIX _init_malloc_state(void* base, uint32_t size) { while (total_bitmask_ints--) *ptr++ = 0; } + +// For debug purposes +void MALLOC_FUNCTION_PREFIX +_get_malloc_info(void* base, uint32_t* used, uint32_t* free) { + uint32_t total_bitmask_ints = get_bitmask_count(base); + uint32_t* bitmasks = get_bitmasks(base); + + uint32_t bits_in_use = 0; + uint32_t bits_free = 0; + for (uint32_t i = 0; i < total_bitmask_ints; ++i) { + uint32_t mask = bitmasks[i]; + if (mask == 0) { + bits_free += 32; + continue; + } else if (mask == -1) { + bits_in_use += 32; + continue; + } else { + // Mask is not empty. We will need to parse all individual bits + for (uint32_t j = 0; j < 32; ++j) { + if (mask & 1) { + bits_in_use++; + } else { + bits_free++; + } + mask >>= 1; + } + } + } + *used = bits_in_use * CHUNK_SIZE; + *free = bits_free * CHUNK_SIZE; +} diff --git a/src/host_bsp_buffer.c b/src/host_bsp_buffer.c index 705a9bd..9c50c5c 100644 --- a/src/host_bsp_buffer.c +++ b/src/host_bsp_buffer.c @@ -85,7 +85,7 @@ void* bsp_stream_create(int stream_size, int token_size, (*(int*)dst_cursor) = 0; // write terminating header (next) dst_cursor += sizeof(int); } else { - // Write a terminating header, or upstreams will crash + // Write a single terminating header, or upstreams will crash (*(int*)dst_cursor) = 0; // prevsize dst_cursor += sizeof(int); (*(int*)dst_cursor) = 0; // nextsize From 0b793b8cd493d27763feab2f6bd04b4ae73b450c Mon Sep 17 00:00:00 2001 From: Jan-Willem Buurlage Date: Wed, 18 Jan 2017 14:39:56 +0100 Subject: [PATCH 14/17] Update streaming documentation --- docs/api_reference.rst | 74 ++++++------------------- docs/conf.py | 2 +- docs/streaming.rst | 120 +++++++++++++++++------------------------ include/e_bsp.h | 2 +- 4 files changed, 68 insertions(+), 130 deletions(-) diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 3419a80..8743b0b 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -80,16 +80,10 @@ ebsp_hpmove .. doxygenfunction:: ebsp_hpmove :project: ebsp_host -ebsp_create_down_stream -^^^^^^^^^^^^^^^^^^^^^^^ +bsp_stream_create +^^^^^^^^^^^^^^^^^ -.. doxygenfunction:: ebsp_create_down_stream - :project: ebsp_host - -ebsp_create_up_stream -^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: ebsp_create_up_stream +.. doxygenfunction:: bsp_stream_create :project: ebsp_host ebsp_write @@ -251,68 +245,32 @@ bsp_hpmove .. doxygenfunction:: bsp_hpmove :project: ebsp_e -ebsp_send_up -^^^^^^^^^^^^ +bsp_stream_open +^^^^^^^^^^^^^^^ -.. doxygenfunction:: ebsp_send_up +.. doxygenfunction:: bsp_stream_open :project: ebsp_e -ebsp_move_chunk_down -^^^^^^^^^^^^^^^^^^^^ +bsp_stream_close +^^^^^^^^^^^^^^^^ -.. doxygenfunction:: ebsp_move_chunk_down +.. doxygenfunction:: bsp_stream_close :project: ebsp_e -ebsp_move_chunk_up +bsp_stream_move_up ^^^^^^^^^^^^^^^^^^ -.. doxygenfunction:: ebsp_move_chunk_up +.. doxygenfunction:: bsp_stream_move_up :project: ebsp_e -ebsp_move_down_cursor -^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: ebsp_move_down_cursor - :project: ebsp_e - -ebsp_reset_down_cursor -^^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: ebsp_reset_down_cursor - :project: ebsp_e - -ebsp_open_up_stream -^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: ebsp_open_up_stream - :project: ebsp_e - -ebsp_open_down_stream -^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: ebsp_open_down_stream - :project: ebsp_e - -ebsp_close_up_stream +bsp_stream_move_down ^^^^^^^^^^^^^^^^^^^^ -.. doxygenfunction:: ebsp_close_up_stream - :project: ebsp_e - -ebsp_close_down_stream -^^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: ebsp_close_down_stream - :project: ebsp_e - -ebsp_set_up_chunk_size -^^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: ebsp_set_up_chunk_size +.. doxygenfunction:: bsp_stream_move_down :project: ebsp_e -bsp_abort -^^^^^^^^^ +bsp_stream_seek +^^^^^^^^^^^^^^^ -.. doxygenfunction:: bsp_abort +.. doxygenfunction:: bsp_stream_seek :project: ebsp_e diff --git a/docs/conf.py b/docs/conf.py index e2207b7..b5a142e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,7 +70,7 @@ # General information about the project. project = 'Epiphany BSP' -copyright = '2015, Coduin' +copyright = '2015-2017, Coduin' author = 'Coduin' # The version info for the project you're documenting, acts as replacement for diff --git a/docs/streaming.rst b/docs/streaming.rst index 572d8a8..732f181 100644 --- a/docs/streaming.rst +++ b/docs/streaming.rst @@ -9,88 +9,83 @@ Streaming When dealing with problems that involve a lot of data such as images or large matrices, it is often the case that the data for the problem does not fit on the combined local memory of the Epiphany processor. In order to work with the data we must then use the larger (but much slower) external memory, which slows the programs down tremendously. -For these situations we provide a *streaming* mechanism. When writing your program to use streams, it will work on smaller chunks of the problem at any given time -- such that the data currently being treated is always local to the core. The EBSP library prepares the next chunk to work on while the previous chunk is being processed such that there is minimal downtime because the Epiphany cores are waiting for the slow external memory. +For these situations we provide a *streaming* mechanism. When writing your program to use streams, it will work on smaller tokens of the problem at any given time -- such that the data currently being treated is always local to the core. The EBSP library prepares the next token to work on while the previous token is being processed such that there is minimal downtime because the Epiphany cores are waiting for the slow external memory. Making and using down streams ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -There are two types of streams, *up* and *down* streams. A *down* stream contains data to be processed by an Epiphany core, while an *up* stream contains results from computations performed by the Epiphany core. Every stream (both up and down) has a *target processor*, *total size* and a *chunk size*. The target processor is simply the processor id of the core that should receive the content of the stream. The total size is the total number of bytes of the entire set of data. This set of data then gets partitioned into chunks consisting of the number of bytes set by the chunk size. This size need not be constant (i.e. it may vary over a single stream), but for our discussion here we will assume that it is constant. +A stream contains data to be processed by an Epiphany core, and can also be used to obtain results from computations performed by the Epiphany core. Every stream has a *total size* and a *token size*. The total size is the total number of bytes of the entire set of data. This set of data then gets partitioned into tokens consisting of the number of bytes set by the token size. This size need not be constant (i.e. it may vary over a single stream), but for our discussion here we will assume that it is constant. -A stream is created before the call to ``ebsp_spmd`` on the host processor. The host prepares the data to be processed by the Epiphany cores, and the EBSP library then performs the necessary work needed for each core to receives its chunk. Note that this data is copied efficiently to the external memory upon creation of the stream, so that the user data should be stored in the ordinary RAM, e.g. allocated by a call to ``malloc``. A stream is created as follows:: +A stream is created before the call to ``ebsp_spmd`` on the host processor. The host prepares the data to be processed by the Epiphany cores, and the EBSP library then performs the necessary work needed for each core to receives its token. Note that this data is copied efficiently to the external memory upon creation of the stream, so that the user data should be stored in the ordinary RAM, e.g. allocated by a call to ``malloc``. A stream is created as follows:: - // on the host + // (on the host) int count = 256; - int count_in_chunk = 32; + int count_in_token = 32; float* data = malloc(count * sizeof(float)); // ... fill data - for (int s = 0; s < bsp_nprocs(); s++) { - ebsp_create_down_stream(&data, s, count * sizeof(float), - count_in_chunk * sizeof(float)); - } - -This will create ``bsp_nprocs()`` identical streams containing user data, one for each core. These streams are chopped up in ``256/32 = 8`` chunks. If you want to use these streams in the kernel you need to *open* them and *move chunks* from a stream to the local memory. Every stream you create on the host gets is identified by the order in which they are created. For example, the stream we created above will obtain the id ``0`` on every core. A second stream (regardless of whether it is up or down) will be identified with ``1``, etc. *These identifiers are shared between up and down streams, but not between cores*. Opening a stream is done by using this identifier:: + bsp_stream_create(count * sizeof(float), count_in_token * sizeof(float), data); - // in the kernel - float* address = NULL; - ebsp_open_down_stream(&(void*)address, // a pointer to the address store - 0); // the stream identifier +This will create a stream containing user data. This stream is chopped up in ``256/32 = 8`` tokens. If you want to use this streams in the kernel of a core you need to *open* it and *move tokens* from a stream to the local memory. Every stream you create on the host gets is identified by the order in which they are created, starting from index ``0``. For example, the stream we created above will obtain the id ``0``. A second stream (regardless of whether it is up or down) will be identified with ``1``, etc. *These identifiers are shared between cores*. Opening a stream is done by using this identifier, for example, to open a stream with identifier ``3``:: -After this call, address will contain the location in the local memory of the first chunk, but the data is not necessarily there yet (it might still be copying). To ensure that the data has been received we *move* a chunk:: + bsp_stream mystream; + if(bsp_stream_open(&mystream, 3)) { + // ... + } - int double_buffer = 1; - ebsp_move_chunk_down(&(void*)address, 0, double_buffer); +After this call, the stream will start copying data to the core, but the data is not necessarily there yet (it might still be copying). To access this data we *move* a token:: -The first two arguments are identical to those of ``ebsp_open_down_stream``. The ``double_buffer`` argument gives you the option to start writing the next chunk to local memory (using the DMA engine), while you process the current chunk that just moved down. This can be done simultaneously to your computations, but will take up twice as much memory. It depends on the specific situation whether double_buffered mode should be turned on or off. Subsequent blocks are obtained using repeated calls to ``ebsp_move_chunk_down``. + // Get some data + void* buffer = NULL; + bsp_stream_move_down(&mystream, &buffer, 0); + // The data is now in buffer -If you want to use a chunk multiple times at different stages of your algorithm, you need to be able to instruct EBSP to change which chunk you want to obtain. Internally the EBSP system has a *cursor* for each stream which points to the next chunk that should be obtained. You can modify this cursor using the following two functions:: +The first argument is the stream object that was filled using ``bsp_stream_open``. The second argument is a pointer to a pointer that will be set to the data location. The final ``double_buffer`` argument, gives you the option to start writing the next token to local memory (using the DMA engine), while you process the current token that you just moved down. This can be done simultaneously to your computations, but will take up twice as much memory. It depends on the specific situation whether double buffered mode should be turned on or off. Subsequent blocks are obtained using repeated calls to ``bsp_stream_move_down``. - // reset the cursor of the first stream to its first chunk - ebsp_reset_down_cursor(0); +If you want to use a token multiple times at different stages of your algorithm, you need to be able to instruct EBSP to change which token you want to obtain. Internally the EBSP system has a *cursor* for each stream which points to the next token that should be obtained. You can modify this cursor using the following two functions:: - // move the cursor of the first stream forward by 5 chunks - ebsp_move_down_cursor(0, 5); + // move the cursor of the stream forward by 5 tokens + bsp_stream_seek(&mystream, 5); - // move the cursor of the first stream back by 3 chunks - ebsp_move_down_cursor(0, -3); + // move the cursor of the stream back by 3 tokens + bsp_stream_seek(&mystream, -3); -Note that this gives you random access inside your streams. Therefore our streaming approach should actually be called *pseudo-streaming*, because formally streaming algorithms only process chunks in a stream a constant number of times. However on the Epiphany we can provide random-access in our streams, leading to different semantics such as moving the cursor. +When you exceed the bounds of the stream, it will be set to the final or first token respectively. Note that this gives you random access inside your streams. Therefore our streaming approach should actually be called *pseudo-streaming*, because formally streaming algorithms only process tokens in a stream a constant number of times. However on the Epiphany we can provide random-access in our streams, opening the door to different semantics such as moving the cursor. Moving results back up ^^^^^^^^^^^^^^^^^^^^^^ -Up streams work very similar to down streams, however no data has to be supplied by the host since it is generated by the Epiphany. We construct an up stream in the following way:: - - // on the host - // .. create up stream (see above) - void* upstream_data = malloc(sizeof(void*) * bsp_nprocs()); - for (int s = 0; s < bsp_nprocs(); s++) { - upstream_data[s] = ebsp_create_up_stream( - s, chunks * chunksize, chunks); - } +A stream can also be used to move results back up, for example:: -The array ``upstream_data`` holds pointers to the generated data by each processor. In the kernel you can *open* these streams similarly to down streams:: + int* buffer1 = ebsp_malloc(100 * sizeof(int)); + int* buffer2 = ebsp_malloc(100 * sizeof(int)); + int* curbuffer = buffer1; + int* otherbuffer = buffer2; - // in the kernel - float* up_address = NULL; - ebsp_open_up_stream(&(void*)up_address, // a pointer to the address store - 1); // the stream identifier + ebsp_stream s; + bsp_stream_open(&s, 0); // open stream 0 + while (...) { + // Fill curbuffer + for (int i = 0; i < 100; i++) + curbuffer[i] = 5; -Note that this stream has the identifier ``1`` on each core. The up_address now points to a portion of *local memory* that you can fill with data from the kernel. To move a chunk of results up we use:: - - int double_buffer = 1; - ebsp_move_chunk_up(&(void*)up_address, 1, double_buffer); + // Send up + bsp_stream_move_up(&s, curbuffer, 100 * sizeof(int), 0); + // Use other bufferfer + swap(curbuffer, otherbuffer); + } + ebsp_free(buffer1); + ebsp_free(buffer2); -If we use a double buffer, then after this call ``up_address`` will point to a new portion of memory, such that you can continue your operations while the previous chunk is being copied up. Again, this uses more local memory, but does allow you to continue processing the next chunk. +Here, we have two buffers containing data. While filling one of the buffers with data, we move the other buffer up. We do this using the ``bsp_stream_move_up`` function which has as arguments respectively: the stream handle, the data to send up, the size of the data to send up, and a flag that indicates whether we want to *wait for completion*. In this case, we do not wait, but use two buffers to perform computations and to send data up to the host simulatenously. Closing streams ^^^^^^^^^^^^^^^ The EBSP stream system allocates buffers for you on the cores. When you are done with a stream you should tell the EBSP system by calling:: - ebsp_close_down_stream(0); - ebsp_close_up_stream(0); + bsp_stream_close(&my_stream); -which will free the buffers for other use. +which will free the buffers for other use, and allow other cores to use the streams. Interface ------------------ @@ -98,38 +93,23 @@ Interface Host ^^^^ -.. doxygenfunction:: ebsp_create_down_stream - :project: ebsp_host - -.. doxygenfunction:: ebsp_create_up_stream +.. doxygenfunction:: bsp_stream_create :project: ebsp_host Epiphany ^^^^^^^^ -.. doxygenfunction:: ebsp_open_down_stream - :project: ebsp_e - -.. doxygenfunction:: ebsp_open_up_stream - :project: ebsp_e - -.. doxygenfunction:: ebsp_close_down_stream - :project: ebsp_e - -.. doxygenfunction:: ebsp_close_up_stream - :project: ebsp_e - -.. doxygenfunction:: ebsp_move_chunk_up +.. doxygenfunction:: bsp_stream_open :project: ebsp_e -.. doxygenfunction:: ebsp_move_chunk_down +.. doxygenfunction:: bsp_stream_close :project: ebsp_e -.. doxygenfunction:: ebsp_move_down_cursor +.. doxygenfunction:: bsp_stream_move_up :project: ebsp_e -.. doxygenfunction:: ebsp_reset_down_cursor +.. doxygenfunction:: bsp_stream_move_down :project: ebsp_e -.. doxygenfunction:: ebsp_set_up_chunk_size +.. doxygenfunction:: bsp_stream_seek :project: ebsp_e diff --git a/include/e_bsp.h b/include/e_bsp.h index 4095b40..506f2e1 100644 --- a/include/e_bsp.h +++ b/include/e_bsp.h @@ -438,7 +438,7 @@ void ebsp_send_up(const void* tag, const void* payload, int nbytes); * // Finally, close the stream * bsp_stream_close(&mystream);` * } - * \endcose + * \endcode * * @remarks This function has to be called *before* performing any other * operation on the stream. From 861be52a35732c763369925c944653ececbf09f4 Mon Sep 17 00:00:00 2001 From: Jan-Willem Buurlage Date: Wed, 18 Jan 2017 15:32:41 +0100 Subject: [PATCH 15/17] Update changelog --- CHANGELOG.md | 3 ++- README.md | 7 ++----- docs/conf.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3274b94..d527cba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,11 @@ # Changelog -## 1.0.0-beta.3 +## 1.0.0 ### Added - BSP variable list is stored distributed over all cores instead of in external memory - Implement `bsp_pop_reg` +- New streaming API ### Fixed - `bsp_begin` no longer uses divide and modulus operator which take up large amounts of memory diff --git a/README.md b/README.md index 25dfd78..282c6a5 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ In particular this library has been implemented and tested on the [Parallella]( int main(int argc, char **argv) { - bsp_init("ecore_program.srec", argc, argv); + bsp_init("ecore_program.elf", argc, argv); bsp_begin(16); ebsp_spmd(); bsp_end(); @@ -92,7 +92,7 @@ HOST_LIB_NAMES = -lhost-bsp -le-hal -le-loader E_LIB_NAMES = -le-bsp -le-lib -all: bin bin/host_program bin/ecore_program.srec +all: bin bin/host_program bin/ecore_program.elf bin: @mkdir -p bin @@ -105,9 +105,6 @@ bin/ecore_program.elf: src/ecore_code.c @echo "CC $<" @e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(E_LIBS) $(E_LIB_NAMES) -bin/%.srec: bin/%.elf - @e-objcopy --srec-forceS3 --output-target srec $< $@ - clean: rm -r bin ``` diff --git a/docs/conf.py b/docs/conf.py index b5a142e..ea2c11c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -80,7 +80,7 @@ # The short X.Y version. version = '1.0' # The full version, including alpha/beta/rc tags. -release = '1.0-beta' +release = '1.0' # google analytics ID googleanalytics_id = 'UA-59249373-1' From 478e728168ef0e7a1254da009d9e42b1e7d065e0 Mon Sep 17 00:00:00 2001 From: Jan-Willem Buurlage Date: Wed, 18 Jan 2017 15:45:45 +0100 Subject: [PATCH 16/17] Add note about opening streams on multiple cores --- docs/streaming.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/streaming.rst b/docs/streaming.rst index 732f181..ff717bb 100644 --- a/docs/streaming.rst +++ b/docs/streaming.rst @@ -32,7 +32,7 @@ This will create a stream containing user data. This stream is chopped up in ``2 // ... } -After this call, the stream will start copying data to the core, but the data is not necessarily there yet (it might still be copying). To access this data we *move* a token:: +After this call, the stream will start copying data to the core, but the data is not necessarily there yet (it might still be copying). A stream can only be opened by *a single core at a time*. To access this data we *move* a token:: // Get some data void* buffer = NULL; From 5b40a47e2287b3c2aab43466f3c7f7558f744646 Mon Sep 17 00:00:00 2001 From: Jan-Willem Buurlage Date: Wed, 18 Jan 2017 16:40:52 +0100 Subject: [PATCH 17/17] Add release date for 1.0.0 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d527cba..517445f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## 1.0.0 +## 1.0.0 - 2017-18-01 ### Added - BSP variable list is stored distributed over all cores instead of in external memory