diff options
Diffstat (limited to 'MLP')
-rw-r--r-- | MLP/Makefile | 44 | ||||
-rw-r--r-- | MLP/baselines/cpu/Makefile | 4 | ||||
-rw-r--r-- | MLP/baselines/cpu/README | 9 | ||||
-rw-r--r-- | MLP/baselines/cpu/mlp_openmp.c | 161 | ||||
-rw-r--r-- | MLP/baselines/gpu/Makefile | 5 | ||||
-rw-r--r-- | MLP/baselines/gpu/README | 9 | ||||
-rw-r--r-- | MLP/baselines/gpu/mlp.cu | 208 | ||||
-rw-r--r-- | MLP/dpu/task.c | 171 | ||||
-rw-r--r-- | MLP/host/app.c | 343 | ||||
-rwxr-xr-x | MLP/support/common.h | 45 | ||||
-rw-r--r-- | MLP/support/params.h | 56 | ||||
-rwxr-xr-x | MLP/support/timer.h | 62 |
12 files changed, 1117 insertions, 0 deletions
diff --git a/MLP/Makefile b/MLP/Makefile new file mode 100644 index 0000000..944b3ca --- /dev/null +++ b/MLP/Makefile @@ -0,0 +1,44 @@ +DPU_DIR := dpu +HOST_DIR := host +BUILDDIR ?= bin +NR_TASKLETS ?= 16 +BL ?= 10 +NR_DPUS ?= 1 + +define conf_filename + ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf +endef +CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL}) + +HOST_TARGET := ${BUILDDIR}/mlp_host +DPU_TARGET := ${BUILDDIR}/mlp_dpu + +COMMON_INCLUDES := support +HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c) +DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c) + +.PHONY: all clean test + +__dirs := $(shell mkdir -p ${BUILDDIR}) + +COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} +HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} +DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} + +all: ${HOST_TARGET} ${DPU_TARGET} + +${CONF}: + $(RM) $(call conf_filename,*,*) + touch ${CONF} + +${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF} + $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + +${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF} + dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} + +clean: + $(RM) -r $(BUILDDIR) + +test: all + ./${HOST_TARGET} -m 1024 -n 1024 diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile new file mode 100644 index 0000000..581897e --- /dev/null +++ b/MLP/baselines/cpu/Makefile @@ -0,0 +1,4 @@ +all: + gcc mlp_openmp.c -o mlp_openmp -fopenmp -std=c99 +run: + ./mlp_openmp diff --git a/MLP/baselines/cpu/README b/MLP/baselines/cpu/README new file mode 100644 index 0000000..b928195 --- /dev/null +++ b/MLP/baselines/cpu/README @@ -0,0 +1,9 @@ +Multilayer Perceptron (MLP) + +Compilation instructions + + make + +Execution instructions + + ./mlp_openmp diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c new file mode 100644 index 0000000..ef478c1 --- /dev/null +++ b/MLP/baselines/cpu/mlp_openmp.c @@ -0,0 +1,161 @@ +/** +* @file app.c +* @brief Template for a Host Application Source File. +* +*/ +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <getopt.h> +#include <assert.h> +#include <stdint.h> +#include "../../support/timer.h" +#include "../../support/common.h" + +T** A; +T* B; +T* C; + +// Create input arrays +static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){ + for (unsigned int l = 0; l < NUM_LAYERS; l++) + for (unsigned int i = 0; i < m_size * n_size; i++){ + if(i % 100 < 98){ + A[l][i] = 0; + }else{ + A[l][i] = (l+i) % 2; + } + } + for (unsigned int i = 0; i < n_size; i++){ + if(i % 50 < 48){ + B[i] = 0; + } + else{ + B[i] = i % 2; + } + } +} + +// Compute output in the host +static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) { + for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){ + for (unsigned int m = 0; m < m_size; m++){ + C[m] = 0; + } + #pragma omp parallel for + for (unsigned int m = 0; m < m_size; m++){ + for (unsigned int n = 0; n < n_size; n++){ + C[m] += A[nl][m * n_size + n] * B[n]; + } + C[m] = max(0, C[m]); + } + for (unsigned int n = 0; n < n_size; n++){ + B[n] = C[n]; + } + } +} + +static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) { + uint64_t sum = 0; + for (uint64_t m = 0; m < n_size; m++){ + sum += B[m]; + } + return sum; +} + +// Params --------------------------------------------------------------------- +typedef struct Params { + char* dpu_type; + int nr_of_ranks; + int input_size_n; + int input_size_m; + int n_warmup; + int n_reps; +}Params; + +void usage() { + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -d <D> DPU type (default=fsim)" + "\n -r <R> # of ranks (default=2)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=8M elements)" + "\n"); + } + + struct Params input_params(int argc, char **argv) { + struct Params p; + p.dpu_type = "fsim"; + p.nr_of_ranks = 1; + p.input_size_n = 1 << 9; + p.input_size_m = 1 << 9; + p.n_warmup = 2; + p.n_reps = 3; + + int opt; + while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 'd': p.dpu_type = optarg; break; + case 'r': p.nr_of_ranks = atoi(optarg); break; + case 'n': p.input_size_n = atoi(optarg); break; + case 'm': p.input_size_m = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(p.nr_of_ranks > 0 && "Invalid # of ranks!"); + + return p; + } + + /** + * @brief Main of the Host Application. + */ + int main(int argc, char **argv) { + + struct Params p = input_params(argc, argv); + uint64_t n_size = 8192; + uint64_t m_size = 20480; + + Timer timer; + A = malloc(NUM_LAYERS * sizeof(T*)); + for(int l = 0; l < NUM_LAYERS; l++) + A[l] = malloc(n_size*m_size*sizeof(unsigned int)); + B = malloc(m_size*sizeof(unsigned int)); + C = malloc(m_size*sizeof(unsigned int)); + + // Create an input file with arbitrary data. + init_data(A, B, m_size, n_size); + + start(&timer, 0, 1); + mlp_host(C, A, B, n_size, m_size); + stop(&timer, 0); + + uint32_t sum = mlp_host_sum(n_size, m_size); + + printf("Kernel "); + print(&timer, 0, 1); + printf("\n"); + + printf("SUM = %d \n", sum); + + for(int l = 0; l < NUM_LAYERS; l++) + free(A[l]); + free(A); + free(B); + free(C); + + return 0; +} diff --git a/MLP/baselines/gpu/Makefile b/MLP/baselines/gpu/Makefile new file mode 100644 index 0000000..69ee49c --- /dev/null +++ b/MLP/baselines/gpu/Makefile @@ -0,0 +1,5 @@ +all: + /usr/local/cuda/bin/nvcc mlp.cu -I/usr/local/cuda/include -lm -o mlp + +clean: + rm mlp diff --git a/MLP/baselines/gpu/README b/MLP/baselines/gpu/README new file mode 100644 index 0000000..253c8e3 --- /dev/null +++ b/MLP/baselines/gpu/README @@ -0,0 +1,9 @@ +Multilayer Perceptron (MLP) + +Compilation instructions + + make + +Execution instructions + + ./mlp diff --git a/MLP/baselines/gpu/mlp.cu b/MLP/baselines/gpu/mlp.cu new file mode 100644 index 0000000..c912d10 --- /dev/null +++ b/MLP/baselines/gpu/mlp.cu @@ -0,0 +1,208 @@ +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <cuda.h> +#include "../../support/common.h" + +#define THREAD 128 + +__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans); + +void cgemv(int m, int n, T *adim, T *b, T *d_ans); + +double gettime() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (double)tv.tv_usec*1.0e-6; +} + +int main(int argc, char **argv) +{ + /* for CPU */ + int i, j; + T **bdim; + T *c, *ans, *h_ans, *h_c; + int n = 8192; + int m = 20480; + + bdim = (T**) malloc(NUM_LAYERS * sizeof(T*)); + for(int l = 0; l < NUM_LAYERS; l++) + bdim[l] = (T*)malloc(sizeof(T)*m*n); + c = (T*)malloc(sizeof(T) *n); + h_c = (T*)malloc(sizeof(T) *n); + ans = (T*)malloc(sizeof(T) *m); + h_ans = (T*)malloc(sizeof(T) *m); + + /* for GPU */ + T *d_bdim; + T *d_c, *d_ans; + cudaMalloc((void **)&d_bdim, sizeof(T)*m*n); + cudaMalloc((void **)&d_c, sizeof(T)*n); + cudaMalloc((void **)&d_ans, sizeof(T)*m); + + for(i = 0; i < n; i++) + { + if(i % 50 < 48) + { + c[i] = 0; + h_c[i] = 0; + } + else + { + c[i] = i % 2; + h_c[i] = i % 2; + } + } + for(int l = 0; l < NUM_LAYERS; l++) + for(i = 0; i < n; i++) + { + for(j = 0; j < m; j++){ + if(j % 100 < 98) + { + + bdim[l][i*m+j] = 0; + } + else + { + + bdim[l][i*m+j] = (l + i) % 2; + } + } + } + + for(j = 0; j < m; j++){ + ans[j] = 0; + h_ans[j] = 0; + } + // Computation on the host for verification + T* vector = c; + T* output = ans; + T* matrix; + int mm = m; + int nn = n; + for(int l = 0; l < NUM_LAYERS; l++){ + matrix = bdim[l]; + cgemv(mm, nn, matrix, vector, output); + vector = output; + h_ans = output; + mm = n; nn = m; + } + + // Event creation + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float time1 = 0; + float time2 = 0; + cudaMemcpy(d_ans, h_ans, sizeof(T)*m, cudaMemcpyHostToDevice); + cudaMemcpy(d_c, h_c, sizeof(T)*n, cudaMemcpyHostToDevice); + + vector = d_c; + output = d_ans; + mm = m; + nn = n; + for(int l = 0; l < NUM_LAYERS; l++){ + cudaMemcpy(d_bdim, bdim[l], sizeof(T)*m*n, cudaMemcpyHostToDevice); + matrix = d_bdim; + // Start timer + cudaEventRecord( start, 0 ); + gemv<<<mm, THREAD>>>(mm, nn, matrix, vector, output); + // End timer + cudaEventRecord( stop, 0 ); + cudaEventSynchronize( stop ); + cudaEventElapsedTime( &time2, start, stop ); + time1 += time2; + vector = output; + d_ans = output; + mm = n; nn = m; + } + + cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost); + cudaMemcpy(h_c, d_c, sizeof(T)*n, cudaMemcpyDeviceToHost); + + for(i = 0; i < m; i++) + { + if(ans[i] != h_ans[i]) + printf("ERROR in Ans %d -> %d -- %d\n", i, ans[i], h_ans[i]); + } + + for(i = 0; i < n; i++) + { + if(c[i] != h_c[i]) + printf("ERROR in C %d -> %d -- %d\n", i, c[i], h_c[i]); + } + printf("Execution time = %f ms\n", time1); + + + for(int l = 0; l < NUM_LAYERS; l++) + free(bdim[l]); + + + free(bdim); + free(c); + free(ans); + free(h_c); + cudaFree(d_bdim); + cudaFree(d_c); + cudaFree(d_ans); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return 0; +} + +__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans) +{ + int i; + int div = n/THREAD; + __shared__ T tmp[THREAD]; + + tmp[threadIdx.x] = 0.0; + + for(i = 0; i < div; i++){ + tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x]; + } + if(threadIdx.x < m%THREAD) + tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x]; + + __syncthreads(); + + for(i = THREAD / 2; i > 31; i = i / 2) + { + if(threadIdx.x < i) + tmp[threadIdx.x] += tmp[threadIdx.x + i]; + __syncthreads(); + } + + if(threadIdx.x < 16) + { + tmp[threadIdx.x] += tmp[threadIdx.x + 16]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 8]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 4]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 2]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 1]; + __syncthreads(); + } + + + if(threadIdx.x == 0) + d_ans[blockIdx.x] = max(0, tmp[0]); + +} + +void cgemv(int m, int n, T *adim, T *b, T *d_ans) +{ + int i, j; + + for(i = 0; i < m; i++){ + for(j = 0; j < n; j++) + d_ans[i] += adim[i*n+j] * b[j]; + d_ans[i] = max(0, d_ans[i]); + } + +} diff --git a/MLP/dpu/task.c b/MLP/dpu/task.c new file mode 100644 index 0000000..de3e554 --- /dev/null +++ b/MLP/dpu/task.c @@ -0,0 +1,171 @@ +/* + * Matrix vector multiplication with multiple tasklet + * + */ +#include <stdint.h> +#include <stdio.h> +#include <defs.h> +#include <mram.h> +#include <alloc.h> +#include <barrier.h> +#include <seqread.h> + +#include "../support/common.h" + +__host dpu_arguments_t DPU_INPUT_ARGUMENTS; + +// GEMV +static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) { + for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) { + bufferC[pos] += bufferA[i] * bufferB[i]; + } + return; +} + +// Barrier +BARRIER_INIT(my_barrier, NR_TASKLETS); + +// main +int main() { + unsigned int tasklet_id = me(); +#if PRINT + printf("tasklet_id = %u\n", tasklet_id); +#endif + if (tasklet_id == 0){ // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + int32_t n_size = DPU_INPUT_ARGUMENTS.n_size; + int32_t n_size_pad = DPU_INPUT_ARGUMENTS.n_size_pad; + uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows; + uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows; + + + unsigned int nrows = nr_rows; + unsigned int rows_per_tasklet; + unsigned int start_row; + unsigned int chunks = nrows / (NR_TASKLETS + NR_TASKLETS); + unsigned int dbl_chunks = chunks + chunks; + rows_per_tasklet = dbl_chunks; + unsigned int rest_rows = nrows % (NR_TASKLETS + NR_TASKLETS); + + if ((tasklet_id + tasklet_id) < rest_rows) + rows_per_tasklet += 2; + if (rest_rows > 0) { + if ((tasklet_id + tasklet_id) >= rest_rows) { + unsigned int hlf_rest_rows = rest_rows >> 1; + if ((rest_rows & 1) == 1) + start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks; + else + start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks; + } else + start_row = tasklet_id * (dbl_chunks + 2); + } else { + start_row = tasklet_id * (dbl_chunks); + } + + // Address of the current row in MRAM + uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T)); + uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T)); + uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T)); + uint32_t mram_temp_addr_A = mram_base_addr_A; + uint32_t mram_temp_addr_B = mram_base_addr_B; + + // Inititalize a local cache to store the MRAM block + T *cache_A = (T *) mem_alloc(BLOCK_SIZE + 8); + T *cache_A_aux = (T *) mem_alloc(8); + T *cache_B = (T *) mem_alloc(BLOCK_SIZE); + T *cache_C = (T *) mem_alloc(8); + + int offset = 0; + + // Iterate over nr_rows + for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) { + + mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T)); + mram_temp_addr_B = mram_base_addr_B; + + cache_C[0] = 0; + cache_C[1] = 0; + for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){ + int n = 0, j; + for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T))) + { + + mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); + mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); + + if(offset) + { + + for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++) + { + cache_A[off] = cache_A[off + 1]; + } + + mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8); + + cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + } + + // Compute GEMV + gemv(cache_C, cache_A, cache_B, pos); + + // Update memory addresses + mram_temp_addr_A += BLOCK_SIZE; + mram_temp_addr_B += BLOCK_SIZE; + } + + mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); + + + if(offset) + { + for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++) + { + + cache_A[off] = cache_A[off + 1]; + } + + mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8); + + cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + } + + + mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); + + for (j = 0; j < (int) (n_size - n); j++) { + // Compute GEMV + if(j >= (int)(BLOCK_SIZE / sizeof(T))){ + printf("error\n"); + break; + } + cache_C[pos] += cache_A[j] * cache_B[j]; + } + + + mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T)); + mram_temp_addr_B = mram_base_addr_B; + + if(mram_temp_addr_A % 8 != 0) + { + offset = 1; + } + else + { + offset = 0; + } + } + // Write cache to current MRAM block + mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8); + + // Update memory address + mram_base_addr_C += 2 * sizeof(T); + + } + + return 0; +} diff --git a/MLP/host/app.c b/MLP/host/app.c new file mode 100644 index 0000000..952cb3f --- /dev/null +++ b/MLP/host/app.c @@ -0,0 +1,343 @@ +/** + * app.c + * MLP Host Application Source File + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <dpu.h> +#include <dpu_log.h> +#include <unistd.h> +#include <getopt.h> +#include <assert.h> + +#if ENERGY +#include <dpu_probe.h> +#endif + +#include "../support/common.h" +#include "../support/timer.h" +#include "../support/params.h" + +// Define the DPU Binary path as DPU_BINARY here +#ifndef DPU_BINARY +#define DPU_BINARY "./bin/mlp_dpu" +#endif + +static T** A; +static T* B; +static T* B_host; +static T* B_tmp; +static T* C; +static T* C_dpu; + +// Create input arrays +static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int n_size) { + for (unsigned int l = 0; l < NUM_LAYERS; l++) + for (unsigned int i = 0; i < m_size * n_size; i++){ + if(i % 100 < 98){ + A[l][i] = 0; + }else{ + A[l][i] = (l+i) % 2; + } + } + for (unsigned int i = 0; i < n_size; i++){ + if(i % 50 < 48){ + B[i] = 0; + } + else{ + B[i] = i % 2; + } + B_host[i] = B[i]; + } +} + +// Compute output in the host +static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) { + + for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){ + for (unsigned int m = 0; m < m_size; m++){ + C[m] = 0; + } + for (unsigned int m = 0; m < m_size; m++){ + for (unsigned int n = 0; n < n_size; n++){ + C[m] += A[nl][m * n_size + n] * B[n]; + } + C[m] = max(0, C[m]); + } + for (unsigned int n = 0; n < n_size; n++){ + B[n] = C[n]; + } + } +} + +// Main of the Host Application +int main(int argc, char **argv) { + + struct Params p = input_params(argc, argv); + + struct dpu_set_t dpu_set, dpu; + uint32_t nr_of_dpus; + + // Allocate DPUs and load binary + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + +#if ENERGY + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); +#endif + + unsigned int i, l; + unsigned int m_size = p.m_size; + unsigned int n_size = p.n_size; + + // Initialize help data + dpu_info = (struct dpu_info_t *) malloc(nr_of_dpus * sizeof(struct dpu_info_t)); + dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t)); + uint32_t max_rows_per_dpu = 0; + uint32_t n_size_pad = n_size; + if(n_size % 2 == 1){ + n_size_pad++; + } + + // Timer + Timer timer; + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + uint32_t rows_per_dpu; + uint32_t prev_rows_dpu = 0; + uint32_t chunks = m_size / nr_of_dpus; + rows_per_dpu = chunks; + uint32_t rest_rows = m_size % nr_of_dpus; + if (i < rest_rows) + rows_per_dpu++; + if (rest_rows > 0) { + if (i >= rest_rows) + prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks; + else + prev_rows_dpu = i * (chunks + 1); + } else { + prev_rows_dpu = i * chunks; + } + + // Keep max rows for parallel transfers + uint32_t rows_per_dpu_pad = rows_per_dpu; + if (rows_per_dpu_pad % 2 == 1) // 4-byte elements + rows_per_dpu_pad++; + if (rows_per_dpu_pad > max_rows_per_dpu) + max_rows_per_dpu = rows_per_dpu_pad; + + dpu_info[i].rows_per_dpu = rows_per_dpu; + dpu_info[i].rows_per_dpu_pad = rows_per_dpu_pad; + dpu_info[i].prev_rows_dpu = prev_rows_dpu; + + // Copy input arguments to DPU + input_args[i].n_size = n_size; + input_args[i].n_size_pad = n_size_pad; + input_args[i].nr_rows = rows_per_dpu; + } + + A = (T**)malloc(NUM_LAYERS * sizeof(T*)); + for(l = 0; l < NUM_LAYERS; l++) + A[l] = (T*)malloc( max_rows_per_dpu * nr_of_dpus * n_size_pad * sizeof(T)); + + + B = (T*)malloc(n_size * sizeof(T)); + B_host = (T*)malloc(n_size * sizeof(T)); + C = (T*)malloc(m_size * sizeof(T)); + C_dpu = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); + B_tmp = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); + + init_data(A, B, B_host, m_size, n_size); + + // Compute output on CPU (performance comparison and verification purposes) + start(&timer, 0, 0); + mlp_host(C, A, B_host, m_size, n_size); + stop(&timer, 0); + + for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + if (rep >= p.n_warmup) + start(&timer, 1, rep - p.n_warmup); + // Input arguments + i = 0; + // Copy input arguments to DPU + DPU_FOREACH(dpu_set, dpu, i) { + input_args[i].max_rows = max_rows_per_dpu; + DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT)); + + + // Copy input array and vector + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, A[0] + dpu_info[i].prev_rows_dpu * n_size)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, B)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + if (rep >= p.n_warmup) + stop(&timer, 1); + + // Run kernel on DPUs + if (rep >= p.n_warmup) + { + start(&timer, 2, rep - p.n_warmup); +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + } + + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + + if (rep >= p.n_warmup) + { + stop(&timer, 2); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); +#endif + } + + for(int lay = 1; lay < NUM_LAYERS; lay++){ + if (rep >= p.n_warmup) + start(&timer, 4, rep - p.n_warmup); + i = 0; + + // Copy C_dpu + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT)); + + // B = C + unsigned int n, j; + i = 0; + for (n = 0; n < nr_of_dpus; n++) { + for (j = 0; j < dpu_info[n].rows_per_dpu; j++) { + B_tmp[i] = C_dpu[n * max_rows_per_dpu + j]; + i++; + } + } + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, B_tmp)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + + // Copy next matrix of weights + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, A[lay] + dpu_info[i].prev_rows_dpu * n_size)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + + if(rep >= p.n_warmup) + stop(&timer, 4); + + if (rep >= p.n_warmup) + { + start(&timer, 2, rep - p.n_warmup); +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + } + + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + + if (rep >= p.n_warmup) + { + stop(&timer, 2); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); +#endif + } + } + +#if PRINT + // Display DPU Logs + DPU_FOREACH(dpu_set, dpu) { + DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); + } +#endif + + // Retrieve results + if (rep >= p.n_warmup) + start(&timer, 3, rep - p.n_warmup); + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT)); + if(rep >= p.n_warmup) + stop(&timer, 3); + } + +#if ENERGY + double acc_energy, avg_energy, acc_time, avg_time; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time)); +#endif + + // Print timing results + printf("CPU Version Time (ms): "); + print(&timer, 0, 1); + printf("CPU-DPU Time (ms): "); + print(&timer, 1, p.n_reps); + printf("DPU Kernel Time (ms): "); + print(&timer, 2, p.n_reps); + printf("Inter-DPU Time (ms): "); + print(&timer, 4, p.n_reps); + printf("DPU-CPU Time (ms): "); + print(&timer, 3, p.n_reps); + +#if ENERGY + printf("Energy (J): %f J\t", avg_energy); +#endif + printf("\n\n"); + + // Check output + bool status = true; + unsigned int n, j; + i = 0; + for (n = 0; n < nr_of_dpus; n++) { + for (j = 0; j < dpu_info[n].rows_per_dpu; j++) { + if(C[i] != C_dpu[n * max_rows_per_dpu + j]) { + status = false; +#if PRINT + printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]); +#endif + } + i++; + } + } + if (status) { + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); + } else { + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); + } + + // Deallocation + for(i = 0; i < NUM_LAYERS; i++) + free(A[i]); + free(A); + free(B); + free(C); + free(C_dpu); + DPU_ASSERT(dpu_free(dpu_set)); + +#if ENERGY + DPU_ASSERT(dpu_probe_deinit(&probe)); +#endif + + return status ? 0 : -1; +} diff --git a/MLP/support/common.h b/MLP/support/common.h new file mode 100755 index 0000000..53b2f1c --- /dev/null +++ b/MLP/support/common.h @@ -0,0 +1,45 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +// Structures used by both the host and the dpu to communicate information +typedef struct { + uint32_t n_size; + uint32_t n_size_pad; + uint32_t nr_rows; + uint32_t max_rows; +} dpu_arguments_t; + +// Specific information for each DPU +struct dpu_info_t { + uint32_t rows_per_dpu; + uint32_t rows_per_dpu_pad; + uint32_t prev_rows_dpu; +}; +struct dpu_info_t *dpu_info; + +#define NUM_LAYERS 3 +#define max(x, y) (x > y ? x : y) +#define min(x, y) (x < y ? x : y) + +// Transfer size between MRAM and WRAM +#ifdef BL +#define BLOCK_SIZE_LOG2 BL +#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2) +#else +#define BLOCK_SIZE_LOG2 8 +#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2) +#define BL BLOCK_SIZE_LOG2 +#endif + +// Data type +#define T int32_t + +#ifndef ENERGY +#define ENERGY 0 +#endif +#define PRINT 0 + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_RESET "\x1b[0m" +#endif diff --git a/MLP/support/params.h b/MLP/support/params.h new file mode 100644 index 0000000..f9e790e --- /dev/null +++ b/MLP/support/params.h @@ -0,0 +1,56 @@ +#ifndef _PARAMS_H_ +#define _PARAMS_H_ + +#include "common.h" + +typedef struct Params { + unsigned int m_size; + unsigned int n_size; + unsigned int n_warmup; + unsigned int n_reps; +}Params; + +static void usage() { + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n" + "\nBenchmark-specific options:" + "\n -m <I> m_size (default=2048 elements)" + "\n -n <I> n_size (default=2048 elements)" + "\n"); +} + +struct Params input_params(int argc, char **argv) { + struct Params p; + p.m_size = 163840; + p.n_size = 4096; + p.n_warmup = 1; + p.n_reps = 3; + + int opt; + while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 'm': p.m_size = atoi(optarg); break; + case 'n': p.n_size = atoi(optarg); break; + case 'w': p.n_warmup = atoi(optarg); break; + case 'e': p.n_reps = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); + + return p; +} +#endif diff --git a/MLP/support/timer.h b/MLP/support/timer.h new file mode 100755 index 0000000..886380a --- /dev/null +++ b/MLP/support/timer.h @@ -0,0 +1,62 @@ +/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer{
+
+ struct timeval startTime[5];
+ struct timeval stopTime[5];
+ double time[5];
+
+}Timer;
+
+void start(Timer *timer, int i, int rep) {
+ if(rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i) {
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+ //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+ // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
+
+}
+
+void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
|