summaryrefslogtreecommitdiff
path: root/SpMV
diff options
context:
space:
mode:
Diffstat (limited to 'SpMV')
-rw-r--r--SpMV/Makefile50
-rw-r--r--SpMV/baselines/cpu/Makefile10
-rw-r--r--SpMV/baselines/cpu/app.c95
-rwxr-xr-xSpMV/baselines/cpu/run-perf.sh6
-rwxr-xr-xSpMV/benchmark-scripts/ccmcc25-sim.sh33
-rwxr-xr-xSpMV/benchmark-scripts/ccmcc25.sh40
-rw-r--r--SpMV/dpu/task.c278
-rw-r--r--SpMV/host/app.c537
-rw-r--r--SpMV/host/mram-management.h48
-rw-r--r--SpMV/include/common.h24
-rw-r--r--SpMV/include/dfatool_host.ah31
-rw-r--r--SpMV/include/matrix.h138
-rw-r--r--SpMV/include/params.h51
-rw-r--r--SpMV/include/timer.h54
-rw-r--r--SpMV/include/utils.h (renamed from SpMV/support/utils.h)1
-rwxr-xr-xSpMV/run-paper-strong-full.sh29
-rwxr-xr-xSpMV/run-paper-strong-rank.sh23
-rwxr-xr-xSpMV/run-paper-weak.sh27
-rw-r--r--SpMV/support/common.h25
-rw-r--r--SpMV/support/matrix.h119
-rw-r--r--SpMV/support/params.h46
-rw-r--r--SpMV/support/timer.h27
22 files changed, 946 insertions, 746 deletions
diff --git a/SpMV/Makefile b/SpMV/Makefile
index 0e7a70c..c2d9d50 100644
--- a/SpMV/Makefile
+++ b/SpMV/Makefile
@@ -1,21 +1,31 @@
NR_TASKLETS ?= 16
NR_DPUS ?= 1
-COMMON_INCLUDES := support
-HOST_SOURCES := $(wildcard host/*.c)
-DPU_SOURCES := $(wildcard dpu/*.c)
-CPU_BASE_SOURCES := $(wildcard baselines/cpu/*.c)
-GPU_BASE_SOURCES := $(wildcard baselines/gpu/*.cu)
-
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
-CPU_BASE_FLAGS := -O3 -fopenmp
-GPU_BASE_FLAGS := -O3
+
+ifeq (${aspectc_timing}, 1)
+ ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+ HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+ HOST_FLAGS += -std=c11
+endif
QUIET = @
-ifdef verbose
+ifeq (${verbose}, 1)
QUIET =
endif
@@ -24,19 +34,13 @@ all: bin/host_code bin/dpu_code
bin:
${QUIET}mkdir -p bin
-gpu: bin/gpu_baseline
-
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
- ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
-
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
- ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
-
-bin/cpu_baseline: ${CPU_BASE_SOURCES}
- ${QUIET}${CC} -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS}
+bin/host_code: host/app.c include bin
+ ${QUIET}cp ../include/dfatool_host_dpu.ah include
+ ${QUIET}${HOST_CC} -o $@ host/app.c ${HOST_FLAGS}
+ ${QUIET}rm -f include/dfatool_host_dpu.ah
-bin/gpu_baseline: ${GPU_BASE_SOURCES}
- ${QUIET}nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS}
+bin/dpu_code: dpu/task.c include bin
+ ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/task.c
clean:
${QUIET}rm -rf bin
diff --git a/SpMV/baselines/cpu/Makefile b/SpMV/baselines/cpu/Makefile
index 5b2367b..a24b764 100644
--- a/SpMV/baselines/cpu/Makefile
+++ b/SpMV/baselines/cpu/Makefile
@@ -1,7 +1,15 @@
+native ?= 1
+
+CFLAGS =
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
all: spmv
spmv: app.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -o spmv -fopenmp app.c
+ gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o spmv -fopenmp app.c
spmv_O0: app.c
gcc -o spmv_O0 -fopenmp app.c
diff --git a/SpMV/baselines/cpu/app.c b/SpMV/baselines/cpu/app.c
index 8d360ee..e33761f 100644
--- a/SpMV/baselines/cpu/app.c
+++ b/SpMV/baselines/cpu/app.c
@@ -13,60 +13,63 @@
#include "../../support/timer.h"
#include "../../support/utils.h"
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
- // Process parameters
- struct Params p = input_params(argc, argv);
+ // Process parameters
+ struct Params p = input_params(argc, argv);
- // Initialize SpMV data structures
- PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
- struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
- PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
- struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
- float* inVector = malloc(csrMatrix.numCols*sizeof(float));
- float* outVector = malloc(csrMatrix.numRows*sizeof(float));
- initVector(inVector, csrMatrix.numCols);
+ // Initialize SpMV data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+ struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros",
+ cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+ struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+ float *inVector = malloc(csrMatrix.numCols * sizeof(float));
+ float *outVector = malloc(csrMatrix.numRows * sizeof(float));
+ initVector(inVector, csrMatrix.numCols);
- // Calculating result on CPU
- PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- //omp_set_num_threads(4);
- Timer timer;
- startTimer(&timer);
- #pragma omp parallel for
- for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
- float sum = 0.0f;
- for(uint32_t i = csrMatrix.rowPtrs[rowIdx]; i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
- uint32_t colIdx = csrMatrix.nonzeros[i].col;
- float value = csrMatrix.nonzeros[i].value;
- sum += inVector[colIdx]*value;
- }
- outVector[rowIdx] = sum;
- }
- stopTimer(&timer);
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ //omp_set_num_threads(4);
+ Timer timer;
+ startTimer(&timer);
+#pragma omp parallel for
+ for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+ float sum = 0.0f;
+ for (uint32_t i = csrMatrix.rowPtrs[rowIdx];
+ i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
+ uint32_t colIdx = csrMatrix.nonzeros[i].col;
+ float value = csrMatrix.nonzeros[i].value;
+ sum += inVector[colIdx] * value;
+ }
+ outVector[rowIdx] = sum;
+ }
+ stopTimer(&timer);
-
- unsigned int nr_threads = 0;
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
-
+ nr_threads++;
- // coomatrix / csrmatrix use uint32_t indexes and float values
- printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
- " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
- nr_threads, csrMatrix.numNonzeros,
- csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer)*1e6),
- csrMatrix.numNonzeros / (getElapsedTime(timer)*1e6),
- getElapsedTime(timer)*1e6);
- //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
- PRINT_INFO(p.verbosity >= 1, " Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+ // coomatrix / csrmatrix use uint32_t indexes and float values
+ printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
+ " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
+ nr_threads, csrMatrix.numNonzeros,
+ csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer) *
+ 1e6),
+ csrMatrix.numNonzeros / (getElapsedTime(timer) * 1e6),
+ getElapsedTime(timer) * 1e6);
+ //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+ PRINT_INFO(p.verbosity >= 1, " Elapsed time: %f ms",
+ getElapsedTime(timer) * 1e3);
- // Deallocate data structures
- freeCOOMatrix(cooMatrix);
- freeCSRMatrix(csrMatrix);
- free(inVector);
- free(outVector);
+ // Deallocate data structures
+ freeCOOMatrix(cooMatrix);
+ freeCSRMatrix(csrMatrix);
+ free(inVector);
+ free(outVector);
- return 0;
+ return 0;
}
diff --git a/SpMV/baselines/cpu/run-perf.sh b/SpMV/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..714498d
--- /dev/null
+++ b/SpMV/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
diff --git a/SpMV/benchmark-scripts/ccmcc25-sim.sh b/SpMV/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..9d1af4e
--- /dev/null
+++ b/SpMV/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+ aspectc=1 aspectc_timing=1 dfatool_timing=0
+ bin/host_code -v 0 -f data/${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+cd data/generate
+for i in 4 8 16; do
+ ./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx
+done
+cd ../..
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks SpMV $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \
+ ::: data bcsstk30.mtx bcsstk30.4.mtx bcsstk30.8.mtx bcsstk30.16.mtx \
+ ::: nr_dpus 1 2 4 8 16 32 48 64 \
+>> ${fn}.txt
+
+rm -f data/bcsstk30.*.mtx
diff --git a/SpMV/benchmark-scripts/ccmcc25.sh b/SpMV/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..176ea99
--- /dev/null
+++ b/SpMV/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+ aspectc=1 aspectc_timing=1 dfatool_timing=0
+ bin/host_code -v 0 -f data/${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+cd data/generate
+for i in 8 32 64; do
+ ./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx
+done
+cd ../..
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+ fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+ source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+ echo "prim-benchmarks SpMV $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt
+
+ parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} numa_rank={numa_rank} \
+ ::: i $(seq 0 10) \
+ ::: data bcsstk30.mtx bcsstk30.8.mtx bcsstk30.32.mtx bcsstk30.64.mtx \
+ ::: numa_rank any \
+ ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+ >> ${fn}.txt
+
+done
+
+rm -f data/bcsstk30.*.mtx
diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c
index 589b6f4..305a645 100644
--- a/SpMV/dpu/task.c
+++ b/SpMV/dpu/task.c
@@ -11,7 +11,7 @@
#include <perfcounter.h>
#include <seqread.h>
-#include "../support/common.h"
+#include "common.h"
#define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m "fmt"\n", ##__VA_ARGS__)
@@ -20,120 +20,164 @@
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
-int main() {
-
- if(me() == 0) {
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- // Load parameters
- uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
- struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- uint32_t numRows = params_w->dpuNumRows;
-
- // Sanity check
- if(me() == 0) {
- if(numRows%2 != 0) {
- // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
- PRINT_ERROR("The number of rows is not a multiple of two!");
- }
- }
-
- // Identify tasklet's rows
- uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
- uint32_t taskletRowsStart = me()*numRowsPerTasklet;
- uint32_t taskletNumRows;
- if(taskletRowsStart > numRows) {
- taskletNumRows = 0;
- } else if(taskletRowsStart + numRowsPerTasklet > numRows) {
- taskletNumRows = numRows - taskletRowsStart;
- } else {
- taskletNumRows = numRowsPerTasklet;
- }
-
- // Only process tasklets with nonzero number of rows
- if(taskletNumRows > 0) {
-
- // Extract parameters
- uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
- uint32_t rowPtrs_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
- uint32_t nonzeros_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuNonzeros_m;
- uint32_t inVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuInVector_m;
- uint32_t outVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuOutVector_m;
-
- // Initialize row pointer sequential reader
- uint32_t taskletRowPtrs_m = rowPtrs_m + taskletRowsStart*sizeof(uint32_t);
- seqreader_t rowPtrReader;
- uint32_t* taskletRowPtrs_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletRowPtrs_m, &rowPtrReader);
- uint32_t firstRowPtr = *taskletRowPtrs_w;
-
- // Initialize nonzeros sequential reader
- uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
- uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart*sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
- seqreader_t nonzerosReader;
- struct Nonzero* taskletNonzeros_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletNonzeros_m, &nonzerosReader);
-
- // Initialize input vector cache
- uint32_t inVectorTileSize = 64;
- float* inVectorTile_w = mem_alloc(inVectorTileSize*sizeof(float));
- mram_read((__mram_ptr void const*)inVector_m, inVectorTile_w, 256);
- uint32_t currInVectorTileIdx = 0;
-
- // Initialize output vector cache
- uint32_t taskletOutVector_m = outVector_m + taskletRowsStart*sizeof(float);
- uint32_t outVectorTileSize = 64;
- float* outVectorTile_w = mem_alloc(outVectorTileSize*sizeof(float));
-
- // SpMV
- uint32_t nextRowPtr = firstRowPtr;
- for(uint32_t row = 0; row < taskletNumRows; ++row) {
-
- // Find row nonzeros
- taskletRowPtrs_w = seqread_get(taskletRowPtrs_w, sizeof(uint32_t), &rowPtrReader);
- uint32_t rowPtr = nextRowPtr;
- nextRowPtr = *taskletRowPtrs_w;
- uint32_t taskletNNZ = nextRowPtr - rowPtr;
-
- // Multiply row with vector
- float outValue = 0.0f;
- for(uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
-
- // Get matrix value
- float matValue = taskletNonzeros_w->value;
-
- // Get input vector value
- uint32_t col = taskletNonzeros_w->col;
- uint32_t inVectorTileIdx = col/inVectorTileSize;
- uint32_t inVectorTileOffset = col%inVectorTileSize;
- if(inVectorTileIdx != currInVectorTileIdx) {
- mram_read((__mram_ptr void const*)(inVector_m + inVectorTileIdx*inVectorTileSize*sizeof(float)), inVectorTile_w, 256);
- currInVectorTileIdx = inVectorTileIdx;
- }
- float inValue = inVectorTile_w[inVectorTileOffset];
-
- // Multiply and add
- outValue += matValue*inValue;
-
- // Read next nonzero
- taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
-
- }
-
- // Store output
- uint32_t outVectorTileIdx = row/outVectorTileSize;
- uint32_t outVectorTileOffset = row%outVectorTileSize;
- outVectorTile_w[outVectorTileOffset] = outValue;
- if(outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
- mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), 256);
- } else if(row == taskletNumRows - 1) { // Last row for tasklet
- mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), (taskletNumRows%outVectorTileSize)*sizeof(float));
- }
-
- }
- }
-
- return 0;
+int main()
+{
+
+ if (me() == 0) {
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ // Load parameters
+ uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ struct DPUParams *params_w =
+ (struct DPUParams *)
+ mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ mram_read((__mram_ptr void const *)params_m, params_w,
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ uint32_t numRows = params_w->dpuNumRows;
+
+ // Sanity check
+ if (me() == 0) {
+ if (numRows % 2 != 0) {
+ // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
+ PRINT_ERROR
+ ("The number of rows is not a multiple of two!");
+ }
+ }
+ // Identify tasklet's rows
+ uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
+ uint32_t taskletRowsStart = me() * numRowsPerTasklet;
+ uint32_t taskletNumRows;
+ if (taskletRowsStart > numRows) {
+ taskletNumRows = 0;
+ } else if (taskletRowsStart + numRowsPerTasklet > numRows) {
+ taskletNumRows = numRows - taskletRowsStart;
+ } else {
+ taskletNumRows = numRowsPerTasklet;
+ }
+
+ // Only process tasklets with nonzero number of rows
+ if (taskletNumRows > 0) {
+
+ // Extract parameters
+ uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
+ uint32_t rowPtrs_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
+ uint32_t nonzeros_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuNonzeros_m;
+ uint32_t inVector_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuInVector_m;
+ uint32_t outVector_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuOutVector_m;
+
+ // Initialize row pointer sequential reader
+ uint32_t taskletRowPtrs_m =
+ rowPtrs_m + taskletRowsStart * sizeof(uint32_t);
+ seqreader_t rowPtrReader;
+ uint32_t *taskletRowPtrs_w =
+ seqread_init(seqread_alloc(),
+ (__mram_ptr void *)taskletRowPtrs_m,
+ &rowPtrReader);
+ uint32_t firstRowPtr = *taskletRowPtrs_w;
+
+ // Initialize nonzeros sequential reader
+ uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
+ uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart * sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
+ seqreader_t nonzerosReader;
+ struct Nonzero *taskletNonzeros_w =
+ seqread_init(seqread_alloc(),
+ (__mram_ptr void *)taskletNonzeros_m,
+ &nonzerosReader);
+
+ // Initialize input vector cache
+ uint32_t inVectorTileSize = 64;
+ float *inVectorTile_w =
+ mem_alloc(inVectorTileSize * sizeof(float));
+ mram_read((__mram_ptr void const *)inVector_m, inVectorTile_w,
+ 256);
+ uint32_t currInVectorTileIdx = 0;
+
+ // Initialize output vector cache
+ uint32_t taskletOutVector_m =
+ outVector_m + taskletRowsStart * sizeof(float);
+ uint32_t outVectorTileSize = 64;
+ float *outVectorTile_w =
+ mem_alloc(outVectorTileSize * sizeof(float));
+
+ // SpMV
+ uint32_t nextRowPtr = firstRowPtr;
+ for (uint32_t row = 0; row < taskletNumRows; ++row) {
+
+ // Find row nonzeros
+ taskletRowPtrs_w =
+ seqread_get(taskletRowPtrs_w, sizeof(uint32_t),
+ &rowPtrReader);
+ uint32_t rowPtr = nextRowPtr;
+ nextRowPtr = *taskletRowPtrs_w;
+ uint32_t taskletNNZ = nextRowPtr - rowPtr;
+
+ // Multiply row with vector
+ float outValue = 0.0f;
+ for (uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
+
+ // Get matrix value
+ float matValue = taskletNonzeros_w->value;
+
+ // Get input vector value
+ uint32_t col = taskletNonzeros_w->col;
+ uint32_t inVectorTileIdx =
+ col / inVectorTileSize;
+ uint32_t inVectorTileOffset =
+ col % inVectorTileSize;
+ if (inVectorTileIdx != currInVectorTileIdx) {
+ mram_read((__mram_ptr void const
+ *)(inVector_m +
+ inVectorTileIdx *
+ inVectorTileSize *
+ sizeof(float)),
+ inVectorTile_w, 256);
+ currInVectorTileIdx = inVectorTileIdx;
+ }
+ float inValue =
+ inVectorTile_w[inVectorTileOffset];
+
+ // Multiply and add
+ outValue += matValue * inValue;
+
+ // Read next nonzero
+ taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
+
+ }
+
+ // Store output
+ uint32_t outVectorTileIdx = row / outVectorTileSize;
+ uint32_t outVectorTileOffset = row % outVectorTileSize;
+ outVectorTile_w[outVectorTileOffset] = outValue;
+ if (outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
+ mram_write(outVectorTile_w,
+ (__mram_ptr void
+ *)(taskletOutVector_m +
+ outVectorTileIdx *
+ outVectorTileSize *
+ sizeof(float)), 256);
+ } else if (row == taskletNumRows - 1) { // Last row for tasklet
+ mram_write(outVectorTile_w,
+ (__mram_ptr void
+ *)(taskletOutVector_m +
+ outVectorTileIdx *
+ outVectorTileSize *
+ sizeof(float)),
+ (taskletNumRows %
+ outVectorTileSize) * sizeof(float));
+ }
+
+ }
+ }
+
+ return 0;
}
diff --git a/SpMV/host/app.c b/SpMV/host/app.c
index fe9c751..6cf2861 100644
--- a/SpMV/host/app.c
+++ b/SpMV/host/app.c
@@ -3,9 +3,24 @@
* SpMV Host Application Source File
*
*/
+#if ASPECTC
+extern "C" {
+#endif
+
#include <dpu.h>
#include <dpu_log.h>
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
#include <assert.h>
#include <getopt.h>
#include <stdio.h>
@@ -14,247 +29,301 @@
#include <unistd.h>
#include "mram-management.h"
-#include "../support/common.h"
-#include "../support/matrix.h"
-#include "../support/params.h"
-#include "../support/timer.h"
-#include "../support/utils.h"
+#include "common.h"
+#include "matrix.h"
+#include "params.h"
+#include "timer.h"
+#include "utils.h"
#define DPU_BINARY "./bin/dpu_code"
#define XSTR(x) STR(x)
#define STR(x) #x
-#ifndef ENERGY
-#define ENERGY 0
-#endif
+// Main of the Host Application
+int main(int argc, char **argv)
+{
+
+ // Process parameters
+ struct Params p = input_params(argc, argv);
+
+ // Timing and profiling
+ Timer timer;
+ double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime =
+ 0.0f, readTime = 0.0f, freeTime = 0.0f;
#if ENERGY
-#include <dpu_probe.h>
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
-// Main of the Host Application
-int main(int argc, char** argv) {
-
- // Process parameters
- struct Params p = input_params(argc, argv);
-
- // Timing and profiling
- Timer timer;
- double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime = 0.0f, readTime = 0.0f, freeTime = 0.0f;
- #if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
- #endif
-
- // Allocate DPUs and load binary
- struct dpu_set_t dpu_set, dpu;
- uint32_t numDPUs, numRanks;
-
- startTimer(&timer);
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- stopTimer(&timer);
- allocTime += getElapsedTime(timer);
-
- startTimer(&timer);
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- stopTimer(&timer);
- loadTime += getElapsedTime(timer);
-
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
- assert(numDPUs == NR_DPUS);
- PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
- // Initialize SpMV data structures
- PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
- struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
- PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
- struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
- uint32_t numRows = csrMatrix.numRows;
- uint32_t numCols = csrMatrix.numCols;
- uint32_t* rowPtrs = csrMatrix.rowPtrs;
- struct Nonzero* nonzeros = csrMatrix.nonzeros;
- float* inVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols*sizeof(float)));
- initVector(inVector, numCols);
- float* outVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows*sizeof(float)));
-
- // Partition data structure across DPUs
- uint32_t numRowsPerDPU = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/numDPUs + 1);
- PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU", numRowsPerDPU);
- struct DPUParams dpuParams[numDPUs];
- unsigned int dpuIdx = 0;
- PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
- DPU_FOREACH (dpu_set, dpu) {
-
- // Allocate parameters
- struct mram_heap_allocator_t allocator;
- init_allocator(&allocator);
- uint32_t dpuParams_m = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
- // Find DPU's rows
- uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
- uint32_t dpuNumRows;
- if(dpuStartRowIdx > numRows) {
- dpuNumRows = 0;
- } else if(dpuStartRowIdx + numRowsPerDPU > numRows) {
- dpuNumRows = numRows - dpuStartRowIdx;
- } else {
- dpuNumRows = numRowsPerDPU;
- }
- dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
- PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
- PRINT_INFO(p.verbosity >= 2, " Receives %u rows", dpuNumRows);
-
- // Partition nonzeros and copy data
- if(dpuNumRows > 0) {
-
- // Find DPU's CSR matrix partition
- uint32_t* dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
- uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
- struct Nonzero* dpuNonzeros_h = &nonzeros[dpuRowPtrsOffset];
- uint32_t dpuNumNonzeros = dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
-
- // Allocate MRAM
- uint32_t dpuRowPtrs_m = mram_heap_alloc(&allocator, (dpuNumRows + 1)*sizeof(uint32_t));
- uint32_t dpuNonzeros_m = mram_heap_alloc(&allocator, dpuNumNonzeros*sizeof(struct Nonzero));
- uint32_t dpuInVector_m = mram_heap_alloc(&allocator, numCols*sizeof(float));
- uint32_t dpuOutVector_m = mram_heap_alloc(&allocator, dpuNumRows*sizeof(float));
- assert((dpuNumRows*sizeof(float))%8 == 0 && "Output sub-vector must be a multiple of 8 bytes!");
- PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated);
-
- // Set up DPU parameters
- dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
- dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
- dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
- dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
- dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
-
- // Send data to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying data to DPU");
- startTimer(&timer);
- copyToDPU(dpu, (uint8_t*)dpuRowPtrs_h, dpuRowPtrs_m, (dpuNumRows + 1)*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNonzeros_h, dpuNonzeros_m, dpuNumNonzeros*sizeof(struct Nonzero));
- copyToDPU(dpu, (uint8_t*)inVector, dpuInVector_m, numCols*sizeof(float));
- stopTimer(&timer);
- writeTime += getElapsedTime(timer);
-
- }
-
- // Send parameters to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU");
- startTimer(&timer);
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m, sizeof(struct DPUParams));
- stopTimer(&timer);
- writeTime += getElapsedTime(timer);
-
- ++dpuIdx;
-
- }
- PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms", writeTime*1e3);
-
- // Run all DPUs
- PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
- startTimer(&timer);
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", energy);
- #endif
- stopTimer(&timer);
- dpuTime += getElapsedTime(timer);
- PRINT_INFO(p.verbosity >= 1, " DPU Time: %f ms", dpuTime*1e3);
-
- // Copy back result
- PRINT_INFO(p.verbosity >= 1, "Copying back the result");
- startTimer(&timer);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
- if(dpuNumRows > 0) {
- uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, (uint8_t*)(outVector + dpuStartRowIdx), dpuNumRows*sizeof(float));
- }
- ++dpuIdx;
- }
- stopTimer(&timer);
- readTime += getElapsedTime(timer);
- PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", readTime*1e3);
-
- // Calculating result on CPU
- PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- float* outVectorReference = malloc(numRows*sizeof(float));
- for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
- float sum = 0.0f;
- for(uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
- uint32_t colIdx = nonzeros[i].col;
- float value = nonzeros[i].value;
- sum += inVector[colIdx]*value;
- }
- outVectorReference[rowIdx] = sum;
- }
-
- // Verify the result
- PRINT_INFO(p.verbosity >= 1, "Verifying the result");
- int status = 1;
- for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
- float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx];
- const float tolerance = 0.00001;
- if(diff > tolerance || diff < -tolerance) {
- status = 0;
- PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]);
- }
- }
-
- startTimer(&timer);
- DPU_ASSERT(dpu_free(dpu_set));
- stopTimer(&timer);
- freeTime += getElapsedTime(timer);
-
- if (status) {
- printf("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
- numDPUs, numRanks, NR_TASKLETS, "float", csrMatrix.numNonzeros);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- allocTime, loadTime, writeTime, dpuTime, readTime, freeTime);
- printf(" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
- csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
- csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- csrMatrix.numNonzeros * sizeof(float) / ((writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros * sizeof(float) / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
- printf(" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
- csrMatrix.numNonzeros / (dpuTime * 1e6),
- csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f",
- csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
- }
-
- // Display DPU Logs
- if(p.verbosity >= 2) {
- PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- PRINT("DPU %u:", dpuIdx);
- DPU_ASSERT(dpu_log_read(dpu, stdout));
- ++dpuIdx;
- }
- }
-
- // Deallocate data structures
- freeCOOMatrix(cooMatrix);
- freeCSRMatrix(csrMatrix);
- free(inVector);
- free(outVector);
- free(outVectorReference);
-
- return 0;
+ // Allocate DPUs and load binary
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t numDPUs, numRanks;
+
+ startTimer(&timer);
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ stopTimer(&timer);
+ allocTime += getElapsedTime(timer);
+
+ startTimer(&timer);
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ stopTimer(&timer);
+ loadTime += getElapsedTime(timer);
+
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+ assert(numDPUs == NR_DPUS);
+ PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+ // Initialize SpMV data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+ struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros",
+ cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+ struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+ uint32_t numRows = csrMatrix.numRows;
+ uint32_t numCols = csrMatrix.numCols;
+ uint32_t *rowPtrs = csrMatrix.rowPtrs;
+ struct Nonzero *nonzeros = csrMatrix.nonzeros;
+ float *inVector =
+ (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)));
+ initVector(inVector, numCols);
+ float *outVector =
+ (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float)));
+
+ // Partition data structure across DPUs
+ uint32_t numRowsPerDPU =
+ ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / numDPUs + 1);
+ PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU",
+ numRowsPerDPU);
+ struct DPUParams dpuParams[numDPUs];
+ unsigned int dpuIdx = 0;
+ PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
+ DPU_FOREACH(dpu_set, dpu) {
+
+ // Allocate parameters
+ struct mram_heap_allocator_t allocator;
+ init_allocator(&allocator);
+ uint32_t dpuParams_m =
+ mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+ // Find DPU's rows
+ uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+ uint32_t dpuNumRows;
+ if (dpuStartRowIdx > numRows) {
+ dpuNumRows = 0;
+ } else if (dpuStartRowIdx + numRowsPerDPU > numRows) {
+ dpuNumRows = numRows - dpuStartRowIdx;
+ } else {
+ dpuNumRows = numRowsPerDPU;
+ }
+ dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
+ PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
+ PRINT_INFO(p.verbosity >= 2, " Receives %u rows",
+ dpuNumRows);
+
+ // Partition nonzeros and copy data
+ if (dpuNumRows > 0) {
+
+ // Find DPU's CSR matrix partition
+ uint32_t *dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
+ uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
+ struct Nonzero *dpuNonzeros_h =
+ &nonzeros[dpuRowPtrsOffset];
+ uint32_t dpuNumNonzeros =
+ dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
+
+ // Allocate MRAM
+ uint32_t dpuRowPtrs_m =
+ mram_heap_alloc(&allocator,
+ (dpuNumRows +
+ 1) * sizeof(uint32_t));
+ uint32_t dpuNonzeros_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNonzeros *
+ sizeof(struct Nonzero));
+ uint32_t dpuInVector_m =
+ mram_heap_alloc(&allocator,
+ numCols * sizeof(float));
+ uint32_t dpuOutVector_m =
+ mram_heap_alloc(&allocator,
+ dpuNumRows * sizeof(float));
+ assert((dpuNumRows * sizeof(float)) % 8 == 0
+ &&
+ "Output sub-vector must be a multiple of 8 bytes!");
+ PRINT_INFO(p.verbosity >= 2,
+ " Total memory allocated is %d bytes",
+ allocator.totalAllocated);
+
+ // Set up DPU parameters
+ dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
+ dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
+ dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
+ dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
+ dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
+
+ // Send data to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying data to DPU");
+ startTimer(&timer);
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuRowPtrs_m, (uint8_t *) dpuRowPtrs_h,
+ ROUND_UP_TO_MULTIPLE_OF_8((dpuNumRows + 1) * sizeof(uint32_t))));
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuNonzeros_m, (uint8_t *) dpuNonzeros_h,
+ ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNonzeros * sizeof(struct Nonzero))));
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuInVector_m, (uint8_t *) inVector,
+ ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float))));
+ stopTimer(&timer);
+ writeTime += getElapsedTime(timer);
+ }
+ // Send parameters to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying parameters to DPU");
+ startTimer(&timer);
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams_m, (uint8_t *) & dpuParams[dpuIdx],
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
+ stopTimer(&timer);
+ writeTime += getElapsedTime(timer);
+
+ ++dpuIdx;
+
+ }
+ PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms",
+ writeTime * 1e3);
+
+ // Run all DPUs
+ PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
+ startTimer(&timer);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+#if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ double energy;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", energy);
+#endif
+ stopTimer(&timer);
+ dpuTime += getElapsedTime(timer);
+ PRINT_INFO(p.verbosity >= 1, " DPU Time: %f ms", dpuTime * 1e3);
+
+ // Copy back result
+ PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+ startTimer(&timer);
+ dpuIdx = 0;
+
+ DPU_FOREACH(dpu_set, dpu) {
+ unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
+ if (dpuNumRows > 0) {
+ uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+ DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams[dpuIdx].dpuOutVector_m,
+ (uint8_t *) (outVector + dpuStartRowIdx),
+ ROUND_UP_TO_MULTIPLE_OF_8(dpuNumRows * sizeof(float))));
+ }
+ ++dpuIdx;
+ }
+ stopTimer(&timer);
+ readTime += getElapsedTime(timer);
+ PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", readTime * 1e3);
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ float *outVectorReference = (float*)malloc(numRows * sizeof(float));
+ for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+ float sum = 0.0f;
+ for (uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
+ uint32_t colIdx = nonzeros[i].col;
+ float value = nonzeros[i].value;
+ sum += inVector[colIdx] * value;
+ }
+ outVectorReference[rowIdx] = sum;
+ }
+
+ // Verify the result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ int status = 1;
+ for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+ float diff =
+ (outVectorReference[rowIdx] -
+ outVector[rowIdx]) / outVectorReference[rowIdx];
+ const float tolerance = 0.00001;
+ if (diff > tolerance || diff < -tolerance) {
+ status = 0;
+ PRINT_ERROR
+ ("Mismatch at index %u (CPU result = %f, DPU result = %f)",
+ rowIdx, outVectorReference[rowIdx],
+ outVector[rowIdx]);
+ }
+ }
+
+ startTimer(&timer);
+ DPU_ASSERT(dpu_free(dpu_set));
+ stopTimer(&timer);
+ freeTime += getElapsedTime(timer);
+
+ if (status) {
+ dfatool_printf
+ ("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
+ numDPUs, numRanks, NR_TASKLETS, "float",
+ csrMatrix.numNonzeros);
+ dfatool_printf
+ ("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+ allocTime, loadTime, writeTime, dpuTime, readTime,
+ freeTime);
+ dfatool_printf
+ (" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+ csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((allocTime + loadTime + writeTime + dpuTime + readTime +
+ freeTime) * 1e6));
+ dfatool_printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((writeTime + dpuTime + readTime) * 1e6),
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((allocTime + loadTime + writeTime + dpuTime +
+ readTime) * 1e6));
+ dfatool_printf
+ (" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+ csrMatrix.numNonzeros / (dpuTime * 1e6),
+ csrMatrix.numNonzeros /
+ ((allocTime + loadTime + writeTime + dpuTime + readTime +
+ freeTime) * 1e6));
+ dfatool_printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) *
+ 1e6),
+ csrMatrix.numNonzeros /
+ ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+ csrMatrix.numNonzeros /
+ ((allocTime + loadTime + writeTime + dpuTime +
+ readTime) * 1e6));
+ }
+ // Display DPU Logs
+ if (p.verbosity >= 2) {
+ PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ PRINT("DPU %u:", dpuIdx);
+ DPU_ASSERT(dpu_log_read(dpu, stdout));
+ ++dpuIdx;
+ }
+ }
+ // Deallocate data structures
+ freeCOOMatrix(cooMatrix);
+ freeCSRMatrix(csrMatrix);
+ free(inVector);
+ free(outVector);
+ free(outVectorReference);
+
+ return 0;
}
diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h
index 627dfde..a953d6a 100644
--- a/SpMV/host/mram-management.h
+++ b/SpMV/host/mram-management.h
@@ -1,37 +1,29 @@
+#pragma once
-#ifndef _MRAM_MANAGEMENT_H_
-#define _MRAM_MANAGEMENT_H_
+#include "common.h"
+#include "utils.h"
-#include "../support/common.h"
-#include "../support/utils.h"
-
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
struct mram_heap_allocator_t {
- uint32_t totalAllocated;
+ uint32_t totalAllocated;
};
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
- allocator->totalAllocated = 0;
-}
-
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
- uint32_t ret = allocator->totalAllocated;
- allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
- if(allocator->totalAllocated > DPU_CAPACITY) {
- PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
- exit(0);
- }
- return ret;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+ allocator->totalAllocated = 0;
}
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
- DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+ uint32_t size)
+{
+ uint32_t ret = allocator->totalAllocated;
+ allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+ if (allocator->totalAllocated > DPU_CAPACITY) {
+ PRINT_ERROR
+ (" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+ allocator->totalAllocated, DPU_CAPACITY);
+ exit(0);
+ }
+ return ret;
}
-
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
- DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
-}
-
-#endif
-
diff --git a/SpMV/include/common.h b/SpMV/include/common.h
new file mode 100644
index 0000000..6118814
--- /dev/null
+++ b/SpMV/include/common.h
@@ -0,0 +1,24 @@
+
+/* Common data structures between host and DPUs */
+
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define ROUND_UP_TO_MULTIPLE_OF_2(x) ((((x) + 1)/2)*2)
+#define ROUND_UP_TO_MULTIPLE_OF_8(x) ((((x) + 7)/8)*8)
+
+struct DPUParams {
+ uint32_t dpuNumRows; /* Number of rows assigned to the DPU */
+ uint32_t dpuRowPtrsOffset; /* Offset of the row pointers */
+ uint32_t dpuRowPtrs_m;
+ uint32_t dpuNonzeros_m;
+ uint32_t dpuInVector_m;
+ uint32_t dpuOutVector_m;
+};
+
+struct Nonzero {
+ uint32_t col;
+ float value;
+};
+
+#endif
diff --git a/SpMV/include/dfatool_host.ah b/SpMV/include/dfatool_host.ah
new file mode 100644
index 0000000..91d44bd
--- /dev/null
+++ b/SpMV/include/dfatool_host.ah
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+ unsigned long n_rows, n_cols, n_nonzero;
+ unsigned int element_size;
+
+ virtual int getKernel() { return 1; }
+
+ DfatoolHostTiming() {
+ element_size = sizeof(float);
+ }
+
+ advice call("% input_params(...)"): after() {
+ printf("[>>] SpMV | n_dpus=%u\n", NR_DPUS);
+ }
+
+ advice call("% readCOOMatrix(...)") : after() {
+ struct COOMatrix* c = tjp->result();
+ n_rows = c->numRows;
+ n_cols = c->numCols;
+ n_nonzero = c->numNonzeros;
+ printf("[--] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+ }
+
+ advice execution("% main(...)") : after() {
+ printf("[<<] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+ }
+};
diff --git a/SpMV/include/matrix.h b/SpMV/include/matrix.h
new file mode 100644
index 0000000..ce8745e
--- /dev/null
+++ b/SpMV/include/matrix.h
@@ -0,0 +1,138 @@
+
+#ifndef _MATRIX_H_
+#define _MATRIX_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "common.h"
+#include "utils.h"
+
+struct COOMatrix {
+ uint32_t numRows;
+ uint32_t numCols;
+ uint32_t numNonzeros;
+ uint32_t *rowIdxs;
+ struct Nonzero *nonzeros;
+};
+
+struct CSRMatrix {
+ uint32_t numRows;
+ uint32_t numCols;
+ uint32_t numNonzeros;
+ uint32_t *rowPtrs;
+ struct Nonzero *nonzeros;
+};
+
+static struct COOMatrix readCOOMatrix(const char *fileName)
+{
+
+ struct COOMatrix cooMatrix;
+
+ // Initialize fields
+ FILE *fp = fopen(fileName, "r");
+ assert(fscanf(fp, "%u", &cooMatrix.numRows));
+ if (cooMatrix.numRows % 2 == 1) {
+ PRINT_WARNING
+ ("Reading matrix %s: number of rows must be even. Padding with an extra row.",
+ fileName);
+ cooMatrix.numRows++;
+ }
+ assert(fscanf(fp, "%u", &cooMatrix.numCols));
+ assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
+ cooMatrix.rowIdxs =
+ (uint32_t *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (cooMatrix.numNonzeros * sizeof(uint32_t)));
+ cooMatrix.nonzeros =
+ (struct Nonzero *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (cooMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+ // Read the nonzeros
+ for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+ uint32_t rowIdx;
+ assert(fscanf(fp, "%u", &rowIdx));
+ cooMatrix.rowIdxs[i] = rowIdx - 1; // File format indexes begin at 1
+ uint32_t colIdx;
+ assert(fscanf(fp, "%u", &colIdx));
+ cooMatrix.nonzeros[i].col = colIdx - 1; // File format indexes begin at 1
+ cooMatrix.nonzeros[i].value = 1.0f;
+ }
+
+ return cooMatrix;
+
+}
+
+static void freeCOOMatrix(struct COOMatrix cooMatrix)
+{
+ free(cooMatrix.rowIdxs);
+ free(cooMatrix.nonzeros);
+}
+
+static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix)
+{
+
+ struct CSRMatrix csrMatrix;
+
+ // Initialize fields
+ csrMatrix.numRows = cooMatrix.numRows;
+ csrMatrix.numCols = cooMatrix.numCols;
+ csrMatrix.numNonzeros = cooMatrix.numNonzeros;
+ csrMatrix.rowPtrs =
+ (uint32_t *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ ((csrMatrix.numRows + 1) * sizeof(uint32_t)));
+ csrMatrix.nonzeros =
+ (struct Nonzero *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (csrMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+ // Histogram rowIdxs
+ memset(csrMatrix.rowPtrs, 0,
+ (csrMatrix.numRows + 1) * sizeof(uint32_t));
+ for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+ uint32_t rowIdx = cooMatrix.rowIdxs[i];
+ csrMatrix.rowPtrs[rowIdx]++;
+ }
+
+ // Prefix sum rowPtrs
+ uint32_t sumBeforeNextRow = 0;
+ for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+ uint32_t sumBeforeRow = sumBeforeNextRow;
+ sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
+ csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
+ }
+ csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
+
+ // Bin the nonzeros
+ for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+ uint32_t rowIdx = cooMatrix.rowIdxs[i];
+ uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
+ csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
+ }
+
+ // Restore rowPtrs
+ for (uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
+ csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
+ }
+ csrMatrix.rowPtrs[0] = 0;
+
+ return csrMatrix;
+
+}
+
+static void freeCSRMatrix(struct CSRMatrix csrMatrix)
+{
+ free(csrMatrix.rowPtrs);
+ free(csrMatrix.nonzeros);
+}
+
+static void initVector(float *vec, uint32_t size)
+{
+ for (uint32_t i = 0; i < size; ++i) {
+ vec[i] = 1.0f;
+ }
+}
+
+#endif
diff --git a/SpMV/include/params.h b/SpMV/include/params.h
new file mode 100644
index 0000000..bf60e79
--- /dev/null
+++ b/SpMV/include/params.h
@@ -0,0 +1,51 @@
+
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+#include "utils.h"
+
+static void usage()
+{
+ PRINT("\nUsage: ./program [options]"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -f <F> input matrix file name (default=data/bcsstk30.mtx)"
+ "\n"
+ "\nGeneral options:"
+ "\n -v <V> verbosity" "\n -h help" "\n\n");
+}
+
+typedef struct Params {
+ const char *fileName;
+ unsigned int verbosity;
+} Params;
+
+static struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.fileName = "data/bcsstk30.mtx";
+ p.verbosity = 1;
+ int opt;
+ while ((opt = getopt(argc, argv, "f:v:h")) >= 0) {
+ switch (opt) {
+ case 'f':
+ p.fileName = optarg;
+ break;
+ case 'v':
+ p.verbosity = atoi(optarg);
+ break;
+ case 'h':
+ usage();
+ exit(0);
+ default:
+ PRINT_ERROR("Unrecognized option!");
+ usage();
+ exit(0);
+ }
+ }
+
+ return p;
+}
+
+#endif
diff --git a/SpMV/include/timer.h b/SpMV/include/timer.h
new file mode 100644
index 0000000..cb513cb
--- /dev/null
+++ b/SpMV/include/timer.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <stdio.h>
+#include <sys/time.h>
+
+#if DFATOOL_TIMING
+
+#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0)
+
+typedef struct Timer {
+ struct timeval startTime;
+ struct timeval endTime;
+} Timer;
+
+static void startTimer(Timer *timer)
+{
+ gettimeofday(&(timer->startTime), NULL);
+}
+
+static void stopTimer(Timer *timer)
+{
+ gettimeofday(&(timer->endTime), NULL);
+}
+
+static double getElapsedTime(Timer timer)
+{
+ return ((double)((timer.endTime.tv_sec - timer.startTime.tv_sec)
+ + (timer.endTime.tv_usec -
+ timer.startTime.tv_usec) / 1.0e6));
+}
+
+#else
+
+#define dfatool_printf(fmt, ...) do {} while (0)
+
+typedef int Timer;
+
+static void startTimer(Timer* timer)
+{
+ (void)timer;
+}
+
+static void stopTimer(Timer* timer)
+{
+ (void)timer;
+}
+
+static double getElapsedTime(Timer timer)
+{
+ (void)timer;
+ return 0.0;
+}
+
+#endif
diff --git a/SpMV/support/utils.h b/SpMV/include/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/SpMV/support/utils.h
+++ b/SpMV/include/utils.h
@@ -8,4 +8,3 @@
#define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__)
#endif
-
diff --git a/SpMV/run-paper-strong-full.sh b/SpMV/run-paper-strong-full.sh
deleted file mode 100755
index 09b7085..0000000
--- a/SpMV/run-paper-strong-full.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-cd data/generate
-./replicate ../bcsstk30.mtx 64 ../bcsstk30.mtx.64.mtx
-cd ../..
-
-# >2048 is not in upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
- for nr_tasklets in 1 2 4 8 16; do
- echo
- if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
- # repetition is not part of upstream setup
- for i in `seq 1 50`; do
- timeout --foreground -k 1m 3m bin/host_code -v 0 -f data/bcsstk30.mtx.64.mtx || true
- done
- fi
- done
-done
-) | tee log-paper-strong-full.txt
-
-rm -f data/bcsstk30.mtx.64.mtx
diff --git a/SpMV/run-paper-strong-rank.sh b/SpMV/run-paper-strong-rank.sh
deleted file mode 100755
index c73a6a0..0000000
--- a/SpMV/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream
-for nr_dpus in 128 1 4 16 64; do
- for nr_tasklets in 1 2 4 8 16; do
- echo
- if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
- # repetition is not part of upstream setup
- for i in `seq 1 50`; do
- timeout --foreground -k 1m 3m bin/host_code -v 0 || true
- done
- fi
- done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/SpMV/run-paper-weak.sh b/SpMV/run-paper-weak.sh
deleted file mode 100755
index 74683cc..0000000
--- a/SpMV/run-paper-weak.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 16 64; do
- cd data/generate
- make
- ./replicate ../bcsstk30.mtx ${nr_dpus} /tmp/bcsstk30.mtx.${nr_dpus}.mtx
- cd ../..
- for nr_tasklets in 1 2 4 8 16; do
- echo
- if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
- # repetition is not part of upstream setup
- for i in `seq 1 50`; do
- timeout --foreground -k 1m 3m bin/host_code -v 0 -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx || true
- done
- fi
- done
- rm -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx
-done |
-) tee log-paper-weak.txt
diff --git a/SpMV/support/common.h b/SpMV/support/common.h
deleted file mode 100644
index 58fede8..0000000
--- a/SpMV/support/common.h
+++ /dev/null
@@ -1,25 +0,0 @@
-
-/* Common data structures between host and DPUs */
-
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-#define ROUND_UP_TO_MULTIPLE_OF_2(x) ((((x) + 1)/2)*2)
-#define ROUND_UP_TO_MULTIPLE_OF_8(x) ((((x) + 7)/8)*8)
-
-struct DPUParams {
- uint32_t dpuNumRows; /* Number of rows assigned to the DPU */
- uint32_t dpuRowPtrsOffset; /* Offset of the row pointers */
- uint32_t dpuRowPtrs_m;
- uint32_t dpuNonzeros_m;
- uint32_t dpuInVector_m;
- uint32_t dpuOutVector_m;
-};
-
-struct Nonzero {
- uint32_t col;
- float value;
-};
-
-#endif
-
diff --git a/SpMV/support/matrix.h b/SpMV/support/matrix.h
deleted file mode 100644
index d25da1b..0000000
--- a/SpMV/support/matrix.h
+++ /dev/null
@@ -1,119 +0,0 @@
-
-#ifndef _MATRIX_H_
-#define _MATRIX_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "common.h"
-#include "utils.h"
-
-struct COOMatrix {
- uint32_t numRows;
- uint32_t numCols;
- uint32_t numNonzeros;
- uint32_t* rowIdxs;
- struct Nonzero* nonzeros;
-};
-
-struct CSRMatrix {
- uint32_t numRows;
- uint32_t numCols;
- uint32_t numNonzeros;
- uint32_t* rowPtrs;
- struct Nonzero* nonzeros;
-};
-
-static struct COOMatrix readCOOMatrix(const char* fileName) {
-
- struct COOMatrix cooMatrix;
-
- // Initialize fields
- FILE* fp = fopen(fileName, "r");
- assert(fscanf(fp, "%u", &cooMatrix.numRows));
- if(cooMatrix.numRows%2 == 1) {
- PRINT_WARNING("Reading matrix %s: number of rows must be even. Padding with an extra row.", fileName);
- cooMatrix.numRows++;
- }
- assert(fscanf(fp, "%u", &cooMatrix.numCols));
- assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
- cooMatrix.rowIdxs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(uint32_t)));
- cooMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(struct Nonzero)));
-
- // Read the nonzeros
- for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
- uint32_t rowIdx;
- assert(fscanf(fp, "%u", &rowIdx));
- cooMatrix.rowIdxs[i] = rowIdx - 1; // File format indexes begin at 1
- uint32_t colIdx;
- assert(fscanf(fp, "%u", &colIdx));
- cooMatrix.nonzeros[i].col = colIdx - 1; // File format indexes begin at 1
- cooMatrix.nonzeros[i].value = 1.0f;
- }
-
- return cooMatrix;
-
-}
-
-static void freeCOOMatrix(struct COOMatrix cooMatrix) {
- free(cooMatrix.rowIdxs);
- free(cooMatrix.nonzeros);
-}
-
-static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix) {
-
- struct CSRMatrix csrMatrix;
-
- // Initialize fields
- csrMatrix.numRows = cooMatrix.numRows;
- csrMatrix.numCols = cooMatrix.numCols;
- csrMatrix.numNonzeros = cooMatrix.numNonzeros;
- csrMatrix.rowPtrs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8((csrMatrix.numRows + 1)*sizeof(uint32_t)));
- csrMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrMatrix.numNonzeros*sizeof(struct Nonzero)));
-
- // Histogram rowIdxs
- memset(csrMatrix.rowPtrs, 0, (csrMatrix.numRows + 1)*sizeof(uint32_t));
- for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
- uint32_t rowIdx = cooMatrix.rowIdxs[i];
- csrMatrix.rowPtrs[rowIdx]++;
- }
-
- // Prefix sum rowPtrs
- uint32_t sumBeforeNextRow = 0;
- for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
- uint32_t sumBeforeRow = sumBeforeNextRow;
- sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
- csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
- }
- csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
-
- // Bin the nonzeros
- for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
- uint32_t rowIdx = cooMatrix.rowIdxs[i];
- uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
- csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
- }
-
- // Restore rowPtrs
- for(uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
- csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
- }
- csrMatrix.rowPtrs[0] = 0;
-
- return csrMatrix;
-
-}
-
-static void freeCSRMatrix(struct CSRMatrix csrMatrix) {
- free(csrMatrix.rowPtrs);
- free(csrMatrix.nonzeros);
-}
-
-static void initVector(float* vec, uint32_t size) {
- for(uint32_t i = 0; i < size; ++i) {
- vec[i] = 1.0f;
- }
-}
-
-#endif
-
diff --git a/SpMV/support/params.h b/SpMV/support/params.h
deleted file mode 100644
index b4b696c..0000000
--- a/SpMV/support/params.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-#include "utils.h"
-
-static void usage() {
- PRINT( "\nUsage: ./program [options]"
- "\n"
- "\nBenchmark-specific options:"
- "\n -f <F> input matrix file name (default=data/bcsstk30.mtx)"
- "\n"
- "\nGeneral options:"
- "\n -v <V> verbosity"
- "\n -h help"
- "\n\n");
-}
-
-typedef struct Params {
- const char* fileName;
- unsigned int verbosity;
-} Params;
-
-static struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.fileName = "data/bcsstk30.mtx";
- p.verbosity = 1;
- int opt;
- while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
- switch(opt) {
- case 'f': p.fileName = optarg; break;
- case 'v': p.verbosity = atoi(optarg); break;
- case 'h': usage(); exit(0);
- default:
- PRINT_ERROR("Unrecognized option!");
- usage();
- exit(0);
- }
- }
-
- return p;
-}
-
-#endif
-
diff --git a/SpMV/support/timer.h b/SpMV/support/timer.h
deleted file mode 100644
index 66e9842..0000000
--- a/SpMV/support/timer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#ifndef _TIMER_H_
-#define _TIMER_H_
-
-#include <stdio.h>
-#include <sys/time.h>
-
-typedef struct Timer {
- struct timeval startTime;
- struct timeval endTime;
-} Timer;
-
-static void startTimer(Timer* timer) {
- gettimeofday(&(timer->startTime), NULL);
-}
-
-static void stopTimer(Timer* timer) {
- gettimeofday(&(timer->endTime), NULL);
-}
-
-static double getElapsedTime(Timer timer) {
- return ((double) ((timer.endTime.tv_sec - timer.startTime.tv_sec)
- + (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
-}
-
-#endif
-