summaryrefslogtreecommitdiff
path: root/SpMV
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2025-05-23 16:28:17 +0200
committerBirte Kristina Friesel <birte.friesel@uos.de>2025-05-23 16:28:35 +0200
commitfa6c70a44fc56cc50370e57c460dd61e8f127b51 (patch)
tree91269761966dccea80a2931542db5a3648f66e18 /SpMV
parent2e3a43c12df8115fc859248adb14b87e08becb77 (diff)
SpMV: Add AspectC++ support
Diffstat (limited to 'SpMV')
-rw-r--r--SpMV/Makefile50
-rwxr-xr-xSpMV/benchmark-scripts/ccmcc25-sim.sh35
-rw-r--r--SpMV/dpu/task.c2
-rw-r--r--SpMV/host/app.c79
-rw-r--r--SpMV/host/mram-management.h26
-rw-r--r--SpMV/include/common.h (renamed from SpMV/support/common.h)0
-rw-r--r--SpMV/include/dfatool_host.ah31
-rw-r--r--SpMV/include/matrix.h (renamed from SpMV/support/matrix.h)0
-rw-r--r--SpMV/include/params.h (renamed from SpMV/support/params.h)0
-rw-r--r--SpMV/include/timer.h (renamed from SpMV/support/timer.h)30
-rw-r--r--SpMV/include/utils.h (renamed from SpMV/support/utils.h)0
11 files changed, 170 insertions, 83 deletions
diff --git a/SpMV/Makefile b/SpMV/Makefile
index 0e7a70c..c2d9d50 100644
--- a/SpMV/Makefile
+++ b/SpMV/Makefile
@@ -1,21 +1,31 @@
NR_TASKLETS ?= 16
NR_DPUS ?= 1
-COMMON_INCLUDES := support
-HOST_SOURCES := $(wildcard host/*.c)
-DPU_SOURCES := $(wildcard dpu/*.c)
-CPU_BASE_SOURCES := $(wildcard baselines/cpu/*.c)
-GPU_BASE_SOURCES := $(wildcard baselines/gpu/*.cu)
-
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
-CPU_BASE_FLAGS := -O3 -fopenmp
-GPU_BASE_FLAGS := -O3
+
+ifeq (${aspectc_timing}, 1)
+ ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+ HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+ HOST_FLAGS += -std=c11
+endif
QUIET = @
-ifdef verbose
+ifeq (${verbose}, 1)
QUIET =
endif
@@ -24,19 +34,13 @@ all: bin/host_code bin/dpu_code
bin:
${QUIET}mkdir -p bin
-gpu: bin/gpu_baseline
-
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
- ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
-
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
- ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
-
-bin/cpu_baseline: ${CPU_BASE_SOURCES}
- ${QUIET}${CC} -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS}
+bin/host_code: host/app.c include bin
+ ${QUIET}cp ../include/dfatool_host_dpu.ah include
+ ${QUIET}${HOST_CC} -o $@ host/app.c ${HOST_FLAGS}
+ ${QUIET}rm -f include/dfatool_host_dpu.ah
-bin/gpu_baseline: ${GPU_BASE_SOURCES}
- ${QUIET}nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS}
+bin/dpu_code: dpu/task.c include bin
+ ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/task.c
clean:
${QUIET}rm -rf bin
diff --git a/SpMV/benchmark-scripts/ccmcc25-sim.sh b/SpMV/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..dcf6f9f
--- /dev/null
+++ b/SpMV/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+ aspectc=1 aspectc_timing=1 dfatool_timing=0
+ bin/host_code -v 0 -f data/${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+cd data/generate
+for i in 4 8 16; do
+ ./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx
+done
+cd ../..
+
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks BFS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt
+
+# BFS does not support repeated kernel invocations → repeat it here
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \
+ ::: data bcsstk30.mtx bcsstk30.4.mtx bcsstk30.8.mtx bcsstk30.16.mtx \
+ ::: nr_dpus 1 2 4 8 16 32 48 64 \
+>> ${fn}.txt
+
+rm -f data/bcsstk30.*.mtx
diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c
index 501a62a..305a645 100644
--- a/SpMV/dpu/task.c
+++ b/SpMV/dpu/task.c
@@ -11,7 +11,7 @@
#include <perfcounter.h>
#include <seqread.h>
-#include "../support/common.h"
+#include "common.h"
#define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m "fmt"\n", ##__VA_ARGS__)
diff --git a/SpMV/host/app.c b/SpMV/host/app.c
index ffccb70..6cf2861 100644
--- a/SpMV/host/app.c
+++ b/SpMV/host/app.c
@@ -3,9 +3,24 @@
* SpMV Host Application Source File
*
*/
+#if ASPECTC
+extern "C" {
+#endif
+
#include <dpu.h>
#include <dpu_log.h>
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
#include <assert.h>
#include <getopt.h>
#include <stdio.h>
@@ -14,24 +29,17 @@
#include <unistd.h>
#include "mram-management.h"
-#include "../support/common.h"
-#include "../support/matrix.h"
-#include "../support/params.h"
-#include "../support/timer.h"
-#include "../support/utils.h"
+#include "common.h"
+#include "matrix.h"
+#include "params.h"
+#include "timer.h"
+#include "utils.h"
#define DPU_BINARY "./bin/dpu_code"
#define XSTR(x) STR(x)
#define STR(x) #x
-#ifndef ENERGY
-#define ENERGY 0
-#endif
-#if ENERGY
-#include <dpu_probe.h>
-#endif
-
// Main of the Host Application
int main(int argc, char **argv)
{
@@ -78,10 +86,10 @@ int main(int argc, char **argv)
uint32_t *rowPtrs = csrMatrix.rowPtrs;
struct Nonzero *nonzeros = csrMatrix.nonzeros;
float *inVector =
- malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)));
+ (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)));
initVector(inVector, numCols);
float *outVector =
- malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float)));
+ (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float)));
// Partition data structure across DPUs
uint32_t numRowsPerDPU =
@@ -158,22 +166,25 @@ int main(int argc, char **argv)
PRINT_INFO(p.verbosity >= 2,
" Copying data to DPU");
startTimer(&timer);
- copyToDPU(dpu, (uint8_t *) dpuRowPtrs_h, dpuRowPtrs_m,
- (dpuNumRows + 1) * sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t *) dpuNonzeros_h, dpuNonzeros_m,
- dpuNumNonzeros * sizeof(struct Nonzero));
- copyToDPU(dpu, (uint8_t *) inVector, dpuInVector_m,
- numCols * sizeof(float));
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuRowPtrs_m, (uint8_t *) dpuRowPtrs_h,
+ ROUND_UP_TO_MULTIPLE_OF_8((dpuNumRows + 1) * sizeof(uint32_t))));
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuNonzeros_m, (uint8_t *) dpuNonzeros_h,
+ ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNonzeros * sizeof(struct Nonzero))));
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuInVector_m, (uint8_t *) inVector,
+ ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float))));
stopTimer(&timer);
writeTime += getElapsedTime(timer);
-
}
// Send parameters to DPU
PRINT_INFO(p.verbosity >= 2,
" Copying parameters to DPU");
startTimer(&timer);
- copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx], dpuParams_m,
- sizeof(struct DPUParams));
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams_m, (uint8_t *) & dpuParams[dpuIdx],
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
stopTimer(&timer);
writeTime += getElapsedTime(timer);
@@ -204,13 +215,15 @@ int main(int argc, char **argv)
PRINT_INFO(p.verbosity >= 1, "Copying back the result");
startTimer(&timer);
dpuIdx = 0;
+
DPU_FOREACH(dpu_set, dpu) {
unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
if (dpuNumRows > 0) {
uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m,
- (uint8_t *) (outVector + dpuStartRowIdx),
- dpuNumRows * sizeof(float));
+ DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams[dpuIdx].dpuOutVector_m,
+ (uint8_t *) (outVector + dpuStartRowIdx),
+ ROUND_UP_TO_MULTIPLE_OF_8(dpuNumRows * sizeof(float))));
}
++dpuIdx;
}
@@ -220,7 +233,7 @@ int main(int argc, char **argv)
// Calculating result on CPU
PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- float *outVectorReference = malloc(numRows * sizeof(float));
+ float *outVectorReference = (float*)malloc(numRows * sizeof(float));
for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
float sum = 0.0f;
for (uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
@@ -254,22 +267,22 @@ int main(int argc, char **argv)
freeTime += getElapsedTime(timer);
if (status) {
- printf
+ dfatool_printf
("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
numDPUs, numRanks, NR_TASKLETS, "float",
csrMatrix.numNonzeros);
- printf
+ dfatool_printf
("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
allocTime, loadTime, writeTime, dpuTime, readTime,
freeTime);
- printf
+ dfatool_printf
(" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
// coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
csrMatrix.numNonzeros * sizeof(float) /
((allocTime + loadTime + writeTime + dpuTime + readTime +
freeTime) * 1e6));
- printf
+ dfatool_printf
(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
csrMatrix.numNonzeros * sizeof(float) /
((writeTime + dpuTime + readTime) * 1e6),
@@ -278,14 +291,14 @@ int main(int argc, char **argv)
csrMatrix.numNonzeros * sizeof(float) /
((allocTime + loadTime + writeTime + dpuTime +
readTime) * 1e6));
- printf
+ dfatool_printf
(" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
// coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
csrMatrix.numNonzeros / (dpuTime * 1e6),
csrMatrix.numNonzeros /
((allocTime + loadTime + writeTime + dpuTime + readTime +
freeTime) * 1e6));
- printf
+ dfatool_printf
(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) *
1e6),
diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h
index f2ee031..a953d6a 100644
--- a/SpMV/host/mram-management.h
+++ b/SpMV/host/mram-management.h
@@ -1,9 +1,7 @@
+#pragma once
-#ifndef _MRAM_MANAGEMENT_H_
-#define _MRAM_MANAGEMENT_H_
-
-#include "../support/common.h"
-#include "../support/utils.h"
+#include "common.h"
+#include "utils.h"
#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
@@ -29,21 +27,3 @@ static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
}
return ret;
}
-
-static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx,
- uint32_t size)
-{
- DPU_ASSERT(dpu_copy_to
- (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
- ROUND_UP_TO_MULTIPLE_OF_8(size)));
-}
-
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx,
- uint8_t *hostPtr, uint32_t size)
-{
- DPU_ASSERT(dpu_copy_from
- (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
- ROUND_UP_TO_MULTIPLE_OF_8(size)));
-}
-
-#endif
diff --git a/SpMV/support/common.h b/SpMV/include/common.h
index 6118814..6118814 100644
--- a/SpMV/support/common.h
+++ b/SpMV/include/common.h
diff --git a/SpMV/include/dfatool_host.ah b/SpMV/include/dfatool_host.ah
new file mode 100644
index 0000000..91d44bd
--- /dev/null
+++ b/SpMV/include/dfatool_host.ah
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+ unsigned long n_rows, n_cols, n_nonzero;
+ unsigned int element_size;
+
+ virtual int getKernel() { return 1; }
+
+ DfatoolHostTiming() {
+ element_size = sizeof(float);
+ }
+
+ advice call("% input_params(...)"): after() {
+ printf("[>>] SpMV | n_dpus=%u\n", NR_DPUS);
+ }
+
+ advice call("% readCOOMatrix(...)") : after() {
+ struct COOMatrix* c = tjp->result();
+ n_rows = c->numRows;
+ n_cols = c->numCols;
+ n_nonzero = c->numNonzeros;
+ printf("[--] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+ }
+
+ advice execution("% main(...)") : after() {
+ printf("[<<] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+ }
+};
diff --git a/SpMV/support/matrix.h b/SpMV/include/matrix.h
index ce8745e..ce8745e 100644
--- a/SpMV/support/matrix.h
+++ b/SpMV/include/matrix.h
diff --git a/SpMV/support/params.h b/SpMV/include/params.h
index bf60e79..bf60e79 100644
--- a/SpMV/support/params.h
+++ b/SpMV/include/params.h
diff --git a/SpMV/support/timer.h b/SpMV/include/timer.h
index 7367b11..cb513cb 100644
--- a/SpMV/support/timer.h
+++ b/SpMV/include/timer.h
@@ -1,10 +1,12 @@
-
-#ifndef _TIMER_H_
-#define _TIMER_H_
+#pragma once
#include <stdio.h>
#include <sys/time.h>
+#if DFATOOL_TIMING
+
+#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0)
+
typedef struct Timer {
struct timeval startTime;
struct timeval endTime;
@@ -27,4 +29,26 @@ static double getElapsedTime(Timer timer)
timer.startTime.tv_usec) / 1.0e6));
}
+#else
+
+#define dfatool_printf(fmt, ...) do {} while (0)
+
+typedef int Timer;
+
+static void startTimer(Timer* timer)
+{
+ (void)timer;
+}
+
+static void stopTimer(Timer* timer)
+{
+ (void)timer;
+}
+
+static double getElapsedTime(Timer timer)
+{
+ (void)timer;
+ return 0.0;
+}
+
#endif
diff --git a/SpMV/support/utils.h b/SpMV/include/utils.h
index ccd8fbd..ccd8fbd 100644
--- a/SpMV/support/utils.h
+++ b/SpMV/include/utils.h