blob: fa697bfce833fcabc07a608b93750ff1d155833d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
#!/bin/bash
mkdir -p log/$(hostname) baselines/cpu/log/$(hostname)
fn=log/$(hostname)/dimes-hetsim-nmc
source /opt/upmem/upmem-2024.1.0-Linux-x86_64/upmem_env.sh
# upstream DPU version uses 2048576 * uint64 ≈ 16 MiB (DPU max: 64 MiB)
# upstream DPU version uses 2 queries
input_size_upstream=2048576
num_queries_upstream=2
# here: 32 MiB and 1048576 queries
input_size_dpu=$(perl -E 'say 2 ** 22')
num_queries_dpu=1048576
# Make sure that num_queries > input_size!
run_benchmark_nmc() {
local "$@"
set -e
sudo limit_ranks_to_numa_node ${numa_rank}
make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1 INPUT_SIZE=${input_size} PROBLEM_SIZE=${num_queries}
bin/bs_host -w 0 -e 100 2>&1
}
export -f run_benchmark_nmc
run_benchmark_baseline() {
local "$@"
set -e
OMP_NUM_THREADS=${nr_threads} ./bs_omp ${input_size} ${num_queries} ${ram} ${cpu} ${ram_local} ${cpu_memcpy} 2>&1
}
export -f run_benchmark_baseline
(
echo "NMC single-node upstream-ref (1/4)" >&2
parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \
num_queries=${num_queries_upstream} input_size=${input_size_upstream} \
::: numa_rank 0 1 \
::: nr_dpus 64 128 256 512 768 1024
echo "NMC multi-node upstream-ref (2/4)" >&2
parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \
num_queries=${num_queries_upstream} input_size=${input_size_upstream} \
::: numa_rank -1 \
::: nr_dpus 1536 2048 2304
echo "NMC single-node DPU-ref (3/4)" >&2
parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \
num_queries=${num_queries_dpu} input_size=${input_size_dpu} \
::: numa_rank 0 1 \
::: nr_dpus 64 128 256 512 768 1024
echo "NMC multi-node DPU-ref (4/4)" >&2
parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \
num_queries=${num_queries_dpu} input_size=${input_size_dpu} \
::: numa_rank -1 \
::: nr_dpus 1536 2048 2304
) >> ${fn}.txt
cd baselines/cpu
(
make -B numa=1 numa_memcpy=1
echo "CPU single-node upstream-ref with memcpy, copy node == input node (1/6)" >&2
parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
ram_local={ram_local} cpu_memcpy={cpu_memcpy} \
num_queries=${num_queries_upstream} input_size=${input_size_upstream} \
::: i $(seq 1 20) \
::: ram 0 1 \
:::+ cpu_memcpy 0 1 \
::: ram_local 0 1 \
:::+ cpu 0 1 \
::: nr_threads 1 2 4 8 12 16
echo "CPU single-node dpu-ref with memcpy, copy node == input node (2/6)" >&2
parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
ram_local={ram_local} cpu_memcpy={cpu_memcpy} \
num_queries=${num_queries_dpu} input_size=${input_size_dpu} \
::: i $(seq 1 20) \
::: ram 0 1 \
:::+ cpu_memcpy 0 1 \
::: ram_local 0 1 \
:::+ cpu 0 1 \
::: nr_threads 1 2 4 8 12 16
make -B numa=1
echo "CPU single-node upstream-ref (3/6)" >&2
parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
num_queries=${num_queries_upstream} input_size=${input_size_upstream} \
::: i $(seq 1 20) \
::: cpu 0 1 \
::: ram 0 1 \
::: nr_threads 1 2 4 8 12 16
echo "CPU single-node DPU-ref (4/6)" >&2
parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
num_queries=${num_queries_dpu} input_size=${input_size_dpu} \
::: i $(seq 1 20) \
::: cpu 0 1 \
::: ram 0 1 \
::: nr_threads 1 2 4 8 12 16
echo "CPU multi-node upstream-ref (5/6)" >&2
parallel -j1 --eta --joblog ${fn}.5.joblog --resume --header : \
run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
num_queries=${num_queries_upstream} input_size=${input_size_upstream} \
::: i $(seq 1 20) \
::: cpu -1 \
::: ram 0 1 \
::: nr_threads 24 32
echo "CPU multi-node DPU-ref (6/6)" >&2
parallel -j1 --eta --joblog ${fn}.6.joblog --resume --header : \
run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
num_queries=${num_queries_dpu} input_size=${input_size_dpu} \
::: i $(seq 1 20) \
::: cpu -1 \
::: ram 0 1 \
::: nr_threads 24 32
) >> ${fn}.txt
|