From 3de4b495fb176eba9a0eb517a4ce05903cb67acb Mon Sep 17 00:00:00 2001 From: Juan Gomez Luna Date: Wed, 16 Jun 2021 19:46:05 +0200 Subject: PrIM -- first commit --- TRNS/Makefile | 44 +++++ TRNS/baselines/cpu/Makefile | 49 ++++++ TRNS/baselines/cpu/README | 16 ++ TRNS/baselines/cpu/kernel.cpp | 131 ++++++++++++++ TRNS/baselines/cpu/kernel.h | 41 +++++ TRNS/baselines/cpu/main.cpp | 219 +++++++++++++++++++++++ TRNS/baselines/cpu/support/common.h | 53 ++++++ TRNS/baselines/cpu/support/setup.h | 50 ++++++ TRNS/baselines/cpu/support/timer.h | 63 +++++++ TRNS/baselines/cpu/support/verify.h | 72 ++++++++ TRNS/baselines/gpu/Makefile | 51 ++++++ TRNS/baselines/gpu/README | 16 ++ TRNS/baselines/gpu/kernel.cu | 170 ++++++++++++++++++ TRNS/baselines/gpu/kernel.h | 44 +++++ TRNS/baselines/gpu/main.cpp | 298 ++++++++++++++++++++++++++++++++ TRNS/baselines/gpu/support/common.h | 53 ++++++ TRNS/baselines/gpu/support/cuda-setup.h | 78 +++++++++ TRNS/baselines/gpu/support/timer.h | 73 ++++++++ TRNS/baselines/gpu/support/verify.h | 71 ++++++++ TRNS/dpu/task.c | 199 +++++++++++++++++++++ TRNS/host/app.c | 281 ++++++++++++++++++++++++++++++ TRNS/support/common.h | 40 +++++ TRNS/support/params.h | 68 ++++++++ TRNS/support/timer.h | 59 +++++++ 24 files changed, 2239 insertions(+) create mode 100644 TRNS/Makefile create mode 100644 TRNS/baselines/cpu/Makefile create mode 100644 TRNS/baselines/cpu/README create mode 100644 TRNS/baselines/cpu/kernel.cpp create mode 100644 TRNS/baselines/cpu/kernel.h create mode 100644 TRNS/baselines/cpu/main.cpp create mode 100644 TRNS/baselines/cpu/support/common.h create mode 100644 TRNS/baselines/cpu/support/setup.h create mode 100644 TRNS/baselines/cpu/support/timer.h create mode 100644 TRNS/baselines/cpu/support/verify.h create mode 100644 TRNS/baselines/gpu/Makefile create mode 100644 TRNS/baselines/gpu/README create mode 100644 TRNS/baselines/gpu/kernel.cu create mode 100644 TRNS/baselines/gpu/kernel.h create mode 100644 TRNS/baselines/gpu/main.cpp create mode 100644 TRNS/baselines/gpu/support/common.h create mode 100644 TRNS/baselines/gpu/support/cuda-setup.h create mode 100644 TRNS/baselines/gpu/support/timer.h create mode 100644 TRNS/baselines/gpu/support/verify.h create mode 100644 TRNS/dpu/task.c create mode 100644 TRNS/host/app.c create mode 100755 TRNS/support/common.h create mode 100644 TRNS/support/params.h create mode 100755 TRNS/support/timer.h (limited to 'TRNS') diff --git a/TRNS/Makefile b/TRNS/Makefile new file mode 100644 index 0000000..8a6fd64 --- /dev/null +++ b/TRNS/Makefile @@ -0,0 +1,44 @@ +DPU_DIR := dpu +HOST_DIR := host +BUILDDIR ?= bin +NR_DPUS ?= 1 +NR_TASKLETS ?= 16 +ENERGY ?= 0 + +define conf_filename + ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2).conf +endef +CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS}) + +HOST_TARGET := ${BUILDDIR}/host_code +DPU_TARGET := ${BUILDDIR}/dpu_code + +COMMON_INCLUDES := support +HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c) +DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c) + +.PHONY: all clean test + +__dirs := $(shell mkdir -p ${BUILDDIR}) + +COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} +HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DENERGY=${ENERGY} +DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} + +all: ${HOST_TARGET} ${DPU_TARGET} + +${CONF}: + $(RM) $(call conf_filename,*,*) + touch ${CONF} + +${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF} + $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + +${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF} + dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} + +clean: + $(RM) -r $(BUILDDIR) + +test: all + ./${HOST_TARGET} diff --git a/TRNS/baselines/cpu/Makefile b/TRNS/baselines/cpu/Makefile new file mode 100644 index 0000000..cb2e264 --- /dev/null +++ b/TRNS/baselines/cpu/Makefile @@ -0,0 +1,49 @@ +# +# Copyright (c) 2016 University of Cordoba and University of Illinois +# All rights reserved. +# +# Developed by: IMPACT Research Group +# University of Cordoba and University of Illinois +# http://impact.crhc.illinois.edu/ +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# with the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# > Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimers. +# > Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimers in the +# documentation and/or other materials provided with the distribution. +# > Neither the names of IMPACT Research Group, University of Cordoba, +# University of Illinois nor the names of its contributors may be used +# to endorse or promote products derived from this Software without +# specific prior written permission. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +# THE SOFTWARE. +# + +CXX=g++ +CXX_FLAGS=-std=c++11 + +LIB=-L/usr/lib/ -lm -pthread + +DEP=kernel.cpp kernel.h main.cpp support/common.h support/setup.h support/timer.h +SRC=main.cpp kernel.cpp +EXE=trns + +all: + $(CXX) $(CXX_FLAGS) $(SRC) $(LIB) -o $(EXE) + +clean: + rm -f $(EXE) + diff --git a/TRNS/baselines/cpu/README b/TRNS/baselines/cpu/README new file mode 100644 index 0000000..eec9460 --- /dev/null +++ b/TRNS/baselines/cpu/README @@ -0,0 +1,16 @@ +In-place matrix transposition (TRNS) + +Compilation instructions + + make + +Execution instructions + + ./trns -w 0 -r 1 -m 16 -n 8 -o 4096 -p 2556 + +For more options + + ./trns -h + +Read more +J. Gomez-Luna et al., “In-place Matrix Transposition on GPUs,” IEEE TPDS, 2016. diff --git a/TRNS/baselines/cpu/kernel.cpp b/TRNS/baselines/cpu/kernel.cpp new file mode 100644 index 0000000..cd51953 --- /dev/null +++ b/TRNS/baselines/cpu/kernel.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include "kernel.h" +#include +#include +#include +#include + +// CPU threads----------------------------------------------------------------- +void run_cpu_threads_100(T *input, std::atomic_int *finished, std::atomic_int *head, int A, int B, int b, int threads) { +///////////////// Run CPU worker threads ///////////////////////////////// +#if PRINT + printf("Starting %d CPU threads\n", threads); +#endif + + std::vector cpu_threads; + for(int i = 0; i < threads; i++) { + + cpu_threads.push_back(std::thread([=]() { + + T data[b]; + T backup[b]; + int done; + int m = A * B - 1; + // Dynamic fetch + int gid = (head)->fetch_add(1); + + while(gid < m) { + int next_in_cycle = (gid * A) - m * (gid / B); + if(next_in_cycle == gid) { + // Dynamic fetch + gid = (head)->fetch_add(1); + continue; + } + for(int i = 0; i < b; i++) { + data[i] = input[gid * b + i]; + } + //make sure the read is not cached + done = (finished + gid)->load(); + for(; done == 0; next_in_cycle = (next_in_cycle * A) - m * (next_in_cycle / B)) { + for(int i = 0; i < b; i++) { + backup[i] = input[next_in_cycle * b + i]; + } + done = (finished + next_in_cycle)->exchange(1); + if(!done) { + for(int i = 0; i < b; i++) { + input[next_in_cycle * b + i] = data[i]; + } + } + for(int i = 0; i < b; i++) { + data[i] = backup[i]; + } + } + // Dynamic fetch + gid = (head)->fetch_add(1); + } + })); + } + + std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); }); +} + + +// CPU threads----------------------------------------------------------------- +void run_cpu_threads_010(T *input, std::atomic_int* head, int a, int b, int tiles, int threads) { +///////////////// Run CPU worker threads ///////////////////////////////// +#if PRINT + printf("Starting %d CPU threads\n", threads); +#endif + + std::vector cpu_threads; + for(int i = 0; i < threads; i++) { + + cpu_threads.push_back(std::thread([=]() { + + T tile[a * b]; + int m = a * b - 1; + + // Dynamic fetch + int gid = (head)->fetch_add(1); + + while(gid < tiles) { + T* input_array = input + a * b * gid; + for (int j = 0; j < a * b; j++) { + int next = (j * a)-m*(j/b); + tile[next] = input_array[j]; + } + for (int j = 0; j < a * b; j++) { + input_array[j] = tile[j]; + } + // Dynamic fetch + gid = (head)->fetch_add(1); + } + })); + } + + std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); }); +} diff --git a/TRNS/baselines/cpu/kernel.h b/TRNS/baselines/cpu/kernel.h new file mode 100644 index 0000000..b0eec1f --- /dev/null +++ b/TRNS/baselines/cpu/kernel.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include +#include +#include "support/common.h" + +void run_cpu_threads_100(T *input, std::atomic_int *finished, std::atomic_int *head, int A, int B, int b, int threads); +void run_cpu_threads_010(T *input, std::atomic_int *head, int m, int n, int tiles, int threads); diff --git a/TRNS/baselines/cpu/main.cpp b/TRNS/baselines/cpu/main.cpp new file mode 100644 index 0000000..2c3bdc3 --- /dev/null +++ b/TRNS/baselines/cpu/main.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include "support/setup.h" +#include "kernel.h" +#include "support/common.h" +#include "support/timer.h" +#include "support/verify.h" + +#include +#include +#include +#include + +// Params --------------------------------------------------------------------- +struct Params { + + int n_threads; + int n_warmup; + int n_reps; + int M_; + int m; + int N_; + int n; + + Params(int argc, char **argv) { + n_threads = 4; + n_warmup = 5; + n_reps = 50; + M_ = 128; + m = 16; + N_ = 128; + n = 8; + int opt; + while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 't': n_threads = atoi(optarg); break; + case 'w': n_warmup = atoi(optarg); break; + case 'r': n_reps = atoi(optarg); break; + case 'm': m = atoi(optarg); break; + case 'n': n = atoi(optarg); break; + case 'o': M_ = atoi(optarg); break; + case 'p': N_ = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + } + + void usage() { + fprintf(stderr, + "\nUsage: ./trns [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -t # of host threads (default=4)" + "\n -w # of untimed warmup iterations (default=5)" + "\n -r # of timed repetition iterations (default=50)" + "\n" + "\nData-partitioning-specific options:" + "\n TRNS only supports CPU-only or GPU-only execution" + "\n" + "\nBenchmark-specific options:" + "\n -m m (default=16 elements)" + "\n -n n (default=8 elements)" + "\n -o M_ (default=128 elements)" + "\n -p N_ (default=128 elements)" + "\n"); + } +}; + +// Input Data ----------------------------------------------------------------- +void read_input(T *x_vector, const Params &p) { + int in_size = p.M_ * p.m * p.N_ * p.n; + srand(5432); + for(int i = 0; i < in_size; i++) { + x_vector[i] = ((T)(rand() % 100) / 100); + } +} + +// Main ------------------------------------------------------------------------------------------ +int main(int argc, char **argv) { + + const Params p(argc, argv); + Timer timer; + + // Allocate + timer.start("Allocation"); + int M_ = p.M_; + int m = p.m; + int N_ = p.N_; + int n = p.n; + int in_size = M_ * m * N_ * n; + int finished_size = M_ * m * N_; + T * h_in_out = (T *)malloc(in_size * sizeof(T)); + std::atomic_int *h_finished = + (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size); + std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int)); + ALLOC_ERR(h_in_out, h_finished, h_head); + T *h_in_backup = (T *)malloc(in_size * sizeof(T)); + ALLOC_ERR(h_in_backup); + timer.stop("Allocation"); + timer.print("Allocation", 1); + + // Initialize + timer.start("Initialization"); + read_input(h_in_out, p); + memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size); + for(int i = 0; i < N_; i++) + h_head[i].store(0); + timer.stop("Initialization"); + timer.print("Initialization", 1); + memcpy(h_in_backup, h_in_out, in_size * sizeof(T)); // Backup for reuse across iterations + + // Loop over main kernel + for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + + // Reset + memcpy(h_in_out, h_in_backup, in_size * sizeof(T)); + memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size); + for(int i = 0; i < N_; i++) + h_head[i].store(0); + + // start timer + if(rep >= p.n_warmup) + timer.start("Step 1"); + // Launch CPU threads + std::thread main_thread_1(run_cpu_threads_100, h_in_out, h_finished, h_head, M_ * m, N_, n, p.n_threads); //M_ * m * N_); + main_thread_1.join(); + // end timer + if(rep >= p.n_warmup) + timer.stop("Step 1"); + + for(int i = 0; i < N_; i++) + h_head[i].store(0); + + // start timer + if(rep >= p.n_warmup) + timer.start("Step 2"); + // Launch CPU threads + std::thread main_thread_2(run_cpu_threads_010, h_in_out, h_head, m, n, M_ * N_, p.n_threads); + main_thread_2.join(); + // end timer + if(rep >= p.n_warmup) + timer.stop("Step 2"); + + memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size); + for(int i = 0; i < N_; i++) + h_head[i].store(0); + + // start timer + if(rep >= p.n_warmup) + timer.start("Step 3"); + // Launch CPU threads + for(int i = 0; i < N_; i++){ + std::thread main_thread_3(run_cpu_threads_100, h_in_out + i * M_ * n * m, h_finished + i * M_ * n, h_head + i, M_, n, m, p.n_threads); //M_ * n); + main_thread_3.join(); + } + // end timer + if(rep >= p.n_warmup) + timer.stop("Step 3"); + } + timer.print("Step 1", p.n_reps); + timer.print("Step 2", p.n_reps); + timer.print("Step 3", p.n_reps); + + // Verify answer + //verify(h_in_out, h_in_backup, M_ * m, N_ * n, 1); + + // Free memory + timer.start("Deallocation"); + free(h_in_out); + free(h_finished); + free(h_head); + free(h_in_backup); + timer.stop("Deallocation"); + timer.print("Deallocation", 1); + + printf("Test Passed\n"); + return 0; +} diff --git a/TRNS/baselines/cpu/support/common.h b/TRNS/baselines/cpu/support/common.h new file mode 100644 index 0000000..f03900a --- /dev/null +++ b/TRNS/baselines/cpu/support/common.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef DOUBLE_PRECISION +#define DOUBLE_PRECISION 1 +#endif + +#if DOUBLE_PRECISION +#define T double +#else +#define T float +#endif + +#define PRINT 0 + +#define divceil(n, m) (((n)-1) / (m) + 1) + +#endif diff --git a/TRNS/baselines/cpu/support/setup.h b/TRNS/baselines/cpu/support/setup.h new file mode 100644 index 0000000..a978152 --- /dev/null +++ b/TRNS/baselines/cpu/support/setup.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include + +// Allocation error checking +#define ERR_1(v1) \ + if(v1 == NULL) { \ + fprintf(stderr, "Allocation error at %s, %d\n", __FILE__, __LINE__); \ + exit(-1); \ + } +#define ERR_2(v1,v2) ERR_1(v1) ERR_1(v2) +#define ERR_3(v1,v2,v3) ERR_2(v1,v2) ERR_1(v3) +#define ERR_4(v1,v2,v3,v4) ERR_3(v1,v2,v3) ERR_1(v4) +#define ERR_5(v1,v2,v3,v4,v5) ERR_4(v1,v2,v3,v4) ERR_1(v5) +#define ERR_6(v1,v2,v3,v4,v5,v6) ERR_5(v1,v2,v3,v4,v5) ERR_1(v6) +#define GET_ERR_MACRO(_1,_2,_3,_4,_5,_6,NAME,...) NAME +#define ALLOC_ERR(...) GET_ERR_MACRO(__VA_ARGS__,ERR_6,ERR_5,ERR_4,ERR_3,ERR_2,ERR_1)(__VA_ARGS__) diff --git a/TRNS/baselines/cpu/support/timer.h b/TRNS/baselines/cpu/support/timer.h new file mode 100644 index 0000000..70ee386 --- /dev/null +++ b/TRNS/baselines/cpu/support/timer.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include +#include +#include +#include + +using namespace std; + +struct Timer { + + map startTime; + map stopTime; + map time; + + void start(string name) { + if(!time.count(name)) { + time[name] = 0.0; + } + gettimeofday(&startTime[name], NULL); + } + + void stop(string name) { + gettimeofday(&stopTime[name], NULL); + time[name] += (stopTime[name].tv_sec - startTime[name].tv_sec) * 1000000.0 + + (stopTime[name].tv_usec - startTime[name].tv_usec); + } + + void print(string name, int REP) { printf("%s Time (ms): %f\n", name.c_str(), time[name] / (1000 * REP)); } +}; diff --git a/TRNS/baselines/cpu/support/verify.h b/TRNS/baselines/cpu/support/verify.h new file mode 100644 index 0000000..ea02f0c --- /dev/null +++ b/TRNS/baselines/cpu/support/verify.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include "common.h" +#include + +inline int compare_output(T *output, T *ref, int dim) { + int i; + for(i = 0; i < dim; i++) { + T diff = fabs(ref[i] - output[i]); + if((diff - 0.0f) > 0.00001f && diff > 0.01 * fabs(ref[i])) { + printf("line: %d ref: %f actual: %f diff: %f\n", i, ref[i], output[i], diff); + exit(EXIT_FAILURE); + } + } + return 0; +} + +// Sequential transposition for comparison purposes +//[w][h/t][t] to [h/t][w][t] +static void trns_host(T* input, unsigned int A, unsigned int B, unsigned int b){ + T* output = (T*) malloc(sizeof(T) * A * B * b); + unsigned int next; + for (unsigned int j = 0; j < b; j++){ + for (unsigned int i = 0; i < A * B; i++){ + next = (i * A) - (A * B - 1) * (i / B); + output[next * b + j] = input[i*b+j]; + } + } + for (unsigned int k = 0; k < A * B * b; k++){ + input[k] = output[k]; + } + free(output); +} + +inline void verify(T *input2, T *input, int height, int width, int tile_size) { + trns_host(input, height, width, tile_size); + compare_output(input2, input, height * width); +} + diff --git a/TRNS/baselines/gpu/Makefile b/TRNS/baselines/gpu/Makefile new file mode 100644 index 0000000..12395b7 --- /dev/null +++ b/TRNS/baselines/gpu/Makefile @@ -0,0 +1,51 @@ +# +# Copyright (c) 2016 University of Cordoba and University of Illinois +# All rights reserved. +# +# Developed by: IMPACT Research Group +# University of Cordoba and University of Illinois +# http://impact.crhc.illinois.edu/ +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# with the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# > Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimers. +# > Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimers in the +# documentation and/or other materials provided with the distribution. +# > Neither the names of IMPACT Research Group, University of Cordoba, +# University of Illinois nor the names of its contributors may be used +# to endorse or promote products derived from this Software without +# specific prior written permission. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +# THE SOFTWARE. +# + +CXX=/usr/local/cuda/bin/nvcc +CXX_FLAGS=-std=c++11 + +LIB=-L/usr/lib/ -L$/usr/local/cuda/lib64/ -lm + +INC=-I/usr/local/cuda/include/ + +DEP=kernel.h main.cpp kernel.cu support/common.h support/cuda-setup.h support/timer.h support/verify.h +SRC=main.cpp kernel.cu +EXE=trns + +all: + $(CXX) $(CXX_FLAGS) $(SRC) $(LIB) $(INC) -o $(EXE) + +clean: + rm -f $(EXE) + diff --git a/TRNS/baselines/gpu/README b/TRNS/baselines/gpu/README new file mode 100644 index 0000000..c36ee77 --- /dev/null +++ b/TRNS/baselines/gpu/README @@ -0,0 +1,16 @@ +In-place matrix transposition (TRNS) + +Compilation instructions + + make + +Execution instructions + + ./trns -w 0 -r 1 -m 16 -n 8 -o 4096 -p 2556 -i 64 + +For more options + + ./trns -h + +Read more +J. Gomez-Luna et al., “In-place Matrix Transposition on GPUs,” IEEE TPDS, 2016. diff --git a/TRNS/baselines/gpu/kernel.cu b/TRNS/baselines/gpu/kernel.cu new file mode 100644 index 0000000..f2251cd --- /dev/null +++ b/TRNS/baselines/gpu/kernel.cu @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include "support/common.h" + +extern __shared__ int l_mem[]; + +// GPU kernel ------------------------------------------------------------------------------------------ +__global__ void PTTWAC_soa_asta(int A, int B, int b, T *input, int *finished, int *head) { + + int* done = l_mem; + int* gid_ = &done[1]; + + const int tid = threadIdx.x; + int m = A * B - 1; + + if(tid == 0) // Dynamic fetch + gid_[0] = atomicAdd(&head[0], 1); + __syncthreads(); + + while(gid_[0] < m) { + int next_in_cycle = (gid_[0] * A) - m * (gid_[0] / B); + if(next_in_cycle == gid_[0]) { + if(tid == 0) // Dynamic fetch + gid_[0] = atomicAdd(&head[0], 1); + __syncthreads(); + continue; + } + T data1, data2, data3, data4; + int i = tid; + if(i < b) + data1 = input[gid_[0] * b + i]; + i += blockDim.x; + if(i < b) + data2 = input[gid_[0] * b + i]; + i += blockDim.x; + if(i < b) + data3 = input[gid_[0] * b + i]; + i += blockDim.x; + if(i < b) + data4 = input[gid_[0] * b + i]; + + if(tid == 0) { + //make sure the read is not cached + done[0] = atomicAdd(&finished[gid_[0]], 0); + } + __syncthreads(); + + for(; done[0] == 0; next_in_cycle = (next_in_cycle * A) - m * (next_in_cycle / B)) { + T backup1, backup2, backup3, backup4; + i = tid; + if(i < b) + backup1 = input[next_in_cycle * b + i]; + i += blockDim.x; + if(i < b) + backup2 = input[next_in_cycle * b + i]; + i += blockDim.x; + if(i < b) + backup3 = input[next_in_cycle * b + i]; + i += blockDim.x; + if(i < b) + backup4 = input[next_in_cycle * b + i]; + + if(tid == 0) { + done[0] = atomicExch(&finished[next_in_cycle], (int)1); + } + __syncthreads(); + + if(!done[0]) { + i = tid; + if(i < b) + input[next_in_cycle * b + i] = data1; + i += blockDim.x; + if(i < b) + input[next_in_cycle * b + i] = data2; + i += blockDim.x; + if(i < b) + input[next_in_cycle * b + i] = data3; + i += blockDim.x; + if(i < b) + input[next_in_cycle * b + i] = data4; + } + i = tid; + if(i < b) + data1 = backup1; + i += blockDim.x; + if(i < b) + data2 = backup2; + i += blockDim.x; + if(i < b) + data3 = backup3; + i += blockDim.x; + if(i < b) + data4 = backup4; + } + + if(tid == 0) // Dynamic fetch + gid_[0] = atomicAdd(&head[0], 1); + __syncthreads(); + } +} + +cudaError_t call_PTTWAC_soa_asta(int blocks, int threads, int A, int B, int b, T *input, + int *finished, int *head, int l_mem_size){ + dim3 dimGrid(blocks); + dim3 dimBlock(threads); + PTTWAC_soa_asta<<>>(A, B, b, input, + finished, head); + cudaError_t err = cudaGetLastError(); + return err; +} + +__global__ void BS_marshal(T *input, int tile_size, int width) { + + T* tile = (T*)l_mem; + + int tidx = threadIdx.x; + int m = width*tile_size-1; + int bid = blockIdx.x; + + input += tile_size*width*bid; + for (int i = tidx; i < tile_size*width; i+=blockDim.x) { + int next = (i * tile_size)-m*(i/width); + tile[next] = input[i]; + } + __syncthreads(); + for (int i = tidx; i < tile_size*width; i+=blockDim.x) { + input[i] = tile[i]; + } +} + +cudaError_t call_BS_marshal(int blocks, int threads, int m, int n, T *input, int l_mem_size){ + dim3 dimGrid(blocks); + dim3 dimBlock(threads); + BS_marshal<<>>(input, m, n); + cudaError_t err = cudaGetLastError(); + return err; +} diff --git a/TRNS/baselines/gpu/kernel.h b/TRNS/baselines/gpu/kernel.h new file mode 100644 index 0000000..47ecc16 --- /dev/null +++ b/TRNS/baselines/gpu/kernel.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include "cuda_runtime.h" +#include +#include +#include "support/common.h" + +cudaError_t call_PTTWAC_soa_asta(int blocks, int threads, int A, int B, int b, T *input, + int *finished, int *head, int l_mem_size); + +cudaError_t call_BS_marshal(int blocks, int threads, int m, int n, T *input, int l_mem_size); diff --git a/TRNS/baselines/gpu/main.cpp b/TRNS/baselines/gpu/main.cpp new file mode 100644 index 0000000..fa4c1e5 --- /dev/null +++ b/TRNS/baselines/gpu/main.cpp @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include "support/cuda-setup.h" +#include "kernel.h" +#include "support/common.h" +#include "support/timer.h" +#include "support/verify.h" + +#include +#include +#include +#include + +// Params --------------------------------------------------------------------- +struct Params { + + int device; + int n_gpu_threads; + int n_gpu_blocks; + int n_threads; + int n_warmup; + int n_reps; + int M_; + int m; + int N_; + int n; + + Params(int argc, char **argv) { + device = 0; + n_gpu_threads = 64; + n_gpu_blocks = 16; + n_warmup = 5; + n_reps = 50; + M_ = 128; + m = 16; + N_ = 128; + n = 8; + int opt; + while((opt = getopt(argc, argv, "hd:i:g:t:w:r:m:n:o:p:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 'd': device = atoi(optarg); break; + case 'i': n_gpu_threads = atoi(optarg); break; + case 'g': n_gpu_blocks = atoi(optarg); break; + case 't': n_threads = atoi(optarg); break; + case 'w': n_warmup = atoi(optarg); break; + case 'r': n_reps = atoi(optarg); break; + case 'm': m = atoi(optarg); break; + case 'n': n = atoi(optarg); break; + case 'o': M_ = atoi(optarg); break; + case 'p': N_ = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert((n_gpu_threads > 0 && n_gpu_blocks > 0) + && "TRNS only runs on CPU-only or GPU-only: './trns -g 0' or './trns -t 0'"); + } + + void usage() { + fprintf(stderr, + "\nUsage: ./trns [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -d CUDA device ID (default=0)" + "\n -i # of device threads per block (default=64)" + "\n -g # of device blocks (default=16)" + "\n -w # of untimed warmup iterations (default=5)" + "\n -r # of timed repetition iterations (default=50)" + "\n" + "\nData-partitioning-specific options:" + "\n TRNS only supports CPU-only or GPU-only execution" + "\n" + "\nBenchmark-specific options:" + "\n -m m (default=16 elements)" + "\n -n n (default=8 elements)" + "\n -o M_ (default=128 elements)" + "\n -p N_ (default=128 elements)" + "\n"); + } +}; + +// Input Data ----------------------------------------------------------------- +void read_input(T *x_vector, const Params &p) { + int in_size = p.M_ * p.m * p.N_ * p.n; + srand(5432); + for(int i = 0; i < in_size; i++) { + x_vector[i] = ((T)(rand() % 100) / 100); + } +} + +// Main ------------------------------------------------------------------------------------------ +int main(int argc, char **argv) { + + const Params p(argc, argv); + CUDASetup setcuda(p.device); + Timer timer; + cudaError_t cudaStatus; + + // Allocate + timer.start("Allocation"); + int M_ = p.M_; + int m = p.m; + int N_ = p.N_; + int n = p.n; + int in_size = M_ * m * N_ * n; + int finished_size = M_ * m * N_; + T * h_in_out = (T *)malloc(in_size * sizeof(T)); + std::atomic_int *h_finished = + (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size); + std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int)); + ALLOC_ERR(h_in_out, h_finished, h_head); + T * d_in_out; + int * d_finished; + int * d_head; + if(p.n_gpu_blocks != 0) { + cudaStatus = cudaMalloc((void**)&d_in_out, in_size * sizeof(T)); + cudaStatus = cudaMalloc((void**)&d_finished, (p.n_gpu_blocks != 0) ? sizeof(int) * finished_size : 0); + cudaStatus = cudaMalloc((void**)&d_head, (p.n_gpu_blocks != 0) ? N_ * sizeof(int) : 0); + CUDA_ERR(); + } + T *h_in_backup = (T *)malloc(in_size * sizeof(T)); + ALLOC_ERR(h_in_backup); + cudaDeviceSynchronize(); + timer.stop("Allocation"); + timer.print("Allocation", 1); + + // Initialize + timer.start("Initialization"); + const int max_gpu_threads = setcuda.max_gpu_threads(); + read_input(h_in_out, p); + memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size); + for(int i = 0; i < N_; i++) + h_head[i].store(0); + timer.stop("Initialization"); + timer.print("Initialization", 1); + memcpy(h_in_backup, h_in_out, in_size * sizeof(T)); // Backup for reuse across iterations + + // Copy to device + timer.start("Copy To Device"); + if(p.n_gpu_blocks != 0) { + cudaStatus = cudaMemcpy(d_in_out, h_in_backup, in_size * sizeof(T), cudaMemcpyHostToDevice); + cudaStatus = cudaMemcpy(d_finished, h_finished, sizeof(int) * finished_size, cudaMemcpyHostToDevice); + cudaStatus = cudaMemcpy(d_head, h_head, N_ * sizeof(int), cudaMemcpyHostToDevice); + CUDA_ERR(); + } + cudaDeviceSynchronize(); + timer.stop("Copy To Device"); + timer.print("Copy To Device", 1); + + // Loop over main kernel + for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + + // Reset + memcpy(h_in_out, h_in_backup, in_size * sizeof(T)); + memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size); + for(int i = 0; i < N_; i++) + h_head[i].store(0); + cudaDeviceSynchronize(); + + // Launch GPU threads + if(p.n_gpu_blocks > 0) { + // Kernel launch + assert(p.n_gpu_threads <= max_gpu_threads && + "The thread block size is greater than the maximum thread block size that can be used on this device"); + + cudaStatus = cudaMemcpy(d_in_out, h_in_backup, in_size * sizeof(T), cudaMemcpyHostToDevice); + cudaStatus = cudaMemcpy(d_finished, h_finished, sizeof(int) * finished_size, cudaMemcpyHostToDevice); + cudaStatus = cudaMemcpy(d_head, h_head, N_ * sizeof(int), cudaMemcpyHostToDevice); + CUDA_ERR(); + + // start timer + if(rep >= p.n_warmup) + timer.start("Step 1"); + // Step 1 + cudaStatus = call_PTTWAC_soa_asta(M_ * m * N_, p.n_gpu_threads, M_ * m, N_, n, + d_in_out, (int*)d_finished, (int*)d_head, sizeof(int) + sizeof(int)); + CUDA_ERR(); + // end timer + if(rep >= p.n_warmup) + timer.stop("Step 1"); + + // start timer + if(rep >= p.n_warmup) + timer.start("Step 2"); + // Step 2 + cudaStatus = call_BS_marshal(M_ * N_, p.n_gpu_threads, m, n, d_in_out, m * n * sizeof(T)); + CUDA_ERR(); + // end timer + if(rep >= p.n_warmup) + timer.stop("Step 2"); + + cudaStatus = cudaMemcpy(d_finished, h_finished, sizeof(int) * finished_size, cudaMemcpyHostToDevice); + cudaStatus = cudaMemcpy(d_head, h_head, N_ * sizeof(int), cudaMemcpyHostToDevice); + CUDA_ERR(); + // start timer + if(rep >= p.n_warmup) + timer.start("Step 3"); + // Step 3 + for(int i = 0; i < N_; i++){ + cudaStatus = call_PTTWAC_soa_asta(M_ * n, p.n_gpu_threads, M_, n, m, + d_in_out + i * M_ * n * m, (int*)d_finished + i * M_ * n, (int*)d_head + i, sizeof(int) + sizeof(int)); + CUDA_ERR(); + } + // end timer + if(rep >= p.n_warmup) + timer.stop("Step 3"); + + } + + cudaDeviceSynchronize(); + + } + timer.print("Step 1", p.n_reps); + timer.print("Step 2", p.n_reps); + timer.print("Step 3", p.n_reps); + + // Copy back + timer.start("Copy Back and Merge"); + if(p.n_gpu_blocks != 0) { + cudaStatus = cudaMemcpy(h_in_out, d_in_out, in_size * sizeof(T), cudaMemcpyDeviceToHost); + CUDA_ERR(); + cudaDeviceSynchronize(); + } + timer.stop("Copy Back and Merge"); + timer.print("Copy Back and Merge", 1); + + // Verify answer + verify(h_in_out, h_in_backup, M_ * m, N_ * n, 1); + + // Free memory + timer.start("Deallocation"); + free(h_in_out); + free(h_finished); + free(h_head); + if(p.n_gpu_blocks != 0) { + cudaStatus = cudaFree(d_in_out); + cudaStatus = cudaFree(d_finished); + cudaStatus = cudaFree(d_head); + CUDA_ERR(); + } + free(h_in_backup); + cudaDeviceSynchronize(); + timer.stop("Deallocation"); + timer.print("Deallocation", 1); + + // Release timers + timer.release("Allocation"); + timer.release("Initialization"); + timer.release("Copy To Device"); + timer.release("Step 1"); + timer.release("Step 2"); + timer.release("Step 3"); + timer.release("Copy Back and Merge"); + timer.release("Deallocation"); + + printf("Test Passed\n"); + return 0; +} diff --git a/TRNS/baselines/gpu/support/common.h b/TRNS/baselines/gpu/support/common.h new file mode 100644 index 0000000..8a7f37f --- /dev/null +++ b/TRNS/baselines/gpu/support/common.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef DOUBLE_PRECISION +#define DOUBLE_PRECISION 1 +#endif + +#if DOUBLE_PRECISION +#define T long int // double +#else +#define T int // float +#endif + +#define PRINT 0 + +#define divceil(n, m) (((n)-1) / (m) + 1) + +#endif diff --git a/TRNS/baselines/gpu/support/cuda-setup.h b/TRNS/baselines/gpu/support/cuda-setup.h new file mode 100644 index 0000000..7b7eefe --- /dev/null +++ b/TRNS/baselines/gpu/support/cuda-setup.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include +#include +#include + +// Allocation error checking +#define ERR_1(v1) \ + if(v1 == NULL) { \ + fprintf(stderr, "Allocation error at %s, %d\n", __FILE__, __LINE__); \ + exit(-1); \ + } +#define ERR_2(v1,v2) ERR_1(v1) ERR_1(v2) +#define ERR_3(v1,v2,v3) ERR_2(v1,v2) ERR_1(v3) +#define ERR_4(v1,v2,v3,v4) ERR_3(v1,v2,v3) ERR_1(v4) +#define ERR_5(v1,v2,v3,v4,v5) ERR_4(v1,v2,v3,v4) ERR_1(v5) +#define ERR_6(v1,v2,v3,v4,v5,v6) ERR_5(v1,v2,v3,v4,v5) ERR_1(v6) +#define GET_ERR_MACRO(_1,_2,_3,_4,_5,_6,NAME,...) NAME +#define ALLOC_ERR(...) GET_ERR_MACRO(__VA_ARGS__,ERR_6,ERR_5,ERR_4,ERR_3,ERR_2,ERR_1)(__VA_ARGS__) + +#define CUDA_ERR() \ + if(cudaStatus != cudaSuccess) { \ + fprintf(stderr, "CUDA error: %s\n at %s, %d\n", cudaGetErrorString(cudaStatus), __FILE__, __LINE__); \ + exit(-1); \ + } + +struct CUDASetup { + + cudaDeviceProp device_prop; + + CUDASetup(int device) { + cudaError_t cudaStatus; + cudaStatus = cudaSetDevice(device); + CUDA_ERR(); + + cudaStatus = cudaGetDeviceProperties(&device_prop, device); + CUDA_ERR(); + fprintf(stderr, "%s\t", device_prop.name); + + } + + int max_gpu_threads() { + return device_prop.maxThreadsPerBlock; + } +}; diff --git a/TRNS/baselines/gpu/support/timer.h b/TRNS/baselines/gpu/support/timer.h new file mode 100644 index 0000000..fceab04 --- /dev/null +++ b/TRNS/baselines/gpu/support/timer.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +using namespace std; + +struct Timer { + + map startTime; + map stopTime; + map time; + + void start(string name) { + if(!time.count(name)) { + cudaEventCreate(&startTime[name]); + cudaEventCreate(&stopTime[name]); + time[name] = 0.0; + } + cudaEventRecord(startTime[name], 0); + } + + void stop(string name) { + cudaEventRecord(stopTime[name],0); + cudaEventSynchronize(stopTime[name]); + float part_time = 0.0; + cudaEventElapsedTime(&part_time, startTime[name], stopTime[name]); + time[name] += part_time; + } + + void print(string name, unsigned int REP) { printf("%s Time (ms): %f\n", name.c_str(), time[name] / REP); } + + void release(string name){ + cudaEventDestroy(startTime[name]); + cudaEventDestroy(stopTime[name]); + } +}; diff --git a/TRNS/baselines/gpu/support/verify.h b/TRNS/baselines/gpu/support/verify.h new file mode 100644 index 0000000..c3ba224 --- /dev/null +++ b/TRNS/baselines/gpu/support/verify.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include "common.h" +#include + +inline int compare_output(T *output, T *ref, int dim) { + int i; + for(i = 0; i < dim; i++) { + T diff = fabs(ref[i] - output[i]); + if((diff - 0.0f) > 0.00001f && diff > 0.01 * fabs(ref[i])) { + printf("line: %d ref: %f actual: %f diff: %f\n", i, ref[i], output[i], diff); + exit(EXIT_FAILURE); + } + } + return 0; +} + +// Sequential transposition for comparison purposes +//[w][h/t][t] to [h/t][w][t] +static void trns_host(T* input, unsigned int A, unsigned int B, unsigned int b){ + T* output = (T*) malloc(sizeof(T) * A * B * b); + unsigned int next; + for (unsigned int j = 0; j < b; j++){ + for (unsigned int i = 0; i < A * B; i++){ + next = (i * A) - (A * B - 1) * (i / B); + output[next * b + j] = input[i*b+j]; + } + } + for (unsigned int k = 0; k < A * B * b; k++){ + input[k] = output[k]; + } + free(output); +} + +inline void verify(T *input2, T *input, int height, int width, int tile_size) { + trns_host(input, height, width, tile_size); + compare_output(input2, input, height * width); +} diff --git a/TRNS/dpu/task.c b/TRNS/dpu/task.c new file mode 100644 index 0000000..56d5059 --- /dev/null +++ b/TRNS/dpu/task.c @@ -0,0 +1,199 @@ +/* +* 3-step matrix transposition with multiple tasklets +* Acks: Stefano Ballarin (P&S PIM Fall 2020) +* +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../support/common.h" + +__host dpu_arguments_t DPU_INPUT_ARGUMENTS; + +uint32_t curr_tile = 0; // protected by MUTEX +uint32_t get_tile(); +void read_tile_step2(uint32_t A, uint32_t offset, T* variable, uint32_t m, uint32_t n); +void write_tile_step2(uint32_t A, uint32_t offset, T* variable, uint32_t m, uint32_t n); +void read_tile_step3(uint32_t A, uint32_t offset, T* variable, uint32_t m); +void write_tile_step3(uint32_t A, uint32_t offset, T* variable, uint32_t m); +_Bool get_done(uint32_t done_array_step3, uint32_t address, T* read_done); +_Bool get_and_set_done(uint32_t done_array_step3, uint32_t address, T* read_done); + +// Barrier +BARRIER_INIT(my_barrier, NR_TASKLETS); + +// Mutexes +MUTEX_INIT(tile_mutex); +MUTEX_INIT(done_mutex); + +extern int main_kernel1(void); +extern int main_kernel2(void); + +int (*kernels[nr_kernels])(void) = {main_kernel1, main_kernel2}; + +int main(void) { + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel](); +} + +// Step 2: 0010 +int main_kernel1() { + unsigned int tasklet_id = me(); +#if PRINT + printf("tasklet_id = %u\n", tasklet_id); +#endif + if (tasklet_id == 0){ // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + uint32_t A = (uint32_t)DPU_MRAM_HEAP_POINTER; // A in MRAM + uint32_t M_ = DPU_INPUT_ARGUMENTS.M_; + uint32_t m = DPU_INPUT_ARGUMENTS.m; + uint32_t n = DPU_INPUT_ARGUMENTS.n; + + T* data = (T*) mem_alloc(m * n * sizeof(T)); + T* backup = (T*) mem_alloc(m * n * sizeof(T)); + + for(unsigned int tile = tasklet_id; tile < M_; tile += NR_TASKLETS){ + read_tile_step2(A, tile * m * n, data, m, n); + for (unsigned int i = 0; i < m * n; i++){ + backup[(i * m) - (m * n - 1) * (i / n)] = data[i]; + } + write_tile_step2(A, tile * m * n, backup, m, n); + } + + return 0; +} + +// Step 3: 0100 +int main_kernel2() { + unsigned int tasklet_id = me(); +#if PRINT + printf("tasklet_id = %u\n", tasklet_id); +#endif + if (tasklet_id == 0){ // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + uint32_t A = (uint32_t)DPU_MRAM_HEAP_POINTER; + uint32_t m = DPU_INPUT_ARGUMENTS.m; + uint32_t n = DPU_INPUT_ARGUMENTS.n; + uint32_t M_ = DPU_INPUT_ARGUMENTS.M_; + uint32_t done_array = (uint32_t)(DPU_MRAM_HEAP_POINTER + M_ * m * n * sizeof(T)); + + const uint32_t tile_max = M_ * n - 1; // Tile id upper bound + + T* data = (T*)mem_alloc(sizeof(T) * m); + T* backup = (T*)mem_alloc(sizeof(T) * m); + T* read_done = (T*)mem_alloc(sizeof(T)); + + uint32_t tile; + _Bool done; + + tile = get_tile(); + + while (tile < tile_max){ + uint32_t next_in_cycle = ((tile * M_) - tile_max * (tile / n)); + if (next_in_cycle == tile){ + tile = get_tile(); + continue; + } + read_tile_step3(A, tile * m, data, m); + + done = get_done(done_array, tile, read_done); + for(; done == 0; next_in_cycle = ((next_in_cycle * M_) - tile_max * (next_in_cycle / n))){ + read_tile_step3(A, next_in_cycle * m, backup, m); + + done = get_and_set_done(done_array, next_in_cycle, read_done); + + if(!done) { + write_tile_step3(A, next_in_cycle * m, data, m); + } + for(uint32_t i = 0; i < m; i++){ + data[i] = backup[i]; + } + } + tile = get_tile(); + } + + return 0; +} + +// Auxiliary functions +uint32_t get_tile(){ + mutex_lock(tile_mutex); + uint32_t value = curr_tile; + curr_tile++; + mutex_unlock(tile_mutex); + return value; +} + +void read_tile_step2(uint32_t A, uint32_t offset, T* variable, uint32_t m, uint32_t n){ + int rest = m * n; + int transfer; + while(rest > 0){ + if(rest * sizeof(T) > 2048){ + transfer = 2048 / sizeof(T); + } else { + transfer = rest; + } + mram_read((__mram_ptr void*)(A + (offset + m * n - rest) * sizeof(T)), variable + (m * n - rest) * sizeof(T), sizeof(T) * transfer); + rest -= transfer; + } +} + +void write_tile_step2(uint32_t A, uint32_t offset, T* variable, uint32_t m, uint32_t n){ + int rest = m * n; + int transfer; + while(rest > 0){ + if(rest * sizeof(T) > 2048){ + transfer = 2048 / sizeof(T); + } else { + transfer = rest; + } + mram_write(variable + (m * n - rest) * sizeof(T), (__mram_ptr void*)(A + (offset + m * n - rest) * sizeof(T)), sizeof(T) * transfer); + rest -= transfer; + } +} + +void read_tile_step3(uint32_t A, uint32_t offset, T* variable, uint32_t m){ + mram_read((__mram_ptr void*)(A + offset * sizeof(T)), variable, sizeof(T) * m); +} + +void write_tile_step3(uint32_t A, uint32_t offset, T* variable, uint32_t m){ + mram_write(variable, (__mram_ptr void*)(A + offset * sizeof(T)), sizeof(T) * m); +} + +_Bool get_done(uint32_t done_array_step3, uint32_t address, T* read_done){ + uint32_t result; + + mutex_lock(done_mutex); + mram_read((__mram_ptr void*)(done_array_step3 + address), read_done, sizeof(T)); + result = ((*read_done & (0x01 << (address % sizeof(T)))) != 0); + mutex_unlock(done_mutex); + + return (_Bool)result; +} + +_Bool get_and_set_done(uint32_t done_array_step3, uint32_t address, T* read_done){ + uint32_t result; + + mutex_lock(done_mutex); + mram_read((__mram_ptr void*)(done_array_step3 + address), read_done, sizeof(T)); + result = ((*read_done & (0x01 << (address % sizeof(T)))) != 0); + *read_done |= (0x01 << (address % sizeof(T))); + mram_write(read_done, (__mram_ptr void*)(done_array_step3 + address), sizeof(T)); + mutex_unlock(done_mutex); + + return (_Bool)result; +} diff --git a/TRNS/host/app.c b/TRNS/host/app.c new file mode 100644 index 0000000..1aa6392 --- /dev/null +++ b/TRNS/host/app.c @@ -0,0 +1,281 @@ +/** +* app.c +* TRNS Host Application Source File +* +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../support/common.h" +#include "../support/timer.h" +#include "../support/params.h" + +// Define the DPU Binary path as DPU_BINARY here +#ifndef DPU_BINARY +#define DPU_BINARY "./bin/dpu_code" +#endif + +#if ENERGY +#include +#endif + +// Pointer declaration +static T* A_host; +static T* A_backup; +static T* A_result; + +// Create input arrays +static void read_input(T* A, unsigned int nr_elements) { + srand(0); + printf("nr_elements\t%u\t", nr_elements); + for (unsigned int i = 0; i < nr_elements; i++) { + A[i] = (T) (rand()); + } +} + +// Compute output in the host +static void trns_host(T* input, unsigned int A, unsigned int B, unsigned int b){ + T* output = (T*) malloc(sizeof(T) * A * B * b); + unsigned int next; + for (unsigned int j = 0; j < b; j++){ + for (unsigned int i = 0; i < A * B; i++){ + next = (i * A) - (A * B - 1) * (i / B); + output[next * b + j] = input[i*b+j]; + } + } + for (unsigned int k = 0; k < A * B * b; k++){ + input[k] = output[k]; + } + free(output); +} + +// Main of the Host Application +int main(int argc, char **argv) { + + struct Params p = input_params(argc, argv); + + struct dpu_set_t dpu_set, dpu; + uint32_t nr_of_dpus; + +#if ENERGY + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); +#endif + + unsigned int i = 0; + unsigned int N_ = p.N_; + const unsigned int n = p.n; + const unsigned int M_ = p.M_; + const unsigned int m = p.m; + N_ = p.exp == 0 ? N_ * NR_DPUS : N_; + + // Input/output allocation + A_host = malloc(M_ * m * N_ * n * sizeof(T)); + A_backup = malloc(M_ * m * N_ * n * sizeof(T)); + A_result = malloc(M_ * m * N_ * n * sizeof(T)); + T* done_host = malloc(M_ * n); // Host array to reset done array of step 3 + memset(done_host, 0, M_ * n); + + // Create an input file with arbitrary data + read_input(A_host, M_ * m * N_ * n); + memcpy(A_backup, A_host, M_ * m * N_ * n * sizeof(T)); + + // Timer declaration + Timer timer; + + printf("NR_TASKLETS\t%d\n", NR_TASKLETS); + printf("M_\t%u, m\t%u, N_\t%u, n\t%u\n", M_, m, N_, n); + + // Loop over main kernel + for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + + int timer_fix = 0; + // Compute output on CPU (performance comparison and verification purposes) + memcpy(A_host, A_backup, M_ * m * N_ * n * sizeof(T)); + if(rep >= p.n_warmup) + start(&timer, 0, rep - p.n_warmup + timer_fix); + trns_host(A_host, M_ * m, N_ * n, 1); + if(rep >= p.n_warmup) + stop(&timer, 0); + + unsigned int curr_dpu = 0; + unsigned int active_dpus; + unsigned int active_dpus_before = 0; + unsigned int first_round = 1; + + while(curr_dpu < N_){ + // Allocate DPUs and load binary + if((N_ - curr_dpu) > NR_DPUS){ + active_dpus = NR_DPUS; + } else { + active_dpus = (N_ - curr_dpu); + } + if((active_dpus_before != active_dpus) && (!(first_round))){ + DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_alloc(active_dpus, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + printf("Allocated %d DPU(s)\n", nr_of_dpus); + } else if (first_round){ + DPU_ASSERT(dpu_alloc(active_dpus, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + printf("Allocated %d DPU(s)\n", nr_of_dpus); + } + + printf("Load input data (step 1)\n"); + if(rep >= p.n_warmup) + start(&timer, 1, rep - p.n_warmup + timer_fix); + // Load input matrix (step 1) + for(unsigned int j = 0; j < M_ * m; j++){ + unsigned int i = 0; + DPU_FOREACH(dpu_set, dpu) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &A_backup[j * N_ * n + n * (i + curr_dpu)])); + i++; + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, sizeof(T) * j * n, sizeof(T) * n, DPU_XFER_DEFAULT)); + } + if(rep >= p.n_warmup) + stop(&timer, 1); + // Reset done array (for step 3) + DPU_FOREACH(dpu_set, dpu) { + DPU_ASSERT(dpu_prepare_xfer(dpu, done_host)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, M_ * m * n * sizeof(T), (M_ * n) / 8 == 0 ? 8 : M_ * n, DPU_XFER_DEFAULT)); + + unsigned int kernel = 0; + dpu_arguments_t input_arguments = {m, n, M_, kernel}; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT)); + printf("Run step 2 on DPU(s) \n"); + // Run DPU kernel + if(rep >= p.n_warmup){ + start(&timer, 2, rep - p.n_warmup + timer_fix); +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + } + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + if(rep >= p.n_warmup){ + stop(&timer, 2); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); +#endif + } +#if PRINT + { + unsigned int each_dpu = 0; + printf("Display DPU Logs\n"); + DPU_FOREACH (dpu_set, dpu) { + printf("DPU#%d:\n", each_dpu); + DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); + each_dpu++; + } + } +#endif + + kernel = 1; + dpu_arguments_t input_arguments2 = {m, n, M_, kernel}; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments2)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments2), DPU_XFER_DEFAULT)); + printf("Run step 3 on DPU(s) \n"); + // Run DPU kernel + if(rep >= p.n_warmup){ + start(&timer, 3, rep - p.n_warmup + timer_fix); +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + } + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + if(rep >= p.n_warmup){ + stop(&timer, 3); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); +#endif + } +#if PRINT + { + unsigned int each_dpu = 0; + printf("Display DPU Logs\n"); + DPU_FOREACH (dpu_set, dpu) { + printf("DPU#%d:\n", each_dpu); + DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); + each_dpu++; + } + } +#endif + + printf("Retrieve results\n"); + if(rep >= p.n_warmup) + start(&timer, 4, rep - p.n_warmup + timer_fix); + DPU_FOREACH(dpu_set, dpu) { + DPU_ASSERT(dpu_prepare_xfer(dpu, (T*)(&A_result[curr_dpu * m * n * M_]))); + curr_dpu++; + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, sizeof(T) * m * n * M_, DPU_XFER_DEFAULT)); + if(rep >= p.n_warmup) + stop(&timer, 4); + + if(first_round){ + first_round = 0; + } + timer_fix++; + } + DPU_ASSERT(dpu_free(dpu_set)); + + } + + // Print timing results + printf("CPU "); + print(&timer, 0, p.n_reps); + printf("CPU-DPU (Step 1) "); + print(&timer, 1, p.n_reps); + printf("Step 2 "); + print(&timer, 2, p.n_reps); + printf("Step 3 "); + print(&timer, 3, p.n_reps); + printf("DPU-CPU "); + print(&timer, 4, p.n_reps); + + #if ENERGY + double energy; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); + printf("DPU Energy (J): %f\t", energy); + #endif + + // Check output + bool status = true; + for (i = 0; i < M_ * m * N_ * n; i++) { + if(A_host[i] != A_result[i]){ + status = false; +#if PRINT + printf("%d: %lu -- %lu\n", i, A_host[i], A_result[i]); +#endif + } + } + if (status) { + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); + } else { + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); + } + + // Deallocation + free(A_host); + free(A_backup); + free(A_result); + free(done_host); + + return status ? 0 : -1; +} diff --git a/TRNS/support/common.h b/TRNS/support/common.h new file mode 100755 index 0000000..2ba56c5 --- /dev/null +++ b/TRNS/support/common.h @@ -0,0 +1,40 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +// Transfer size between MRAM and WRAM +#ifdef BL +#define BLOCK_SIZE_LOG2 BL +#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2) +#else +#define BLOCK_SIZE_LOG2 8 +#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2) +#define BL BLOCK_SIZE_LOG2 +#endif + +// Data type +#define T int64_t + +// Structures used by both the host and the dpu to communicate information +typedef struct { + uint32_t m; + uint32_t n; + uint32_t M_; + enum kernels { + kernel1 = 0, + kernel2 = 1, + nr_kernels = 2, + } kernel; +} dpu_arguments_t; + +#ifndef ENERGY +#define ENERGY 0 +#endif +#define PRINT 0 + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_RESET "\x1b[0m" + +#define divceil(n, m) (((n)-1) / (m) + 1) +#define roundup(n, m) ((n / m) * m + m) +#endif diff --git a/TRNS/support/params.h b/TRNS/support/params.h new file mode 100644 index 0000000..6b7e6f2 --- /dev/null +++ b/TRNS/support/params.h @@ -0,0 +1,68 @@ +#ifndef _PARAMS_H_ +#define _PARAMS_H_ + +#include "common.h" + +typedef struct Params { + unsigned int M_; + unsigned int m; + unsigned int N_; + unsigned int n; + int n_warmup; + int n_reps; + int exp; +}Params; + +static void usage() { + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w # of untimed warmup iterations (default=1)" + "\n -e # of timed repetition iterations (default=3)" + "\n -x Weak (0) or strong (1) scaling (default=0)" + "\n" + "\nBenchmark-specific options:" + "\n -m m (default=16 elements)" + "\n -n n (default=8 elements)" + "\n -o M_ (default=12288 elements)" + "\n -p N_ (default=1 elements)" + "\n"); +} + +struct Params input_params(int argc, char **argv) { + struct Params p; + p.M_ = 12288; + p.m = 16; + p.N_ = 1; + p.n = 8; + p.n_warmup = 1; + p.n_reps = 3; + p.exp = 0; + + int opt; + while((opt = getopt(argc, argv, "hw:e:x:m:n:o:p:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 'w': p.n_warmup = atoi(optarg); break; + case 'e': p.n_reps = atoi(optarg); break; + case 'x': p.exp = atoi(optarg); break; + case 'm': p.m = atoi(optarg); break; + case 'n': p.n = atoi(optarg); break; + case 'o': p.M_ = atoi(optarg); break; + case 'p': p.N_ = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); + + return p; +} +#endif diff --git a/TRNS/support/timer.h b/TRNS/support/timer.h new file mode 100755 index 0000000..b53d95f --- /dev/null +++ b/TRNS/support/timer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include + +typedef struct Timer{ + + struct timeval startTime[7]; + struct timeval stopTime[7]; + double time[7]; + +}Timer; + +void start(Timer *timer, int i, int rep) { + if(rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) { + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); } -- cgit v1.2.3