1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#include <stdlib.h>
#include <stdio.h>
#include "../../support/timer.h"
#include "gemv_utils.h"
int main(int argc, char *argv[])
{
const size_t rows = 20480;
const size_t cols = 8192;
double **A, *b, *x;
b = (double*) malloc(sizeof(double)*rows);
x = (double*) malloc(sizeof(double)*cols);
allocate_dense(rows, cols, &A);
make_hilbert_mat(rows,cols, &A);
Timer timer;
for (int i = 0; i < 100; i++) {
#pragma omp parallel
{
#pragma omp for
for (size_t i = 0; i < cols; i++) {
x[i] = (double) i+1 ;
}
#pragma omp for
for (size_t i = 0; i < rows; i++) {
b[i] = (double) 0.0;
}
}
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
nr_threads++;
start(&timer, 0, 0);
gemv(A, x, rows, cols, &b);
stop(&timer, 0);
printf("[::] n_threads=%d e_type=%s n_elements=%d "
"| throughput_cpu_omp_MBps=%f\n",
nr_threads, "double", rows * cols,
rows * cols * sizeof(double) / timer.time[0]);
printf("[::] n_threads=%d e_type=%s n_elements=%d "
"| throughput_cpu_omp_MOpps=%f\n",
nr_threads, "double", rows * cols,
rows * cols / timer.time[0]);
printf("[::] n_threads=%d e_type=%s n_elements=%d |",
nr_threads, "double", rows * cols);
printall(&timer, 0);
}
#if 0
print_vec(x, rows);
print_mat(A, rows, cols);
print_vec(b, rows);
#endif
printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows));
return 0;
}
void gemv(double** A, double* x, size_t rows, size_t cols, double** b) {
#pragma omp parallel for
for (size_t i = 0; i < rows; i ++ )
for (size_t j = 0; j < cols; j ++ ) {
(*b)[i] = (*b)[i] + A[i][j]*x[j];
}
}
void make_hilbert_mat(size_t rows, size_t cols, double*** A) {
#pragma omp parallel for
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
(*A)[i][j] = 1.0/( (double) i + (double) j + 1.0);
}
}
}
double sum_vec(double* vec, size_t rows) {
double sum = 0.0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < rows; i++) sum = sum + vec[i];
return sum;
}
|