Florian Rappl, Departement of Theoretical Physics, University of Regensburg
void main(int argc, char **argv) {
int sum = 0;
int add = atoi(argv[1]);
for(int i = 0; i < 100; i++) {
sum += i + add;
}
printf("Result = %d\n", sum);
}
void main(int argc, char **argv) {
int sum = 0;
int add = atoi(argv[1]);
#pragma omp parallel for reduction(+: sum)
for(int i = 0; i < 100; i++) {
sum += i + add;
}
printf("Result = %d\n", sum);
}
jmp LOOP_HEAD
LOOP_BODY:
;some moves...
addl %eax, -8(%rbp)
addl $1, -12(%rbp)
LOOP_HEAD:
cmpl $99, -12(%rbp)
jle LOOP_BODY
;again some moves
call printf
movl $OMP_SECTION, %edi
call GOMP_parallel_start
;load and move
call OMP_SECTION
call GOMP_parallel_end
;some moves
call printf
OMP_SECTION:
LOOP_HEAD:
call omp_get_num_threads
call omp_get_thread_num
; moves and shifts
idivl %ebx
imull %ebx, %edx
cmpl $100, %edx
; further computations
jge LOOP_BARRIER
LOOP_BODY:
; some moves
addl -20(%rbp), %eax
addl %eax, -24(%rbp)
addl $1, -20(%rbp)
cmpl %edx, -20(%rbp)
jl LOOP_BODY
LOOP_BARRIER:
; some moves again
lock addl %eax, (%rdx)
omp_set_nested
, omp_get_nested
omp_set_schedule
, omp_get_schedule
void main(int argc, char **argv) {
reducer_opadd<int> sum = 0;
int add = atoi(argv[1]);
_Cilk_for(int i = 0; i < 100; i++) {
sum += i + add;
}
printf("Result = %d\n", sum.get_value());
}
cilk_spawn
, cilk_sync
and cilk_for
#pragma cilk_grainsize
set_worker_count()
c[0:2*n-1] = 0; //Initialize
for (size_t i=0; i<n; ++i)
c[i:n] += a[i]*b[0:n];
void main(int argc, char **argv) {
int add = atoi(argv[1]);
int sum = parallel_reduce(blocked_range<int>(0, 100), 0,
[](const blocked_range<int>& r, int local)->int {
for(int i = r.begin(); i != r.end(); i++) {
local += i + add;
return local; }},
[](int x, int y)->int {
return x + y; });
printf("Result = %d\n", sum);
}
concurrent_vector
compare_and_swap
There are 3 rules to follow when parallelizing large codes. Unfortunately, no one knows what these rules are.
W. Somerset Maugham, Gary Montry