Florian Rappl, Departement of Theoretical Physics, University of Regensburg
void main(int argc, char **argv) {int sum = 0;int add = atoi(argv[1]);for(int i = 0; i < 100; i++) {sum += i + add;}printf("Result = %d\n", sum);}
void main(int argc, char **argv) {int sum = 0;int add = atoi(argv[1]);#pragma omp parallel for reduction(+: sum)for(int i = 0; i < 100; i++) {sum += i + add;}printf("Result = %d\n", sum);}
jmp LOOP_HEADLOOP_BODY:;some moves...addl %eax, -8(%rbp)addl $1, -12(%rbp)LOOP_HEAD:cmpl $99, -12(%rbp)jle LOOP_BODY;again some movescall printf
movl $OMP_SECTION, %edicall GOMP_parallel_start;load and movecall OMP_SECTIONcall GOMP_parallel_end;some movescall printf
OMP_SECTION:LOOP_HEAD:call omp_get_num_threadscall omp_get_thread_num; moves and shiftsidivl %ebximull %ebx, %edxcmpl $100, %edx; further computationsjge LOOP_BARRIER
LOOP_BODY:; some movesaddl -20(%rbp), %eaxaddl %eax, -24(%rbp)addl $1, -20(%rbp)cmpl %edx, -20(%rbp)jl LOOP_BODYLOOP_BARRIER:; some moves againlock addl %eax, (%rdx)
omp_set_nested, omp_get_nestedomp_set_schedule, omp_get_schedulevoid main(int argc, char **argv) {reducer_opadd<int> sum = 0;int add = atoi(argv[1]);_Cilk_for(int i = 0; i < 100; i++) {sum += i + add;}printf("Result = %d\n", sum.get_value());}
cilk_spawn, cilk_sync and cilk_for#pragma cilk_grainsizeset_worker_count()c[0:2*n-1] = 0; //Initializefor (size_t i=0; i<n; ++i)c[i:n] += a[i]*b[0:n];
void main(int argc, char **argv) {int add = atoi(argv[1]);int sum = parallel_reduce(blocked_range<int>(0, 100), 0,[](const blocked_range<int>& r, int local)->int {for(int i = r.begin(); i != r.end(); i++) {local += i + add;return local; }},[](int x, int y)->int {return x + y; });printf("Result = %d\n", sum);}
concurrent_vectorcompare_and_swapThere are 3 rules to follow when parallelizing large codes. Unfortunately, no one knows what these rules are.
W. Somerset Maugham, Gary Montry