openacc.examples
- Date:
02-09-2023
NAME
OpenACC.EXAMPLES - Example OpenACC codes
DESCRIPTION
C
Example 1: pointer passing for non-global arrays
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
int n=10;
// This example shows usage of global arrays in calc_gl()
// and pointer passing for non-global arrays in calc().
float a_gl[100],b_gl[100],c_gl[100];
int calc_gl ( )
{
// Illustrate present clauses with conditional data sharing clauses,
// global arrays, parallel regions in functions.
//
// In this case, calls to calc_gl are within a data region, so
// arrays are present and there will be no copies at the region directives.
//
// If the function is called outside the acc_data region the compiler
// does a check for the arrays being present on the accelerator, finds
// them absent and does the necessary transfers according to the
// data_sharing clauses.
//
// When ipa inlines the functions the compiler optimizes the regions and
// no checks are done for the presence of the arrays - it can tell they
// have already been transferred to the accelerator.
// There is one set of transfers at the acc_data pragma
// and one at the end of the acc_data region.
//
// When ipa does not inline the routines the compiler does a check for
// the arrays being present on the accelerator at the acc_region directives,
// finds them present and does not do any transfers.
//
int i;
#pragma acc parallel present_or_copyin(a_gl, b_gl) present_or_copyout(c_gl)
{
#pragma acc loop
for (i=0; i<n; i++ ) {
c_gl[i] = a_gl[i] + b_gl[i] + c_gl[i];
}
}
return (0);
}
// Declare similar function referencing non-global arrays.
// Assumes pointers to the first array element. Arrays require
// indexing based on start and length.
int calc (long start, long length, float *a, float *b, float *c )
{
int i;
// Assume arrays must be copied to/from accelerator.
// Data is transferred to/from accelerator at start and at completion
// of data region
#pragma acc data copyin(a[start:length], b[start:length] ) copy( c[start:length] )
{
#pragma acc parallel
{
#pragma acc loop
for (i=0; i<length; i++ ) {
*(c+start+i) = *(a+start+i) + *(b+start+i) + *(c+start+i);
}
} /* End of loop region */
printf("\nInside data region used by calc()\n");
printf("a[1] b[1] c[1] = %f %f %f \n", *(a+start+1), *(b+start+1), *(c+start+1));
} /* End of data region */
printf("\nOutside of data region used by calc()\n");
printf("a[1] b[1] c[1] = %f %f %f \n", *(a+start+1), *(b+start+1), *(c+start+1));
return(0);
}
int main ()
{
int rv;
long m=n;
float *a,*b,*c;
long i, start, length;
/* allocate the pointers */
a = (float*)malloc(m*sizeof(float) );
b = (float*)malloc(m*sizeof(float) );
c = (float*)malloc(m*sizeof(float) );
// Initialize arrays
for (i = 0; i < m; i++ ) {
*(a+i) = a_gl[i] = i;
*(b+i) = b_gl[i] = 2*i;
*(c+i) = c_gl[i] = 0;
}
start = 0; length = m;
// Pass array pointers for non-global arrays
for (i = 0; i< length; i++) {
#pragma _CRI inline_never calc
rv = calc (start,length,a,b,c);
}
// Have the compiler move data once to be used across multiple function calls
// Data is transferred to/from accelertor at start and at completion
// of parallel region. Using global arrays.
#pragma acc data copy(c_gl) copyin(a_gl, b_gl )
{
for (i=0; i<length; i++ ) {
rv = calc_gl( );
}
printf("\nInside data region used by calc_gl\n");
printf("a_gl[1] b_gl[1] c_gl[1] = %f %f %f\n",a_gl[1],b_gl[1],c_gl[1] );
} /* end of data region */
printf("\nOutside data region used by calc_gl\n");
printf("a_gl[1] b_gl[1] c_gl[1] = %f %f %f\n",a_gl[1],b_gl[1],c_gl[1] );
return(0);
}
Example 2: shape pointer-to-pointer arrays
/*
* This example shows how to shape "pointer-to-pointer" arrays in
* C/C++. The shape syntax is identical to true multidimensional
* arrays. The compiler will automatically perform the necessary
* deep copy to transfer all "arrays-of-pointers" and
* "arrays-of-data". According to the OpenACC spec, it is
* illegal to modify an array-of-pointers within a data region.
*
* This feature may be used for multidimensional arrays with any
* number and combination of array and pointer dimensions, as long as
* the shape is "rectangular". That is, every pointer dimension must
* have a consistent shape across the entire array.
*/
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
const int M = (argc > 1 ? atoi(argv[1]) : 8);
const int N = (argc > 2 ? atoi(argv[2]) : 64);
float **x;
x = (float**)calloc(M, sizeof(x));
for (int i = 0 ; i < M ; i++) {
x[i] = (float*)calloc(N, sizeof(x[0]));
}
#pragma acc parallel loop copy(x[0:M][0:N])
for (int i = 0 ; i < M ; i++) {
#pragma acc loop
for (int j = 0 ; j < N ; j++) {
x[i][j] = x[i][j] + 1;
}
}
int sum = 0.0;
for (int i = 0 ; i < M ; i++)
for (int j = 0 ; j < N ; j++)
sum += abs((int)x[i][j]);
printf("result = %d\n", sum);
if ( sum != M*N ) {
printf("FAIL\n");
return 1;
}
printf("PASS\n");
return 0;
}
Example 3: manual deep copy
/* The OpenACC data directives apply to contiguous regions of
* data that can be described by a base address and a size. The
* implementation can just copy each region and update the base
* pointer for use in accelerator regions. However, this strategy
* only works for "flat" data structures that only contain raw data,
* which is interpreted the same on both the host and accelerator.
* This strategy breaks down for "disjoint data structures", where a
* data structure contains pointers to other data structures, because
* host pointers are invalid on the accelerator and accelerator
* pointers are invalid on the host. Indeed, the OpenACC standard
* specifies that "if a struct or class member is a pointer type, the
* data addressed by that pointer are not implicitly copied." That
* is, OpenACC supports "shallow" copy.
*
* There are essentially two steps for copying disjoint data
* structures:
*
* (1) Copy each disjoint object independently
* (2) Fix pointers to restore relationships between objects
*
*/
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
/* Include this file for Cray OpenACC API */
#include <openacc.h>
struct A {
struct B *x; /* pointer to array of struct B */
int n; /* number of elements in x */
};
struct B {
double *y; /* pointer to array of double */
int n; /* number of elements in y */
};
/* This convenience macro allows us to fix a pointer in accelerator
memory, essentially performing a memcpy the size of a pointer to
write the appropriate address. */
#define SET_ACC_PTR(acc_ptr, acc_target) (acc_memcpy_to_device( &(acc_ptr), &(acc_target), sizeof(void*) ))
/* This routine performs a "deep" copy of an array of struct A
* objects. It allocates and copies each struct A object, the struct
* B objects pointed to by struct A objects, and the double arrays
* pointed to by struct B objects. Finally, this routine also fixes
* all of the pointers in device memory to point to the accelerator
* objects rather than the host objects.
*/
struct A* struct_A_copyin(struct A*z, const int n) {
/* copyin z[0:n] */
struct A*acc_z = (struct A*)acc_copyin( z, n*sizeof(struct A) );
for (int i = 0 ; i < n ; i++) {
/* copyin z[i].x[0:z[i].n] */
struct B*acc_x = (struct B*)acc_copyin( z[i].x, z[i].n*sizeof(struct B) );
/* fix acc pointer acc_z[i].x */
SET_ACC_PTR(acc_z[i].x, acc_x);
for (int j = 0 ; j < z[i].n ; j++) {
/* copyin z[i].x[j].y[0:z[i].x[j].n] */
double*acc_y = (double*)acc_copyin( z[i].x[j].y, z[i].x[j].n*sizeof(double) );
/* fix acc pointer acc_x[j].y */
SET_ACC_PTR(acc_x[j].y, acc_y);
}
}
return acc_z;
}
/* This routine performs a deep copy from acc to host, updating a
* host disjoint data structure to match its corresponding acc
* disjoint data structure. The host objects are overwritten
* entirely with acc objects, destroying the host pointers. However,
* then we look up the acc pointers with the acc_hostptr library
* function to find the corresponding host pointers, allowing us to
* fix up the pointers.
*/
void struct_A_update_host(struct A*z, const int n) {
/* update z[0:n] */
acc_update_self( (void*)z, n*sizeof(struct A) );
for (int i = 0 ; i < n ; i++) {
/* fix acc pointer acc_z[i].x */
z[i].x = (struct B*)(acc_hostptr((void*)z[i].x));
/* update z[i].x[0:z[i].n] */
acc_update_self( (void*)z[i].x, z[i].n*sizeof(struct B) );
for (int j = 0 ; j < z[i].n ; j++) {
/* fix acc pointer acc_x[j].y */
z[i].x[j].y = (double *)(acc_hostptr((void*)z[i].x[j].y));
/* update z[i].x[j].y[0:z[i].x[j].n] */
acc_update_self( (void*)z[i].x[j].y, z[i].x[j].n*sizeof(double) );
}
}
}
/* This routine performs a deep copy from host to accelerator,
* updating an accelerator disjoint data structure to match its
* corresponding host disjoint data structure. The accelerator
* objects are overwritten entirely with host objects, destroying the
* accelerator pointers. However, then we look up the host pointers
* with the acc_hostptr library function to find the
* corresponding accelerator pointers, allowing us to fix up the
* pointers.
*/
void struct_A_update_device(struct A*z, const int n) {
/* update z[0:n] */
acc_update_device( (void*)z, n*sizeof(struct A) );
struct A*acc_z = (struct A*)acc_deviceptr((void*)z);
for (int i = 0 ; i < n ; i++) {
/* update z[i].x[0:z[i].n] */
acc_update_device( (void*)z[i].x, z[i].n*sizeof(struct B) );
struct B*acc_x = (struct B*)acc_deviceptr( (void*)z[i].x );
/* fix acc pointer acc_z[i].x */
SET_ACC_PTR(acc_z[i].x, acc_x);
for (int j = 0 ; j < z[i].n ; j++) {
/* update z[i].x[j].y[0:z[i].x[j].n] */
acc_update_device( (void*)z[i].x[j].y, z[i].x[j].n*sizeof(double) );
double*acc_y = (double*)acc_deviceptr( (void*)z[i].x[j].y );
/* fix acc pointer acc_x[j].y */
SET_ACC_PTR(acc_x[j].y, acc_y);
}
}
}
/* This routine performs a deep free on the disjoint data structure,
* freeing all of the double arrays, struct B objects, and struct A
* objects.
*/
void struct_A_free(struct A*z, const int n) {
for (int i = 0 ; i < n ; i++) {
for (int j = 0 ; j < z[i].n ; j++) {
acc_delete( (void*)z[i].x[j].y, z[i].x[j].n*sizeof(double) );
}
acc_delete( (void*)z[i].x, z[i].n*sizeof(struct B) );
}
acc_delete( (void*)z, n*sizeof(struct A) );
}
/* This function iterates over a disjoint data structure and computes
* a sum reduction over the elements in the double arrays. This
* function executes on the host.
*/
double host_sum(const struct A*z, const int n) {
double sum = 0.0;
for (int i = 0 ; i < n ; i++) {
for (int j = 0 ; j < z[i].n ; j++) {
for (int k = 0 ; k < z[i].x[j].n ; k++) {
sum += z[i].x[j].y[k];
}
}
}
return sum;
}
/* This function iterates over a disjoint data structure and computes
* a sum reduction over the elements in the double arrays. This
* function executes on the accelerator, so it expects that the
* entire data structure has already been copied to the accelerator
* prior to invocation.
*/
double acc_sum(const struct A*z, const int n) {
double sum = 0.0;
/* The disjoint data structure is expected to be present */
#pragma acc parallel loop reduction(+:sum) present(z[0:n])
for (int i = 0 ; i < n ; i++) {
for (int j = 0 ; j < z[i].n ; j++) {
for (int k = 0 ; k < z[i].x[j].n ; k++) {
sum += z[i].x[j].y[k];
}
}
}
return sum;
}
/* Zero out all double elements in the host copy of the disjoint data
* structure.
*/
void host_zero(struct A*z, const int n) {
for (int i = 0 ; i < n ; i++) {
for (int j = 0 ; j < z[i].n ; j++) {
for (int k = 0 ; k < z[i].x[j].n ; k++) {
z[i].x[j].y[k] = 0.0;
}
}
}
}
/* Zero out all double elements in the accelerator copy of the
* disjoint data structure.
*/
void acc_zero(struct A*z, const int n) {
/* The disjoint data structure is expected to be present */
#pragma acc parallel loop present(z[0:n])
for (int i = 0 ; i < n ; i++) {
for (int j = 0 ; j < z[i].n ; j++) {
for (int k = 0 ; k < z[i].x[j].n ; k++) {
z[i].x[j].y[k] = 0.0;
}
}
}
}
int main() {
int n = 4;
int count = 0;
/* Allocate and initialize a disjoint data structure */
struct A *z;
/* The top-level is an array of struct A objects */
z = (struct A*)calloc(n, sizeof(struct A));
for (int i = 0 ; i < n ; i++) {
z[i].n = n;
/* Each struct A object points to an array of struct B objects */
z[i].x = (struct B*)calloc(n, sizeof(struct B));
for (int j = 0 ; j < z[i].n ; j++) {
z[i].x[j].n = n;
/* Each struct B object points to an array of doubles */
z[i].x[j].y = (double *)calloc(n, sizeof(double));
for (int k = 0 ; k < z[i].x[j].n ; k++) {
z[i].x[j].y[k] = ++count;
}
}
}
/* Compute and print expected sum */
printf("expected = %g\n", (double)count * ((double)count + 1.0) / 2.0);
/* Compute and print host sum */
printf("host_sum = %g\n", host_sum(z, n));
/* Deep-copy the disjoint data structure to the accelerator */
printf("deep copy from host to acc\n");
struct_A_copyin(z, n);
/* Compute and print accelerator sum */
printf("acc_sum = %g\n", acc_sum(z, n));
/* Zero out host data */
printf("zero host data\n");
host_zero(z, n);
/* Recompute and print host sum */
printf("host_sum = %g\n", host_sum(z, n));
/* Restore host data by copying from accelerator */
printf("deep copy from acc to host\n");
struct_A_update_host(z, n);
/* Recompute and print host sum */
printf("host_sum = %g\n", host_sum(z, n));
/* Zero out acc data */
printf("zero acc data\n");
acc_zero(z, n);
/* Recompute and print acc sum */
printf("acc_sum = %g\n", acc_sum(z, n));
/* Restore acc data by copping from host */
printf("deep copy from host to acc\n");
struct_A_update_device(z, n);
/* Recompute and print acc sum */
printf("acc_sum = %g\n", acc_sum(z, n));
/* Free accelerator memory */
printf("deep free\n");
struct_A_free(z, n);
/* Free host memory */
for (int i = 0 ; i < n ; i++) {
for (int j = 0 ; i < z[i].n ; j++) {
z[i].x[j].n = 0;
free(z[i].x[j].y);
}
z[i].n = 0;
free(z[i].x);
}
free(z);
return 0;
}
Example 4: #pragma acc routine
/*
* This example shows how to use the "#pragma acc routine" directive
* to compile entire functions for a device and make function calls in
* compute regions. Each acc routine specifies the type of
* worksharing used in that routine.
*/
#include <stdio.h>
#include <stdlib.h>
#pragma acc routine seq
float func4(float x) {
return x + 1;
}
#pragma acc routine vector
void func3(int N, float *x) {
#pragma acc loop vector
for (int i = 0 ; i < N ; i++)
x[i] = func4(x[i]);
}
#pragma acc routine worker
void func2(int N, float **x) {
#pragma acc loop worker
for (int i = 0 ; i < N ; i++)
func3(N, x[i]);
}
#pragma acc routine gang
void func1(int N, float ***x) {
#pragma acc loop gang
for (int i = 0 ; i < N ; i++)
func2(N, x[i]);
}
int main() {
const int N = 8;
float ***x;
x = (float***)calloc(N, sizeof(x));
for (int i = 0 ; i < N ; i++) {
x[i] = (float**)calloc(N, sizeof(x[0]));
for (int j = 0 ; j < N ; j++)
x[i][j] = (float*)calloc(N, sizeof(x[0][0]));
}
#pragma acc parallel copy(x[0:N][0:N][0:N])
{
func1(N, x);
}
int sum = 0.0;
for (int i = 0 ; i < N ; i++)
for (int j = 0 ; j < N ; j++)
for (int k = 0 ; k < N ; k++)
sum += abs((int)x[i][j][k]);
printf("result = %d\n", sum);
if ( sum != N*N*N ) {
printf("FAIL\n");
return 1;
}
printf("PASS\n");
return 0;
}
Example 5: wait for dependencies
/*
* Using wait to handle complex accelerator region
* dependencies.
*
* This example shows how asynchronous waits can be used
* to execute a region asynchronously that is dependent
* on calculations done on several async queues without
* requiring a wait on the host.
*/
#include <stdio.h>
#define SIZE 1000
int main( )
{
long a[SIZE];
long b[SIZE];
long c[SIZE];
long d[SIZE];
long e[SIZE];
int i;
#pragma acc data create(a,b,c,d)
{
/*
* Calculate a, b, c on different async queues so that
* the calculations can be done in parallel.
*/
#pragma acc parallel loop async(1)
for (i=0;i<SIZE;++i) {
a[i] = i;
}
#pragma acc parallel loop async(2)
for (i=0;i<SIZE;++i) {
b[i] = i*2;
}
#pragma acc parallel loop async(3)
for (i=0;i<SIZE;++i) {
c[i] = i*3;
}
/*
* Calculate d from a,b,c making sure to wait for b and
* c to complete. The calculation of a is guaranteed to
* complete before this can execute since it uses the
* same async id. Note since this is done with a valid
* async id, the parallel region can be enqueued but the
* host is not required to wait at any point yet.
*/
#pragma acc parallel loop async(1) wait(2,3)
for (i=0;i<SIZE;++i) {
d[i] = a[i]+b[i]+c[i];
}
/*
* Start transfering d back to the host, but don't wait for the
* transfer to complete.
*/
#pragma acc update self(d) async(1)
/*
* Perform some expensive calculation on the host. Since all the previous
* parallel and updates are asynchronous this can be done in parallel with
* all the previous accelerator operations.
*/
for (i=0;i<SIZE;++i) {
e[i] = i*i;
}
/*
* Make sure the update (and all previous accelerator operations) have completed
* before doing anything else, including ending the data region.
*/
#pragma acc wait(1)
}
printf( "d[10]=%ld e[10]=%ld\n",d[10],e[10]);
return 0;
}
Example 6: create identifiers for dependent operations
/*
* Using wait to create distinct identifiers for
* a series of dependent operations.
*
* This example shows how asynchronous waits can be used
* to execute a series of steps asynchronously that are dependent
* on the previous step but with a unique async identifier for
* each step. This allows waiting for a specific step in the
* process without waiting for the whole process.
*/
#include <stdio.h>
#define SIZE 1000
#define STEPS 10
int main( )
{
long a[SIZE];
long b[SIZE];
long c[SIZE];
int i,step;
for (i=0;i<SIZE;++i) {
c[i]= 0;
}
#pragma acc data create(a,b) copy(c)
{
/*
* Pipeline a series of dependent asynchronous operations on the device
*
* time 0: step 0 stage 1
* time 1: step 0 stage 2
* time 2: step 0 stage 3 step 1 stage 1
* time 3: step 0 stage 4 step 1 stage 2
* time 4: step 1 stage 3 step 2 stage 1
* time 5: step 1 stage 4 step 2 stage 2
*
*/
for (step=0;step<STEPS;++step) {
/*
* Must wait for stage 2 of the previous iteration to finish before
* we start modifying a on the host.
*/
#pragma acc wait(1)
/*
* Stage 1: some operation on the host
* Can overlap with stage 3 and 4 of previous iteration
*/
a[0] = step*step;
for (i=1;i<SIZE;++i) {
a[i] = i*a[i-1];
}
/*
* Must wait for stage 3 of the previous iteration to finish before
* we start modifying a on the device. Note this wait could
* be added to stage 2 directly.
*/
#pragma acc wait(2)
/*
* Stage 2: update the device copy of a
* Can overlap with stage 4 or previous iteration
*/
#pragma acc update device(a) async(1)
/*
* Must wait for stage 4 of the previous iteration to finish before
* we can start modifying b on the device. Note this wait could
* be added to stage 3 directively.
*/
#pragma acc wait(3)
/*
* Stage 3: use a on the device
*/
#pragma acc parallel loop async(2) wait(1)
for (i=0;i<SIZE;++i) {
b[i] = a[i]-step;
}
/*
* Step 4: a calculation that doesn't use a
*/
#pragma acc parallel loop async(3) wait(2)
for (i=0;i<SIZE;++i) {
c[i] += b[i];
}
}
/*
* Wait for c to be calculated before exiting the data region
*/
#pragma acc wait(3)
}
printf( "c[10]=%ld\n",c[10]);
return 0;
}
Fortran
Example 7: !$acc parallel loop
PROGRAM test_openacc
IMPLICIT NONE
INTEGER, PARAMETER :: M=1000
INTEGER :: a(M),b(M),c(M)
INTEGER :: j,total,expected
!!$ For simple cases, use parallel loop as a shortcut for
!!$ parallel and loop
!!$ Set a,b,c
!$acc parallel loop
DO j = 1,M
a(j) = j
b(j) = j
c(j) = j
ENDDO
!$acc end parallel loop
!!$ Set b, copy it to host
!$acc parallel copyout(b)
!$acc loop
DO j = 1,M
b(j) = 2*j
ENDDO
!$acc end loop
!$acc end parallel
!!$ Set c, copy it to host
!$acc parallel copyout(c)
!$acc loop
DO j = 1,M
c(j) = -j
ENDDO
!$acc end loop
!$acc end parallel
!!$ Compute a based on b and c, copy a to host
!$acc parallel copyout(a) copyin(b,c)
!$acc loop
DO j = 1,M
a(j) = b(j) + c(j)
ENDDO
!$acc end loop
!$acc end parallel
!!$ Compute a checksum
total = 0
!$acc parallel copyin(a)
!$acc loop reduction(+:total)
DO j = 1,M
total = total + a(j)
ENDDO
!$acc end loop
!$acc end parallel
!!$ The expected result
expected = M*(M+1)/2
PRINT *,"Result: ",total
PRINT *,"Verified: ",(total==expected)
END PROGRAM test_openacc
Example 8: !$acc cache directive
!!$ The example applies loop and blocking directives as recommended.
!!$ Cache directive must be in innermost loop.
!!$ Size of cache must be bounded.
!!$ In this example, blocking directives force a 16x64 tile.
!!$
!!$ Compile with option -rmd. Note cache messages appearing in *.lst file.
subroutine example( A, B, ni, nj, nk )
integer ni, nj, nk
integer A(ni, nj, nk)
integer B(0:ni+1, 0:nj+1, nk)
!$acc parallel copyout(A), copyin(B)
!$acc loop gang
do k=1,nk
!dir$ blockable( i, j )
!$acc loop worker
!dir$ blockingsize ( 16 )
do j=1,nj
!$acc loop vector
!dir$ blockingsize ( 64 )
do i=1,ni
!$acc cache( A(i,j,k), B(i-1:i+1,j-1:j+1,k) )
A(i,j,k) = B(i,j,k) - &
( B(i-1, j-1, k) &
+ B(i-1, j+1, k) &
+ B(i+1, j-1, k) &
+ B(i+1, j+1, k) ) / 5
end do ! i
end do ! j
end do ! k
!$acc end parallel
end subroutine example
Example 9: !$acc parallel loop directive
program main
integer(8), parameter :: n = 100
integer(8) :: i
real(8) :: A(n), B(n), C(n)
real(8) :: expected, actual
A=0 ; B=1 ; C=2 ;
expected = sum(B+C)
!$acc parallel loop copyout(A) copyin(B,C)
do i=1,n
A(i) = B(i) + C(i)
end do
actual = sum(A)
print *, 'expected = ', expected
print *, 'actual = ', actual
if ( actual .ne. expected ) then
print *, "FAIL"
call exit(-1)
else
print *, "PASS"
call exit(0)
end if
end program main
Example 10: using OPENACC_LIB
program main
! Using openacc C runtime library from Fortran code.
! This example is functionally equivalent to Example 7,
! but runtime library routines are used to
! perform the data transfers.
use openacc_lib
integer(8), parameter :: n = 100
integer(8) :: i
real(8) :: A(n), B(n), C(n)
real(8) :: expected, actual
A=0 ; B=1 ; C=2 ;
expected = sum(B+C)
! Use functions to allocate and transfer data in
call acc_create ( A )
! could also use 'all acc_create( A(1), 8*n )'
call acc_copyin ( B )
! could also use 'call acc_copyin( B(1), 8*n )'
call acc_copyin ( C )
! could also use 'call acc_copyin( C(1), 8*n)'
! Specify A,B,C as "present", since we already took care of
! allocating and copying
!$acc parallel loop present(A,B,C)
do i=1,n
A(i) = B(i) + C(i)
end do
! Use functions to transfer out and delete data
call acc_copyout ( A )
call acc_delete ( B )
call acc_delete ( C )
actual = sum(A)
print *, 'expected = ', expected
print *, 'actual = ', actual
if ( actual .ne. expected ) then
print *, "FAIL"
call exit(-1)
else
print *, "PASS"
call exit(0)
end if
end program main
Example 11: !#acc routine directive
! This example shows how to use the "!$acc routine" directive to
! compile entire functions for a device and make function calls in
! compute regions. Each acc routine specifies the type of worksharing
! used in that routine.
subroutine func4(x)
!$acc routine seq
real :: x
integer :: i
x = x + 1
end subroutine func4
subroutine func3(N, x)
!$acc routine vector
!$acc routine(func4) seq
real :: x(N)
integer :: N, i
!$acc loop vector
do i=1,N
call func4(x(i))
end do
end subroutine func3
subroutine func2(N, x)
!$acc routine worker
!$acc routine(func3) vector
real :: x(N,N)
integer :: N, i
!$acc loop worker
do i=1,N
call func3(N, x(:,i))
end do
end subroutine func2
subroutine func1(N, x)
!$acc routine gang
real :: x(N,N,N)
integer :: N, i
!$acc routine(func2) worker
!$acc loop gang
do i=1,N
call func2(N, x(:,:,i))
end do
end subroutine func1
program main
!$acc routine(func1) gang
integer, parameter :: N = 8
real :: x(N,N,N)
x = 0
!$acc parallel copy(x)
call func1(N, x)
!$acc end parallel
print *, 'result =', sum(abs(x))
if ( sum(abs(x)) .ne. N*N*N ) then
print *, 'FAIL'
call exit(1)
else
print *, 'PASS'
call exit(0)
end if
end program main