openacc.examples

Date:: 02-09-2023

NAME

OpenACC.EXAMPLES - Example OpenACC codes

DESCRIPTION

C

Example 1: pointer passing for non-global arrays

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
int n=10;

//  This example shows usage of global arrays in calc_gl()
//  and  pointer passing for non-global arrays in calc().

float  a_gl[100],b_gl[100],c_gl[100];

int calc_gl ( )
{

//    Illustrate present clauses with conditional data sharing clauses,
//    global arrays, parallel regions in functions.
//
//    In this case, calls to calc_gl are within a data region, so
//    arrays are present and there will be no copies at the region directives.
//
//    If the function is called outside the acc_data region the compiler
//    does a check for the arrays being present on the accelerator, finds
//    them absent and does the necessary transfers according to the
//    data_sharing clauses.
//
//    When ipa inlines the functions the compiler optimizes the regions and
//    no checks are done for the presence of the arrays - it can tell they
//    have already been transferred to the accelerator.
//    There is one set of transfers at the acc_data pragma
//    and one at the end of the acc_data region.
//
//    When ipa does not inline the routines the compiler does a check for
//    the arrays being present on the accelerator at the acc_region directives,
//    finds them present and does not do any transfers.
//

        int i;

#pragma acc parallel present_or_copyin(a_gl, b_gl) present_or_copyout(c_gl)
        {
#pragma acc loop
        for (i=0; i<n; i++ ) {
                c_gl[i] =  a_gl[i] + b_gl[i] + c_gl[i];
                }
        }
        return (0);
}

// Declare similar function referencing non-global arrays.
// Assumes pointers to the first array element. Arrays require
// indexing based on start and length.

int calc (long start, long length, float *a, float *b, float *c )
{
int i;

// Assume arrays must be copied to/from accelerator.
// Data is transferred to/from accelerator at start and at completion
// of data region

#pragma acc data copyin(a[start:length], b[start:length] ) copy( c[start:length] )
        {
#pragma acc parallel
                {
#pragma acc  loop
                for (i=0; i<length; i++ ) {
                        *(c+start+i) =  *(a+start+i) + *(b+start+i) + *(c+start+i);
                        }
 } /*  End of loop region */
    printf("\nInside data region used by calc()\n");
    printf("a[1] b[1] c[1]  = %f %f %f \n", *(a+start+1), *(b+start+1), *(c+start+1));

        } /*  End of data region  */
printf("\nOutside of data region used by calc()\n");
printf("a[1] b[1] c[1]  = %f %f %f \n", *(a+start+1), *(b+start+1), *(c+start+1));
return(0);
}


int main ()
{
int rv;
long m=n;
float  *a,*b,*c;
long i, start, length;

/* allocate the pointers  */
a = (float*)malloc(m*sizeof(float) );
b = (float*)malloc(m*sizeof(float) );
c = (float*)malloc(m*sizeof(float) );

//  Initialize arrays
for (i = 0; i < m; i++ ) {
        *(a+i) = a_gl[i] = i;
        *(b+i) = b_gl[i] = 2*i;
        *(c+i) = c_gl[i] = 0;
}

start = 0; length = m;

// Pass array pointers for non-global arrays

for (i = 0; i< length; i++) {
#pragma _CRI inline_never calc
        rv =  calc (start,length,a,b,c);
}

// Have the compiler move data once to be used across multiple function calls
// Data is transferred to/from accelertor at start and at completion
// of parallel region.  Using global arrays.

#pragma acc data copy(c_gl) copyin(a_gl, b_gl )
{
        for (i=0; i<length; i++ ) {
                rv = calc_gl( );
        }
        printf("\nInside data region used by calc_gl\n");
        printf("a_gl[1] b_gl[1] c_gl[1] = %f %f %f\n",a_gl[1],b_gl[1],c_gl[1] );


}  /*  end of data region  */

printf("\nOutside data region used by calc_gl\n");
printf("a_gl[1] b_gl[1] c_gl[1] = %f %f %f\n",a_gl[1],b_gl[1],c_gl[1] );

return(0);

}

Example 2: shape pointer-to-pointer arrays

/*
 *  This example shows how to shape "pointer-to-pointer" arrays in
 *  C/C++.  The shape syntax is identical to true multidimensional
 *  arrays.  The compiler will automatically perform the necessary
 *  deep copy to transfer all "arrays-of-pointers" and
 *  "arrays-of-data".  According to the OpenACC spec, it is
 *  illegal to modify an array-of-pointers within a data region.
 *
 *  This feature may be used for multidimensional arrays with any
 *  number and combination of array and pointer dimensions, as long as
 *  the shape is "rectangular".  That is, every pointer dimension must
 *  have a consistent shape across the entire array.
 */

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[]) {
    const int M = (argc > 1 ? atoi(argv[1]) : 8);
    const int N = (argc > 2 ? atoi(argv[2]) : 64);

    float **x;
    x = (float**)calloc(M, sizeof(x));
    for (int i = 0 ; i < M ; i++) {
        x[i] = (float*)calloc(N, sizeof(x[0]));
    }

#pragma acc parallel loop copy(x[0:M][0:N])
    for (int i = 0 ; i < M ; i++) {
#pragma acc loop
        for (int j = 0 ; j < N ; j++) {
            x[i][j] = x[i][j] + 1;
        }
    }

    int sum = 0.0;
    for (int i = 0 ; i < M ; i++)
        for (int j = 0 ; j < N ; j++)
            sum += abs((int)x[i][j]);

    printf("result = %d\n", sum);

    if ( sum != M*N ) {
        printf("FAIL\n");
        return 1;
    }
    printf("PASS\n");
    return 0;
}

Example 3: manual deep copy

/*  The OpenACC data directives apply to contiguous regions of
 *  data that can be described by a base address and a size.  The
 *  implementation can just copy each region and update the base
 *  pointer for use in accelerator regions.  However, this strategy
 *  only works for "flat" data structures that only contain raw data,
 *  which is interpreted the same on both the host and accelerator.
 *  This strategy breaks down for "disjoint data structures", where a
 *  data structure contains pointers to other data structures, because
 *  host pointers are invalid on the accelerator and accelerator
 *  pointers are invalid on the host.  Indeed, the OpenACC standard
 *  specifies that "if a struct or class member is a pointer type, the
 *  data addressed by that pointer are not implicitly copied."  That
 *  is, OpenACC supports "shallow" copy.
 *
 *  There are essentially two steps for copying disjoint data
 *  structures:
 *
 *    (1) Copy each disjoint object independently
 *    (2) Fix pointers to restore relationships between objects
 *
 */

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

/* Include this file for Cray OpenACC API */
#include <openacc.h>


struct A {
  struct B *x; /* pointer to array of struct B */
  int n;       /* number of elements in x */
};
struct B {
  double *y; /* pointer to array of double */
  int n;     /* number of elements in y */
};

/* This convenience macro allows us to fix a pointer in accelerator
     memory, essentially performing a memcpy the size of a pointer to
     write the appropriate address. */
#define SET_ACC_PTR(acc_ptr, acc_target) (acc_memcpy_to_device( &(acc_ptr), &(acc_target), sizeof(void*) ))

/*  This routine performs a "deep" copy of an array of struct A
 *  objects.  It allocates and copies each struct A object, the struct
 *  B objects pointed to by struct A objects, and the double arrays
 *  pointed to by struct B objects.  Finally, this routine also fixes
 *  all of the pointers in device memory to point to the accelerator
 *  objects rather than the host objects.
 */
struct A* struct_A_copyin(struct A*z, const int n) {

  /* copyin z[0:n] */
  struct A*acc_z = (struct A*)acc_copyin( z, n*sizeof(struct A) );

  for (int i = 0 ; i < n ; i++) {
      /* copyin z[i].x[0:z[i].n] */
      struct B*acc_x = (struct B*)acc_copyin( z[i].x, z[i].n*sizeof(struct B) );

      /* fix acc pointer acc_z[i].x */
      SET_ACC_PTR(acc_z[i].x, acc_x);
      for (int j = 0 ; j < z[i].n ; j++) {
             /* copyin z[i].x[j].y[0:z[i].x[j].n] */
             double*acc_y = (double*)acc_copyin( z[i].x[j].y, z[i].x[j].n*sizeof(double) );

             /* fix acc pointer acc_x[j].y */
             SET_ACC_PTR(acc_x[j].y, acc_y);
      }
  }
  return acc_z;
}

/*  This routine performs a deep copy from acc to host, updating a
 *  host disjoint data structure to match its corresponding acc
 *  disjoint data structure.  The host objects are overwritten
 *  entirely with acc objects, destroying the host pointers.  However,
 *  then we look up the acc pointers with the acc_hostptr library
 *  function to find the corresponding host pointers, allowing us to
 *  fix up the pointers.
 */
void struct_A_update_host(struct A*z, const int n) {

  /* update z[0:n] */
  acc_update_self( (void*)z, n*sizeof(struct A) );
  for (int i = 0 ; i < n ; i++) {
      /* fix acc pointer acc_z[i].x */
      z[i].x = (struct B*)(acc_hostptr((void*)z[i].x));
      /* update z[i].x[0:z[i].n] */
      acc_update_self( (void*)z[i].x, z[i].n*sizeof(struct B) );
      for (int j = 0 ; j < z[i].n ; j++) {
             /* fix acc pointer acc_x[j].y */
             z[i].x[j].y = (double *)(acc_hostptr((void*)z[i].x[j].y));
             /* update z[i].x[j].y[0:z[i].x[j].n] */
             acc_update_self( (void*)z[i].x[j].y, z[i].x[j].n*sizeof(double) );
      }
  }
}

/*  This routine performs a deep copy from host to accelerator,
 *  updating an accelerator disjoint data structure to match its
 *  corresponding host disjoint data structure.  The accelerator
 *  objects are overwritten entirely with host objects, destroying the
 *  accelerator pointers.  However, then we look up the host pointers
 *  with the acc_hostptr library function to find the
 *  corresponding accelerator pointers, allowing us to fix up the
 *  pointers.
 */
void struct_A_update_device(struct A*z, const int n) {

  /* update z[0:n] */
  acc_update_device( (void*)z, n*sizeof(struct A) );
  struct A*acc_z = (struct A*)acc_deviceptr((void*)z);

  for (int i = 0 ; i < n ; i++) {
      /* update z[i].x[0:z[i].n] */
      acc_update_device( (void*)z[i].x, z[i].n*sizeof(struct B) );
      struct B*acc_x = (struct B*)acc_deviceptr( (void*)z[i].x );
      /* fix acc pointer acc_z[i].x */
      SET_ACC_PTR(acc_z[i].x, acc_x);
      for (int j = 0 ; j < z[i].n ; j++) {
             /* update z[i].x[j].y[0:z[i].x[j].n] */
             acc_update_device( (void*)z[i].x[j].y, z[i].x[j].n*sizeof(double) );
             double*acc_y = (double*)acc_deviceptr( (void*)z[i].x[j].y );
             /* fix acc pointer acc_x[j].y */
             SET_ACC_PTR(acc_x[j].y, acc_y);
      }
  }
}


/*  This routine performs a deep free on the disjoint data structure,
 *  freeing all of the double arrays, struct B objects, and struct A
 *  objects.
 */
void struct_A_free(struct A*z, const int n) {
  for (int i = 0 ; i < n ; i++) {
      for (int j = 0 ; j < z[i].n ; j++) {
             acc_delete( (void*)z[i].x[j].y, z[i].x[j].n*sizeof(double) );
      }
      acc_delete( (void*)z[i].x, z[i].n*sizeof(struct B) );
  }
  acc_delete( (void*)z, n*sizeof(struct A) );
}

/*  This function iterates over a disjoint data structure and computes
 *  a sum reduction over the elements in the double arrays.  This
 *  function executes on the host.
 */
double host_sum(const struct A*z, const int n) {
  double sum = 0.0;
  for (int i = 0 ; i < n ; i++) {
      for (int j = 0 ; j < z[i].n ; j++) {
             for (int k = 0 ; k < z[i].x[j].n ; k++) {
               sum += z[i].x[j].y[k];
             }
      }
  }
  return sum;
}

/*  This function iterates over a disjoint data structure and computes
 *  a sum reduction over the elements in the double arrays.  This
 *  function executes on the accelerator, so it expects that the
 *  entire data structure has already been copied to the accelerator
 *  prior to invocation.
 */
double acc_sum(const struct A*z, const int n) {
  double sum = 0.0;
  /* The disjoint data structure is expected to be present */
#pragma acc parallel loop reduction(+:sum) present(z[0:n])
  for (int i = 0 ; i < n ; i++) {
      for (int j = 0 ; j < z[i].n ; j++) {
             for (int k = 0 ; k < z[i].x[j].n ; k++) {
               sum += z[i].x[j].y[k];
             }
      }
  }
  return sum;
}

/*  Zero out all double elements in the host copy of the disjoint data
 *  structure.
 */
void host_zero(struct A*z, const int n) {
  for (int i = 0 ; i < n ; i++) {
      for (int j = 0 ; j < z[i].n ; j++) {
             for (int k = 0 ; k < z[i].x[j].n ; k++) {
               z[i].x[j].y[k] = 0.0;
             }
      }
  }
}

/*  Zero out all double elements in the accelerator copy of the
 *  disjoint data structure.
 */
void acc_zero(struct A*z, const int n) {
  /* The disjoint data structure is expected to be present */
#pragma acc parallel loop present(z[0:n])
  for (int i = 0 ; i < n ; i++) {
      for (int j = 0 ; j < z[i].n ; j++) {
             for (int k = 0 ; k < z[i].x[j].n ; k++) {
               z[i].x[j].y[k] = 0.0;
             }
      }
  }
}


int main() {
  int n = 4;
  int count = 0;

  /* Allocate and initialize a disjoint data structure */
  struct A *z;
  /* The top-level is an array of struct A objects */
  z = (struct A*)calloc(n, sizeof(struct A));
  for (int i = 0 ; i < n ; i++) {
      z[i].n = n;
      /* Each struct A object points to an array of struct B objects */
      z[i].x = (struct B*)calloc(n, sizeof(struct B));
      for (int j = 0 ; j < z[i].n ; j++) {
             z[i].x[j].n = n;
             /* Each struct B object points to an array of doubles */
             z[i].x[j].y = (double *)calloc(n, sizeof(double));
             for (int k = 0 ; k < z[i].x[j].n ; k++) {
               z[i].x[j].y[k] = ++count;
             }
      }
  }

  /* Compute and print expected sum */
  printf("expected = %g\n", (double)count * ((double)count + 1.0) / 2.0);

  /* Compute and print host sum */
  printf("host_sum = %g\n", host_sum(z, n));

  /* Deep-copy the disjoint data structure to the accelerator */
  printf("deep copy from host to acc\n");
  struct_A_copyin(z, n);

  /* Compute and print accelerator sum */
  printf("acc_sum = %g\n", acc_sum(z, n));

  /* Zero out host data */
  printf("zero host data\n");
  host_zero(z, n);

  /* Recompute and print host sum */
  printf("host_sum = %g\n", host_sum(z, n));

  /* Restore host data by copying from accelerator */
  printf("deep copy from acc to host\n");
  struct_A_update_host(z, n);

  /* Recompute and print host sum */
  printf("host_sum = %g\n", host_sum(z, n));
  /* Zero out acc data */
  printf("zero acc data\n");
  acc_zero(z, n);

  /* Recompute and print acc sum */
  printf("acc_sum = %g\n", acc_sum(z, n));

  /* Restore acc data by copping from host */
  printf("deep copy from host to acc\n");
  struct_A_update_device(z, n);

  /* Recompute and print acc sum */
  printf("acc_sum = %g\n", acc_sum(z, n));

  /* Free accelerator memory */
  printf("deep free\n");
  struct_A_free(z, n);

  /* Free host memory */
  for (int i = 0 ; i < n ; i++) {
      for (int j = 0 ; i < z[i].n ; j++) {
             z[i].x[j].n = 0;
             free(z[i].x[j].y);
      }
      z[i].n = 0;
      free(z[i].x);
  }
  free(z);

  return 0;
}

Example 4: #pragma acc routine

/*
 * This example shows how to use the "#pragma acc routine" directive
 * to compile entire functions for a device and make function calls in
 * compute regions.  Each acc routine specifies the type of
 * worksharing used in that routine.
 */

#include <stdio.h>
#include <stdlib.h>

#pragma acc routine seq
float func4(float x) {
    return x + 1;
}

#pragma acc routine vector
void func3(int N, float *x) {
#pragma acc loop vector
    for (int i = 0 ; i < N ; i++)
        x[i] = func4(x[i]);
}

#pragma acc routine worker
void func2(int N, float **x) {
#pragma acc loop worker
     for (int i = 0 ; i < N ; i++)
         func3(N, x[i]);
}

#pragma acc routine gang
void func1(int N, float ***x) {
#pragma acc loop gang
     for (int i = 0 ; i < N ; i++)
         func2(N, x[i]);
}

int main() {
    const int N = 8;
    float ***x;
    x = (float***)calloc(N, sizeof(x));
    for (int i = 0 ; i < N ; i++) {
        x[i] = (float**)calloc(N, sizeof(x[0]));
        for (int j = 0 ; j < N ; j++)
            x[i][j] = (float*)calloc(N, sizeof(x[0][0]));
    }

#pragma acc parallel copy(x[0:N][0:N][0:N])
    {
        func1(N, x);
    }

    int sum = 0.0;
    for (int i = 0 ; i < N ; i++)
        for (int j = 0 ; j < N ; j++)
            for (int k = 0 ; k < N ; k++)
                sum += abs((int)x[i][j][k]);

    printf("result = %d\n", sum);

    if ( sum != N*N*N ) {
        printf("FAIL\n");
        return 1;
    }
    printf("PASS\n");
    return 0;
}

Example 5: wait for dependencies

/*
 *  Using wait to handle complex accelerator region
 *  dependencies.
 *
 *  This example shows how asynchronous waits can be used
 *  to execute a region asynchronously that is dependent
 *  on calculations done on several async queues without
 *  requiring a wait on the host.
 */

#include <stdio.h>

#define SIZE 1000
int main( )
{
  long a[SIZE];
  long b[SIZE];
  long c[SIZE];
  long d[SIZE];
  long e[SIZE];
  int i;

#pragma acc data create(a,b,c,d)
  {
    /*
     *  Calculate a, b, c on different async queues so that
     *  the calculations can be done in parallel.
     */

#pragma acc parallel loop async(1)
    for (i=0;i<SIZE;++i) {
      a[i] = i;
    }
#pragma acc parallel loop async(2)
    for (i=0;i<SIZE;++i) {
      b[i] = i*2;
    }
#pragma acc parallel loop async(3)
    for (i=0;i<SIZE;++i) {
      c[i] = i*3;
    }

    /*
     *  Calculate d from a,b,c making sure to wait for b and
     *  c to complete.  The calculation of a is guaranteed to
     *  complete before this can execute since it uses the
     *  same async id.  Note since this is done with a valid
     *  async id, the parallel region can be enqueued but the
     *  host is not required to wait at any point yet.
     */
#pragma acc parallel loop async(1) wait(2,3)
    for (i=0;i<SIZE;++i) {
      d[i] = a[i]+b[i]+c[i];
    }

    /*
     *  Start transfering d back to the host, but don't wait for the
     *  transfer to complete.
     */
#pragma acc update self(d) async(1)

    /*
     *  Perform some expensive calculation on the host.  Since all the previous
     *  parallel and updates are asynchronous this can be done in parallel with
     *  all the previous accelerator operations.
     */
    for (i=0;i<SIZE;++i) {
      e[i] = i*i;
    }

    /*
     *  Make sure the update (and all previous accelerator operations) have completed
     *  before doing anything else, including ending the data region.
     */
#pragma acc wait(1)

  }

  printf( "d[10]=%ld e[10]=%ld\n",d[10],e[10]);

  return 0;
}

Example 6: create identifiers for dependent operations

/*
 *  Using wait to create distinct identifiers for
 *  a series of dependent operations.
 *
 *  This example shows how asynchronous waits can be used
 *  to execute a series of steps asynchronously that are dependent
 *  on the previous step but with a unique async identifier for
 *  each step.  This allows waiting for a specific step in the
 *  process without waiting for the whole process.
 */

#include <stdio.h>

#define SIZE 1000
#define STEPS 10

int main( )
{
  long a[SIZE];
  long b[SIZE];
  long c[SIZE];
  int i,step;

  for (i=0;i<SIZE;++i) {
    c[i]= 0;
  }
#pragma acc data create(a,b) copy(c)
  {
    /*
     *  Pipeline a series of dependent asynchronous operations on the device
     *
     *  time 0:  step 0 stage 1
     *  time 1:  step 0 stage 2
     *  time 2:  step 0 stage 3 step 1 stage 1
     *  time 3:  step 0 stage 4 step 1 stage 2
     *  time 4:                 step 1 stage 3  step 2 stage 1
     *  time 5:                 step 1 stage 4  step 2 stage 2
     *
     */
    for (step=0;step<STEPS;++step) {

      /*
       *  Must wait for stage 2 of the previous iteration to finish before
       *  we start modifying a on the host.
       */
#pragma acc wait(1)
      /*
       *  Stage 1:  some operation on the host
       *  Can overlap with stage 3 and 4 of previous iteration
       */
      a[0] = step*step;
      for (i=1;i<SIZE;++i) {
     a[i] = i*a[i-1];
      }

      /*
       *  Must wait for stage 3 of the previous iteration to finish before
       *  we start modifying a on the device.  Note this wait could
       *  be added to stage 2 directly.
       */
#pragma acc wait(2)
      /*
       *  Stage 2: update the device copy of a
       *  Can overlap with stage 4 or previous iteration
       */
#pragma acc update device(a) async(1)


      /*
       *  Must wait for stage 4 of the previous iteration to finish before
       *  we can start modifying b on the device.  Note this wait could
       *  be added to stage 3 directively.
       */
#pragma acc wait(3)

      /*
       *  Stage 3: use a on the device
       */
#pragma acc parallel loop async(2) wait(1)
      for (i=0;i<SIZE;++i) {
     b[i] = a[i]-step;
      }

      /*
       *  Step 4: a calculation that doesn't use a
       */
#pragma acc parallel loop async(3) wait(2)
      for (i=0;i<SIZE;++i) {
     c[i] += b[i];
      }
    }

    /*
     *  Wait for c to be calculated before exiting the data region
     */
#pragma acc wait(3)
  }

  printf( "c[10]=%ld\n",c[10]);

  return 0;
}

Fortran

Example 7: !$acc parallel loop

PROGRAM test_openacc
        IMPLICIT NONE
        INTEGER, PARAMETER :: M=1000
        INTEGER :: a(M),b(M),c(M)
        INTEGER :: j,total,expected

!!$ For simple cases, use parallel loop as a shortcut for
!!$ parallel and loop
!!$ Set a,b,c

!$acc parallel loop
        DO j = 1,M
                a(j) = j
                b(j) = j
                c(j) = j
        ENDDO
!$acc end parallel loop

!!$ Set b, copy it to host
!$acc parallel copyout(b)
!$acc loop
        DO j = 1,M
                b(j) = 2*j
        ENDDO
!$acc end loop
!$acc end parallel

!!$ Set c, copy it to host
!$acc parallel copyout(c)
!$acc loop
        DO j = 1,M
                c(j) = -j
        ENDDO
!$acc end loop
!$acc end parallel

!!$ Compute a based on b and c, copy a to host
!$acc parallel copyout(a) copyin(b,c)
!$acc loop
        DO j = 1,M
                 a(j) = b(j) + c(j)
        ENDDO
!$acc end loop
!$acc end parallel

!!$ Compute a checksum
total = 0
!$acc parallel copyin(a)
!$acc loop reduction(+:total)
        DO j = 1,M
                total = total + a(j)
        ENDDO
!$acc end loop
!$acc end parallel


!!$ The expected result
        expected = M*(M+1)/2
        PRINT *,"Result: ",total
        PRINT *,"Verified: ",(total==expected)

END PROGRAM test_openacc

Example 8: !$acc cache directive

!!$ The example applies loop and blocking directives as recommended.
!!$ Cache directive must be in innermost loop.
!!$ Size of cache must be bounded.
!!$ In this example, blocking directives force a 16x64 tile.
!!$
!!$ Compile with option -rmd. Note cache messages appearing in *.lst file.

subroutine example( A, B, ni, nj, nk )
    integer ni, nj, nk
    integer A(ni, nj, nk)
    integer B(0:ni+1, 0:nj+1, nk)

!$acc parallel copyout(A), copyin(B)

!$acc loop gang
  do k=1,nk

!dir$ blockable( i, j )
!$acc loop worker
!dir$ blockingsize ( 16 )
  do j=1,nj
!$acc loop vector
!dir$ blockingsize ( 64 )
  do i=1,ni
!$acc cache( A(i,j,k), B(i-1:i+1,j-1:j+1,k) )

         A(i,j,k) = B(i,j,k) - &
                       ( B(i-1, j-1, k) &
                       + B(i-1, j+1, k) &
                       + B(i+1, j-1, k) &
                       + B(i+1, j+1, k) ) / 5
  end do ! i
  end do ! j

  end do ! k

!$acc end parallel

end subroutine example

Example 9: !$acc parallel loop directive

program main
  integer(8), parameter :: n = 100
  integer(8) :: i
  real(8) :: A(n), B(n), C(n)
  real(8) :: expected, actual

  A=0 ; B=1 ; C=2 ;
  expected = sum(B+C)

  !$acc parallel loop copyout(A) copyin(B,C)
  do i=1,n
     A(i) = B(i) + C(i)
  end do

  actual = sum(A)
  print *, 'expected = ', expected
  print *, 'actual   = ', actual

  if ( actual .ne. expected ) then
     print *, "FAIL"
     call exit(-1)
  else
     print *, "PASS"
     call exit(0)
  end if

end program main

Example 10: using OPENACC_LIB

program main
  ! Using openacc C runtime library from Fortran code.

  ! This example is functionally equivalent to Example 7,
  ! but runtime library routines are used to
  ! perform the data transfers.
use openacc_lib

  integer(8), parameter :: n = 100
  integer(8) :: i
  real(8) :: A(n), B(n), C(n)
  real(8) :: expected, actual

  A=0 ; B=1 ; C=2 ;
  expected = sum(B+C)

  ! Use functions to allocate and transfer data in
  call acc_create ( A )
  ! could also use 'all acc_create( A(1), 8*n )'
  call acc_copyin ( B )
  ! could also use 'call acc_copyin( B(1), 8*n )'
  call acc_copyin ( C )
  ! could also use 'call acc_copyin( C(1), 8*n)'

  ! Specify A,B,C as "present", since we already took care of
  ! allocating and copying
  !$acc parallel loop present(A,B,C)
  do i=1,n
     A(i) = B(i) + C(i)
  end do

  ! Use functions to transfer out and delete data
  call acc_copyout ( A )
  call acc_delete ( B )
  call acc_delete ( C )


  actual = sum(A)
  print *, 'expected = ', expected
  print *, 'actual   = ', actual

  if ( actual .ne. expected ) then
     print *, "FAIL"
     call exit(-1)
  else
     print *, "PASS"
     call exit(0)
  end if

end program main

Example 11: !#acc routine directive

! This example shows how to use the "!$acc routine" directive to
! compile entire functions for a device and make function calls in
! compute regions.  Each acc routine specifies the type of worksharing
! used in that routine.

subroutine func4(x)
  !$acc routine seq
  real :: x
  integer :: i
  x = x + 1
end subroutine func4

subroutine func3(N, x)
  !$acc routine vector
  !$acc routine(func4) seq
  real :: x(N)
  integer :: N, i
  !$acc loop vector
  do i=1,N
     call func4(x(i))
  end do
end subroutine func3

subroutine func2(N, x)
  !$acc routine worker
  !$acc routine(func3) vector
  real :: x(N,N)
  integer :: N, i
  !$acc loop worker
  do i=1,N
     call func3(N, x(:,i))
  end do
end subroutine func2

subroutine func1(N, x)
  !$acc routine gang
  real :: x(N,N,N)
  integer :: N, i
  !$acc routine(func2) worker
  !$acc loop gang
  do i=1,N
     call func2(N, x(:,:,i))
  end do
end subroutine func1

program main
  !$acc routine(func1) gang
  integer, parameter :: N = 8
  real :: x(N,N,N)
  x = 0

  !$acc parallel copy(x)
  call func1(N, x)
  !$acc end parallel

  print *, 'result =', sum(abs(x))

  if ( sum(abs(x)) .ne. N*N*N ) then
     print *, 'FAIL'
     call exit(1)
  else
     print *, 'PASS'
     call exit(0)
  end if

end program main