GENWiki

Premier IT Outsourcing and Support Services within the UK

User Tools

Site Tools


archive:computers:bauer.lst

_OPTIMIZING IN A PARALLEL ENVIRONMENT_ by Barr E. Bauer

[LISTINÇ ONE]

    program test                                                      1

* 2 * purpose is to test SGI parallelization scheme for loop selection, 3 * numerically-intensive calculations, and total reduction. See text 4 * for details. 5 * 6

    parameter (MAXFIRST=250, MAXSECOND=250, MAXTHIRD=10)              7
    real*8 a(MAXTHIRD,MAXSECOND,MAXFIRST)                             8
    real*8 b(MAXTHIRD,MAXSECOND,MAXFIRST)                             9
    real*8 sub_total(MAXFIRST), partial_total(4)                      10
    real*8 d(MAXTHIRD), c, tmp    ! local variables                   11
    real*8 dist(MAXSECOND,MAXFIRST), grand_total                      12
    real*8 grand_total      ! test for proper operation               13
    logical parallel        ! selects 2-version loops                 14
    integer*4 iflag         ! used to show LASTLOCAL value            15
                                                                      16
    data parallel /.false./                                           17
    data sub_total, iflag /MAXFIRST*0.0, 0/                           18

* 19 * outer loop: contains both interior loops 20 * 21

                                                                      22

* C$doacross local(k,j,i,tmp,d,c), share(a,b,sub_total,dist), 23 * C$& lastlocal(iflag) 24

                                                                      25
    do i = 1, MAXFIRST                                                26

* 27 * first inner loop: fills arrays a and b 28 * 29

                                                                      30

* C$doacross local(j,k,c), share(i,a,b) 31

                                                                      32
      do j = 1, MAXSECOND                                             33
        do k = 1, MAXTHIRD                                            34
          a(k,j,i) = dsqrt(dfloat(i*j*k))                             35
          c = 1.0 - a(k,j,i)                                          36
          if (c .le. 0.0 .and. i .lt. j*k) then                       37
            c = -c                                                    38
          else                                                        39
            c = c**2                                                  40
          endif                                                       41
          b(k,j,i) = 32*(dcos(c)**5)*dsin(c)-                         42
   1                 32*(dcos(c)**3)*dsin(c)+                         43
   2                  6*dcos(c)*dsin(c)                               44
        enddo                                                         45
      enddo                                                           46

* 47 * seond inner loop: determines distance and starts summation 48 * 49

                                                                      50

* c$doacross local(j,k,d,tmp), share(i,a,b,dist,sub_total), 51 * c$& lastlocal(iflag) 52

                                                                      53

Š do j=1, MAXSECOND 54

        tmp = 0.0                                                     55
        do k = 1, MAXTHIRD                                            56
          d(k) = a(k,j,i) - b(k,j,i)                                  57
        enddo                                                         58
        do k = 1, MAXTHIRD                                            59
          tmp = tmp + d(k)**2                                         60
        enddo                                                         61
        dist(j,i) = dsqrt(tmp)                                        62
        if (dist(j,i) .le. 0.1) iflag = iflag + 1                     63
        sub_total(j) = sub_total(j) + dist(j,i)                       64
      enddo                                                           65
    enddo                                                             66

* 67 * the next section is an example of sum reduction optimized to the 68 * parallel environment and the use of a more efficient 2 loop summation 69 * 70 * if -mp option is active, parallel is set to .true. which then 71 * selects the parallel version 72 * 73

                                                                      74

C$ parallel = .true. 75

    grand_total = 0.0                                                 76
    if (parallel) then                     ! parallel version         77

C$ num_threads = mp_numthreads() 78

      ichunk = (MAXFIRST + (num_threads - 1))/num_threads             79
                                                                      80

C$doacross local(k,j), 81 C$& share(num_threads,partial_total,sub_total,ichunk) 82

                                                                      83
      do k = 1, num_threads ! this loop is parallelized               84
        partial_total(k) = 0.0                                        85
        do j = k*ichunk - ichunk + 1, min(k*ichunk,MAXFIRST)          86
          partial_total(k) = partial_total(k) + sub_total(j)          87
        enddo                                                         88
      enddo                                                           89
      do j = 1, num_threads   ! smaller loop handled as scalar        90
        grand_total = grand_total + partial_total(j)                  91
      enddo                                                           92
    else                                   ! the scalar version       93
      do j = 1, MAXFIRST                                              94
        grand_total = grand_total + sub_total(j)                      95
      enddo                                                           96
    endif                                                             97
                                                                      98
    if (parallel) then                                                99

C$ write (*,10) grand_total, num_threads 100 C$ write (*,20) iflag 101

    else                                                              102
      write (*,30) grand_total                                        103
      write (*,40) iflag                                              104
    endif                                                             105
    stop                                                              106

C$10 format(1x,'grand total = ',g10.3,'threads = ',i4) 107 C$20 format(1x,'parallel iflag = ',i10) 108 Š30 format(1x,'grand total = ',g10.3) 109 40 format(1x,'scalar iflag = ',i10) 110

    end                                                               111

[LISTINÇ TWO]

(source code)

    subroutine example(a, b, c, n)
    integer*4 n
    real*4 a(n), b(n), c(n)
    (additional code)

c$doacross local(i, x)

    do i=1, n
      x = a(n) * b(n)
      c(n) = x**2
    enddo
    (additional code)
    return
    end

(the loop is transformed to)

    subroutine _example_1(
   1  _local_start,             ! index starting value

Š 2 _local_ntrip, ! number of loop executions

   3  _incr,                    ! index increment
   4  _my_threadno)             ! unique process ID number
    integer*4 _local_start, _local_ntrip, _incr, _my_threadno
    integer*4  i                ! declared local
    real*4     x                ! declared local
    integer*4  _tmp             ! created local
    i = _local_start
    do _tmp = 1, _local_ntrip
      x = a(i) * b(i)
      c(i) = x**2
      i = i + _incr
    enddo
    return
    end

Examplå 1º A typical DÏ loop

do i = 1, n

 a(i) = x * b(i)

enddo

Examplå 2º Á DÏ looð iî whicè thå arraù variablå referenceó á  valuå thaô ió noô currenô witè ôhå index

do i = 2, n

 arr(i) = b(i) - arr(i-1)

enddo

Examplå 3º Aî examplå oæ loaä imbalance

do i = 1, n

        do j = 1, i
             a(j, i) = a(j, i) * xmult
        enddo
    enddo

Examplå 4º Loaä balancing

    num_threads = mp_numthreads()

c$doacross local(i, j, k)

    do k = 1, num_threads
        do i = k, n, num_threads
             do j = 1, i
                  a(j, i) = a(j, i) * xmult
             enddo
        enddo
    enddo

/home/gen.uk/domains/wiki.gen.uk/public_html/data/pages/archive/computers/bauer.lst.txt · Last modified: 2001/11/08 10:19 by 127.0.0.1

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki