! Michael MacMahon - Mech. Aerospace Eng, Syracuse University.
! mmcmahon@npac.syr.edu
! Saleh Elmohamed - NPAC, Syracuse University.
! saleh@npac.syr.edu                          Summer 1997.
!-------------------------------------------------------------
! This program uses HPF/Fortran 90 to simulate two-dimensional
! inviscid flow through a nozzle.  The numerical scheme is a finite
! volume cell centered scheme employing a fourth stage
! Runge-Kutta time stepping algorithm.  Artificial viscosity
! terms are added to stabilize the scheme in areas of large
! gradients, such as shock waves.  The program reads in a mesh
! from file ft03.dat and computes needed Delta x and Delta y values.
! The initial conditions are set up as simply uniform flow everywhere.
! The inlet and outlet boundary conditions are determined from
! the information provided by the user, the inlet mach number, back
! pressure and output condition (super  or subsonic).  
!
! The structure of the code consists of calls to the flux,
! artificial dissipation, and boundary condition routines
! for each stage of the Runge-Kutta algorithm.  
! Data is output for contour and line plots as needed.
!
! The code makes heavy use of FORALL statements, which
! are more succint than repeated DO loops, guarentee no
! dependencies and make conversion to HPF easier. 
! These do require more mememory than the traditional
! DO loops, but since this code was only tried on eight processors,
! effective use of the cache was not crucial.

! The solution vector V  consists of:
!	V(1,i,j)=density at point i,j
!	V(2,i,j)=density*(X component of Velocity) at point i,j
!	V(3,i,j)=density*(Y component of Velocity) at point i,
!	V(4,i,j)=internal energy at point i,j

      Program nozzle
      Implicit none
      Integer, Parameter :: lmax=4
      Integer, Parameter :: nx=256
      Integer, Parameter :: ny=64
      Real V(lmax,nx+2,ny+2)
      Real xloc(nx+1,ny+2),yloc(nx+1,ny+2)
      Real dnax(nx+1,ny+2),dnay(nx+1,ny+2)
      Real dnbx(nx+1,ny+2),dnby(nx+1,ny+2)
	  Real min,start,end
      Integer mx,my

! The following are the HPF statements.  PROCS is the one-
! dimensional abstract processor array.  It's size
! (the number of processors) is specified at run time.
! The solution vector V is distributed along its third
! dimension, while the first two dimensions are collapsed (notice the
! * in the line !HPF$ DISTRIBUTE  V (*,*,BLOCK) onto PROC) . 
! The second dimension of the other arrays
! corresponds to the third dimension of V,  and as such is 
! aligned with the third dimension of V. This alignment
! guarentees no unnescessary communication in statements
! involving both V and the other variables, of which there
! are many.

!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V (*,*,BLOCK) onto PROC
!HPF$ ALIGN (*,:) with V(*,*,:) :: xloc,yloc,dnax,dnay,dnbx,dnby
      

      mx=nx+1
      my=ny+1
      
      Call Mesh(V,xloc,yloc,dnax,dnay,dnbx,dnby,lmax,mx,my)
      Call Init_cons(V,lmax,min,mx,my)
      Call James(V,xloc,yloc,dnax,dnay,dnbx,dnby,
     $ min,lmax,mx,my)

       End program

	
        subroutine timer(return_time, initial_time)
         real, intent(in)  :: initial_time
        real,  intent(out) :: return_time
        integer finish,rate
        CALL system_clock( COUNT=finish,COUNT_RATE=rate)
        return_time = FLOAT(finish) / FLOAT(rate) - initial_time
	end subroutine

! This routine reads in the x and y values of the mesh.
! The Delta x and Delta y values for the cell sides are computed
! and stored in dnax,dnay.


      Subroutine  Mesh(V,xloc,yloc,dnax,dnay,dnbx,dnby,lmax,mx,my)
      Implicit none
      Integer mx,my,lmax
      Real V(lmax,mx+1,my+1)
      Real xloc(mx,my+1),yloc(mx,my+1)
      Real dnax(mx,my+1),dnay(mx,my+1)
      Real dnbx(mx,my+1),dnby(mx,my+1)
      Integer i,j

! Notice the * in front of (*,*,:) and PROC in the 
! distribute directive.  This tells the compiler that
! any incoming distribution V had is okay, and thus
! no remapping occurs.  Similarly, the align statement
! shows *V(*,*,:) indicating incoming alignment is fine.
! If the subroutine is called many times and the programmer
! does not use the * notation, the compiler may
! remap the arrays every time, resulting in poor performance

! You will see this type of distribution in all
! the subroutines from now on.


!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with *V(*,*,:) :: xloc,yloc,dnax,dnay,dnbx,dnby

      open(unit=3,file='ft03.dat',status='old')
		dnax=0.0
	    dnay=0.0
		dnbx=0.0
		dnby=0.0
                xloc=0.0
                yloc=0.0
      Do i=1,mx
         Do j=1,my
            read(3,*) xloc(i,j),yloc(i,j)
         End Do
      End Do
!____________DEBUGGER MESH___________________________________
!	Forall(i=1:mx,j=1:my) xloc(i,j)=0.0133*real(i-1)  
!	Forall(i=1:mx,j=1:my) yloc(i,j)=0.015*real(j-1) 
!-----------------------------------------------------------
      Forall(i=2:mx,j=2:my) dnax(i,j)=yloc(i,j-1)-yloc(i-1,j-1)
      Forall(i=2:mx,j=2:my) dnay(i,j)=-1.0*(xloc(i,j-1)
     $ -xloc(i-1,j-1))

      Forall(i=2:mx,j=2:my) dnbx(i,j)=yloc(i,j)-yloc(i,j-1)
      Forall(i=2:mx,j=2:my) dnby(i,j)=-1.0*(xloc(i,j)
     $ -xloc(i,j-1))

         End subroutine

      
! Just set up the initial conditions of the nozzle,
! such as the density, pressure, velocity and internal energy.
!

      Subroutine Init_cons(V,lmax,min,mx,my)
      Implicit none
      Integer lmax,mx,my
      Real V(lmax,mx+1,my+1)
      Real p(mx+1,my+1)
	  Real min
      Integer l,i,j

! Again, distributing  V *(*,*,BLOCK) insures 
! incoming mapping is to be left alone.
! The third dimension of P is aligned with
! the third dimension of V.

!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with V(*,*,:) :: p
       write(6,*) 'Enter inlet mach no.'
      read(5,*) min

      Forall(j=2:my) P(1,j)=1.0/(min**2/2.0 *(1.4-1.0)
     $  +1.0)**(1.4/0.4)
		write(6,*) 'Inlet pressure is:',p(1,2)
       Forall(j=2:my) V(1,1,j)=P(1,j)*(1.0
     $  + (1.4-1.0)/2.0*min**2)

         Forall(j=2:my) V(2,1,j)=V(1,1,j)*min
     $ *sqrt(1.4*p(1,j)/V(1,1,j))

          Forall(j=2:my) V(3,1,j)=0.0
       Forall(j=2:my)    V(4,1,j)=p(1,j)/(0.4) + 0.5*V(1,1,j)
     $ *(V(2,1,j)**2 / V(1,1,j)**2)

       Forall(l=1:lmax,i=2:mx+1,j=2:my) V(l,i,j)=V(l,1,j)


       End subroutine
       

! This is the four stage Runge-Kutta algorithm, which
! solves the necessary differential equations.  RES
! is the time derivative of the solution vector, and
! when it approaches zero out solution is converged.
! I have hardwired the appropriate number of iterations
! required into the code.  See note above the
! loop for more information.


      Subroutine James(V,xloc,yloc,dnax,dnay,dnbx,dnby,min,
     $ lmax,mx,my)
      Implicit none
      Integer lmax,mx,my
      Real V(lmax,mx+1,my+1)
      Real  xloc(mx,my+1),yloc(mx,my+1),dnbx(mx,my+1),dnby(mx,my+1)
      Real  dnax(mx,my+1),dnay(mx,my+1)
      Real Fluxab(lmax,mx+1,my+1),Fluxbc(lmax,mx+1,my+1)
      Real Fluxcd(lmax,mx+1,my+1),Fluxda(lmax,mx+1,my+1)
      Real Res(lmax,mx,my+1),h(lmax,mx+1,my+1)
      Real Q(lmax,mx+1,my+1)
      Real p(mx+1,my+1)
      Real avab(lmax,mx+1,my+1),avbc(lmax,mx+1,my+1),
     $ avcd(lmax,mx+1,my+1)
      Real avda(lmax,mx+1,my+1)
      Real dt(mx,my+1)
      Real area(mx,my+1)
      Real pback,cfl,min,start,end
      Integer l,i,j,z
      Integer exit,iter,itermax

! The incoming arrays are not remapped or realigned, a fact
! the * in front of (*,*,BLOCK) and *V(*,*,:) insures.
! The local arrays are then aligned with V so that no 
! extra communication occurs when they interact, which
! they do many times.  See the line where RES is calculated
! to validate this.  The expression for RES is choppy
! because it had to be hacked into small pieces.  Originally
! it was one large clean statement but the compiler 
! complained it contained too many run time calls, ergo
! the present form.

!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with *V(*,*,:) :: xloc,yloc,dnax,dnay,dnbx,dnby
!HPF$ ALIGN (*,*,:) with V (*,*,:):: Q,Fluxab,Fluxbc,Fluxcd,Fluxda,h,
     $ avab,avbc,avcd,avda,res
!HPF$ ALIGN (*,:) with V (*,*,:) :: p,dt,area


      write(6,*) 'Please enter pback'
      Read(5,*) pback
      write(6,*) 'Please enter CFl number'
      read(5,*) cfl
      write(6,*) 'enter exit condition: 1=subsonic, 2=supersonic'
      read(5,*) exit
      iter=1

	if (exit .eq. 2) then
	  itermax=1350.
	end if

	if (exit .eq. 1) then
		itermax=4000. 
	end if

	call timer(start,0.0)
		res(3,4,5)=0.4

! These array operations set every element of the array to
! the prescribed value.  No communication nessecary.
	avab=0.0
	avbc=0.0
	avcd=0.0
	avda=0.0
	dt=9.9e-3
	  p=pback
	open(unit=8,file='err.dat',status='unknown')
	open(unit=9,file='in.dat',status='unknown')
	
      Forall(i=2:mx,j=2:my) area(i,j)=0.5*abs((xloc(i,j)
     $ -xloc(i-1,j-1))*(yloc(i-1,j)-yloc(i,j-1)) - 
     $ (xloc(i-1,j)-xloc(i,j-1))*(yloc(i,j)-yloc(i-1,j-1)))

!-----------Start looping-------------
!General form of the loop:
!        -Compute time step at each axial location
!        -Compute flux vectors for each "volume"
!        -Compute dissipation vectors
!        -Compute residual
!        -Compute stage of Runge Kutta
!        -Repeat for all 4 stages
!        -Update solution 

! Although not used, the  code is able to add a source term 
!(ie  heat transfer into the nozzle) to the equations using
! the source subroutine and H vector.  

      Do While(iter .le. itermax)
         Call time_step(V,dt,p,xloc,yloc,cfl,lmax,mx,my)
	
         Call Flux(V,Fluxab,Fluxbc,Fluxcd,Fluxda,xloc,yloc,p,lmax,mx,my)
	
	 Call Artvisc(V,avab,avbc,avcd,avda,xloc,yloc,
     $ dnax,dnay,dnbx,dnby,p,lmax,mx,my)
	
!         Call source(V,h,p,yloc,lmax,mx,my)
         Forall(l=1:lmax,i=2:mx,j=2:my)
	 Res(l,i,j)=-1.0/area(i,j)
     $ * (Fluxab(l,i,j)+Fluxbc(l,i,j)+Fluxcd(l,i,j)+Fluxda(l,i,j))
	Res(l,i,j)=Res(l,i,j)  
     $ + 1.0/area(i,j)*(avbc(l,i,j)*sqrt(dnbx(i,j)**2
     $ + dnby(i,j)**2)-avda(l,i,j)*sqrt(dnbx(i-1,j)**2
     $ + dnby(i-1,j)**2))
	Res(l,i,j)=Res(l,i,j) +1.0/area(i,j)*( avcd(l,i,j)
     $ *sqrt(dnax(i,j)**2
     $ + dnay(i,j)**2)-avab(l,i,j)*sqrt(dnax(i,j-1)**2+dnay(i,j-1)
     $ **2)) + h(l,i,j)
	 Q(l,i,j)=V(l,i,j)
     $         + 0.25 *Res(l,i,j)*dt(i,j)
	End forall
         Forall(i=2:mx,j=2:my)
	 p(i,j)=0.4*(Q(4,i,j)-0.5/Q(1,i,j)
     $ *(Q(2,i,j)**2 + Q(3,i,j)**2))
	 End Forall
	
	  Call bound(Q,p,min,pback,lmax,mx,my,exit)
	 	           
         
!          Call time_step(Q,dt,xloc,yloc,lmax,mx,my)
         Call Flux(Q,Fluxab,Fluxbc,Fluxcd,Fluxda,xloc,yloc,p,lmax,mx,my)
         Call Artvisc(Q,avab,avbc,avcd,avda,xloc,yloc,
     $ dnax,dnay,dnbx,dnby,p,lmax,mx,my)

!         Call source(Q,h,p,yloc,lmax,mx,my)
          Forall(l=1:lmax,i=2:mx,j=2:my)

         Res(l,i,j)=-1.0/area(i,j)
     $ * (Fluxab(l,i,j)+Fluxbc(l,i,j)+Fluxcd(l,i,j)+Fluxda(l,i,j))
        Res(l,i,j)=Res(l,i,j) + 
     $ + 1.0/area(i,j)*(avbc(l,i,j)*sqrt(dnbx(i,j)**2
     $ + dnby(i,j)**2)-avda(l,i,j)*sqrt(dnbx(i-1,j)**2
     $ + dnby(i-1,j)**2))
        Res(l,i,j)=Res(l,i,j) +1.0/area(i,j)*( avcd(l,i,j)
     $ *sqrt(dnax(i,j)**2
     $ + dnay(i,j)**2)-avab(l,i,j)*sqrt(dnax(i,j-1)**2+dnay(i,j-1)
     $ **2)) + h(l,i,j)


         Q(l,i,j)=V(l,i,j)
     $   + 1.0/3.0 *Res(l,i,j)*dt(i,j)
	End Forall
         Forall(i=2:mx,j=2:my)
	 p(i,j)=0.4*(Q(4,i,j)-0.5/Q(1,i,j)
     $ *(Q(2,i,j)**2 + Q(3,i,j)**2))
	 End Forall
         Call bound(Q,p,min,pback,lmax,mx,my,exit)

!         Call time_step(Q,dt,xloc,yloc,lmax,mx,my)
         Call Flux(Q,Fluxab,Fluxbc,Fluxcd,Fluxda,xloc,yloc,p,lmax,mx,my)
         Call Artvisc(Q,avab,avbc,avcd,avda,xloc,yloc,
     $ dnax,dnay,dnbx,dnby,p,lmax,mx,my)
!         Call source(Q,h,p,yloc,lmax,mx,my)
         Forall(l=1:lmax,i=2:mx,j=2:my)

 	Res(l,i,j)=-1.0/area(i,j)
     $ * (Fluxab(l,i,j)+Fluxbc(l,i,j)+Fluxcd(l,i,j)+Fluxda(l,i,j))
        Res(l,i,j)=Res(l,i,j) + 
     $ + 1.0/area(i,j)*(avbc(l,i,j)*sqrt(dnbx(i,j)**2
     $ + dnby(i,j)**2)-avda(l,i,j)*sqrt(dnbx(i-1,j)**2
     $ + dnby(i-1,j)**2))
        Res(l,i,j)=Res(l,i,j) +1.0/area(i,j)*( avcd(l,i,j)
     $ *sqrt(dnax(i,j)**2
     $ + dnay(i,j)**2)-avab(l,i,j)*sqrt(dnax(i,j-1)**2+dnay(i,j-1)
     $ **2)) + h(l,i,j)

         Q(l,i,j)=V(l,i,j)
     $         + 1.0/2.0 *Res(l,i,j)*dt(i,j)
	End Forall
         Forall(i=2:mx,j=2:my)
	 p(i,j)=0.4*(Q(4,i,j)-0.5/Q(1,i,j)
     $ *(Q(2,i,j)**2 + Q(3,i,j)**2))
	End Forall
         Call bound(Q,p,min,pback,lmax,mx,my,exit)


!         Call time_step(Q,dt,xloc,yloc,lmax,mx,my)
         Call Flux(Q,Fluxab,Fluxbc,Fluxcd,Fluxda,xloc,yloc,p,lmax,mx,my)
         Call Artvisc(Q,avab,avbc,avcd,avda,xloc,yloc,
     $ dnax,dnay,dnbx,dnby,p,lmax,mx,my)
!         Call source(Q,h,p,yloc,lmax,mx,my)
          Forall(l=1:lmax,i=2:mx,j=2:my)
	  Res(l,i,j)=-1.0/area(i,j)

     $ * (Fluxab(l,i,j)+Fluxbc(l,i,j)+Fluxcd(l,i,j)+Fluxda(l,i,j))
        Res(l,i,j)=Res(l,i,j) + 
     $ + 1.0/area(i,j)*(avbc(l,i,j)*sqrt(dnbx(i,j)**2
     $ + dnby(i,j)**2)-avda(l,i,j)*sqrt(dnbx(i-1,j)**2
     $ + dnby(i-1,j)**2))
        Res(l,i,j)=Res(l,i,j) +1.0/area(i,j)*( avcd(l,i,j)
     $ *sqrt(dnax(i,j)**2
     $ + dnay(i,j)**2)-avab(l,i,j)*sqrt(dnax(i,j-1)**2+dnay(i,j-1)
     $ **2)) + h(l,i,j)


         V(l,i,j)=V(l,i,j)
     $    + 1.0 *Res(l,i,j)*dt(i,j)
	End Forall
         Forall(i=2:mx,j=2:my)
	 p(i,j)=0.4*(V(4,i,j)-0.5/V(1,i,j)
     $ *(V(2,i,j)**2 + V(3,i,j)**2))
	End Forall

         Call bound(V,p,min,pback,lmax,mx,my,exit)
        
	iter=iter+1
	
	End do
	call timer(end,start)
	write(6,*) 'Execution took:',end,'seconds'
	call Contour(V,xloc,yloc,lmax,mx,my )
	
	
	do i=1,mx
	 write(8,*) xloc(i,8),sqrt((V(2,i,my)/V(1,i,my))**2
     $ + (V(3,i,my)/V(1,i,my))**2)/
     $ sqrt(1.4*p(i,my)/V(1,i,my)), sqrt((V(2,i,2)/V(1,i,2))**2
     $ + (V(3,i,2)/V(1,i,2))**2)/
     $ sqrt(1.4*p(i,2)/V(1,i,2)),v(3,i,2)/V(1,i,2)
	end do

	write(8,*) 'and this was last residual',maxval(abs(res))
         End subroutine
! This routine computes the flux vectors across the edges of
! our finite volume element.  The values of the  flow variables
! at the cell edges are assumed to be the average of the neighboring
! cell centers, hence the 0.5*(   everything).  The average
! is multipled by the appropriate edge Deltax or Deltay.
! Forall constructs are used throughout, but some of
! the statements appear choppy because I had to cut them
! into chunks.  The compiler complained that too many run-time
! calls were used when these were single statements.


      Subroutine Flux(V,Fluxab,Fluxbc,Fluxcd,Fluxda,xloc,yloc,
     $ p,lmax,mx,my)
      Implicit none
      Integer lmax,mx,my
      Real  V(lmax,mx+1,my+1)
      Real Fluxab(lmax,mx+1,my+1),Fluxbc(lmax,mx+1,my+1),
     $ Fluxcd(lmax,mx+1,my+1),Fluxda(lmax,mx+1,my+1)
      Real  xloc(mx,my+1),yloc(mx,my+1)
      Real  p(mx+1,my+1)
      Integer l,i,j

!All arrays used in this routine have already been distributed
! or aligned in the calling routine, thus the * notation.
! No remapping or realigning should occur. 

!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with *V(*,*,:) :: xloc,yloc
!HPF$ ALIGN (*,*,:) with *V (*,*,:) ::Fluxab,Fluxbc,Fluxcd,Fluxda
!HPF$ ALIGN (*,:) with *V (*,*,:) :: p
!************Compute Boundary Fluxes***************
! --------------Top (solid) Boundary---------------
      Fluxcd=0.0
      Forall(i=2:mx)
	 Fluxcd(2,i,my)=p(i,my)*(yloc(i-1,my)-yloc(i,my))
	
         Fluxcd(3,i,my)=-1.0*p(i,my)*(xloc(i-1,my)
     $ -xloc(i,my))
         End Forall
! -------------Bottom (symmetry) Boundary-----------------
      Fluxab=0.0

      Forall(i=2:mx) 
      Fluxab(1,i,2)=0.0
       Fluxab(2,i,2)=(
     $   p(i,2))*(yloc(i,1)-yloc(i-1,1))
       Fluxab(3,i,2)=-1.0*p(i,2)
     $  *(xloc(i,1)-xloc(i-1,1))
       Fluxab(4,i,2)=0.0
       End Forall

!****************Start Interior Points*************

	Forall(i=2:mx,j=3:my)
        Fluxab(1,i,j)=0.5*(V(2,i,j)+
     $ V(2,i,j-1))*(yloc(i,j-1)-yloc(i-1,j-1)) 
     $ -0.5*(V(3,i,j)+V(3,i,j-1))*(xloc(i,j-1)-xloc(i-1,j-1))
	End forall
	
	
	Forall(i=2:mx,j=3:my)
         Fluxab(2,i,j)=0.5*(V(2,i,j)**2
     $ / V(1,i,j) +p(i,j) + V(2,i,j-1)**2/V(1,i,j-1) +p(i,j-1))
     $ *(yloc(i,j-1)-yloc(i-1,j-1))
	Fluxab(2,i,j)=Fluxab(2,i,j) -0.5*(V(2,i,j)*V(3,i,j)/V(1,i,j)
     $ + V(2,i,j-1)*V(3,i,j-1)/V(1,i,j-1))
     $ *(xloc(i,j-1)-xloc(i-1,j-1)) 
        End Forall

	Forall(i=2:mx,j=3:my)
         Fluxab(3,i,j)=0.5*(V(2,i,j)
     $  *v(3,i,j)/V(1,i,j) + V(2,i,j-1)*V(3,i,j-1)/V(1,i,j-1)
     $ )*(yloc(i,j-1)-yloc(i-1,j-1))
	Fluxab(3,i,j)=Fluxab(3,i,j) -0.5*(V(3,i,j)**2/V(1,i,j)
     $ +p(i,j) +p(i,j-1)+ V(3,i,j-1)**2/V(1,i,j-1))
     $ *(xloc(i,j-1)-xloc(i-1,j-1))
	 end forall

	Forall(i=2:mx,j=3:my)
         Fluxab(4,i,j)=(V(4,i,j)
     $ + p(i,j))*V(2,i,j)/V(1,i,j)
	 Fluxab(4,i,j)=Fluxab(4,i,j) + (V(4,i,j-1) +
     $ p(i,j-1))*V(2,i,j-1)/V(1,i,j-1)
	Fluxab(4,i,j)=0.5*Fluxab(4,i,j)*(yloc(i,j-1)-yloc(i-1,j-1))
	Fluxab(4,i,j)=Fluxab(4,i,j)
     $ -0.5*(V(4,i,j)+p(i,j))*V(3,i,j)/V(1,i,j)
     $ *(xloc(i,j-1)-xloc(i-1,j-1))
	 Fluxab(4,i,j)=Fluxab(4,i,j)-
     $ 0.5*(V(4,i,j-1)+p(i,j-1))*V(3,i,j-1)/V(1,i,j-1)
     $ * (xloc(i,j-1)-xloc(i-1,j-1))

	End Forall 
		
! Now bc fluxes

	Forall(i=1:mx,j=2:my)
        Fluxbc(1,i,j)=0.5*(V(2,i,j)
     $ + V(2,i+1,j))*(yloc(i,j)-yloc(i,j-1)) -0.5*(V(3,i,j)
     $ + V(3,i+1,j))*(xloc(i,j)-xloc(i,j-1))
	End Forall

	Forall(i=1:mx,j=2:my)
         Fluxbc(2,i,j)=0.5*(V(2,i,j)**2
     $ /V(1,i,j) + p(i,j)+V(2,i+1,j)**2/V(1,i+1,j)+p(i+1,j))
     $ *(yloc(i,j)-yloc(i,j-1))
	Fluxbc(2,i,j)=Fluxbc(2,i,j)
     $	 -0.5*(V(3,i,j)*V(2,i,j)/V(1,i,j)
     $ + V(3,i+1,j)*V(2,i+1,j)/V(1,i+1,j))
     $ * (xloc(i,j)-xloc(i,j-1))
	end Forall
	
	Forall(i=1:mx,j=2:my)
         Fluxbc(3,i,j)=0.5*(V(2,i,j)
     $  *V(3,i,j)/V(1,i,j) + V(2,i+1,j)*V(3,i+1,j)/V(1,i+1,j))
     $ *(yloc(i,j)-yloc(i,j-1)) 
	Fluxbc(3,i,j)=Fluxbc(3,i,j)-0.5*(V(3,i,j)**2/V(1,i,j)
     $ + V(3,i+1,j)**2/V(1,i+1,j) +p(i,j) +p(i+1,j))
     $ *(xloc(i,j)-xloc(i,j-1))
	End Forall

	Forall(i=1:mx,j=2:my)
         Fluxbc(4,i,j)=(V(4,i,j)
     $ + P(i,j))*V(2,i,j)/V(1,i,j)
	Fluxbc(4,i,j)=Fluxbc(4,i,j) + (V(4,i+1,j)+p(i+1,j))
     $ *V(2,i+1,j)/V(1,i+1,j)
	Fluxbc(4,i,j)=0.5*Fluxbc(4,i,j)*(yloc(i,j)-yloc(i,j-1))
     $  -0.5*(V(4,i,j)
     $ + P(i,j))*V(3,i,j)/V(1,i,j)*(xloc(i,j)-xloc(i,j-1))
        Fluxbc(4,i,j)=Fluxbc(4,i,j)-0.5* (V(4,i+1,j)+p(i+1,j))
     $ *V(3,i+1,j)/V(1,i+1,j)*(xloc(i,j)-xloc(i,j-1))
	End Forall
	

        Forall(l=1:lmax,i=2:mx,j=2:my-1)
        fluxcd(l,i,j)=
     $ -1.0*fluxab(l,i,j+1)
        End FORALL
	

        Forall(l=1:lmax,i=2:mx,j=2:my)
        fluxda(l,i,j)=
     $ -1.0*fluxbc(l,i-1,j)
        End Forall
	
    	
	End Subroutine
 

! This routine computes the arificial dissipation for the scheme.
! The dissipation vectors avab,avbc,avcd,acda are necessary to
! dampen the numerical oscillations that a shock wave produces.
! The vectors are the sum of a second order term and fourth
! order term.  The fourth order term is turned "off" when
! a pressure gradient sensor EPS2 reaches a critical value (kp4).
! Leaving it in would blow the solution up near shock waves.

! Two HPF_LOCAL routines are used because the boundary conditions
! the routine requires (Derivatives are zero at boundaries)
! somehow required communication when they shouldn't have.
! Using HPF_LOCAL insures the procedure is done on a single
! node with no communication.


	extrinsic (hpf_local) subroutine any(V,lmax,mx,my)
	integer lmax,mx,my
        Real   V(:,:,:)
        integer myid,nprocs,ub,lb,i,j,k
!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC


	myid   = pghpf_myprocnum()
	nprocs = pghpf_nprocs()
! lb and ub are the upper and lower bound of the array's 3rd
! dimension

	lb = lbound(V,3)
        ub = ubound(V,3)
        Do i=2,mx
!Make derivaties zero at top and lower boundary
           Do k=1,lmax
             if (myid == nprocs-1) then
   	       V(k,i,ub)=V(k,i,ub-1)
             else if (myid == 0) then
	       V(k,i,lb)=V(k,i,lb+1)
             endif
           End Do
        End Do
       end subroutine

	
	extrinsic (hpf_local) subroutine notany(V,p,lmax,mx,my)
        integer mx,my,lmax
	Real V(:,:,:)
        Real   p(:,:)
        integer myid,nprocs,ub,lb,i,j,k
!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with *V(*,*,:) :: p
        myid   = pghpf_myprocnum()
        nprocs = pghpf_nprocs()
 
        lb = lbound(p,2)
        ub = ubound(p,2)
! Make derivatives zero at top and lower boundary
        Do i=2,mx
          
             if (myid == nprocs-1) then
               p(i,ub)=p(i,ub-1)
             else if (myid == 0) then
               p(i,lb)=p(i,lb+1)
             endif
            End Do
       end subroutine


      Subroutine Artvisc(V,avab,avbc,avcd,avda,xloc,yloc,
     $ dnax,dnay,dnbx,dnby,p,lmax,mx,my)
      Implicit none
      Integer lmax,mx,my
      Real   V(lmax,mx+1,my+1)
      Real   avab(lmax,mx+1,my+1),avbc(lmax,mx+1,my+1),
     $ avcd(lmax,mx+1,my+1),avda(lmax,mx+1,my+1)
      Real  xloc(mx,my+1),yloc(mx,my+1),dnbx(mx,my+1),dnby(mx,my+1)
      Real   dnax(mx,my+1),dnay(mx,my+1)
      Real  p(mx+1,my+1)
      Real eps2(mx+1,my+1),eps4(mx+1,my+1)
      Real kp2,kp4,start,end
      Integer k, i,j,imy

! Execpt for eps2 and eps4, all arrays have been previously
! distributed and aligned.  Again, the * tells
! the compiler not to remap or realign.


!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with *V(*,*,:) :: xloc,yloc,dnax,dnay,dnbx,dnby
!HPF$ ALIGN (*,*,:) with *V (*,*,:) ::avab,avbc,avcd,avda
!HPF$ ALIGN (*,:) with *V(*,*,:) :: p
!HPF$ ALIGN  (*,:) with V(*,*,:) :: eps2,eps4 
 
!! HPF_Local routines must be interfaced, and they
! must have assumed shaped (:,:,:) array declarations
! They also use same mapping.
	interface
	extrinsic (hpf_local) subroutine any(V,lmax,mx,my)
	integer lmax,mx,my
        Real   V(:,:,:)
        integer myid,nprocs,ub,lb,i,j,k
!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
	
	end subroutine
      	end interface

	interface
	extrinsic (hpf_local) subroutine notany(V,p,lmax,mx,my)
        integer mx,my,lmax
	Real V(:,:,:)
        Real   p(:,:)
        integer myid,nprocs,ub,lb,i,j,k
!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with *V(*,*,:) :: p
	end subroutine
	end interface

      kp2=4.0/4.0
      kp4=4.0/256.0

	avab=0.0
	avbc=0.0
	avcd=0.0
	avda=0.0
	
     
!***********Side AB & CD fluxes**********
        
!       call  any(V,lmax,mx,my)
!	call notany(V,p,lmax,mx,my)

! What the  HPF_LOCAL routines do

	p(:,my+1) = p(:,my)
      p(:,1) = p(:,2)
	V(:,:,my+1)=V(:,:,my)
	V(:,:,1)=V(:,:,2)
      Forall(k=1:lmax,i=2:mx)
	 avab(k,i,2)=0.0
       avcd(k,i,my)=0.0
	End Forall

! Instead of the Forall Constructs, I could have used the
! following Independent DO loops with scalar eps2 and eps4 variables

!  !HPF$  INDEPENDENT, new(i)
!       DO j=3,my
!  !HPF$ INDEPENDENT, new(k,eps2,eps4)
!         DO i=2,mx
!  !HPF$ INDEPENDENT 
!	    DO k=1,4
!	 eps2=abs(p(i,j+1)
!     $-2.0*p(i,j)+p(i,j-1))/abs(p(i,j+1)+2.0*p(i,j) +
!     $ p(i,j-1))
!	eps2=kp2*max(eps2,abs(p(i,j)-2.0*p(i,j-1)+p(i,j-2))
!     $ /abs( p(i,j)+2.0*p(i,j-1)+p(i,j-2)))
!
!	eps4=max(0.0,(kp4-eps2))

!	 avab(k,i,j)=0.5*(V(2,i,j)/V(1,i,j)+V(2,i,j-1)/V(1,i,j-1))
!     $ *dnax(i,j)
! 
!        avab(k,i,j)=avab(k,i,j)+0.5*(V(3,i,j)/V(1,i,j)+
!     $V(3,i,j-1)/V(1,i,j-1))*
!     $ dnay(i,j)
!        avab(k,i,j)=abs(avab(k,i,j))
!        avab(k,i,j)=avab(k,i,j)/sqrt(dnax(i,j)**2 + dnay(i,j)**2)
!        avab(k,i,j)=avab(k,i,j)+sqrt(1.4*(p(i,j)+p(i,j-1))/
!     $ (V(1,i,j)+V(1,i,j-1)))
!        avab(k,i,j)=avab(k,i,j)*(eps2(i,j)*(V(k,i,j)-V(k,i,j-1))
!     $ -eps4(i,j)*(V(k,i,j+1)-V(k,i,j)-V(k,i,j)
!     	    End DO
!	  End DO
!       End DO 

      Forall(i=2:mx,j=3:my)

      eps2(i,j)=abs(p(i,j+1)
     $-2.0*p(i,j)+p(i,j-1))/abs(p(i,j+1)+2.0*p(i,j) +
     $ p(i,j-1))
	eps2(I,J)=kp2*MAX(eps2(i,j),abs(p(i,j)-2.0*p(i,j-1)+p(i,j-2))
     $ /abs( p(i,j)+2.0*p(i,j-1)+p(i,j-2)))
      
       eps4(i,j)=max(0.0,(kp4-eps2(i,j)))
       End Forall
	
	
       Forall(k=1:lmax,i=2:mx,j=3:my)

      	avab(k,i,j)=0.5*(V(2,i,j)/V(1,i,j)+V(2,i,j-1)/V(1,i,j-1))
     $ *dnax(i,j)

	avab(k,i,j)=avab(k,i,j)+0.5*(V(3,i,j)/V(1,i,j)+
     $V(3,i,j-1)/V(1,i,j-1))*
     $ dnay(i,j)
	avab(k,i,j)=abs(avab(k,i,j))
	avab(k,i,j)=avab(k,i,j)/sqrt(dnax(i,j)**2 + dnay(i,j)**2)
	avab(k,i,j)=avab(k,i,j)+sqrt(1.4*(p(i,j)+p(i,j-1))/
     $ (V(1,i,j)+V(1,i,j-1)))
	avab(k,i,j)=avab(k,i,j)*(eps2(i,j)*(V(k,i,j)-V(k,i,j-1))
     $ -eps4(i,j)*(V(k,i,j+1)-V(k,i,j)-V(k,i,j)-V(k,i,j)
     $ +V(k,i,j-1)+V(k,i,j-1)+V(k,i,j-1)-V(k,i,j-2)))
     
       End Forall 
	
    
      Forall(k=1:lmax,i=2:mx,j=2:my-1) avcd(k,i,j)=1.0*avab(k,i,j+1)
      
!***********now BC and DA dissipation *********

      Forall(k=1:lmax,j=1:my+1)
       avda(k,2,j)=0.0
       avbc(k,mx,j)=0.0
	avbc(k,mx+1,j)=0.0
       V(k,mx+1,j)=V(k,mx,j)
	V(k,1,j)=V(k,2,j)
      End Forall

      Forall(j=1:my+1)
       p(mx+1,j)=p(mx,j)
	p(1,j)=p(2,j)
      End Forall

      	
      Forall(i=2:mx-1,j=2:my)
      eps2(i,j)=abs(p(i+1,j)
     $ -2.0*p(i,j)+p(i-1,j))/abs(p(i+1,j)+2.0*p(i,j)+p(i-1,j))

	eps2(i,j)=kp2*max(eps2(i,j),
     $ abs(p(i+2,j)-2.0*p(i+1,j)+p(i,j))/abs(p(i+2,j)+2.0*p(i+1,j)
     $ +p(i,j)))

       eps4(i,j)=max(0.0,(kp4-eps2(i,j)))
       End Forall

      Forall(k=1:lmax,i=2:mx-1,j=2:my)
      avbc(k,i,j)=
     $ 0.5*
     $ (V(2,i,j)/V(1,i,j)+V(2,i+1,j)/V(1,i+1,j))*dnbx(i,j)

	 avbc(k,i,j)=avbc(k,i,j)+
     $ 0.5*(V(3,i,j)/V(1,i,j)+V(3,i+1,j)/V(1,i+1,j))*dnby(i,j)
	avbc(k,i,j)=abs(avbc(k,i,j))

	avbc(k,i,j)=avbc(k,i,j)
     $ /sqrt(dnbx(i,j)**2+dnby(i,j)**2) 
	avbc(k,i,j)=avbc(k,i,j)+ sqrt(1.4*(p(i,j)
     $ + p(i+1,j))/(V(1,i,j)+V(1,i+1,j)))

	avbc(k,i,j)=avbc(k,i,j)*(eps2(i,j)*(V(k,i+1,j)
     $ -V(k,i,j)) -eps4(i,j)*(V(k,i+2,j)-V(k,i+1,j)-V(k,i+1,j)
     $ - V(k,i+1,j)+V(k,i,j)+V(k,i,j)+V(k,i,j)
     $ -V(k,i-1,j)))
      End Forall
	
	
      Forall(k=1:lmax,i=3:mx,j=2:my) avda(k,i,j)=1.0*avbc(k,i-1,j)
		

	End subroutine

! This routine computes the local time step, which varies
! varies with mesh spacing.  This provides much quicker
! convergence than the global time step.


      Subroutine time_step(V,dt,p,xloc,yloc,cfl,lmax,mx,my)
      Implicit none
      Integer lmax,mx,my
      Real V(lmax,mx+1,my+1)
      Real xloc(mx,my+1),yloc(mx,my+1)
      Real p(mx+1,my+1)
      Real  dt(mx,my+1)
      Real cfl
      Integer i,j,l

! Simply distribute and align as done in the calling routine

!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto *PROC
!HPF$ ALIGN (*,:) with *V(*,*,:) :: xloc,yloc,dt
!HPF$ ALIGN (*,:) with *V(*,*,:) :: p      

	
	Forall(i=2:mx,j=2:my)
	dt(i,j)=cfl/(abs(V(2,i,j)/V(1,i,j))/(xloc(i,j)-xloc(i-1,j))
     $ + abs(V(3,i,j)/V(1,i,j))/(yloc(i,j)-yloc(i,j-1)) + 
     $ sqrt(1.4*p(i,j)/v(1,i,j))*sqrt(1.0/(xloc(i,j)-xloc(i-1,j))**2
     $ +1.0/(yloc(i,j)-yloc(i,j-1))**2))
	End Forall 
	  End subroutine

!This routine computes the necessary boundary conditions
! at the nozzle exit and entrance.  For a subsonic inlet,
! pressure is simply extrapolated backwards from the
! upstream.  Supersonic inlet pressure is computed
! on the basis of given mach number.
! A supersonic outlet extrapolates the pressure, while
! a subsonic inlet sets the back pressure to the value
! specified by the user.


      Subroutine Bound(V,p,min,pback,lmax,mx,my,exit)
      Integer lmax,mx,my
      Real V(lmax,mx+1,my+1)
      Real  p(mx+1,my+1)
      Real mach(my+1)
      Real min,pback
      Integer exit
! Simple leave incoming distribution and alignment alone
!HPF$ PROCESSORS PROC(NUMBER_OF_PROCESSORS())
!HPF$ DISTRIBUTE  V *(*,*,BLOCK) onto * PROC
!HPF$ ALIGN (*,:) with *V (*,*,:) :: p

  
      If (min .gt. 1.0) then
       Forall(j=1:my+1)
        P(1,j)=1.0/((min**2)/2.0 *(1.4-1.0)
     $  +1.0)**(1.4/0.4)
        V(1,1,j)=P(1,j)*(1.0
     $  + (1.4-1.0)/2.0*min**2)
        V(2,1,j)=V(1,1,j)*min*
     $ sqrt(1.4*p(1,j)/V(1,1,j))
        V(3,1,j)=0.0
        V(4,1,j)=p(1,j)/(0.4) + 0.5*V(1,1,j)
     $ *(V(2,1,j)**2 / V(1,1,j)**2)
        end Forall
       End IF

       If (min .le. 1.0) then
        Forall(j=1:my+1)
        p(1,j)=2.0*p(2,j)-p(3,j)
	 mach(j)=2.0/(1.4-1.0)*(p(1,j)**
     $ ((1.0-1.4)/1.4) -1.0)
         V(1,1,j)=p(1,j)*
     $ (1.0+0.2*mach(j))
        V(2,1,j)=sqrt(mach(j))*sqrt(1.4/(1.0+0.2*mach(j)))
     $ *V(1,1,j)

        V(3,1,j)=0.0

        V(4,1,j)=p(1,j)/(0.4) + 0.5*V(1,1,j)
     $ *(V(2,1,j)**2 / V(1,1,j)**2)
        end Forall
       End IF

       If (exit .eq. 1) then
          Forall(j=1:my+1)
          p(mx+1,j)=pback
           V(1,mx+1,j)=2.0*V(1,mx,j)-V(1,mx-1,j)
           V(2,mx+1,j)=2.0*V(2,mx,j)-V(2,mx-1,j)
           V(3,mx+1,j)=2.0*V(3,mx,j)-V(3,mx-1,j)
           V(4,mx+1,j)=pback/0.4
     $ + 0.5*V(1,mx+1,j)*(V(2,mx+1,j)**2/v(1,mx+1,j)**2 +
     $ V(3,mx+1,j)**2/V(1,mx+1,j)**2)
           end Forall
        End if

        If (exit .eq. 2 ) then
           Forall(j=1:my+1)
           p(mx+1,j) =p(mx,j)

            V(1,mx+1,j)=V(1,mx,j)

           V(2,mx+1,j)=V(2,mx,j)

           V(3,mx+1,j)=V(3,mx,j)
	  
           V(4,mx+1,j)=p(mx+1,j)/0.4
     $ + 0.5*V(1,mx+1,j)*(V(2,mx+1,j)**2/v(1,mx+1,j)**2 +
     $ V(3,mx+1,j)**2/V(1,mx+1,j)**2)
           end Forall
        End if


        End subroutine


      Subroutine Contour(V,xloc,yloc,lmax,mx,my)
      Implicit none
      Integer lmax,mx,my
      Real V(lmax,mx+1,my+1)
      Real xloc(mx,my+1),yloc(mx,my+1)
      Real p(mx,my)
      Real wcell(mx,my)
      Real wnode(mx,my)
      Integer i,j

      open(unit=10,file='info.dat',status='unknown')
	write(10,*) 'VARIABLES= "x","y","u","v","mach","entropy","enthalpy"'
	write(10,*) 'ZONE I=',mx,' J=',my,'F=BLOCK'
1	Format(6f12.6)
      Forall(i=2:mx,j=2:my) p(i,j)=0.4*(V(4,i,j)
     $ -0.5/V(1,i,j)*(V(2,i,j)**2 + V(3,i,j)**2))
      write(10,1) ((xloc(i,j),i=1,mx),j=1,my)
      Write(10,1) ((yloc(i,j),i=1,mx),j=1,my)

!---------Put u component of velocity into file-----

      Forall(i=2:mx,j=2:my) wcell(i,j)=V(2,i,j)/V(1,i,j)
      Forall(i=2:mx-1,j=2:my-1) wnode(i,j)=0.25*(wcell(i+1,j)
     $ + wcell(i+1,j+1) + wcell(i,j+1) + wcell(i,j))

      wnode(1,1)=wcell(2,2)
      wnode(1,my)=wcell(2,my)
      wnode(mx,my)=wcell(mx,my)
      Wnode(mx,1)=wcell(mx,2)

      Forall(j=2:my-1) wnode(1,j)=0.5*(wcell(2,j+1)+
     $ wcell(2,j))
      Forall(j=2:my-1) wnode(mx,j)=0.5*(wcell(mx,j+1)
     $ + wcell(mx,j))
      Forall(i=2:mx-1) wnode(i,1)=0.5*(wcell(i,2)+
     $ wcell(i+1,2))
      Forall(i=2:mx-1) wnode(i,my)=0.5*(wcell(i,my)+wcell(i+1,my))
	write(10,1) ((wnode(i,j),i=1,mx),j=1,my)
!--------Put v component of velocity into file--------

      Forall(i=2:mx,j=2:my)  wcell(i,j)=V(3,i,j)/V(1,i,j) 
      Forall(i=2:mx-1,j=2:my-1) wnode(i,j)=0.25*(wcell(i+1,j)
     $ + wcell(i+1,j+1) + wcell(i,j+1) + wcell(i,j))

      wnode(1,1)=wcell(2,2)
      wnode(1,my)=wcell(2,my)
      wnode(mx,my)=wcell(mx,my)
      Wnode(mx,1)=wcell(mx,2)

      Forall(j=2:my-1) wnode(1,j)=0.5*(wcell(2,j+1)+
     $ wcell(2,j))
      Forall(j=2:my-1) wnode(mx,j)=0.5*(wcell(mx,j+1)
     $ + wcell(mx,j))
      Forall(i=2:mx-1) wnode(i,1)=0.5*(wcell(i,2)+
     $ wcell(i+1,2))
      Forall(i=2:mx-1) wnode(i,my)=0.5*(wcell(i,my)+wcell(i+1,my))
	write(10,1) ((wnode(i,j),i=1,mx),j=1,my)
!----------Put mach number into file----------------

      Forall(i=2:mx,j=2:my) wcell(i,j)=sqrt((V(2,i,j)/V(1,i,j))**2
     $ + (V(3,i,j)/V(1,i,j))**2)/sqrt(1.4*p(i,j)/V(1,i,j))
      
      Forall(i=2:mx-1,j=2:my-1) wnode(i,j)=0.25*(wcell(i+1,j)
     $ + wcell(i+1,j+1) + wcell(i,j+1) + wcell(i,j))

      wnode(1,1)=wcell(2,2)
      wnode(1,my)=wcell(2,my)
      wnode(mx,my)=wcell(mx,my)
      Wnode(mx,1)=wcell(mx,2)

      Forall(j=2:my-1) wnode(1,j)=0.5*(wcell(2,j+1)+
     $ wcell(2,j))
      Forall(j=2:my-1) wnode(mx,j)=0.5*(wcell(mx,j+1)
     $ + wcell(mx,j))
      Forall(i=2:mx-1) wnode(i,1)=0.5*(wcell(i,2)+
     $ wcell(i+1,2))
      Forall(i=2:mx-1) wnode(i,my)=0.5*(wcell(i,my)+wcell(i+1,my))

	write(10,1) ((wnode(i,j),i=1,mx),j=1,my)
!--------Stick entropy in there-------
       Forall(i=2:mx,j=2:my) wcell(i,j)=1.4/(1.4-1.0)*log(p(i,j)
     $ /V(1,i,j)) - log(p(i,j))

        Forall(i=2:mx-1,j=2:my-1) wnode(i,j)=0.25*(wcell(i+1,j)
     $ + wcell(i+1,j+1) + wcell(i,j+1) + wcell(i,j))

      wnode(1,1)=wcell(2,2)
      wnode(1,my)=wcell(2,my)
      wnode(mx,my)=wcell(mx,my)
      Wnode(mx,1)=wcell(mx,2)

      Forall(j=2:my-1) wnode(1,j)=0.5*(wcell(2,j+1)+
     $ wcell(2,j))
      Forall(j=2:my-1) wnode(mx,j)=0.5*(wcell(mx,j+1)
     $ + wcell(mx,j))
      Forall(i=2:mx-1) wnode(i,1)=0.5*(wcell(i,2)+
     $ wcell(i+1,2))
      Forall(i=2:mx-1) wnode(i,my)=0.5*(wcell(i,my)+wcell(i+1,my))

	write(10,1) ((wnode(i,j),i=1,mx),j=1,my)

!----------Enthalpy-----------------

	Forall(i=2:mx,j=2:my) wcell(i,j)=V(4,i,j)/V(1,i,j) +p(i,j)/V(1,i,j)
	 Forall(i=2:mx-1,j=2:my-1) wnode(i,j)=0.25*(wcell(i+1,j)
     $ + wcell(i+1,j+1) + wcell(i,j+1) + wcell(i,j))

      wnode(1,1)=wcell(2,2)
      wnode(1,my)=wcell(2,my)
      wnode(mx,my)=wcell(mx,my)
      Wnode(mx,1)=wcell(mx,2)

      Forall(j=2:my-1) wnode(1,j)=0.5*(wcell(2,j+1)+
     $ wcell(2,j))
      Forall(j=2:my-1) wnode(mx,j)=0.5*(wcell(mx,j+1)
     $ + wcell(mx,j))
      Forall(i=2:mx-1) wnode(i,1)=0.5*(wcell(i,2)+
     $ wcell(i+1,2))
      Forall(i=2:mx-1) wnode(i,my)=0.5*(wcell(i,my)+wcell(i+1,my))

	write(10,1) ((wnode(i,j),i=1,mx),j=1,my)
	close(unit=10)
      End subroutine