How to speed up the calculation?

#pragma TextEncoding = "UTF-8"
#pragma rtGlobals=3     // Use modern global access method and strict wave access.
function makekmesh(nk)
variable nk
variable i,j,k,t=0
make/d/o/n=(nk*nk*nk,3) kwave
for(i=0;i<nk;i++)
  for(j=0;j<nk;j++)
    for(k=0;k<nk;k++)
        kwave[t][0]=i/nk
        kwave[t][1]=j/nk
        kwave[t][2]=k/nk
        t++
    endfor
  endfor
endfor
end
 
function hkmo(nk)
variable nk
wave h1,h2,n1,n2,r1,r2,r3
wave kwave,kwavexs,kwaveys
make/d/o/n=(36,36,nk)/c hk=0
make/d/o/c/n=(36,36,nk) dhx=0,dhy=0
make/n=(36,36)/d/o/c temp1=0
make/d/o/n=(36,nk)/c ev=0
variable i,j,k
variable const=0.0005
    for(j=0;j<nk;j++)
      for(i=0;i<(18*18*(13*13*13));i++)
        if((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==n2[i]))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])+const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])-const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==3)&&(n2[i]==4))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==12)&&(n2[i]==13))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==4)&&(n2[i]==3))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==13)&&(n2[i]==12))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==6)&&(n2[i]==7))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==15)&&(n2[i]==16))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])-cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==7)&&(n2[i]==6))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==16)&&(n2[i]==15))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])+cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==8)&&(n2[i]==9))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])-2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])-2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==17)&&(n2[i]==18))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])-2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])-2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==9)&&(n2[i]==8))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])+2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])+2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==18)&&(n2[i]==17))
          hk[n1[i]-1][n2[i]-1][j]+=(cmplx(h1[i],h2[i])+2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=(cmplx(h1[i],h2[i])+2*cmplx(0,1)*const)*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        else
          hk[n1[i]-1][n2[i]-1][j]+=cmplx(h1[i],h2[i])*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
          hk[18+n1[i]-1][18+n2[i]-1][j]+=cmplx(h1[i],h2[i])*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
        endif
        dhx[n1[i]-1][n2[i]-1][j]+=cmplx(0,1)*(2.81*10^(-10)*(r2[i]+r3[i]))*(cmplx(h1[i],h2[i]))*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i])) 
        dhy[n1[i]-1][n2[i]-1][j]+=cmplx(0,1)*(2.81*10^(-10)*(r1[i]+r3[i]))*(cmplx(h1[i],h2[i]))*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i])) 
        dhx[18+n1[i]-1][18+n2[i]-1][j]+=cmplx(0,1)*(2.81*10^(-10)*(r2[i]+r3[i]))*(cmplx(h1[i],h2[i]))*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i])) 
        dhy[18+n1[i]-1][18+n2[i]-1][j]+=cmplx(0,1)*(2.81*10^(-10)*(r1[i]+r3[i]))*(cmplx(h1[i],h2[i]))*exp(2*pi*cmplx(0,kwave[j][0]*r1[i]+kwave[j][1]*r2[i]+kwave[j][2]*r3[i]))
      endfor
      temp1=hk[p][q][j]
      MatrixEigenV/SYM/EVEC temp1
      wave W_eigenValues
      ev[][j]=W_eigenValues[p]
      wave M_eigenVectors
      duplicate/o M_eigenVectors,$"v"+num2str(j)
      if(mod(j,100)==0)
        print j
      endif
    endfor
killwaves temp1
end
 
function conduct(nk)
variable nk
wave/c ev,hk
wave h1,h2,r1,r2,r3,n1,n2
wave kwave
make/d/o/n=1000/c txx=0,txy=0,txx2=0,txy2=0,txx3=0,txy3=0
wave/c dhx,dhy
variable i,j,k,c0,c1,l,const=0.0005
variable ita=0.2,ita2=0.3,ita3=0.4,omg
 
for(i=0;i<(nk);i++)
  if(mod(i,100)==0)
    print i
  endif
  duplicate/o $"v"+num2str(i),tt0
  for(j=0;j<16;j++)   
    for(k=16;k<36;k++)
     if((ev[k][i]<10))
      variable/c t0=0,t1=0,t2=0,t3=0
      for(c0=0;c0<36;c0++)
        for(c1=0;c1<36;c1++)          
          if((c0<9)&&(c1>8)&&(c1<18))          
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
          elseif((c0<9)&&(c1>26))
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
          elseif((c0>8)&&(c0<18)&&(c1<9))
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
          elseif((c0>8)&&(c0<18)&&(c1>17)&&(c1<27))
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
          elseif((c0>17)&&(c0<27)&&(c1>8)&&(c1<18))  
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
          elseif((c0>17)&&(c0<27)&&(c1>26))
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
          elseif((c0>26)&&(c1<9))
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10)) 
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
          elseif((c0>26)&&(c1>17)&&(c1<27))
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(0.25*2.81*2*10^(-10))  
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]-conj(tt0[c0][j])*hk[c0][c1][i]*tt0[c1][k]*cmplx(0,1)*(0.25*2.81*2*10^(-10))  
          else
            t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]
            t1+=conj(tt0[c0][j])*dhx[c0][c1][i]*tt0[c1][k]
            t2+=conj(tt0[c0][k])*dhy[c0][c1][i]*tt0[c1][j]
            t3+=conj(tt0[c0][j])*dhy[c0][c1][i]*tt0[c1][k]
          endif
        endfor
      endfor
      for(omg=0;omg<9.999;omg+=0.01)
        variable difcv=ev[k][i]-ev[j][i]
        txx[omg*100]+=cmplx(0,1)*(t1*t0/(omg-difcv+cmplx(0,ita))+t0*t1/(omg+difcv+cmplx(0,ita)))/difcv
        txy[omg*100]+=cmplx(0,1)*(t1*t2/(omg-difcv+cmplx(0,ita))+t0*t3/(omg+difcv+cmplx(0,ita)))/difcv
        txx2[omg*100]+=cmplx(0,1)*(t1*t0/(omg-difcv+cmplx(0,ita2))+t0*t1/(omg+difcv+cmplx(0,ita2)))/difcv
        txy2[omg*100]+=cmplx(0,1)*(t1*t2/(omg-difcv+cmplx(0,ita2))+t0*t3/(omg+difcv+cmplx(0,ita2)))/difcv
        txx3[omg*100]+=cmplx(0,1)*(t1*t0/(omg-difcv+cmplx(0,ita3))+t0*t1/(omg+difcv+cmplx(0,ita3)))/difcv
        txy3[omg*100]+=cmplx(0,1)*(t1*t2/(omg-difcv+cmplx(0,ita3))+t0*t3/(omg+difcv+cmplx(0,ita3)))/difcv
      endfor
     endif
    endfor
  endfor
endfor
variable ee=1.6*10^(-19)
variable vprim=44.3761*10^(-30)
variable hbar=1.05457*10^(-34)
txx*=ee^2/vprim/(nk)/hbar
txy*=ee^2/vprim/(nk)/hbar
txx2*=ee^2/vprim/(nk)/hbar
txy2*=ee^2/vprim/(nk)/hbar
txx3*=ee^2/vprim/(nk)/hbar
txy3*=ee^2/vprim/(nk)/hbar
end

I have written the procedures above which should be run in the order for example makekmesh(52), hkmo(52*52*52), conduct(52*52*52). Because the kmesh contains so many k points for example 52*52*52 or 60*60*60, this set of procedures need to be run for a very long time. How to speed up the calculation? Thanks. The input waves h1,h2,n1,n2,r1,r2,r3 are larger than 25MB so I cannot upload the input.

At least you should precalculate things like 

- (18*18*(13*13*13)

- ee^2/vprim/(nk)/hbar

- (0.25*2.81*2*10^(-10))

etc, when used more than once, but unless you find alternatives for the nested loops I doubt that this will make a significant difference.

Using MultiThread for wave assignments also help. 

 

 

Two things strike me. First, you have multiple if-elseif conditional steps within a double for-loop construction, all enclosed within a double for-loop construction. You parse through all specific cases and do the general case deep within the constructions. I am struck that this approach may be made faster when calculate the general case first and then go back one at a time on the specifics. Also, I have to wonder whether the if-elseif conditions can be pre-calculated into a matrix of integers that are then used as case switches to work on a 1-D array. For example ...

make/FREE/N=(nx, ny) choice
for(j=0;j<nk;j++)
    for(i=0;i<(18*18*(13*13*13));i++)
        if((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==n2[i]))
            choice[i][j] = 0
        elseif((r1[i]==0)&&(r2[i]==0)&&(r3[i]==0)&&(n1[i]==3)&&(n2[i]==4))
            choice[i][j] = 1
...
 
endfor
redimension ... // redimension all [i][j] waves to a 1-D array
for (j=0;j<nk*...,j++)
   switch(choice[j]
      case 0:
         ...
         break
      case 1:
         ...
         break
 
   endswitch
endfor
redimension ... // regenerate matrix

Secondly, you repeat comparable algebraic expressions inside the (else if) constructions, with the only parameters that change being the iteration indices. I would wonder whether you should construct a function that takes the input indices and does what is needed. For example, consider your long expression versus the function call ...

t0+=conj(tt0[c0][k])*dhx[c0][c1][i]*tt0[c1][j]-conj(tt0[c0][k])*hk[c0][c1][i]*tt0[c1][j]*cmplx(0,1)*(-0.25*2.81*2*10^(-10))
 
Function calc_tvalue( variable dhx, variable hkA, variable tt0A, variable tt0B )
    variable rtnv
    variable pf = -0.25*2.18*2*10^(-10)
    rtnv = dhx - pf*hkA*cmplx(0,1)
    rtnv *= conj(tt0A)*tt0B
    return rtnv
end

I may not have gotten the expression exactly correct. The point remains that the function is condensed, applicable to any case for the template input, and far more readable. 

Finally, I am curious whether your matrix has a symmetry that you can exploit. For example, can you calculate all the elements on one side or another of the [i][j][k] pairings, and then do a symmetry inversion to fill the other side?

 

Just as example, your function makemesh(nk) could be replaced by something like:

function foo(variable nk)
    make/d/o/N=(nk^3,3) kwave2 = 0
    MultiThread kwave2[][0] = trunc(p/nk^2) * 1/nk
    // kwave2[][1] = ....
    MultiThread kwave2[][2] = mod(p,nk)/nk
end

if you find an expression for the second column of kwave. On my machine and nk=200 this takes 0.5 sec vs 5 sec for your version.

 

 

 

Hello Yuan Fang,

The first issue that you may want to address is going over your loops and in each case look at the current code and identify elements that are repeated more than once within each loop.  For example,

cmplx(0,1)*(0.25*2.81*2*10^(-10)

appears multiple times in a loop and there is no reason to calculate it more than once outside the loop, save it in a local variable and use that in the loop expressions.  This simple revision will simplify your code and save on computation time.  I realize that you may place high value on code readability but here you take the readable code and factor out anything that is computed more than once in a loop.  I even define a constant for 2*pi so as to optimize that factor.

Your next step is to examine every expression in every loop.  If the expression does not depend on the loop iteration/index, factor it out of the loop and use a local variable in its place.

Next examine your code to see if you can replace some code with MultiThread or MatrixOP commands.

AG

I was curious about the code and asked ChatGPT for the likely purpose of the function hkmo(nk). This was the (shortened) answer:

"This Igor Pro function, hkmo(nk), appears to be constructing and diagonalizing a Hamiltonian matrix in momentum space for a solid-state physics or condensed matter physics problem. Here’s a breakdown of what the function is doing:

...

...

Conclusion

This function likely constructs a momentum-space tight-binding Hamiltonian, diagonalizes it to obtain the band structure, and computes derivatives for additional properties. The model seems to be describing a complex multi-band system (possibly a 2D or 3D material with 36 orbitals)."

 

Then I asked if he/she/it (?) can optimise it for speed. With a bit of manual tweaking the code compiles, but I have absolutely no idea if it does what it's supposed to do, but it looks far more readable. I hope @Yuan Fang can tell us.

Function hkmo_OPT(Variable  nk)
    Wave/Z h1, h2, n1, n2, r1, r2, r3
    Wave/Z kwave, kwavexs, kwaveys
    Make/D/O/N=(36,36,nk)/C hk=0
    Make/D/O/C/N=(36,36,nk) dhx=0, dhy=0
    Make/N=(36,36)/D/O/C temp1=0
    Make/D/O/N=(36,nk)/C ev=0
    
    Variable i, j, k
    Variable const = 0.0005
    Variable twoPi = 2 * Pi
    Complex expFactor
 
    // Precompute common values
    Make/N=(nk, 3)/O waveExpFactor
    For (j=0; j<nk; j++)
        waveExpFactor[j][0] = twoPi * kwave[j][0]
        waveExpFactor[j][1] = twoPi * kwave[j][1]
        waveExpFactor[j][2] = twoPi * kwave[j][2]
    EndFor
 
    // Loop over nk (momentum points)
    For (j=0; j<nk; j++)
        For (i=0; i<(18*18*13*13*13); i++)
            
            // Avoid redundant calculations
            If (r1[i] == 0 && r2[i] == 0 && r3[i] == 0)
                Variable idx1 = n1[i] - 1
                Variable idx2 = n2[i] - 1
                Variable idx1s = 18 + idx1
                Variable idx2s = 18 + idx2
                
                Complex hComplex = Cmplx(h1[i], h2[i])
                expFactor = Exp(Cmplx(0, waveExpFactor[j][0] * r1[i] + waveExpFactor[j][1] * r2[i] + waveExpFactor[j][2] * r3[i]))
 
                // Precompute adjustment terms
                Complex delta = 0
                If (n1[i] == n2[i])
                    delta = const
                ElseIf ((n1[i] == 3 && n2[i] == 4) || (n1[i] == 12 && n2[i] == 13) || (n1[i] == 6 && n2[i] == 7) || (n1[i] == 15 && n2[i] == 16))
                    delta = -Cmplx(0, 1) * const
                ElseIf ((n1[i] == 4 && n2[i] == 3) || (n1[i] == 13 && n2[i] == 12) || (n1[i] == 7 && n2[i] == 6) || (n1[i] == 16 && n2[i] == 15))
                    delta = Cmplx(0, 1) * const
                ElseIf ((n1[i] == 8 && n2[i] == 9) || (n1[i] == 17 && n2[i] == 18))
                    delta = -2 * Cmplx(0, 1) * const
                ElseIf ((n1[i] == 9 && n2[i] == 8) || (n1[i] == 18 && n2[i] == 17))
                    delta = 2 * Cmplx(0, 1) * const
                Endif
                
                Complex hkValue = (hComplex + delta) * expFactor
                hk[idx1][idx2][j] += hkValue
                hk[idx1s][idx2s][j] += hkValue
 
                // Compute derivatives dhx and dhy efficiently
                Variable dhFactorX = 2.81e-10 * (r2[i] + r3[i])
                Variable dhFactorY = 2.81e-10 * (r1[i] + r3[i])
                Complex dhValX = Cmplx(0, 1) * dhFactorX * hComplex * expFactor
                Complex dhValY = Cmplx(0, 1) * dhFactorY * hComplex * expFactor
                
                dhx[idx1][idx2][j] += dhValX
                dhy[idx1][idx2][j] += dhValY
                dhx[idx1s][idx2s][j] += dhValX
                dhy[idx1s][idx2s][j] += dhValY
            EndIf
        EndFor
 
        // Diagonalization (Eigenvalues and Eigenvectors)
        temp1 = hk[p][q][j]
        MatrixEigenV/SYM/EVEC temp1
        Wave W_eigenValues
        ev[][j] = W_eigenValues[p]
        Wave M_eigenVectors
        Duplicate/O M_eigenVectors, $"v" + num2str(j)
 
        If (Mod(j, 100) == 0)
            Print j
        EndIf
    EndFor
    KillWaves/Z temp1
End

 

EDIT:

 // Precompute common values
    Make/N=(nk, 3)/O waveExpFactor
    For (j=0; j<nk; j++)
        waveExpFactor[j][0] = twoPi * kwave[j][0]
        waveExpFactor[j][1] = twoPi * kwave[j][1]
        waveExpFactor[j][2] = twoPi * kwave[j][2]
    EndFor

can be written as 

 // Precompute common values
    Make/N=(nk, 3)/O waveExpFactor
    MultiThread waveExpFactor = twoPi * kwave[p][q]


 


 

Thank you all. My problem has been solved by translating from Igor to mpi4py and run the code on supercomputer.