/**********************************************************************
  Cluster_DFT.c:

     Cluster_DFT.c is a subroutine to perform cluster calculations

  Log of Cluster_DFT.c:

     22/Nov/2001  Released by T.Ozaki

***********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "openmx_common.h"

#define  measure_time   0

#ifdef nompi
#include "mimic_mpi.h"
#else
#include "mpi.h"
#endif

static double Cluster_collinear(
                   int SCF_iter,
                   int SpinP_switch,
                   double ***C,
                   double **ko,
                   double *****nh, 
                   double ****CntOLP,
                   double *****CDM,
                   double *****EDM,
                   double Eele0[2], double Eele1[2]);

static double Cluster_non_collinear(
                   int SCF_iter,
                   int SpinP_switch,
                   double *****nh,
                   double *****ImNL,
                   double ****CntOLP,
                   double *****CDM,
                   double *****EDM,
                   double Eele0[2], double Eele1[2]);



double Cluster_DFT(int SCF_iter,
                   int SpinP_switch,
                   double ***Cluster_ReCoes,
                   double **Cluster_ko,
                   double *****nh,
                   double *****ImNL,
                   double ****CntOLP,
                   double *****CDM,
                   double *****EDM,
                   double Eele0[2], double Eele1[2])
{
  static double time0;

  /****************************************************
         collinear without spin-orbit coupling
  ****************************************************/

  if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==0 ){

    time0 = Cluster_collinear(SCF_iter,SpinP_switch,Cluster_ReCoes,Cluster_ko,
                              nh,CntOLP,CDM,EDM,Eele0,Eele1);
  }

  /****************************************************
           collinear with spin-orbit coupling
  ****************************************************/

  else if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==1 ){
    printf("Spin-orbit coupling is not supported for collinear DFT calculations.\n");
    MPI_Finalize();
    exit(1);
  }

  /****************************************************
   non-collinear with and without spin-orbit coupling
  ****************************************************/

  else if (SpinP_switch==3){
    time0 = Cluster_non_collinear(SCF_iter,SpinP_switch,nh,ImNL,CntOLP,CDM,EDM,Eele0,Eele1);
  }

  return time0;
}



double Cluster_collinear(
                   int SCF_iter,
                   int SpinP_switch,
                   double ***C,
                   double **ko,
                   double *****nh, double ****CntOLP,
                   double *****CDM,
                   double *****EDM,
                   double Eele0[2], double Eele1[2])
{
  static int firsttime=1;
  int i,j,l,n,n2,n1,i1,j1,k1,l1;
  int wan,HOMO0,HOMO1;
  int *MP;
  int step,wstep,nstep,istart,iend;
  int spin,po,num0,num1;
  int ct_AN,k,wanA,tnoA,wanB,tnoB;
  int GA_AN,Anum,loopN,Gc_AN;
  int MA_AN,LB_AN,GB_AN,Bnum,MaxN;
  int wan1,mul,m,bcast_flag;
  int *is1,*ie1,*is2,*ie2;
  double time0,lumos,av_num;
  double *OneD_Mat1;
  double ***H;
  double TZ,my_sum,sum,sumE,max_x=50.0;
  double My_Eele1[2],tmp1,tmp2;
  double Num_State,x,FermiF,Dnum,Dnum2;
  double dum,ChemP_MAX,ChemP_MIN,spin_degeneracy;
  double TStime,TEtime;
  double FermiEps = 1.0e-13;
  double EV_cut0;
  int numprocs0,myid0;
  int numprocs1,myid1;
  int Num_Comm_World1;
  int ID,myworld1;
  int *NPROCS_ID1,*NPROCS_WD1;
  int *Comm_World1;
  int *Comm_World_StartID1;
  MPI_Comm *MPI_CommWD1;
  char *Name_Angular[Supported_MaxL+1][2*(Supported_MaxL+1)+1];
  char *Name_Multiple[20];
  double OLP_eigen_cut=Threshold_OLP_Eigen;
  char file_EV[YOUSO10] = ".EV";
  char buf[fp_bsize];          /* setvbuf */
  FILE *fp_EV;
  double stime, etime;
  double time1,time2,time3,time4,time5,time6,time7;

  MPI_Status *stat_send;
  MPI_Request *request_send;
  MPI_Request *request_recv;
 
  /* for time */
  MPI_Barrier(mpi_comm_level1);
  dtime(&TStime);

  /* MPI */
  MPI_Comm_size(mpi_comm_level1,&numprocs0);
  MPI_Comm_rank(mpi_comm_level1,&myid0);

  Num_Comm_World1 = SpinP_switch + 1; 

  stat_send = malloc(sizeof(MPI_Status)*numprocs0);
  request_send = malloc(sizeof(MPI_Request)*numprocs0);
  request_recv = malloc(sizeof(MPI_Request)*numprocs0);

  /***********************************************
      allocation of arrays for the first world 
  ***********************************************/

  NPROCS_ID1 = (int*)malloc(sizeof(int)*numprocs0); 
  Comm_World1 = (int*)malloc(sizeof(int)*numprocs0); 
  NPROCS_WD1 = (int*)malloc(sizeof(int)*Num_Comm_World1); 
  Comm_World_StartID1 = (int*)malloc(sizeof(int)*Num_Comm_World1); 
  MPI_CommWD1 = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World1);

  /*********************************************** 
            make the first level worlds 
  ***********************************************/

  Make_Comm_Worlds(mpi_comm_level1, myid0, numprocs0, Num_Comm_World1, &myworld1, MPI_CommWD1, 
                   NPROCS_ID1, Comm_World1, NPROCS_WD1, Comm_World_StartID1);

  /*********************************************** 
       for pallalel calculations in myworld1
  ***********************************************/

  MPI_Comm_size(MPI_CommWD1[myworld1],&numprocs1);
  MPI_Comm_rank(MPI_CommWD1[myworld1],&myid1);

  /****************************************************
             calculation of the array size
  ****************************************************/

  n = 0;
  for (i=1; i<=atomnum; i++){
    wanA  = WhatSpecies[i];
    n = n + Spe_Total_CNO[wanA];
  }
  n2 = n + 2;

  /****************************************************
   Allocation

   int     MP[List_YOUSO[1]]
   double  H[List_YOUSO[23]][n2][n2]  
  ****************************************************/

  MP = (int*)malloc(sizeof(int)*List_YOUSO[1]);

  H = (double***)malloc(sizeof(double**)*List_YOUSO[23]);
  for (i=0; i<List_YOUSO[23]; i++){
    H[i] = (double**)malloc(sizeof(double*)*n2);
    for (j=0; j<n2; j++){
      H[i][j] = (double*)malloc(sizeof(double)*n2);
    }
  }

  is1 = (int*)malloc(sizeof(int)*numprocs1);
  ie1 = (int*)malloc(sizeof(int)*numprocs1);

  is2 = (int*)malloc(sizeof(int)*numprocs1);
  ie2 = (int*)malloc(sizeof(int)*numprocs1);

  if (firsttime){
    PrintMemory("Cluster_DFT: H",sizeof(double)*List_YOUSO[23]*n2*n2,NULL);
  }

  /* for PrintMemory */
  firsttime=0;

  if (measure_time){
    time1 = 0.0;
    time2 = 0.0;
    time3 = 0.0;
    time4 = 0.0;
    time5 = 0.0;
    time6 = 0.0;
    time7 = 0.0;
  }

  if      (SpinP_switch==0) spin_degeneracy = 2.0;
  else if (SpinP_switch==1) spin_degeneracy = 1.0;

  /****************************************************
                  total core charge
  ****************************************************/

  TZ = 0.0;
  for (i=1; i<=atomnum; i++){
    wan = WhatSpecies[i];
    TZ = TZ + Spe_Core_Charge[wan];
  }

  /****************************************************
         set the matrix size n
  ****************************************************/

  n = Size_Total_Matrix;

  /****************************************************
         find the numbers of partions for MPI
  ****************************************************/

  if ( numprocs1<=n ){

    av_num = (double)n/(double)numprocs1;

    for (ID=0; ID<numprocs1; ID++){
      is1[ID] = (int)(av_num*(double)ID) + 1; 
      ie1[ID] = (int)(av_num*(double)(ID+1)); 
    }

    is1[0] = 1;
    ie1[numprocs1-1] = n; 

  }

  else{

    for (ID=0; ID<n; ID++){
      is1[ID] = ID + 1; 
      ie1[ID] = ID + 1;
    }
    for (ID=n; ID<numprocs1; ID++){
      is1[ID] =  1;
      ie1[ID] = -2;
    }
  }

  /****************************************************
       1. diagonalize the overlap matrix     
       2. search negative eigenvalues
  ****************************************************/

  MPI_Barrier(mpi_comm_level1);

  if (SCF_iter==1){
    Overlap_Cluster(CntOLP,S,MP);
  }

  for (spin=0; spin<=SpinP_switch; spin++){
    Hamiltonian_Cluster(nh[spin],H[spin],MP);
  } 

  if (SCF_iter==1){

    if (measure_time) dtime(&stime);

    /* all the processors have the full output S matrix */

    bcast_flag = 1;
    Eigen_PReHH(mpi_comm_level1,S,ko[0],n,n,bcast_flag);

    /* minus eigenvalues to 1.0e-14 */

    for (l=1; l<=n; l++){
      if (ko[0][l]<0.0) ko[0][l] = 1.0e-14;
      EV_S[l] = ko[0][l];
    }

    /* print to the standard output */

    if (2<=level_stdout && myid0==Host_ID){
      for (l=1; l<=n; l++){
	printf("  Eigenvalues of OLP  %2d  %18.15f\n",l,ko[0][l]);fflush(stdout);
      }
    }

    /* calculate S*1/sqrt(ko) */

    for (l=1; l<=n; l++){
      IEV_S[l] = 1.0/sqrt(ko[0][l]);
    }

    SP_PEV = 1;

    for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
	S[i1][j1] = S[i1][j1]*IEV_S[j1];
      }
    }

    if (measure_time){
      dtime(&etime);
      time1 += etime - stime; 
    }
  }

  /****************************************************
    calculations of eigenvalues for up and down spins

     Note:
         MP indicates the starting position of
              atom i in arraies H and S
  ****************************************************/

  MPI_Barrier(mpi_comm_level1);

  /* initialize ko */
  for (spin=0; spin<=SpinP_switch; spin++){
    for (i1=1; i1<=n; i1++){
      ko[spin][i1] = 10000.0;
    }
  }

  /* spin=myworld1 */

  spin = myworld1;

 diagonalize:

  if (measure_time) dtime(&stime);

  /* transpose S */
  for (i1=1; i1<=n; i1++){
    for (j1=i1+1; j1<=n; j1++){
      tmp1 = S[i1][j1];
      tmp2 = S[j1][i1];
      S[i1][j1] = tmp2;
      S[j1][i1] = tmp1;
    }
  }

  /* C is distributed by row in each processor */

  for (j1=is1[myid1]; j1<=ie1[myid1]; j1++){
    for (i1=1; i1<=n; i1++){
      sum = 0.0;
      for (l=1; l<=n; l++){
	sum += H[spin][i1][l]*S[j1][l];
      }
      C[spin][j1][i1] = sum;
    }
  }

  /* H is distributed by row in each processor */

  for (j1=is1[myid1]; j1<=ie1[myid1]; j1++){
    for (i1=1; i1<=n; i1++){
      sum = 0.0;
      for (l=1; l<=n; l++){
	sum += S[i1][l]*C[spin][j1][l];
      }
      H[spin][j1][i1] = sum;
    }
  }

  /* broadcast H[spin] */

  BroadCast_ReMatrix(MPI_CommWD1[myworld1],H[spin],n,is1,ie1,myid1,numprocs1,
                       stat_send,request_send,request_recv);

  /* H to C (transposition) */

  for (i1=1; i1<=n; i1++){
    for (j1=1; j1<=n; j1++){
      C[spin][j1][i1] = H[spin][i1][j1];
    }
  }

  /* penalty for ill-conditioning states */

  EV_cut0 = Threshold_OLP_Eigen;

  for (i1=1; i1<=n; i1++){

    if (EV_S[i1]<EV_cut0){
      C[spin][i1][i1] += pow((EV_S[i1]/EV_cut0),-2.0) - 1.0;
    }
 
    /* cutoff the interaction between the ill-conditioned state */
 
    if (1.0e+3<C[spin][i1][i1]){
      for (j1=1; j1<=n; j1++){
	C[spin][i1][j1] = 0.0;
	C[spin][j1][i1] = 0.0;
      }
      C[spin][i1][i1] = 1.0e+4;
    }
  }

  /* find the maximum states in solved eigenvalues */

  if (SCF_iter==1){
    MaxN = n;
  }
  else {
    lumos = (double)n*0.050;      
    if (lumos<60.0) lumos = 60.0;
    MaxN = (TZ-system_charge)/2 + (int)lumos;
    if (n<MaxN) MaxN = n;
  }

  if ( numprocs1<=MaxN ){

    av_num = (double)MaxN/(double)numprocs1;

    for (ID=0; ID<numprocs1; ID++){
      is2[ID] = (int)(av_num*(double)ID) + 1; 
      ie2[ID] = (int)(av_num*(double)(ID+1)); 
    }

    is2[0] = 1;
    ie2[numprocs1-1] = MaxN; 
  }

  else{
    for (ID=0; ID<MaxN; ID++){
      is2[ID] = ID + 1; 
      ie2[ID] = ID + 1;
    }
    for (ID=MaxN; ID<numprocs1; ID++){
      is2[ID] =  1;
      ie2[ID] = -2;
    }
  }

  if (measure_time){
    dtime(&etime);
    time2 += etime - stime;
  }

  /*  The output C matrix is distributed by column. */

  if (measure_time) dtime(&stime);

  bcast_flag = 0;
  Eigen_PReHH(MPI_CommWD1[myworld1],C[spin],ko[spin],n,MaxN,bcast_flag);

  /*  The H matrix is distributed by row */
  for (i1=1; i1<=n; i1++){
    for (j1=is2[myid1]; j1<=ie2[myid1]; j1++){
      H[spin][j1][i1] = C[spin][i1][j1];
    }
  }

  if (measure_time){
    dtime(&etime);
    time3 += etime - stime;
  }

  /****************************************************
      transformation to the original eigenvectors.
                       NOTE 244P
  ****************************************************/

  if (measure_time) dtime(&stime);

  /* transpose */

  for (i1=1; i1<=n; i1++){
    for (j1=i1+1; j1<=n; j1++){
      tmp1 = S[i1][j1];
      tmp2 = S[j1][i1];
      S[i1][j1] = tmp2;
      S[j1][i1] = tmp1;
    }
  }

  /* C is distributed by row in each processor */
  for (j1=is2[myid1]; j1<=ie2[myid1]; j1++){
    for (i1=1; i1<=n; i1++){
      sum = 0.0;
      for (l=1; l<=n; l++){
	sum += S[i1][l]*H[spin][j1][l];
      }
      C[spin][j1][i1] = sum;
    }
  }

  /****************************************************
   MPI: C

   Since is2 and ie2 depend on the spin index, we must
   call BroadCast_ReMatrix in the spin loop.
  ****************************************************/

  /* broadcast C:
     C is distributed by row in each processor
  */

  BroadCast_ReMatrix(MPI_CommWD1[myworld1],C[spin],n,is2,ie2,myid1,numprocs1,
                     stat_send,request_send,request_recv);

  if (measure_time){
    dtime(&etime);
    time4 += etime - stime;
  }

  if (SpinP_switch==1 && numprocs0==1 && spin==0){
    spin++;  
    goto diagonalize; 
  }

  if (measure_time) dtime(&stime);

  /*********************************************** 
    MPI: ko and C 
  ***********************************************/
 
  /* communicate every 100 MByte */

  nstep = 8*MaxN*n/(1000000*100) + 1;
  wstep = MaxN/nstep;

 /* allocate */
  OneD_Mat1 = (double*)malloc(sizeof(double)*(wstep+2)*(n+1));

  for (spin=0; spin<=SpinP_switch; spin++){

    for (step=0; step<nstep; step++){

      istart = step*wstep + 1;
      iend   = (nstep+1)*wstep;
      if (MaxN<iend)       iend = MaxN;
      if (step==(nstep-1)) iend = MaxN; 

      if (istart<=iend){

	for (i=istart; i<=iend; i++){
	  i1 = (i - istart)*n;
	  for (j=1; j<=n; j++){
	    k = i1 + j - 1;
	    OneD_Mat1[k] = C[spin][i][j];
	  }
	}

        MPI_Barrier(mpi_comm_level1);
	MPI_Bcast(&OneD_Mat1[0],(iend-istart+1)*n,MPI_DOUBLE,
		  Comm_World_StartID1[spin],mpi_comm_level1);

	for (i=istart; i<=iend; i++){
	  i1 = (i - istart)*n;
	  for (j=1; j<=n; j++){
	    k = i1 + j - 1;
	    C[spin][i][j] = OneD_Mat1[k];
	  }
	}
      }
    }
  }

  for (spin=0; spin<=SpinP_switch; spin++){
    MPI_Bcast(&ko[spin][1],MaxN,MPI_DOUBLE,Comm_World_StartID1[spin],mpi_comm_level1);
  }

  /* free */
  free(OneD_Mat1);

  /****************************************************
              searching of chemical potential
  ****************************************************/

  if (2<=level_stdout){
    for (i1=1; i1<=MaxN; i1++){
      if (SpinP_switch==0)
        printf("  Eigenvalues of Kohn-Sham %2d %15.12f %15.12f\n",
                    i1,ko[0][i1],ko[0][i1]);
      else 
        printf("  Eigenvalues of Kohn-Sham %2d %15.12f %15.12f\n",
                  i1,ko[0][i1],ko[1][i1]);
    }
  }

  po = 0;
  loopN = 0;

  ChemP_MAX = 30.0;  
  ChemP_MIN =-30.0;
  
  do {
    ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
    Num_State = 0.0;
    for (spin=0; spin<=SpinP_switch; spin++){
      for (i1=1; i1<=MaxN; i1++){
        x = (ko[spin][i1] - ChemP)*Beta;
        if (x<=-max_x) x = -max_x;
        if (max_x<=x)  x = max_x;
        FermiF = 1.0/(1.0 + exp(x));
        Num_State = Num_State + spin_degeneracy*FermiF;
        if (0.5<FermiF) Cluster_HOMO[spin] = i1;
      }
    }
    Dnum = (TZ - Num_State) - system_charge;
    if (0.0<=Dnum) ChemP_MIN = ChemP;
    else           ChemP_MAX = ChemP;
    if (fabs(Dnum)<1.0e-13) po = 1;
    loopN++;
  } 
  while (po==0 && loopN<1000); 
  
  if (2<=level_stdout){
    printf("  ChemP=%15.12f\n",ChemP);
  }

  if (measure_time){
    dtime(&etime);
    time5 += etime - stime;
  }

  /****************************************************
            Energies by summing up eigenvalues
  ****************************************************/

  Eele0[0] = 0.0;
  Eele0[1] = 0.0;
  for (spin=0; spin<=SpinP_switch; spin++){
    for (i1=1; i1<=MaxN; i1++){

      x = (ko[spin][i1] - ChemP)*Beta;
      if (x<=-max_x) x = -max_x;
      if (max_x<=x)  x = max_x;
      FermiF = 1.0/(1.0 + exp(x));
      Eele0[spin] += ko[spin][i1]*FermiF;

    }
  }
  if (SpinP_switch==0){
    Eele0[1] = Eele0[0];
  }
 
 /****************************************************
      LCAO coefficients are stored for calculating
               values of MOs on grids
  ****************************************************/

  if (SpinP_switch==0){
    if ( (Cluster_HOMO[0]-num_HOMOs+1)<1 )  num_HOMOs = Cluster_HOMO[0];
    if ( (Cluster_HOMO[0]+num_LUMOs)>MaxN ) num_LUMOs = MaxN - Cluster_HOMO[0];
  }
  else if (SpinP_switch==1){
    if ( (Cluster_HOMO[0]-num_HOMOs+1)<1 )  num_HOMOs = Cluster_HOMO[0];
    if ( (Cluster_HOMO[1]-num_HOMOs+1)<1 )  num_HOMOs = Cluster_HOMO[1];
    if ( (Cluster_HOMO[0]+num_LUMOs)>MaxN ) num_LUMOs = MaxN - Cluster_HOMO[0];
    if ( (Cluster_HOMO[1]+num_LUMOs)>MaxN ) num_LUMOs = MaxN - Cluster_HOMO[1];
  }

  if (myid0==Host_ID){
    if (SpinP_switch==0 && 2<=level_stdout){
      printf("  HOMO = %2d\n",Cluster_HOMO[0]);
    }
    else if (SpinP_switch==1 && 2<=level_stdout){
      printf("  HOMO for up-spin   = %2d\n",Cluster_HOMO[0]);
      printf("  HOMO for down-spin = %2d\n",Cluster_HOMO[1]);
    }
  }

  if (MO_fileout==1){  

    /* HOMOs */

    for (spin=0; spin<=SpinP_switch; spin++){
      for (j=0; j<num_HOMOs; j++){
        j1 = Cluster_HOMO[spin] - j;
	for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
	  wanA = WhatSpecies[GA_AN];
	  tnoA = Spe_Total_CNO[wanA];
	  Anum = MP[GA_AN];
	  for (i=0; i<tnoA; i++){
	    HOMOs_Coef[0][spin][j][GA_AN][i].r = C[spin][j1][Anum+i];
	  }
        }
      }
    }
      
    /* LUMOs */
   
    for (spin=0; spin<=SpinP_switch; spin++){
      for (j=0; j<num_LUMOs; j++){
	j1 = Cluster_HOMO[spin] + 1 + j;
	for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
	  wanA = WhatSpecies[GA_AN];
	  tnoA = Spe_Total_CNO[wanA];
	  Anum = MP[GA_AN];
	  for (i=0; i<tnoA; i++){
	    LUMOs_Coef[0][spin][j][GA_AN][i].r = C[spin][j1][Anum+i];
	  }
        }
      }
    }
  }

  /****************************************************
       density matrix and energy density matrix
              for up and down spins
  ****************************************************/

  if (measure_time) dtime(&stime);

  for (spin=0; spin<=SpinP_switch; spin++){
    for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
      GA_AN = M2G[MA_AN];
      wanA = WhatSpecies[GA_AN];
      tnoA = Spe_Total_CNO[wanA];
      Anum = MP[GA_AN];
      for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
	GB_AN = natn[GA_AN][LB_AN];
	wanB = WhatSpecies[GB_AN];
	tnoB = Spe_Total_CNO[wanB];
	Bnum = MP[GB_AN];
	for (i=0; i<tnoA; i++){
	  for (j=0; j<tnoB; j++){
	    CDM[spin][MA_AN][LB_AN][i][j] = 0.0;
	    EDM[spin][MA_AN][LB_AN][i][j] = 0.0;
	  }
	}
      }
    }
  }

  for (spin=0; spin<=SpinP_switch; spin++){
    for (k=1; k<=MaxN; k++){

      x = (ko[spin][k] - ChemP)*Beta;
      if (x<=-max_x) x = -max_x;
      if (max_x<=x)  x = max_x;
      FermiF = 1.0/(1.0 + exp(x));

      if (FermiF>FermiEps) {

        for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
          GA_AN = M2G[MA_AN];
	  wanA = WhatSpecies[GA_AN];
	  tnoA = Spe_Total_CNO[wanA];
	  Anum = MP[GA_AN];
	  for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
	    GB_AN = natn[GA_AN][LB_AN];
	    wanB = WhatSpecies[GB_AN];
	    tnoB = Spe_Total_CNO[wanB];
	    Bnum = MP[GB_AN];
	    for (i=0; i<tnoA; i++){
	      for (j=0; j<tnoB; j++){
		dum = FermiF*C[spin][k][Anum+i]*C[spin][k][Bnum+j];
		CDM[spin][MA_AN][LB_AN][i][j] += dum;
		EDM[spin][MA_AN][LB_AN][i][j] += dum*ko[spin][k];
	      }
	    }
	  }
	}

      }
    }
  }

  /****************************************************
                      Bond Energies
  ****************************************************/
  
  My_Eele1[0] = 0.0;
  My_Eele1[1] = 0.0;

  for (spin=0; spin<=SpinP_switch; spin++){
    for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
      GA_AN = M2G[MA_AN];
      wanA = WhatSpecies[GA_AN];
      tnoA = Spe_Total_CNO[wanA];
      for (j=0; j<=FNAN[GA_AN]; j++){
	wanB = WhatSpecies[natn[GA_AN][j]];
	tnoB = Spe_Total_CNO[wanB];
	for (k=0; k<tnoA; k++){
	  for (l=0; l<tnoB; l++){
            My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*nh[spin][MA_AN][j][k][l];
          }
	}
      }
    }
  }
  
  /* MPI, My_Eele1 */
  for (spin=0; spin<=SpinP_switch; spin++){
    MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
  }

  if (SpinP_switch==0) Eele1[1] = Eele1[0];

  /*
  printf("Eele00=%15.12f Eele01=%15.12f\n",Eele0[0],Eele0[1]);
  printf("Eele10=%15.12f Eele11=%15.12f\n",Eele1[0],Eele1[1]);

  MPI_Finalize();
  exit(0);
  */

  if (measure_time){
    dtime(&etime);
    time6 += etime - stime;
  }

  /****************************************************
                        Output
  ****************************************************/

  if (measure_time) dtime(&stime);

  if (myid0==Host_ID){

    fnjoint(filepath,filename,file_EV);

    if ((fp_EV = fopen(file_EV,"w")) != NULL){

#ifdef xt3
      setvbuf(fp_EV,buf,_IOFBF,fp_bsize);  /* setvbuf */
#endif

      fprintf(fp_EV,"\n");
      fprintf(fp_EV,"***********************************************************\n");
      fprintf(fp_EV,"***********************************************************\n");
      fprintf(fp_EV,"           Eigenvalues (Hartree) for SCF KS-eq.           \n");
      fprintf(fp_EV,"***********************************************************\n");
      fprintf(fp_EV,"***********************************************************\n\n");

      fprintf(fp_EV,"   Chemical Potential (Hartree) = %18.14f\n",ChemP);
      fprintf(fp_EV,"   Number of States             = %18.14f\n",Num_State);
      if (SpinP_switch==0){
        fprintf(fp_EV,"   HOMO = %2d\n",Cluster_HOMO[0]);
      }
      else if (SpinP_switch==1){
        fprintf(fp_EV,"   HOMO for up-spin   = %2d\n",Cluster_HOMO[0]);
        fprintf(fp_EV,"   HOMO for down-spin = %2d\n",Cluster_HOMO[1]);
      }

      fprintf(fp_EV,"   Eigenvalues\n");
          fprintf(fp_EV,"                Up-spin            Down-spin\n");
      for (i1=1; i1<=MaxN; i1++){
        if (SpinP_switch==0)
          fprintf(fp_EV,"      %5d %18.14f %18.14f\n",i1,ko[0][i1],ko[0][i1]);
        else if (SpinP_switch==1)
          fprintf(fp_EV,"      %5d %18.14f %18.14f\n",i1,ko[0][i1],ko[1][i1]);
      }

      if (2<=level_fileout){

        fprintf(fp_EV,"\n");
        fprintf(fp_EV,"***********************************************************\n");
        fprintf(fp_EV,"***********************************************************\n");
        fprintf(fp_EV,"  Eigenvalues (Hartree) and Eigenvectors for SCF KS-eq.  \n");
        fprintf(fp_EV,"***********************************************************\n");
        fprintf(fp_EV,"***********************************************************\n");

        fprintf(fp_EV,"\n\n");
        fprintf(fp_EV,"   Chemical Potential (Hartree) = %18.14f\n",ChemP);
        if (SpinP_switch==0){
          fprintf(fp_EV,"   HOMO = %2d\n",Cluster_HOMO[0]);
        }
        else if (SpinP_switch==1){
          fprintf(fp_EV,"   HOMO for up-spin   = %2d\n",Cluster_HOMO[0]);
          fprintf(fp_EV,"   HOMO for down-spin = %2d\n",Cluster_HOMO[1]);
        }

        fprintf(fp_EV,"\n");
        fprintf(fp_EV,"   LCAO coefficients for up (U) and down (D) spins\n\n");

        num0 = 6;
        num1 = MaxN/num0 + 1*(MaxN%num0!=0);

        for (spin=0; spin<=SpinP_switch; spin++){
	  for (i=1; i<=num1; i++){

	    /* header */ 
  	    fprintf(fp_EV,"\n");
	    for (i1=-1; i1<=0; i1++){

              if      (i1==-1) fprintf(fp_EV,"                     ");
              else if (i1==0)  fprintf(fp_EV,"                      ");

	      for (j=1; j<=num0; j++){
	        j1 = num0*(i-1) + j;
	        if (j1<=MaxN){ 
	 	  if (i1==-1){
		    if (spin==0)      fprintf(fp_EV," %5d (U)",j1);
		    else if (spin==1) fprintf(fp_EV," %5d (D)",j1);
		  }
 		  else if (i1==0){
		    fprintf(fp_EV,"  %8.5f",ko[spin][j1]);
		  }
	        }
	      }
	      fprintf(fp_EV,"\n");
	      if (i1==0)  fprintf(fp_EV,"\n");
	    }

	    /* LCAO coefficients */ 

	    Name_Angular[0][0] = "s          ";
	    Name_Angular[1][0] = "px         ";
	    Name_Angular[1][1] = "py         ";
	    Name_Angular[1][2] = "pz         ";
	    Name_Angular[2][0] = "d3z^2-r^2  ";
	    Name_Angular[2][1] = "dx^2-y^2   ";
	    Name_Angular[2][2] = "dxy        ";
	    Name_Angular[2][3] = "dxz        ";
	    Name_Angular[2][4] = "dyz        ";
	    Name_Angular[3][0] = "f5z^2-3r^2 ";
	    Name_Angular[3][1] = "f5xz^2-xr^2";
	    Name_Angular[3][2] = "f5yz^2-yr^2";
	    Name_Angular[3][3] = "fzx^2-zy^2 ";
	    Name_Angular[3][4] = "fxyz       ";
	    Name_Angular[3][5] = "fx^3-3*xy^2";
	    Name_Angular[3][6] = "f3yx^2-y^3 ";
	    Name_Angular[4][0] = "g1         ";
	    Name_Angular[4][1] = "g2         ";
	    Name_Angular[4][2] = "g3         ";
	    Name_Angular[4][3] = "g4         ";
	    Name_Angular[4][4] = "g5         ";
	    Name_Angular[4][5] = "g6         ";
	    Name_Angular[4][6] = "g7         ";
	    Name_Angular[4][7] = "g8         ";
	    Name_Angular[4][8] = "g9         ";

	    Name_Multiple[0] = "0";
	    Name_Multiple[1] = "1";
	    Name_Multiple[2] = "2";
	    Name_Multiple[3] = "3";
	    Name_Multiple[4] = "4";
	    Name_Multiple[5] = "5";

	    i1 = 1; 

            for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){

              wan1 = WhatSpecies[Gc_AN];
            
 	      for (l=0; l<=Supported_MaxL; l++){
 	        for (mul=0; mul<Spe_Num_CBasis[wan1][l]; mul++){
 	          for (m=0; m<(2*l+1); m++){

                    if (l==0 && mul==0 && m==0)
                      fprintf(fp_EV,"%4d %3s %s %s", 
                              Gc_AN,SpeName[wan1],Name_Multiple[mul],Name_Angular[l][m]);
                    else
                      fprintf(fp_EV,"         %s %s", 
                              Name_Multiple[mul],Name_Angular[l][m]);

		    for (j=1; j<=num0; j++){
		      j1 = num0*(i-1) + j;
		      if (0<i1 && j1<=MaxN){
			fprintf(fp_EV,"  %8.5f",C[spin][j1][i1]);
		      }
		    }
		  
		    fprintf(fp_EV,"\n");

                    i1++;
		  }
		}
	      }
	    }

	  }
        }

      }

      fclose(fp_EV);
    }
    else
      printf("Failure of saving the EV file.\n");
  }

  if (measure_time){
    dtime(&etime);
    time7 += etime - stime;
  }

  if (measure_time){
    printf("Cluster_DFT myid=%2d time1=%7.3f time2=%7.3f time3=%7.3f time4=%7.3f time5=%7.3f time6=%7.3f time7=%7.3f\n",
            myid0,time1,time2,time3,time4,time5,time6,time7);fflush(stdout); 
  }

  /****************************************************
                          Free
  ****************************************************/

  free(stat_send);
  free(request_send);
  free(request_recv);

  free(is1);
  free(ie1);

  free(is2);
  free(ie2);

  free(MP);

  for (i=0; i<List_YOUSO[23]; i++){
    for (j=0; j<n2; j++){
      free(H[i][j]);
    }
    free(H[i]);
  }
  free(H);  

  /* freeing of arrays for the first world */

  if (Num_Comm_World1<=numprocs0){
    MPI_Comm_free(&MPI_CommWD1[myworld1]);
  }

  free(NPROCS_ID1);
  free(Comm_World1);
  free(NPROCS_WD1);
  free(Comm_World_StartID1);
  free(MPI_CommWD1);

  /* for elapsed time */

  MPI_Barrier(mpi_comm_level1);
  dtime(&TEtime);
  time0 = TEtime - TStime;
  return time0;
}








double Cluster_non_collinear(
                   int SCF_iter,
                   int SpinP_switch,
                   double *****nh,
                   double *****ImNL,
                   double ****CntOLP,
                   double *****CDM,
                   double *****EDM,
                   double Eele0[2], double Eele1[2])
{
  static int firsttime=1;
  int i,j,l,n,n2,n1,i1,j1,k1,l1;
  int ii1,jj1,jj2,ki,kj;
  int wan,HOMO0,HOMO1;
  int *MP;
  int spin,po,num0,num1;
  int mul,m,wan1,Gc_AN,bcast_flag;
  double time0,lumos,av_num;
  int ct_AN,k,wanA,tnoA,wanB,tnoB;
  int GA_AN,Anum,loopN;
  int MA_AN,LB_AN,GB_AN,Bnum,MaxN;
  int *is1,*ie1,*is12,*ie12,*is2,*ie2;
  double *ko;
  dcomplex **H,**C;
  dcomplex Ctmp1,Ctmp2;
  double EV_cut0;
  double sum_i,sum_r,tmp1,tmp2;
  double sum_r0,sum_i0,sum_r1,sum_i1;
  double sum_r00,sum_i00;
  double sum_r01,sum_i01;
  double sum_r10,sum_i10;
  double sum_r11,sum_i11;
  double TZ,my_sum,sum,sumE,max_x=50.0;
  double My_Eele1[2];
  double Num_State,x,FermiF,Dnum,Dnum2;
  double dum,ChemP_MAX,ChemP_MIN;
  double TStime,TEtime;
  double FermiEps = 1.0e-13;
  int numprocs,myid,ID;
  double OLP_eigen_cut=Threshold_OLP_Eigen;
  char *Name_Angular[Supported_MaxL+1][2*(Supported_MaxL+1)+1];
  char *Name_Multiple[20];
  char file_EV[YOUSO10] = ".EV";
  FILE *fp_EV;
  char buf[fp_bsize];          /* setvbuf */
  double time1,time2,time3,time4,time5,time6,time7;
  double stime,etime;

  MPI_Status *stat_send;
  MPI_Request *request_send;
  MPI_Request *request_recv;

  /* MPI */
  MPI_Comm_size(mpi_comm_level1,&numprocs);
  MPI_Comm_rank(mpi_comm_level1,&myid);

  MPI_Barrier(mpi_comm_level1);
  dtime(&TStime);

  /* allocation of arrays */

  stat_send = malloc(sizeof(MPI_Status)*numprocs);
  request_send = malloc(sizeof(MPI_Request)*numprocs);
  request_recv = malloc(sizeof(MPI_Request)*numprocs);

  /****************************************************
             calculation of the array size
  ****************************************************/

  n = 0;
  for (i=1; i<=atomnum; i++){
    wanA  = WhatSpecies[i];
    n  = n + Spe_Total_CNO[wanA];
  }
  n2 = 2*n + 2;

  /****************************************************
   Allocation

   int     MP[List_YOUSO[1]]
   double  ko[n2]
   dcomplex H[n2][n2]  
   dcomplex C[n2][n2]
  ****************************************************/

  MP = (int*)malloc(sizeof(int)*List_YOUSO[1]);
  ko = (double*)malloc(sizeof(double)*n2);

  H = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (i=0; i<n2; i++){
    H[i] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  C = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (i=0; i<n2; i++){
    C[i] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  is1 = (int*)malloc(sizeof(int)*numprocs);
  ie1 = (int*)malloc(sizeof(int)*numprocs);

  is12 = (int*)malloc(sizeof(int)*numprocs);
  ie12 = (int*)malloc(sizeof(int)*numprocs);

  is2 = (int*)malloc(sizeof(int)*numprocs);
  ie2 = (int*)malloc(sizeof(int)*numprocs);

  if (firsttime){
    PrintMemory("Cluster_DFT: H",sizeof(dcomplex)*n2*n2,NULL);
    PrintMemory("Cluster_DFT: C",sizeof(dcomplex)*n2*n2,NULL);
  }

  /* for PrintMemory */
  firsttime=0;

  if (measure_time){
    time1 = 0.0;
    time2 = 0.0;
    time3 = 0.0;
    time4 = 0.0;
    time5 = 0.0;
    time6 = 0.0;
    time7 = 0.0;
  }

  /****************************************************
                  total core charge
  ****************************************************/

  TZ = 0.0;
  for (i=1; i<=atomnum; i++){
    wan = WhatSpecies[i];
    TZ = TZ + Spe_Core_Charge[wan];
  }

  /****************************************************
       1. diagonalize the overlap matrix     
       2. search negative eigenvalues
  ****************************************************/

  n = Size_Total_Matrix;

  /****************************************************
         find the numbers of partions for MPI
  ****************************************************/

  if ( numprocs<=n ){

    av_num = (double)n/(double)numprocs;

    for (ID=0; ID<numprocs; ID++){
      is1[ID] = (int)(av_num*(double)ID) + 1; 
      ie1[ID] = (int)(av_num*(double)(ID+1)); 
    }

    is1[0] = 1;
    ie1[numprocs-1] = n; 

  }

  else{

    for (ID=0; ID<n; ID++){
      is1[ID] = ID + 1; 
      ie1[ID] = ID + 1;
    }
    for (ID=n; ID<numprocs; ID++){
      is1[ID] =  1;
      ie1[ID] = -2;
    }
  }

  for (ID=0; ID<numprocs; ID++){
    is12[ID] = 2*is1[ID] - 1;
    ie12[ID] = 2*ie1[ID];
  }

  /****************************************************
       1. diagonalize the overlap matrix     
       2. search negative eigenvalues
  ****************************************************/

  if (SCF_iter<=1){

    if (measure_time) dtime(&stime);

    Overlap_Cluster(CntOLP,S,MP);

    /* all the processors have the full output S matrix */

    bcast_flag = 1;
    Eigen_PReHH(mpi_comm_level1,S,ko,n,n,bcast_flag);

    /* minus eigenvalues to 1.0e-14 */
    for (l=1; l<=n; l++){
      if (ko[l]<0.0) ko[l] = 1.0e-14;
      EV_S[l] = ko[l];
    }

    /* print to the standard output */

    if (2<=level_stdout){
      for (l=1; l<=n; l++){
	printf("  Eigenvalues of OLP  %2d  %18.15f\n",l,ko[l]);
      }
    }

    /* calculate S*1/sqrt(ko) */

    for (l=1; l<=n; l++){
      IEV_S[l] = 1.0/sqrt(ko[l]);
    }

    SP_PEV = 1;

    for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
	S[i1][j1] = S[i1][j1]*IEV_S[j1];
      }
    }

    if (measure_time){
      dtime(&etime);
      time1 += etime - stime; 
    }
  }

  /****************************************************
    Calculations of eigenvalues

     Note:
         MP indicates the starting position of
              atom i in arraies H and S
  ****************************************************/
   
  if (measure_time) dtime(&stime);
 
  Hamiltonian_Cluster_NC(nh, ImNL, H, MP);

  /* initialize ko */
  for (i1=1; i1<=2*n; i1++){
    ko[i1] = 10000.0;
  }

  /* transpose S */
  for (i1=1; i1<=n; i1++){
    for (j1=i1+1; j1<=n; j1++){
      tmp1 = S[i1][j1];
      tmp2 = S[j1][i1];
      S[i1][j1] = tmp2;
      S[j1][i1] = tmp1;
    }
  }

  /* C is distributed by row in each processor */
  /*            H * U * lambda^{-1/2}          */

  for (j1=is1[myid]; j1<=ie1[myid]; j1++){
    for (i1=1; i1<=2*n; i1++){

      sum_r0 = 0.0;
      sum_i0 = 0.0;

      sum_r1 = 0.0;
      sum_i1 = 0.0;

      for (l=1; l<=n; l++){
	sum_r0 += H[i1][l  ].r*S[j1][l];
	sum_i0 += H[i1][l  ].i*S[j1][l];
	sum_r1 += H[i1][n+l].r*S[j1][l];
	sum_i1 += H[i1][n+l].i*S[j1][l];
      }

      C[2*j1-1][i1].r = sum_r0;
      C[2*j1-1][i1].i = sum_i0;

      C[2*j1  ][i1].r = sum_r1;
      C[2*j1  ][i1].i = sum_i1;
    }
  }

  /* H is distributed by row in each processor */
  /* lambda^{-1/2} * U^+ H * U * lambda^{-1/2} */

  for (j1=is1[myid]; j1<=ie1[myid]; j1++){
    for (i1=1; i1<=n; i1++){

      sum_r00 = 0.0;
      sum_i00 = 0.0;

      sum_r01 = 0.0;
      sum_i01 = 0.0;

      sum_r10 = 0.0;
      sum_i10 = 0.0;

      sum_r11 = 0.0;
      sum_i11 = 0.0;

      jj1 = 2*j1 - 1;
      jj2 = 2*j1;

      for (l=1; l<=n; l++){

	sum_r00 += S[i1][l]*C[jj1][l  ].r;
	sum_i00 += S[i1][l]*C[jj1][l  ].i;

	sum_r01 += S[i1][l]*C[jj1][l+n].r;
	sum_i01 += S[i1][l]*C[jj1][l+n].i;

	sum_r10 += S[i1][l]*C[jj2][l  ].r;
	sum_i10 += S[i1][l]*C[jj2][l  ].i;

	sum_r11 += S[i1][l]*C[jj2][l+n].r;
	sum_i11 += S[i1][l]*C[jj2][l+n].i;
      }

      H[jj1][2*i1-1].r = sum_r00;
      H[jj1][2*i1-1].i = sum_i00;

      H[jj1][2*i1  ].r = sum_r01;
      H[jj1][2*i1  ].i = sum_i01;

      H[jj2][2*i1-1].r = sum_r10;
      H[jj2][2*i1-1].i = sum_i10;

      H[jj2][2*i1  ].r = sum_r11;
      H[jj2][2*i1  ].i = sum_i11;
    }
  }

  /* broadcast H */

  BroadCast_ComplexMatrix(mpi_comm_level1,H,2*n,is12,ie12,myid,numprocs,
                          stat_send,request_send,request_recv);

  /* H to C (transposition) */

  for (i1=1; i1<=2*n; i1++){
    for (j1=1; j1<=2*n; j1++){
      C[j1][i1].r = H[i1][j1].r;
      C[j1][i1].i = H[i1][j1].i;
    }
  }

  /* penalty for ill-conditioning states */

  EV_cut0 = Threshold_OLP_Eigen;

  for (i1=1; i1<=n; i1++){

    if (EV_S[i1]<EV_cut0){
      C[2*i1-1][2*i1-1].r += pow((EV_S[i1]/EV_cut0),-2.0) - 1.0;
      C[2*i1  ][2*i1  ].r += pow((EV_S[i1]/EV_cut0),-2.0) - 1.0;
    }

    /* cutoff the interaction between the ill-conditioned state */

    if (1.0e+3<C[2*i1-1][2*i1-1].r){
      for (j1=1; j1<=2*n; j1++){
	C[2*i1-1][j1    ] = Complex(0.0,0.0);
	C[j1    ][2*i1-1] = Complex(0.0,0.0);
	C[2*i1  ][j1    ] = Complex(0.0,0.0);
	C[j1    ][2*i1  ] = Complex(0.0,0.0);
      }
      C[2*i1-1][2*i1-1] = Complex(1.0e+4,0.0);
      C[2*i1  ][2*i1  ] = Complex(1.0e+4,0.0);
    }
  }

  /* find the maximum states in solved eigenvalues */
  
  n1 = 2*n;

  if (SCF_iter==1){
    MaxN = n1; 
  }   
  else {
    lumos = (double)n1*0.10;      
    if (lumos<60.0) lumos = 60.0;
    MaxN = Cluster_HOMO[0] + (int)lumos;
    if (n1<MaxN) MaxN = n1;
  }

  if ( numprocs<=MaxN ){

    av_num = (double)MaxN/(double)numprocs;

    for (ID=0; ID<numprocs; ID++){
      is2[ID] = (int)(av_num*(double)ID) + 1; 
      ie2[ID] = (int)(av_num*(double)(ID+1)); 
    }

    is2[0] = 1;
    ie2[numprocs-1] = MaxN; 
  }

  else{
    for (ID=0; ID<MaxN; ID++){
      is2[ID] = ID + 1; 
      ie2[ID] = ID + 1;
    }
    for (ID=MaxN; ID<numprocs; ID++){
      is2[ID] =  1;
      ie2[ID] = -2;
    }
  }

  if (measure_time){
    dtime(&etime);
    time2 += etime - stime;
  }

  /*  The output C matrix is distributed by column. */
  /*  solve eigenvalue problem                      */

  if (measure_time) dtime(&stime);

  bcast_flag = 0;
  Eigen_PHH(mpi_comm_level1, C, ko, n1, MaxN, bcast_flag);

  if (2<=level_stdout){
    for (i1=1; i1<=MaxN; i1++){
      printf("  Eigenvalues of Kohn-Sham %2d  %15.12f\n", i1,ko[i1]);
    }
  }

  /*  The H matrix is distributed by row */

  for (i1=1; i1<=2*n; i1++){
    for (j1=is2[myid]; j1<=ie2[myid]; j1++){
      H[j1][i1] = C[i1][j1];
    }
  }

  if (measure_time){
    dtime(&etime);
    time3 += etime - stime;
  }

  /****************************************************
      Transformation to the original eigenvectors.
      JRCAT NOTE 244P  C = U * lambda^{-1/2} * D
  ****************************************************/

  if (measure_time) dtime(&stime);

  /* transpose S */

  for (i1=1; i1<=n; i1++){
    for (j1=i1+1; j1<=n; j1++){
      tmp1 = S[i1][j1];
      tmp2 = S[j1][i1];
      S[i1][j1] = tmp2;
      S[j1][i1] = tmp1;
    }
  }

  for (i1=1; i1<=2*n; i1++){
    for (j1=1; j1<=2*n; j1++){
      C[i1][j1].r = 0.0;
      C[i1][j1].i = 0.0;
    }
  }

  /* C is distributed by row in each processor */

  for (j1=is2[myid]; j1<=ie2[myid]; j1++){
    for (i1=1; i1<=n; i1++){

      sum_r0 = 0.0;
      sum_i0 = 0.0;

      sum_r1 = 0.0;
      sum_i1 = 0.0;

      l1 = 1;
      for (l=1; l<=n; l++){
	sum_r0 += S[i1][l]*H[j1][l1].r;
	sum_i0 += S[i1][l]*H[j1][l1].i;  l1++;

	sum_r1 += S[i1][l]*H[j1][l1].r;
	sum_i1 += S[i1][l]*H[j1][l1].i;  l1++;
      }

      C[j1][i1  ].r = sum_r0;
      C[j1][i1  ].i = sum_i0;

      C[j1][i1+n].r = sum_r1;
      C[j1][i1+n].i = sum_i1;

    }
  }

  /****************************************************
     broadcast C
     C is distributed by row in each processor
  ****************************************************/

  BroadCast_ComplexMatrix(mpi_comm_level1,C,2*n,is2,ie2,myid,numprocs, 
                          stat_send,request_send,request_recv);

  if (measure_time){
    dtime(&etime);
    time4 += etime - stime;
  }

  /****************************************************
                  find chemical potential
  ****************************************************/

  if (measure_time) dtime(&stime);

  po = 0;
  loopN = 0;

  ChemP_MAX = 25.0;  
  ChemP_MIN =-25.0;

  do {
    ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
    Num_State = 0.0;

    for (i1=1; i1<=MaxN; i1++){
      x = (ko[i1] - ChemP)*Beta;
      if (x<=-max_x) x = -max_x;
      if (max_x<=x)  x = max_x;
      FermiF = 1.0/(1.0 + exp(x));
      Num_State = Num_State + FermiF;
      if (0.5<FermiF) Cluster_HOMO[0] = i1;
    }

    Dnum = (TZ - Num_State) - system_charge;
    if (0.0<=Dnum) ChemP_MIN = ChemP;
    else           ChemP_MAX = ChemP;
    if (fabs(Dnum)<1.0e-13) po = 1;
    loopN++;
  } 
  while (po==0 && loopN<1000); 

  if (2<=level_stdout){
    printf("  ChemP=%15.12f\n",ChemP);
  }

  if (measure_time){
    dtime(&etime);
    time5 += etime - stime;
  }

  /****************************************************
            Energies by summing up eigenvalues
  ****************************************************/

  Eele0[0] = 0.0;
  Eele0[1] = 0.0;

  for (i1=1; i1<=MaxN; i1++){

    x = (ko[i1] - ChemP)*Beta;
    if (x<=-max_x) x = -max_x;
    if (max_x<=x)  x = max_x;
    FermiF = 1.0/(1.0 + exp(x));
    Eele0[0] = Eele0[0] + ko[i1]*FermiF;
  }

  /****************************************************
      LCAO coefficients are stored for calculating
               values of MOs on grids
  ****************************************************/

  if ( (Cluster_HOMO[0]-num_HOMOs+1)<1 )  num_HOMOs = Cluster_HOMO[0];
  if ( (Cluster_HOMO[0]+num_LUMOs)>MaxN ) num_LUMOs = MaxN - Cluster_HOMO[0];

  if (myid==Host_ID && 2<=level_stdout){
    printf("  HOMO = %2d\n",Cluster_HOMO[0]);
  }

  if (MO_fileout==1){  

    /* HOMOs */

    for (j=0; j<num_HOMOs; j++){
      j1 = Cluster_HOMO[0] - j;
      for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
        wanA = WhatSpecies[GA_AN];
        tnoA = Spe_Total_CNO[wanA];
        Anum = MP[GA_AN];
	for (i=0; i<tnoA; i++){
	  HOMOs_Coef[0][0][j][GA_AN][i].r = C[j1][Anum+i].r;
	  HOMOs_Coef[0][0][j][GA_AN][i].i = C[j1][Anum+i].i;
	  HOMOs_Coef[0][1][j][GA_AN][i].r = C[j1][Anum+i+n].r;
	  HOMOs_Coef[0][1][j][GA_AN][i].i = C[j1][Anum+i+n].i;
	}
      }
    }
      
    /* LUMOs */

    for (j=0; j<num_LUMOs; j++){
      j1 = Cluster_HOMO[0] + 1 + j;
      for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
	wanA = WhatSpecies[GA_AN];
	tnoA = Spe_Total_CNO[wanA];
	Anum = MP[GA_AN];
	for (i=0; i<tnoA; i++){
	  LUMOs_Coef[0][0][j][GA_AN][i].r = C[j1][Anum+i].r;
	  LUMOs_Coef[0][0][j][GA_AN][i].i = C[j1][Anum+i].i;
	  LUMOs_Coef[0][1][j][GA_AN][i].r = C[j1][Anum+i+n].r;
	  LUMOs_Coef[0][1][j][GA_AN][i].i = C[j1][Anum+i+n].i;
	}
      }
    }
  }

  /****************************************************
        density matrix and energy density matrix

        CDM[0]  Re alpha alpha density matrix
        CDM[1]  Re beta  beta  density matrix
        CDM[2]  Re alpha beta  density matrix
        CDM[3]  Im alpha beta  density matrix
        iDM[0][0]  Im alpha alpha density matrix
        iDM[0][1]  Im beta  beta  density matrix

        EDM[0]  Re alpha alpha energy density matrix
        EDM[1]  Re beta  beta  energy density matrix
        EDM[2]  Re alpha beta  energy density matrix
        EDM[3]  Im alpha beta  energy density matrix
  ****************************************************/

  if (measure_time) dtime(&stime);

  for (spin=0; spin<=SpinP_switch; spin++){
    for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
      GA_AN = M2G[MA_AN];
      wanA = WhatSpecies[GA_AN];
      tnoA = Spe_Total_CNO[wanA];
      Anum = MP[GA_AN];
      for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
	GB_AN = natn[GA_AN][LB_AN];
	wanB = WhatSpecies[GB_AN];
	tnoB = Spe_Total_CNO[wanB];
	Bnum = MP[GB_AN];
	for (i=0; i<tnoA; i++){
	  for (j=0; j<tnoB; j++){
	    CDM[spin][MA_AN][LB_AN][i][j] = 0.0;
	    EDM[spin][MA_AN][LB_AN][i][j] = 0.0;
	  }
	}

	for (i=0; i<tnoA; i++){
	  for (j=0; j<tnoB; j++){
	    iDM[0][0][MA_AN][LB_AN][i][j] = 0.0;
	    iDM[0][1][MA_AN][LB_AN][i][j] = 0.0;
	  }
	}

      }
    }
  }

  for (k=1; k<=MaxN; k++){

    x = (ko[k] - ChemP)*Beta;
    if (x<=-max_x) x = -max_x;
    if (max_x<=x)  x = max_x;
    FermiF = 1.0/(1.0 + exp(x));

    if (FermiF>FermiEps) {

      for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
        GA_AN = M2G[MA_AN];
        wanA = WhatSpecies[GA_AN];
	tnoA = Spe_Total_CNO[wanA];
	Anum = MP[GA_AN];
	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
	  GB_AN = natn[GA_AN][LB_AN];
	  wanB = WhatSpecies[GB_AN];
	  tnoB = Spe_Total_CNO[wanB];
	  Bnum = MP[GB_AN];
	  for (i=0; i<tnoA; i++){
	    for (j=0; j<tnoB; j++){
              
              /* Re11 */
	      dum = FermiF*(C[k][Anum+i].r*C[k][Bnum+j].r
                          + C[k][Anum+i].i*C[k][Bnum+j].i);
	      CDM[0][MA_AN][LB_AN][i][j] += dum;
	      EDM[0][MA_AN][LB_AN][i][j] += dum*ko[k];

              /* Re22 */
	      dum = FermiF*(C[k][Anum+i+n].r*C[k][Bnum+j+n].r
                          + C[k][Anum+i+n].i*C[k][Bnum+j+n].i);
	      CDM[1][MA_AN][LB_AN][i][j] += dum;
	      EDM[1][MA_AN][LB_AN][i][j] += dum*ko[k];
               
              /* Re12 */
	      dum = FermiF*(C[k][Anum+i].r*C[k][Bnum+j+n].r
                          + C[k][Anum+i].i*C[k][Bnum+j+n].i);
	      CDM[2][MA_AN][LB_AN][i][j] += dum;
	      EDM[2][MA_AN][LB_AN][i][j] += dum*ko[k];
              
              /* Im12 */
	      dum = FermiF*(C[k][Anum+i].r*C[k][Bnum+j+n].i
                           -C[k][Anum+i].i*C[k][Bnum+j+n].r);
	      CDM[3][MA_AN][LB_AN][i][j] += dum;
	      EDM[3][MA_AN][LB_AN][i][j] += dum*ko[k];

              /* Im11 */
	      dum = FermiF*(C[k][Anum+i].r*C[k][Bnum+j].i
                           -C[k][Anum+i].i*C[k][Bnum+j].r);
  	      iDM[0][0][MA_AN][LB_AN][i][j] += dum;

              /* Im22 */
	      dum = FermiF*(C[k][Anum+i+n].r*C[k][Bnum+j+n].i
                           -C[k][Anum+i+n].i*C[k][Bnum+j+n].r);
              iDM[0][1][MA_AN][LB_AN][i][j] += dum;
	    }
	  }
        }
      }

    }
  }

  /****************************************************
                      Bond Energies
  ****************************************************/

  My_Eele1[0] = 0.0;
  My_Eele1[1] = 0.0;
  
  for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
    GA_AN = M2G[MA_AN];
    wanA = WhatSpecies[GA_AN];
    tnoA = Spe_Total_CNO[wanA];
    for (j=0; j<=FNAN[GA_AN]; j++){
      wanB = WhatSpecies[natn[GA_AN][j]];
      tnoB = Spe_Total_CNO[wanB];

      /* non-spin-orbit coupling and non-LDA+U */  
      if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
          && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0){
	for (k=0; k<tnoA; k++){
	  for (l=0; l<tnoB; l++){
	    My_Eele1[0] += 
	      + CDM[0][MA_AN][j][k][l]*nh[0][MA_AN][j][k][l]
	      + CDM[1][MA_AN][j][k][l]*nh[1][MA_AN][j][k][l]
	      + 2.0*CDM[2][MA_AN][j][k][l]*nh[2][MA_AN][j][k][l]
	      - 2.0*CDM[3][MA_AN][j][k][l]*nh[3][MA_AN][j][k][l];
	  }
	}
      }

      /* spin-orbit coupling or LDA+U */
      else {
	for (k=0; k<tnoA; k++){
	  for (l=0; l<tnoB; l++){
	    My_Eele1[0] += 
   	         + CDM[0][MA_AN][j][k][l]*nh[0][MA_AN][j][k][l]
                 - iDM[0][0][MA_AN][j][k][l]*ImNL[0][MA_AN][j][k][l]
	         + CDM[1][MA_AN][j][k][l]*nh[1][MA_AN][j][k][l]
                 - iDM[0][1][MA_AN][j][k][l]*ImNL[1][MA_AN][j][k][l]
	         + 2.0*CDM[2][MA_AN][j][k][l]*nh[2][MA_AN][j][k][l]
	         - 2.0*CDM[3][MA_AN][j][k][l]*(nh[3][MA_AN][j][k][l]
                                            +ImNL[2][MA_AN][j][k][l]);
	  }
	}
      }

    }
  }

  /* MPI, My_Eele1 */
  MPI_Allreduce(&My_Eele1[0], &Eele1[0], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
  MPI_Allreduce(&My_Eele1[1], &Eele1[1], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);

  if (2<=level_stdout && myid==Host_ID){
    printf("Eele0=%15.12f Eele1=%15.12f\n",Eele0[0],Eele1[0]);
  }

  if (measure_time){
    dtime(&etime);
    time6 += etime - stime;
  }

  /****************************************************
                        Output
  ****************************************************/

  if (measure_time) dtime(&stime);

  if (myid==Host_ID){

    fnjoint(filepath,filename,file_EV);

    if ((fp_EV = fopen(file_EV,"w")) != NULL){

#ifdef xt3
      setvbuf(fp_EV,buf,_IOFBF,fp_bsize);  /* setvbuf */
#endif

      fprintf(fp_EV,"\n");
      fprintf(fp_EV,"***********************************************************\n");
      fprintf(fp_EV,"***********************************************************\n");
      fprintf(fp_EV,"      Eigenvalues (Hartree) for non-collinear KS-eq.       \n");
      fprintf(fp_EV,"***********************************************************\n");
      fprintf(fp_EV,"***********************************************************\n\n");

      fprintf(fp_EV,"   Chemical Potential (Hartree) = %18.14f\n",ChemP);
      fprintf(fp_EV,"   Number of States             = %18.14f\n",Num_State);
      fprintf(fp_EV,"   HOMO = %2d\n",Cluster_HOMO[0]);

      fprintf(fp_EV,"   Eigenvalues\n");
          fprintf(fp_EV,"\n");
      for (i1=1; i1<=MaxN; i1++){
        fprintf(fp_EV,"      %5d %18.14f\n",i1,ko[i1]);
      }

      if (2<=level_fileout){


        fprintf(fp_EV,"\n");
        fprintf(fp_EV,"***********************************************************\n");
        fprintf(fp_EV,"***********************************************************\n");
        fprintf(fp_EV,"  Eigenvalues (Hartree) and Eigenvectors for SCF KS-eq.  \n");
        fprintf(fp_EV,"***********************************************************\n");
        fprintf(fp_EV,"***********************************************************\n");

        fprintf(fp_EV,"\n\n");
        fprintf(fp_EV,"   Chemical Potential (Hartree) = %18.14f\n",ChemP);
        fprintf(fp_EV,"   HOMO = %2d\n",Cluster_HOMO[0]);

        fprintf(fp_EV,"\n");
	fprintf(fp_EV,"   Real (Re) and imaginary (Im) parts of LCAO coefficients\n\n");

        num0 = 2;
        num1 = MaxN/num0 + 1*(MaxN%num0!=0);

	for (i=1; i<=num1; i++){

	  fprintf(fp_EV,"\n");

	  for (i1=-2; i1<=0; i1++){

	    fprintf(fp_EV,"                     ");

	    for (j=1; j<=num0; j++){
	      j1 = num0*(i-1) + j;

	      if (j1<=MaxN){ 
		if (i1==-2){
		  fprintf(fp_EV," %4d",j1);
		  fprintf(fp_EV,"                                   ");
		}
		else if (i1==-1){
		  fprintf(fp_EV,"   %8.5f",ko[j1]);
		  fprintf(fp_EV,"                             ");
		}

		else if (i1==0){
		  fprintf(fp_EV,"     Re(U)");
		  fprintf(fp_EV,"     Im(U)");
		  fprintf(fp_EV,"     Re(D)");
		  fprintf(fp_EV,"     Im(D)");
		}

	      }
	    }
	    fprintf(fp_EV,"\n");
	    if (i1==-1)  fprintf(fp_EV,"\n");
	    if (i1==0)   fprintf(fp_EV,"\n");
	  }

	  Name_Angular[0][0] = "s          ";
	  Name_Angular[1][0] = "px         ";
	  Name_Angular[1][1] = "py         ";
	  Name_Angular[1][2] = "pz         ";
	  Name_Angular[2][0] = "d3z^2-r^2  ";
	  Name_Angular[2][1] = "dx^2-y^2   ";
	  Name_Angular[2][2] = "dxy        ";
	  Name_Angular[2][3] = "dxz        ";
	  Name_Angular[2][4] = "dyz        ";
	  Name_Angular[3][0] = "f5z^2-3r^2 ";
	  Name_Angular[3][1] = "f5xz^2-xr^2";
	  Name_Angular[3][2] = "f5yz^2-yr^2";
	  Name_Angular[3][3] = "fzx^2-zy^2 ";
	  Name_Angular[3][4] = "fxyz       ";
	  Name_Angular[3][5] = "fx^3-3*xy^2";
	  Name_Angular[3][6] = "f3yx^2-y^3 ";
	  Name_Angular[4][0] = "g1         ";
	  Name_Angular[4][1] = "g2         ";
	  Name_Angular[4][2] = "g3         ";
	  Name_Angular[4][3] = "g4         ";
	  Name_Angular[4][4] = "g5         ";
	  Name_Angular[4][5] = "g6         ";
	  Name_Angular[4][6] = "g7         ";
	  Name_Angular[4][7] = "g8         ";
	  Name_Angular[4][8] = "g9         ";

	  Name_Multiple[0] = "0";
	  Name_Multiple[1] = "1";
	  Name_Multiple[2] = "2";
	  Name_Multiple[3] = "3";
	  Name_Multiple[4] = "4";
	  Name_Multiple[5] = "5";

	  i1 = 1; 

	  for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){

	    wan1 = WhatSpecies[Gc_AN];

	    for (l=0; l<=Supported_MaxL; l++){
	      for (mul=0; mul<Spe_Num_CBasis[wan1][l]; mul++){
		for (m=0; m<(2*l+1); m++){

		  if (l==0 && mul==0 && m==0)
		    fprintf(fp_EV,"%4d %3s %s %s", 
			    Gc_AN,SpeName[wan1],Name_Multiple[mul],Name_Angular[l][m]);
		  else
		    fprintf(fp_EV,"         %s %s", 
			    Name_Multiple[mul],Name_Angular[l][m]);

		  for (j=1; j<=num0; j++){

		    j1 = num0*(i-1) + j;

		    if (0<i1 && j1<=MaxN){
		      fprintf(fp_EV,"  %8.5f",C[j1][i1].r);
		      fprintf(fp_EV,"  %8.5f",C[j1][i1].i);
		      fprintf(fp_EV,"  %8.5f",C[j1][i1+n].r);
		      fprintf(fp_EV,"  %8.5f",C[j1][i1+n].i);
		    }
		  }

		  fprintf(fp_EV,"\n");
		  if (i1==-1)  fprintf(fp_EV,"\n");
		  if (i1==0)   fprintf(fp_EV,"\n");

		  i1++;

		}
	      }
	    }
	  }

	}
      }

      fclose(fp_EV);
    }
    else
      printf("Failure of saving the EV file.\n");
  }

  if (measure_time){
    dtime(&etime);
    time7 += etime - stime;
  }

  if (measure_time){
    printf("Cluster_DFT myid=%2d time1=%7.3f time2=%7.3f time3=%7.3f time4=%7.3f time5=%7.3f time6=%7.3f time7=%7.3f\n",
            myid,time1,time2,time3,time4,time5,time6,time7);fflush(stdout); 
  }

  /****************************************************
                          Free
  ****************************************************/

  free(stat_send);
  free(request_send);
  free(request_recv);

  free(MP);
  free(ko);

  for (i=0; i<n2; i++){
    free(H[i]);
  }
  free(H);

  for (i=0; i<n2; i++){
    free(C[i]);
  }
  free(C);

  free(is1);
  free(ie1);

  free(is12);
  free(ie12);

  free(is2);
  free(ie2);

  /* for elapsed time */

  MPI_Barrier(mpi_comm_level1);
  dtime(&TEtime);
  time0 = TEtime - TStime;
  return time0;
}
