Hello! I have some confusing parallel timing results for my code, which performs
heavy numerical work.
For using 1 core, my timing is 882 seconds;
For using 8 cores, my timing is 121 seconds (7.3x faster);
for using 16 cores; my timing is 101 seconds (8.7x faster);
for using 32 logical cores; timing is 100 seconds, as same with using 16 cores.
My questions focus on two points:
1 why the timing for hyper-threading is same with 16 physical cores?
2 the 8 cores timing shows nearly linear speedup. However, why 16 cores only
have marginal improvements against with 8 cores?
The working environment is:
Two 2.6 GHz 64bit Intel 8-core XEON E5-2600 with 16 physical cores and 32 logical cores
32 GB of memory
Finally let me introduce more details about my code. This code is performing
4body analytical integrals in quantum chemistry in terms of Gaussian type
functions.
This is the head file to define the class with operator():
UInt is just size_t type.
/**
* \class TBB_GInts4D
* the result is in form of matrix
*/
class TBB_GInts4DMatrix {
private:
//
// basic data needed for calculation
//
bool sameShellPair; ///< whether the bra and ket are coming from same source?
UInt nSpin; ///< number of spin states
Double thresh; ///< threshold value for gints4d work
Double spThresh; ///< threshold value for generating shell pairs
//
// reference for the input
//
all of data here are constant reference.....
//
// the output data
//
Mtrx alphaMatrix; ///< alpha matrix
Mtrx betaMatrix; ///< beta matrix
public:
/**
* constructor
* \param spStatus whether bra and ket are same shell pair infor?
* \param ginfor information center for gints job
* \param bra,ket shell pair infor class for bra and ket side
* \param b1,b2,k1,k2 reference for shell data
* \param alpha,beta input density matrices
*/
TBB_GInts4DMatrix(bool spStatus, const GIntJobInfor& ginfor0, const SigMolShellPairInfor& bra,
const SigMolShellPairInfor& ket, const MolShell& b1, const MolShell& b2,
const MolShell& k1, const MolShell& k2, const Mtrx& alpha, const Mtrx& beta);
/**
* destructor
*/
~TBB_GInts4DMatrix() { };
/**
* functional operator to generate the integral results
* this is used as parallel_reduce
*/
void operator()(const blocked_range<UInt>& r);
/**
* splitting constructor used by TBB
* we copy all of reference data except result
* which should be always empty initially
*/
TBB_GInts4DMatrix(const TBB_GInts4DMatrix& tbb_gints4d, split);
/**
* for assemble results in threads together
*/
void join(TBB_GInts4DMatrix& tbb_gints4d) {
// update the alpha/beta matrix
alphaMatrix.add(tbb_gints4d.alphaMatrix);
if (nSpin == 2) {
betaMatrix.add(tbb_gints4d.betaMatrix);
}
};
.......
}
This is the cpp file for this class:
// constructor
TBB_GInts4DMatrix::TBB_GInts4DMatrix(bool spStatus, const GIntJobInfor& ginfor0,
const SigMolShellPairInfor& bra, const SigMolShellPairInfor& ket,
const MolShell& b1, const MolShell& b2, const MolShell& k1, const MolShell& k2,
const Mtrx& alpha, const Mtrx& beta):sameShellPair(spStatus),nSpin(ginfor0.getNSpin()),
thresh(ginfor0.int4DThreshVal()),spThresh(ginfor0.spThreshVal()),
bra1(b1),bra2(b2),ket1(k1),ket2(k2),braInfo(bra),ketInfo(ket),
ginfor(ginfor0),alphaDen(alpha),betaDen(beta)
{
// set up results
alphaMatrix.init(alphaDen.getRow(),alphaDen.getCol());
if (nSpin==2) {
betaMatrix.init(betaDen.getRow(),betaDen.getCol());
}
}
// copy constructor
TBB_GInts4DMatrix::TBB_GInts4DMatrix(const TBB_GInts4DMatrix& tbb_gints4d,
split):sameShellPair(tbb_gints4d.sameShellPair),
nSpin(tbb_gints4d.nSpin),thresh(tbb_gints4d.thresh),spThresh(tbb_gints4d.spThresh),
bra1(tbb_gints4d.bra1),bra2(tbb_gints4d.bra2),ket1(tbb_gints4d.ket1),
ket2(tbb_gints4d.ket2),braInfo(tbb_gints4d.braInfo),ketInfo(tbb_gints4d.ketInfo),
ginfor(tbb_gints4d.ginfor),alphaDen(tbb_gints4d.alphaDen),betaDen(tbb_gints4d.betaDen)
{
// set up results
alphaMatrix.init(alphaDen.getRow(),alphaDen.getCol());
if (nSpin==2) {
betaMatrix.init(betaDen.getRow(),betaDen.getCol());
}
}
// this is the body for defining operator ()
void TBB_GInts4DMatrix::operator()(const blocked_range<UInt>& r)
{
//
// setup scratch data for ERI calculation
// 1 atom shell pair (bra and ket)
// 2 heap memory management if necessary
// 3 integral array
// 4 J/K digestion
//
basically, here I just allocate memory for scratch data
in terms of TBB scalable allocator with STL vector
// now real work begins
for( UInt n=r.begin(); n!=r.end(); ++n ) {
// here we perform heavy numerical work to calculate
// the 4 body integrals. All of variables are local
// to threads, no more memory needed except above
// scratch data
// the calculation inside the loop costs 10^5 to 10^6 CPU
// instructions to finish
// the results will be added to result alphaMatrix and
// betaMatrix, which are local per thread
}
}
This is the way I use the class:
// get the global information
const GlobalInfor& globInfor = ginfor.getGlobalInfor();
// possible timing code
tick_count t0 = tick_count::now();
// set up the scheduler
task_scheduler_init init(task_scheduler_init::deferred);
if (globInfor.useMultiThreads()) {
init.initialize();
}else{
init.initialize(1);
}
// do normal integral matrix calculation
UInt braLen = braInfor.getNSigAtomShellPairs();
bool sameShellPairs = true;
UInt len = braLen*(braLen+1)/2;
TBB_GInts4DMatrix intWork(sameShellPairs,ginfor,braInfor,ketInfor,s,s,s,s,den.getMtrx(0),den.getMtrx(1));
parallel_reduce(blocked_range<UInt>(0,len),intWork);
// possible timing code
tick_count t1 = tick_count::now();
Double t = (t1-t0).seconds();
if (globInfor.printTiming()) {
printf("%s %-12.6f\n", "GInts4D work with TBB threads, time in seconds ", t);
}
Thank you so much for your help!!!
fenglai