1 #ifndef _PAR_FRIENDS_H_
2 #define _PAR_FRIENDS_H_
16 template <
class IT,
class NT,
class DER>
27 template <
typename IT,
typename NT>
35 else if (vecs.size() < 2)
42 typename vector< FullyDistVec<IT,NT> >::iterator it = vecs.begin();
43 shared_ptr<CommGrid> commGridPtr = it->
getcommgrid();
44 MPI_Comm World = commGridPtr->GetWorld();
46 IT nglen = it->TotalLength();
47 IT cumloclen = it->MyLocLength();
49 for(; it != vecs.end(); ++it)
51 if(*(commGridPtr) != *(it->getcommgrid()))
56 nglen += it->TotalLength();
57 cumloclen += it->MyLocLength();
60 int nprocs = commGridPtr->GetSize();
62 vector< vector< NT > > data(nprocs);
63 vector< vector< IT > > inds(nprocs);
65 for(it = vecs.begin(); it != vecs.end(); ++it)
67 IT loclen = it->LocArrSize();
68 for(IT i=0; i < loclen; ++i)
71 IT loffset = it->LengthUntil();
72 int owner = ConCat.Owner(gloffset+loffset+i, locind);
73 data[owner].push_back(it->arr[i]);
74 inds[owner].push_back(locind);
76 gloffset += it->TotalLength();
79 int * sendcnt =
new int[nprocs];
80 int * sdispls =
new int[nprocs];
81 for(
int i=0; i<nprocs; ++i)
82 sendcnt[i] = (
int) data[i].size();
84 int * rdispls =
new int[nprocs];
85 int * recvcnt =
new int[nprocs];
86 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
89 for(
int i=0; i<nprocs-1; ++i)
91 sdispls[i+1] = sdispls[i] + sendcnt[i];
92 rdispls[i+1] = rdispls[i] + recvcnt[i];
94 IT totrecv = accumulate(recvcnt,recvcnt+nprocs,static_cast<IT>(0));
95 NT * senddatabuf =
new NT[cumloclen];
96 for(
int i=0; i<nprocs; ++i)
98 copy(data[i].begin(), data[i].end(), senddatabuf+sdispls[i]);
99 vector<NT>().swap(data[i]);
101 NT * recvdatabuf =
new NT[totrecv];
102 MPI_Alltoallv(senddatabuf, sendcnt, sdispls, MPIType<NT>(), recvdatabuf, recvcnt, rdispls, MPIType<NT>(), World);
103 delete [] senddatabuf;
105 IT * sendindsbuf =
new IT[cumloclen];
106 for(
int i=0; i<nprocs; ++i)
108 copy(inds[i].begin(), inds[i].end(), sendindsbuf+sdispls[i]);
109 vector<IT>().swap(inds[i]);
111 IT * recvindsbuf =
new IT[totrecv];
112 MPI_Alltoallv(sendindsbuf, sendcnt, sdispls, MPIType<IT>(), recvindsbuf, recvcnt, rdispls, MPIType<IT>(), World);
113 DeleteAll(sendindsbuf, sendcnt, sdispls);
115 for(
int i=0; i<nprocs; ++i)
117 for(
int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j)
119 ConCat.arr[recvindsbuf[j]] = recvdatabuf[j];
122 DeleteAll(recvindsbuf, recvcnt, rdispls);
127 template <
typename MATRIXA,
typename MATRIXB>
130 if(A.getncol() != B.getnrow())
133 outs <<
"Can not multiply, dimensions does not match"<< endl;
134 outs << A.getncol() <<
" != " << B.getnrow() << endl;
139 if((
void*) &A == (
void*) &B)
142 outs <<
"Can not multiply, inputs alias (make a temporary copy of one of them first)"<< endl;
159 template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
170 shared_ptr<CommGrid> GridC =
ProductGrid((A.commGrid).get(), (B.commGrid).
get(), stages, dummy, dummy);
171 IU C_m = A.spSeq->getnrow();
172 IU C_n = B.spSeq->getncol();
174 UDERA * A1seq =
new UDERA();
175 UDERA * A2seq =
new UDERA();
176 UDERB * B1seq =
new UDERB();
177 UDERB * B2seq =
new UDERB();
178 (A.spSeq)->Split( *A1seq, *A2seq);
179 const_cast< UDERB*
>(B.spSeq)->
Transpose();
180 (B.spSeq)->Split( *B1seq, *B2seq);
181 MPI_Barrier(GridC->GetWorld());
183 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
184 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
192 vector< SpTuples<IU,NUO> *> tomerge;
194 int Aself = (A.commGrid)->GetRankInProcRow();
195 int Bself = (B.commGrid)->GetRankInProcCol();
197 for(
int i = 0; i < stages; ++i)
206 ess.resize(UDERA::esscount);
207 for(
int j=0; j< UDERA::esscount; ++j)
209 ess[j] = ARecvSizes[j][i];
221 ess.resize(UDERB::esscount);
222 for(
int j=0; j< UDERB::esscount; ++j)
224 ess[j] = BRecvSizes[j][i];
234 if(!C_cont->isZero())
235 tomerge.push_back(C_cont);
239 if(clearA)
delete A1seq;
240 if(clearB)
delete B1seq;
247 for(
int i = 0; i < stages; ++i)
256 ess.resize(UDERA::esscount);
257 for(
int j=0; j< UDERA::esscount; ++j)
259 ess[j] = ARecvSizes[j][i];
273 ess.resize(UDERB::esscount);
274 for(
int j=0; j< UDERB::esscount; ++j)
276 ess[j] = BRecvSizes[j][i];
287 if(!C_cont->isZero())
288 tomerge.push_back(C_cont);
302 (A.spSeq)->Merge(*A1seq, *A2seq);
314 (B.spSeq)->Merge(*B1seq, *B2seq);
317 const_cast< UDERB*
>(B.spSeq)->
Transpose();
320 UDERO *
C =
new UDERO(MergeAll<SR>(tomerge, C_m, C_n,
true),
false);
331 template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
341 shared_ptr<CommGrid> GridC =
ProductGrid((A.commGrid).get(), (B.commGrid).
get(), stages, dummy, dummy);
342 IU C_m = A.spSeq->getnrow();
343 IU C_n = B.spSeq->getncol();
345 const_cast< UDERB*
>(B.spSeq)->
Transpose();
346 MPI_Barrier(GridC->GetWorld());
348 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
349 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
357 vector< SpTuples<IU,NUO> *> tomerge;
359 int Aself = (A.commGrid)->GetRankInProcRow();
360 int Bself = (B.commGrid)->GetRankInProcCol();
362 for(
int i = 0; i < stages; ++i)
371 ess.resize(UDERA::esscount);
372 for(
int j=0; j< UDERA::esscount; ++j)
374 ess[j] = ARecvSizes[j][i];
388 ess.resize(UDERB::esscount);
389 for(
int j=0; j< UDERB::esscount; ++j)
391 ess[j] = BRecvSizes[j][i];
403 if(!C_cont->isZero())
404 tomerge.push_back(C_cont);
408 outs << i <<
"th SUMMA iteration"<< endl;
412 if(clearA && A.spSeq != NULL)
417 if(clearB && B.spSeq != NULL)
426 UDERO *
C =
new UDERO(MergeAll<SR>(tomerge, C_m, C_n,
true),
false);
431 const_cast< UDERB*
>(B.spSeq)->
Transpose();
437 template <
typename MATRIX,
typename VECTOR>
440 if(A.getncol() != x.TotalLength())
443 outs <<
"Can not multiply, dimensions does not match"<< endl;
444 outs << A.getncol() <<
" != " << x.TotalLength() << endl;
448 if(! ( *(A.getcommgrid()) == *(x.getcommgrid())) )
450 cout <<
"Grids are not comparable for SpMV" << endl;
456 template <
typename SR,
typename IU,
typename NUM,
typename UDER>
460 template <
typename SR,
typename IU,
typename NUM,
typename UDER>
466 return SpMV<SR>(A, x, indexisvalue, optbuf);
474 template<
typename IU,
typename NV>
477 int32_t xlocnz = (int32_t) x.
getlocnnz();
478 int32_t roffst = (int32_t) x.RowLenUntil();
480 IU luntil = x.LengthUntil();
481 int diagneigh = x.commGrid->GetComplementRank();
484 MPI_Sendrecv(&roffst, 1,
MPIType<int32_t>(), diagneigh,
TROST, &roffset, 1,
MPIType<int32_t>(), diagneigh,
TROST, World, &status);
485 MPI_Sendrecv(&xlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, &trxlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, World, &status);
486 MPI_Sendrecv(&luntil, 1, MPIType<IU>(), diagneigh,
TRLUT, &lenuntil, 1, MPIType<IU>(), diagneigh,
TRLUT, World, &status);
490 trxinds =
new int32_t[trxlocnz];
491 int32_t * temp_xind =
new int32_t[xlocnz];
492 for(
int i=0; i< xlocnz; ++i) temp_xind[i] = (int32_t) x.ind[i];
493 MPI_Sendrecv(temp_xind, xlocnz,
MPIType<int32_t>(), diagneigh,
TRI, trxinds, trxlocnz,
MPIType<int32_t>(), diagneigh,
TRI, World, &status);
497 trxnums =
new NV[trxlocnz];
498 MPI_Sendrecv(const_cast<NV*>(
SpHelper::p2a(x.num)), xlocnz, MPIType<NV>(), diagneigh,
TRX, trxnums, trxlocnz, MPIType<NV>(), diagneigh,
TRX, World, &status);
500 transform(trxinds, trxinds+trxlocnz, trxinds, bind2nd(plus<int32_t>(), roffset));
510 template<
typename IU,
typename NV>
511 void AllGatherVector(MPI_Comm & ColWorld,
int trxlocnz, IU lenuntil, int32_t * & trxinds, NV * & trxnums,
512 int32_t * & indacc, NV * & numacc,
int & accnz,
bool indexisvalue)
514 int colneighs, colrank;
515 MPI_Comm_size(ColWorld, &colneighs);
516 MPI_Comm_rank(ColWorld, &colrank);
517 int * colnz =
new int[colneighs];
518 colnz[colrank] = trxlocnz;
519 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
520 int * dpls =
new int[colneighs]();
521 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
522 accnz = std::accumulate(colnz, colnz+colneighs, 0);
523 indacc =
new int32_t[accnz];
524 numacc =
new NV[accnz];
534 double t0=MPI_Wtime();
542 if(colrank == 0) lenuntilcol = lenuntil;
543 MPI_Bcast(&lenuntilcol, 1, MPIType<IU>(), 0, ColWorld);
544 for(
int i=0; i< accnz; ++i)
546 numacc[i] = indacc[i] + lenuntilcol;
551 MPI_Allgatherv(trxnums, trxlocnz, MPIType<NV>(), numacc, colnz, dpls, MPIType<NV>(), ColWorld);
555 double t1=MPI_Wtime();
569 template<
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
571 int32_t * & sendindbuf, OVT * & sendnumbuf,
int * & sdispls,
int * sendcnt,
int accnz,
bool indexisvalue)
575 if(A.spSeq->getnsplit() > 0)
578 dcsc_gespmv_threaded_setbuffers<SR> (*(A.spSeq), indacc, numacc, accnz, optbuf.
inds, optbuf.
nums, sendcnt, optbuf.
dspls, rowneighs);
582 dcsc_gespmv<SR> (*(A.spSeq), indacc, numacc, accnz, optbuf.
inds, optbuf.
nums, sendcnt, optbuf.
dspls, rowneighs, indexisvalue);
588 if(A.spSeq->getnsplit() > 0)
591 int totalsent = dcsc_gespmv_threaded<SR> (*(A.spSeq), indacc, numacc, accnz, sendindbuf, sendnumbuf, sdispls, rowneighs);
594 for(
int i=0; i<rowneighs-1; ++i)
595 sendcnt[i] = sdispls[i+1] - sdispls[i];
596 sendcnt[rowneighs-1] = totalsent - sdispls[rowneighs-1];
601 vector< int32_t > indy;
604 dcsc_gespmv<SR>(*(A.spSeq), indacc, numacc, accnz, indy, numy);
607 int32_t bufsize = indy.size();
608 sendindbuf =
new int32_t[bufsize];
609 sendnumbuf =
new OVT[bufsize];
613 for(
int i=0; i<rowneighs; ++i)
615 int32_t end_this = (i==rowneighs-1) ? A.
getlocalrows(): (i+1)*perproc;
616 while(k < bufsize && indy[k] < end_this)
618 sendindbuf[k] = indy[k] - i*perproc;
619 sendnumbuf[k] = numy[k];
624 sdispls =
new int[rowneighs]();
625 partial_sum(sendcnt, sendcnt+rowneighs-1, sdispls+1);
630 template <
typename SR,
typename IU,
typename OVT>
634 vector<IU>().swap(y.ind);
635 vector<OVT>().swap(y.num);
638 IU ysize = y.MyLocLength();
639 bool * isthere =
new bool[ysize];
640 vector< pair<IU,OVT> > ts_pairs;
641 fill_n(isthere, ysize,
false);
645 for(
int i=0; i<rowneighs; ++i)
647 for(
int j=0; j< recvcnt[i]; ++j)
649 int32_t index = recvindbuf[rdispls[i] + j];
651 ts_pairs.push_back(make_pair(index, recvnumbuf[rdispls[i] + j]));
656 DeleteAll(isthere, recvindbuf, recvnumbuf);
657 sort(ts_pairs.begin(), ts_pairs.end());
658 int nnzy = ts_pairs.size();
661 for(
int i=0; i< nnzy; ++i)
663 y.ind[i] = ts_pairs[i].first;
664 y.num[i] = ts_pairs[i].second;
669 int32_t inf = numeric_limits<int32_t>::min();
670 int32_t sup = numeric_limits<int32_t>::max();
671 KNHeap< int32_t, int32_t > sHeap(sup, inf);
672 int * processed =
new int[rowneighs]();
673 for(
int i=0; i<rowneighs; ++i)
678 sHeap.insert(recvindbuf[rdispls[i]], i);
685 sHeap.deleteMin(&key, &locv);
686 y.ind.push_back( static_cast<IU>(key));
687 y.num.push_back(recvnumbuf[rdispls[locv]]);
689 if( (++(processed[locv])) < recvcnt[locv] )
690 sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
696 sHeap.deleteMin(&key, &locv);
697 IU deref = rdispls[locv] + processed[locv];
698 if(y.ind.back() ==
static_cast<IU
>(key))
700 y.num.back() =
SR::add(y.num.back(), recvnumbuf[deref]);
706 y.ind.push_back(static_cast<IU>(key));
707 y.num.push_back(recvnumbuf[deref]);
710 if( (++(processed[locv])) < recvcnt[locv] )
711 sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
727 template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
734 MPI_Comm World = x.commGrid->GetWorld();
735 MPI_Comm ColWorld = x.commGrid->GetColWorld();
736 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
741 int32_t *trxinds, *indacc;
742 IVT *trxnums, *numacc;
743 TransposeVector(World, x, trxlocnz, lenuntil, trxinds, trxnums, indexisvalue);
744 AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, indacc, numacc, accnz, indexisvalue);
747 MPI_Comm_size(RowWorld, &rowneighs);
748 int * sendcnt =
new int[rowneighs]();
749 int32_t * sendindbuf;
752 LocalSpMV<SR>(A, rowneighs, optbuf, indacc, numacc, sendindbuf, sendnumbuf, sdispls, sendcnt, accnz, indexisvalue);
754 int * rdispls =
new int[rowneighs];
755 int * recvcnt =
new int[rowneighs];
756 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld);
760 for(
int i=0; i<rowneighs-1; ++i)
762 rdispls[i+1] = rdispls[i] + recvcnt[i];
764 int totrecv = accumulate(recvcnt,recvcnt+rowneighs,0);
765 int32_t * recvindbuf =
new int32_t[totrecv];
766 OVT * recvnumbuf =
new OVT[totrecv];
769 double t2=MPI_Wtime();
771 if(optbuf.totmax > 0 )
774 MPI_Alltoallv(optbuf.nums, sendcnt, optbuf.dspls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
793 MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
799 double t3=MPI_Wtime();
808 MergeContributions<SR>(y,recvcnt, rdispls, recvindbuf, recvnumbuf, rowneighs);
812 template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
816 SpMV<SR>(A, x, y, indexisvalue, optbuf);
824 template <
typename SR,
typename IU,
typename NUM,
typename UDER>
830 SpMV<SR>(A, x, y, indexisvalue, optbuf);
837 template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
844 MPI_Comm World = x.commGrid->GetWorld();
845 MPI_Comm ColWorld = x.commGrid->GetColWorld();
846 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
851 int diagneigh = x.commGrid->GetComplementRank();
853 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
855 NUV * trxnums =
new NUV[trxsize];
856 MPI_Sendrecv(const_cast<NUV*>(
SpHelper::p2a(x.arr)), xsize, MPIType<NUV>(), diagneigh,
TRX, trxnums, trxsize, MPIType<NUV>(), diagneigh,
TRX, World, &status);
858 int colneighs, colrank;
859 MPI_Comm_size(ColWorld, &colneighs);
860 MPI_Comm_rank(ColWorld, &colrank);
861 int * colsize =
new int[colneighs];
862 colsize[colrank] = trxsize;
863 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize, 1, MPI_INT, ColWorld);
864 int * dpls =
new int[colneighs]();
865 std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
866 int accsize = std::accumulate(colsize, colsize+colneighs, 0);
867 NUV * numacc =
new NUV[accsize];
869 MPI_Allgatherv(trxnums, trxsize, MPIType<NUV>(), numacc, colsize, dpls, MPIType<NUV>(), ColWorld);
875 T_promote * localy =
new T_promote[ysize];
876 fill_n(localy, ysize,
id);
877 dcsc_gespmv<SR>(*(A.spSeq), numacc, localy);
885 MPI_Comm_size(RowWorld, &rowneighs);
888 for(
int i=0; i< rowneighs; ++i)
890 begptr = y.RowLenUntil(i);
897 endptr = y.RowLenUntil(i+1);
899 MPI_Reduce(localy+begptr,
SpHelper::p2a(y.arr), endptr-begptr, MPIType<T_promote>(),
SR::mpi_op(), i, RowWorld);
911 template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
918 MPI_Comm World = x.commGrid->GetWorld();
919 MPI_Comm ColWorld = x.commGrid->GetColWorld();
920 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
924 int roffst = x.RowLenUntil();
927 int diagneigh = x.commGrid->GetComplementRank();
929 MPI_Sendrecv(&xlocnz, 1, MPI_INT, diagneigh,
TRX, &trxlocnz, 1, MPI_INT, diagneigh,
TRX, World, &status);
930 MPI_Sendrecv(&roffst, 1, MPI_INT, diagneigh,
TROST, &offset, 1, MPI_INT, diagneigh,
TROST, World, &status);
932 IU * trxinds =
new IU[trxlocnz];
933 NUV * trxnums =
new NUV[trxlocnz];
934 MPI_Sendrecv(const_cast<IU*>(
SpHelper::p2a(x.ind)), xlocnz, MPIType<IU>(), diagneigh,
TRX, trxinds, trxlocnz, MPIType<IU>(), diagneigh,
TRX, World, &status);
935 MPI_Sendrecv(const_cast<NUV*>(
SpHelper::p2a(x.num)), xlocnz, MPIType<NUV>(), diagneigh,
TRX, trxnums, trxlocnz, MPIType<NUV>(), diagneigh,
TRX, World, &status);
936 transform(trxinds, trxinds+trxlocnz, trxinds, bind2nd(plus<IU>(), offset));
938 int colneighs, colrank;
939 MPI_Comm_size(ColWorld, &colneighs);
940 MPI_Comm_rank(ColWorld, &colrank);
941 int * colnz =
new int[colneighs];
942 colnz[colrank] = trxlocnz;
943 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
944 int * dpls =
new int[colneighs]();
945 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
946 int accnz = std::accumulate(colnz, colnz+colneighs, 0);
947 IU * indacc =
new IU[accnz];
948 NUV * numacc =
new NUV[accnz];
952 MPI_Allgatherv(trxinds, trxlocnz, MPIType<IU>(), indacc, colnz, dpls, MPIType<IU>(), ColWorld);
953 MPI_Allgatherv(trxnums, trxlocnz, MPIType<NUV>(), numacc, colnz, dpls, MPIType<NUV>(), ColWorld);
957 vector< int32_t > indy;
958 vector< T_promote > numy;
960 int32_t * tmpindacc =
new int32_t[accnz];
961 for(
int i=0; i< accnz; ++i) tmpindacc[i] = indacc[i];
964 dcsc_gespmv<SR>(*(A.spSeq), tmpindacc, numacc, accnz, indy, numy);
970 IU yintlen = y.MyRowLength();
973 MPI_Comm_size(RowWorld,&rowneighs);
974 vector< vector<IU> > sendind(rowneighs);
975 vector< vector<T_promote> > sendnum(rowneighs);
976 typename vector<int32_t>::size_type outnz = indy.size();
977 for(
typename vector<IU>::size_type i=0; i< outnz; ++i)
980 int rown = y.OwnerWithinRow(yintlen, static_cast<IU>(indy[i]), locind);
981 sendind[rown].push_back(locind);
982 sendnum[rown].push_back(numy[i]);
985 IU * sendindbuf =
new IU[outnz];
986 T_promote * sendnumbuf =
new T_promote[outnz];
987 int * sendcnt =
new int[rowneighs];
988 int * sdispls =
new int[rowneighs];
989 for(
int i=0; i<rowneighs; ++i)
990 sendcnt[i] = sendind[i].
size();
992 int * rdispls =
new int[rowneighs];
993 int * recvcnt =
new int[rowneighs];
994 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld);
998 for(
int i=0; i<rowneighs-1; ++i)
1000 sdispls[i+1] = sdispls[i] + sendcnt[i];
1001 rdispls[i+1] = rdispls[i] + recvcnt[i];
1003 int totrecv = accumulate(recvcnt,recvcnt+rowneighs,0);
1004 IU * recvindbuf =
new IU[totrecv];
1005 T_promote * recvnumbuf =
new T_promote[totrecv];
1007 for(
int i=0; i<rowneighs; ++i)
1009 copy(sendind[i].begin(), sendind[i].end(), sendindbuf+sdispls[i]);
1010 vector<IU>().swap(sendind[i]);
1012 for(
int i=0; i<rowneighs; ++i)
1014 copy(sendnum[i].begin(), sendnum[i].end(), sendnumbuf+sdispls[i]);
1015 vector<T_promote>().swap(sendnum[i]);
1017 MPI_Alltoallv(sendindbuf, sendcnt, sdispls, MPIType<IU>(), recvindbuf, recvcnt, rdispls, MPIType<IU>(), RowWorld);
1018 MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<T_promote>(), recvnumbuf, recvcnt, rdispls, MPIType<T_promote>(), RowWorld);
1021 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
1024 IU ysize = y.MyLocLength();
1025 T_promote * localy =
new T_promote[ysize];
1026 bool * isthere =
new bool[ysize];
1028 fill_n(isthere, ysize,
false);
1030 for(
int i=0; i< totrecv; ++i)
1032 if(!isthere[recvindbuf[i]])
1034 localy[recvindbuf[i]] = recvnumbuf[i];
1035 nzinds.push_back(recvindbuf[i]);
1036 isthere[recvindbuf[i]] =
true;
1040 localy[recvindbuf[i]] =
SR::add(localy[recvindbuf[i]], recvnumbuf[i]);
1043 DeleteAll(isthere, recvindbuf, recvnumbuf);
1044 sort(nzinds.begin(), nzinds.end());
1045 int nnzy = nzinds.size();
1048 for(
int i=0; i< nnzy; ++i)
1050 y.ind[i] = nzinds[i];
1051 y.num[i] = localy[nzinds[i]];
1058 template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1065 if(*(A.commGrid) == *(B.commGrid))
1067 DER_promote * result =
new DER_promote(
EWiseMult(*(A.spSeq),*(B.spSeq),exclude) );
1072 cout <<
"Grids are not comparable elementwise multiplication" << endl;
1078 template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation>
1082 if(*(A.commGrid) == *(B.commGrid))
1084 RETDER * result =
new RETDER( EWiseApply<RETT>(*(A.spSeq),*(B.spSeq), __binary_op, notB, defaultBVal) );
1089 cout <<
"Grids are not comparable elementwise apply" << endl;
1095 template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
1097 (
const SpParMat<IU,NU1,UDERA> & A,
const SpParMat<IU,NU2,UDERB> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect,
const bool useExtendedBinOp)
1099 if(*(A.commGrid) == *(B.commGrid))
1101 RETDER * result =
new RETDER( EWiseApply<RETT>(*(A.spSeq),*(B.spSeq), __binary_op, do_op, allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect) );
1106 cout <<
"Grids are not comparable elementwise apply" << endl;
1113 template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
1115 EWiseApply (
const SpParMat<IU,NU1,UDERA> & A,
const SpParMat<IU,NU2,UDERB> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect =
true)
1117 return EWiseApply<RETT, RETDER>(A, B,
1120 allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect,
true);
1129 template <
typename IU,
typename NU1,
typename NU2>
1135 if(*(V.commGrid) == *(W.commGrid))
1138 Product.length = V.length;
1139 if(Product.diagonal)
1143 IU
size= V.ind.size();
1144 for(IU i=0; i<size; ++i)
1146 if(W.arr.size() <= V.ind[i] || W.arr[V.ind[i]] == zero)
1148 Product.ind.push_back(V.ind[i]);
1149 Product.num.push_back(V.num[i]);
1155 IU
size= V.ind.size();
1156 for(IU i=0; i<size; ++i)
1158 if(W.arr.size() > V.ind[i] && W.arr[V.ind[i]] != zero)
1160 Product.ind.push_back(V.ind[i]);
1161 Product.num.push_back(V.num[i] * W.arr[V.ind[i]]);
1170 cout <<
"Grids are not comparable elementwise multiplication" << endl;
1180 template <
typename IU,
typename NU1,
typename NU2>
1186 if(*(V.commGrid) == *(W.commGrid))
1189 if(V.glen != W.glen)
1191 cerr <<
"Vector dimensions don't match for EWiseMult\n";
1196 Product.glen = V.glen;
1200 #if defined(_OPENMP) && defined(CBLAS_EXPERIMENTAL) // not faster than serial
1202 vector <IU> tlosizes (actual_splits, 0);
1203 vector < vector<IU> > tlinds(actual_splits);
1204 vector < vector<T_promote> > tlnums(actual_splits);
1205 IU tlsize = size / actual_splits;
1206 #pragma omp parallel for //schedule(dynamic, 1)
1207 for(IU t = 0; t < actual_splits; ++t)
1209 IU tlbegin = t*tlsize;
1210 IU tlend = (t==actual_splits-1)? size : (t+1)*tlsize;
1211 for(IU i=tlbegin; i<tlend; ++i)
1213 if(W.arr[V.ind[i]] == zero)
1215 tlinds[t].push_back(V.ind[i]);
1216 tlnums[t].push_back(V.num[i]);
1221 vector<IU> prefix_sum(actual_splits+1,0);
1222 partial_sum(tlosizes.begin(), tlosizes.end(), prefix_sum.begin()+1);
1223 Product.ind.resize(prefix_sum[actual_splits]);
1224 Product.num.resize(prefix_sum[actual_splits]);
1226 #pragma omp parallel for //schedule(dynamic, 1)
1227 for(IU t=0; t< actual_splits; ++t)
1229 copy(tlinds[t].begin(), tlinds[t].end(), Product.ind.begin()+prefix_sum[t]);
1230 copy(tlnums[t].begin(), tlnums[t].end(), Product.num.begin()+prefix_sum[t]);
1233 for(IU i=0; i<size; ++i)
1235 if(W.arr[V.ind[i]] == zero)
1237 Product.ind.push_back(V.ind[i]);
1238 Product.num.push_back(V.num[i]);
1245 for(IU i=0; i<size; ++i)
1247 if(W.arr[V.ind[i]] != zero)
1249 Product.ind.push_back(V.ind[i]);
1250 Product.num.push_back(V.num[i] * W.arr[V.ind[i]]);
1259 cout <<
"Grids are not comparable elementwise multiplication" << endl;
1283 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1287 typedef RET T_promote;
1288 if(*(V.commGrid) == *(W.commGrid))
1292 if(V.TotalLength() != W.TotalLength())
1295 outs <<
"Vector dimensions don't match (" << V.TotalLength() <<
" vs " << W.TotalLength() <<
") for EWiseApply (short version)\n";
1301 Product.glen = V.glen;
1308 for(IU i=0; i<size; ++i)
1310 if(sp_iter < spsize && V.ind[sp_iter] == i)
1312 if (_doOp(V.num[sp_iter], W.arr[i],
false,
false))
1314 Product.ind.push_back(i);
1315 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[i],
false,
false));
1321 if (_doOp(Vzero, W.arr[i],
true,
false))
1323 Product.ind.push_back(i);
1324 Product.num.push_back(_binary_op(Vzero, W.arr[i],
true,
false));
1332 for(sp_iter = 0; sp_iter < spsize; ++sp_iter)
1334 if (_doOp(V.num[sp_iter], W.arr[V.ind[sp_iter]],
false,
false))
1336 Product.ind.push_back(V.ind[sp_iter]);
1337 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[V.ind[sp_iter]],
false,
false));
1346 cout <<
"Grids are not comparable for EWiseApply" << endl;
1373 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1375 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls, NU1 Vzero, NU2 Wzero,
const bool allowIntersect,
const bool useExtendedBinOp)
1377 typedef RET T_promote;
1378 if(*(V.commGrid) == *(W.commGrid))
1381 if(V.glen != W.glen)
1384 outs <<
"Vector dimensions don't match (" << V.glen <<
" vs " << W.glen <<
") for EWiseApply (full version)\n";
1390 Product.glen = V.glen;
1391 typename vector< IU >::const_iterator indV = V.ind.begin();
1392 typename vector< NU1 >::const_iterator numV = V.num.begin();
1393 typename vector< IU >::const_iterator indW = W.ind.begin();
1394 typename vector< NU2 >::const_iterator numW = W.num.begin();
1396 while (indV < V.ind.end() && indW < W.ind.end())
1403 if (_doOp(*numV, *numW,
false,
false))
1405 Product.ind.push_back(*indV);
1406 Product.num.push_back(_binary_op(*numV, *numW,
false,
false));
1412 else if (*indV < *indW)
1417 if (_doOp(*numV, Wzero,
false,
true))
1419 Product.ind.push_back(*indV);
1420 Product.num.push_back(_binary_op(*numV, Wzero,
false,
true));
1430 if (_doOp(Vzero, *numW,
true,
false))
1432 Product.ind.push_back(*indW);
1433 Product.num.push_back(_binary_op(Vzero, *numW,
true,
false));
1440 while (allowWNulls && indV < V.ind.end())
1442 if (_doOp(*numV, Wzero,
false,
true))
1444 Product.ind.push_back(*indV);
1445 Product.num.push_back(_binary_op(*numV, Wzero,
false,
true));
1449 while (allowVNulls && indW < W.ind.end())
1451 if (_doOp(Vzero, *numW,
true,
false))
1453 Product.ind.push_back(*indW);
1454 Product.num.push_back(_binary_op(Vzero, *numW,
true,
false));
1463 cout <<
"Grids are not comparable for EWiseApply" << endl;
1470 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1474 return EWiseApply<RET>(V, W,
1477 allowVNulls, Vzero,
true);
1480 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1482 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls, NU1 Vzero, NU2 Wzero,
const bool allowIntersect =
true)
1484 return EWiseApply<RET>(V, W,
1487 allowVNulls, allowWNulls, Vzero, Wzero, allowIntersect,
true);