COMBINATORIAL_BLAS  1.3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParFriends.h
Go to the documentation of this file.
1 #ifndef _PAR_FRIENDS_H_
2 #define _PAR_FRIENDS_H_
3 
4 #include "mpi.h"
5 #include <iostream>
6 #include <cstdarg>
7 #include "SpParMat.h"
8 #include "SpParHelper.h"
9 #include "MPIType.h"
10 #include "Friends.h"
11 #include "OptBuf.h"
12 
13 
14 using namespace std;
15 
16 template <class IT, class NT, class DER>
17 class SpParMat;
18 
19 /*************************************************************************************************/
20 /**************************** FRIEND FUNCTIONS FOR PARALLEL CLASSES ******************************/
21 /*************************************************************************************************/
22 
23 
27 template <typename IT, typename NT>
29 {
30  if(vecs.size() < 1)
31  {
32  SpParHelper::Print("Warning: Nothing to concatenate, returning empty ");
33  return FullyDistVec<IT,NT>();
34  }
35  else if (vecs.size() < 2)
36  {
37  return vecs[1];
38 
39  }
40  else
41  {
42  typename vector< FullyDistVec<IT,NT> >::iterator it = vecs.begin();
43  shared_ptr<CommGrid> commGridPtr = it->getcommgrid();
44  MPI_Comm World = commGridPtr->GetWorld();
45 
46  IT nglen = it->TotalLength(); // new global length
47  IT cumloclen = it->MyLocLength(); // existing cumulative local lengths
48  ++it;
49  for(; it != vecs.end(); ++it)
50  {
51  if(*(commGridPtr) != *(it->getcommgrid()))
52  {
53  SpParHelper::Print("Grids are not comparable for FullyDistVec<IT,NT>::EWiseApply\n");
54  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
55  }
56  nglen += it->TotalLength();
57  cumloclen += it->MyLocLength();
58  }
59  FullyDistVec<IT,NT> ConCat (commGridPtr, nglen, NT());
60  int nprocs = commGridPtr->GetSize();
61 
62  vector< vector< NT > > data(nprocs);
63  vector< vector< IT > > inds(nprocs);
64  IT gloffset = 0;
65  for(it = vecs.begin(); it != vecs.end(); ++it)
66  {
67  IT loclen = it->LocArrSize();
68  for(IT i=0; i < loclen; ++i)
69  {
70  IT locind;
71  IT loffset = it->LengthUntil();
72  int owner = ConCat.Owner(gloffset+loffset+i, locind);
73  data[owner].push_back(it->arr[i]);
74  inds[owner].push_back(locind);
75  }
76  gloffset += it->TotalLength();
77  }
78 
79  int * sendcnt = new int[nprocs];
80  int * sdispls = new int[nprocs];
81  for(int i=0; i<nprocs; ++i)
82  sendcnt[i] = (int) data[i].size();
83 
84  int * rdispls = new int[nprocs];
85  int * recvcnt = new int[nprocs];
86  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World); // share the request counts
87  sdispls[0] = 0;
88  rdispls[0] = 0;
89  for(int i=0; i<nprocs-1; ++i)
90  {
91  sdispls[i+1] = sdispls[i] + sendcnt[i];
92  rdispls[i+1] = rdispls[i] + recvcnt[i];
93  }
94  IT totrecv = accumulate(recvcnt,recvcnt+nprocs,static_cast<IT>(0));
95  NT * senddatabuf = new NT[cumloclen];
96  for(int i=0; i<nprocs; ++i)
97  {
98  copy(data[i].begin(), data[i].end(), senddatabuf+sdispls[i]);
99  vector<NT>().swap(data[i]); // delete data vectors
100  }
101  NT * recvdatabuf = new NT[totrecv];
102  MPI_Alltoallv(senddatabuf, sendcnt, sdispls, MPIType<NT>(), recvdatabuf, recvcnt, rdispls, MPIType<NT>(), World); // send data
103  delete [] senddatabuf;
104 
105  IT * sendindsbuf = new IT[cumloclen];
106  for(int i=0; i<nprocs; ++i)
107  {
108  copy(inds[i].begin(), inds[i].end(), sendindsbuf+sdispls[i]);
109  vector<IT>().swap(inds[i]); // delete inds vectors
110  }
111  IT * recvindsbuf = new IT[totrecv];
112  MPI_Alltoallv(sendindsbuf, sendcnt, sdispls, MPIType<IT>(), recvindsbuf, recvcnt, rdispls, MPIType<IT>(), World); // send new inds
113  DeleteAll(sendindsbuf, sendcnt, sdispls);
114 
115  for(int i=0; i<nprocs; ++i)
116  {
117  for(int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j)
118  {
119  ConCat.arr[recvindsbuf[j]] = recvdatabuf[j];
120  }
121  }
122  DeleteAll(recvindsbuf, recvcnt, rdispls);
123  return ConCat;
124  }
125 }
126 
127 template <typename MATRIXA, typename MATRIXB>
128 bool CheckSpGEMMCompliance(const MATRIXA & A, const MATRIXB & B)
129 {
130  if(A.getncol() != B.getnrow())
131  {
132  ostringstream outs;
133  outs << "Can not multiply, dimensions does not match"<< endl;
134  outs << A.getncol() << " != " << B.getnrow() << endl;
135  SpParHelper::Print(outs.str());
136  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
137  return false;
138  }
139  if((void*) &A == (void*) &B)
140  {
141  ostringstream outs;
142  outs << "Can not multiply, inputs alias (make a temporary copy of one of them first)"<< endl;
143  SpParHelper::Print(outs.str());
144  MPI_Abort(MPI_COMM_WORLD, MATRIXALIAS);
145  return false;
146  }
147  return true;
148 }
149 
150 
159 template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
161  (SpParMat<IU,NU1,UDERA> & A, SpParMat<IU,NU2,UDERB> & B, bool clearA = false, bool clearB = false )
162 
163 {
164  if(!CheckSpGEMMCompliance(A,B) )
165  {
166  return SpParMat< IU,NUO,UDERO >();
167  }
168 
169  int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
170  shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
171  IU C_m = A.spSeq->getnrow();
172  IU C_n = B.spSeq->getncol();
173 
174  UDERA * A1seq = new UDERA();
175  UDERA * A2seq = new UDERA();
176  UDERB * B1seq = new UDERB();
177  UDERB * B2seq = new UDERB();
178  (A.spSeq)->Split( *A1seq, *A2seq);
179  const_cast< UDERB* >(B.spSeq)->Transpose();
180  (B.spSeq)->Split( *B1seq, *B2seq);
181  MPI_Barrier(GridC->GetWorld());
182 
183  IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
184  IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
185 
186  SpParHelper::GetSetSizes( *A1seq, ARecvSizes, (A.commGrid)->GetRowWorld());
187  SpParHelper::GetSetSizes( *B1seq, BRecvSizes, (B.commGrid)->GetColWorld());
188 
189  // Remotely fetched matrices are stored as pointers
190  UDERA * ARecv;
191  UDERB * BRecv;
192  vector< SpTuples<IU,NUO> *> tomerge;
193 
194  int Aself = (A.commGrid)->GetRankInProcRow();
195  int Bself = (B.commGrid)->GetRankInProcCol();
196 
197  for(int i = 0; i < stages; ++i)
198  {
199  vector<IU> ess;
200  if(i == Aself)
201  {
202  ARecv = A1seq; // shallow-copy
203  }
204  else
205  {
206  ess.resize(UDERA::esscount);
207  for(int j=0; j< UDERA::esscount; ++j)
208  {
209  ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
210  }
211  ARecv = new UDERA(); // first, create the object
212  }
213  SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
214  ess.clear();
215  if(i == Bself)
216  {
217  BRecv = B1seq; // shallow-copy
218  }
219  else
220  {
221  ess.resize(UDERB::esscount);
222  for(int j=0; j< UDERB::esscount; ++j)
223  {
224  ess[j] = BRecvSizes[j][i];
225  }
226  BRecv = new UDERB();
227  }
228  SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
229  SpTuples<IU,NUO> * C_cont = MultiplyReturnTuples<SR, NUO>
230  (*ARecv, *BRecv, // parameters themselves
231  false, true, // transpose information (B is transposed)
232  i != Aself, // 'delete A' condition
233  i != Bself); // 'delete B' condition
234  if(!C_cont->isZero())
235  tomerge.push_back(C_cont);
236  else
237  delete C_cont;
238  }
239  if(clearA) delete A1seq;
240  if(clearB) delete B1seq;
241 
242  // Set the new dimensions
243  SpParHelper::GetSetSizes( *A2seq, ARecvSizes, (A.commGrid)->GetRowWorld());
244  SpParHelper::GetSetSizes( *B2seq, BRecvSizes, (B.commGrid)->GetColWorld());
245 
246  // Start the second round
247  for(int i = 0; i < stages; ++i)
248  {
249  vector<IU> ess;
250  if(i == Aself)
251  {
252  ARecv = A2seq; // shallow-copy
253  }
254  else
255  {
256  ess.resize(UDERA::esscount);
257  for(int j=0; j< UDERA::esscount; ++j)
258  {
259  ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
260  }
261  ARecv = new UDERA(); // first, create the object
262  }
263 
264  SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
265  ess.clear();
266 
267  if(i == Bself)
268  {
269  BRecv = B2seq; // shallow-copy
270  }
271  else
272  {
273  ess.resize(UDERB::esscount);
274  for(int j=0; j< UDERB::esscount; ++j)
275  {
276  ess[j] = BRecvSizes[j][i];
277  }
278  BRecv = new UDERB();
279  }
280  SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
281 
282  SpTuples<IU,NUO> * C_cont = MultiplyReturnTuples<SR, NUO>
283  (*ARecv, *BRecv, // parameters themselves
284  false, true, // transpose information (B is transposed)
285  i != Aself, // 'delete A' condition
286  i != Bself); // 'delete B' condition
287  if(!C_cont->isZero())
288  tomerge.push_back(C_cont);
289  else
290  delete C_cont;
291  }
292  SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
293  SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
294  if(clearA)
295  {
296  delete A2seq;
297  delete A.spSeq;
298  A.spSeq = NULL;
299  }
300  else
301  {
302  (A.spSeq)->Merge(*A1seq, *A2seq);
303  delete A1seq;
304  delete A2seq;
305  }
306  if(clearB)
307  {
308  delete B2seq;
309  delete B.spSeq;
310  B.spSeq = NULL;
311  }
312  else
313  {
314  (B.spSeq)->Merge(*B1seq, *B2seq);
315  delete B1seq;
316  delete B2seq;
317  const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
318  }
319 
320  UDERO * C = new UDERO(MergeAll<SR>(tomerge, C_m, C_n,true), false);
321  // First get the result in SpTuples, then convert to UDER
322  return SpParMat<IU,NUO,UDERO> (C, GridC); // return the result object
323 }
324 
325 
331 template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
333  (SpParMat<IU,NU1,UDERA> & A, SpParMat<IU,NU2,UDERB> & B, bool clearA = false, bool clearB = false )
334 
335 {
336  if(!CheckSpGEMMCompliance(A,B) )
337  {
338  return SpParMat< IU,NUO,UDERO >();
339  }
340  int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
341  shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
342  IU C_m = A.spSeq->getnrow();
343  IU C_n = B.spSeq->getncol();
344 
345  const_cast< UDERB* >(B.spSeq)->Transpose();
346  MPI_Barrier(GridC->GetWorld());
347 
348  IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
349  IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
350 
351  SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
352  SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
353 
354  // Remotely fetched matrices are stored as pointers
355  UDERA * ARecv;
356  UDERB * BRecv;
357  vector< SpTuples<IU,NUO> *> tomerge;
358 
359  int Aself = (A.commGrid)->GetRankInProcRow();
360  int Bself = (B.commGrid)->GetRankInProcCol();
361 
362  for(int i = 0; i < stages; ++i)
363  {
364  vector<IU> ess;
365  if(i == Aself)
366  {
367  ARecv = A.spSeq; // shallow-copy
368  }
369  else
370  {
371  ess.resize(UDERA::esscount);
372  for(int j=0; j< UDERA::esscount; ++j)
373  {
374  ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
375  }
376  ARecv = new UDERA(); // first, create the object
377  }
378 
379  SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
380  ess.clear();
381 
382  if(i == Bself)
383  {
384  BRecv = B.spSeq; // shallow-copy
385  }
386  else
387  {
388  ess.resize(UDERB::esscount);
389  for(int j=0; j< UDERB::esscount; ++j)
390  {
391  ess[j] = BRecvSizes[j][i];
392  }
393  BRecv = new UDERB();
394  }
395  SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
396 
397  SpTuples<IU,NUO> * C_cont = MultiplyReturnTuples<SR, NUO>
398  (*ARecv, *BRecv, // parameters themselves
399  false, true, // transpose information (B is transposed)
400  i != Aself, // 'delete A' condition
401  i != Bself); // 'delete B' condition
402 
403  if(!C_cont->isZero())
404  tomerge.push_back(C_cont);
405 
406  #ifndef NDEBUG
407  ostringstream outs;
408  outs << i << "th SUMMA iteration"<< endl;
409  SpParHelper::Print(outs.str());
410  #endif
411  }
412  if(clearA && A.spSeq != NULL)
413  {
414  delete A.spSeq;
415  A.spSeq = NULL;
416  }
417  if(clearB && B.spSeq != NULL)
418  {
419  delete B.spSeq;
420  B.spSeq = NULL;
421  }
422 
423  SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
424  SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
425 
426  UDERO * C = new UDERO(MergeAll<SR>(tomerge, C_m, C_n,true), false);
427  // First get the result in SpTuples, then convert to UDER
428  // the last parameter to MergeAll deletes tomerge arrays
429 
430  if(!clearB)
431  const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
432 
433  return SpParMat<IU,NUO,UDERO> (C, GridC); // return the result object
434 }
435 
436 
437 template <typename MATRIX, typename VECTOR>
438 void CheckSpMVCompliance(const MATRIX & A, const VECTOR & x)
439 {
440  if(A.getncol() != x.TotalLength())
441  {
442  ostringstream outs;
443  outs << "Can not multiply, dimensions does not match"<< endl;
444  outs << A.getncol() << " != " << x.TotalLength() << endl;
445  SpParHelper::Print(outs.str());
446  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
447  }
448  if(! ( *(A.getcommgrid()) == *(x.getcommgrid())) )
449  {
450  cout << "Grids are not comparable for SpMV" << endl;
451  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
452  }
453 }
454 
455 
456 template <typename SR, typename IU, typename NUM, typename UDER>
458  (const SpParMat<IU,NUM,UDER> & A, const FullyDistSpVec<IU,IU> & x, bool indexisvalue, OptBuf<int32_t, typename promote_trait<NUM,IU>::T_promote > & optbuf);
459 
460 template <typename SR, typename IU, typename NUM, typename UDER>
462  (const SpParMat<IU,NUM,UDER> & A, const FullyDistSpVec<IU,IU> & x, bool indexisvalue)
463 {
464  typedef typename promote_trait<NUM,IU>::T_promote T_promote;
466  return SpMV<SR>(A, x, indexisvalue, optbuf);
467 }
468 
474 template<typename IU, typename NV>
475 void TransposeVector(MPI_Comm & World, const FullyDistSpVec<IU,NV> & x, int32_t & trxlocnz, IU & lenuntil, int32_t * & trxinds, NV * & trxnums, bool indexisvalue)
476 {
477  int32_t xlocnz = (int32_t) x.getlocnnz();
478  int32_t roffst = (int32_t) x.RowLenUntil(); // since trxinds is int32_t
479  int32_t roffset;
480  IU luntil = x.LengthUntil();
481  int diagneigh = x.commGrid->GetComplementRank();
482 
483  MPI_Status status;
484  MPI_Sendrecv(&roffst, 1, MPIType<int32_t>(), diagneigh, TROST, &roffset, 1, MPIType<int32_t>(), diagneigh, TROST, World, &status);
485  MPI_Sendrecv(&xlocnz, 1, MPIType<int32_t>(), diagneigh, TRNNZ, &trxlocnz, 1, MPIType<int32_t>(), diagneigh, TRNNZ, World, &status);
486  MPI_Sendrecv(&luntil, 1, MPIType<IU>(), diagneigh, TRLUT, &lenuntil, 1, MPIType<IU>(), diagneigh, TRLUT, World, &status);
487 
488  // ABAB: Important observation is that local indices (given by x.ind) is 32-bit addressible
489  // Copy them to 32 bit integers and transfer that to save 50% of off-node bandwidth
490  trxinds = new int32_t[trxlocnz];
491  int32_t * temp_xind = new int32_t[xlocnz];
492  for(int i=0; i< xlocnz; ++i) temp_xind[i] = (int32_t) x.ind[i];
493  MPI_Sendrecv(temp_xind, xlocnz, MPIType<int32_t>(), diagneigh, TRI, trxinds, trxlocnz, MPIType<int32_t>(), diagneigh, TRI, World, &status);
494  delete [] temp_xind;
495  if(!indexisvalue)
496  {
497  trxnums = new NV[trxlocnz];
498  MPI_Sendrecv(const_cast<NV*>(SpHelper::p2a(x.num)), xlocnz, MPIType<NV>(), diagneigh, TRX, trxnums, trxlocnz, MPIType<NV>(), diagneigh, TRX, World, &status);
499  }
500  transform(trxinds, trxinds+trxlocnz, trxinds, bind2nd(plus<int32_t>(), roffset)); // fullydist indexing (p pieces) -> matrix indexing (sqrt(p) pieces)
501 }
502 
510 template<typename IU, typename NV>
511 void AllGatherVector(MPI_Comm & ColWorld, int trxlocnz, IU lenuntil, int32_t * & trxinds, NV * & trxnums,
512  int32_t * & indacc, NV * & numacc, int & accnz, bool indexisvalue)
513 {
514  int colneighs, colrank;
515  MPI_Comm_size(ColWorld, &colneighs);
516  MPI_Comm_rank(ColWorld, &colrank);
517  int * colnz = new int[colneighs];
518  colnz[colrank] = trxlocnz;
519  MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
520  int * dpls = new int[colneighs](); // displacements (zero initialized pid)
521  std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
522  accnz = std::accumulate(colnz, colnz+colneighs, 0);
523  indacc = new int32_t[accnz];
524  numacc = new NV[accnz];
525 
526  // ABAB: Future issues here, colnz is of type int (MPI limitation)
527  // What if the aggregate vector size along the processor row/column is not 32-bit addressible?
528  // This will happen when n/sqrt(p) > 2^31
529  // Currently we can solve a small problem (scale 32) with 4096 processor
530  // For a medium problem (scale 35), we'll need 32K processors which gives sqrt(p) ~ 180
531  // 2^35 / 180 ~ 2^29 / 3 which is not an issue !
532 
533 #ifdef TIMING
534  double t0=MPI_Wtime();
535 #endif
536  MPI_Allgatherv(trxinds, trxlocnz, MPIType<int32_t>(), indacc, colnz, dpls, MPIType<int32_t>(), ColWorld);
537 
538  delete [] trxinds;
539  if(indexisvalue)
540  {
541  IU lenuntilcol;
542  if(colrank == 0) lenuntilcol = lenuntil;
543  MPI_Bcast(&lenuntilcol, 1, MPIType<IU>(), 0, ColWorld);
544  for(int i=0; i< accnz; ++i) // fill numerical values from indices
545  {
546  numacc[i] = indacc[i] + lenuntilcol;
547  }
548  }
549  else
550  {
551  MPI_Allgatherv(trxnums, trxlocnz, MPIType<NV>(), numacc, colnz, dpls, MPIType<NV>(), ColWorld);
552  delete [] trxnums;
553  }
554 #ifdef TIMING
555  double t1=MPI_Wtime();
556  cblas_allgathertime += (t1-t0);
557 #endif
558  DeleteAll(colnz,dpls);
559 }
560 
561 
562 
569 template<typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
570 void LocalSpMV(const SpParMat<IU,NUM,UDER> & A, int rowneighs, OptBuf<int32_t, OVT > & optbuf, int32_t * & indacc, IVT * & numacc,
571  int32_t * & sendindbuf, OVT * & sendnumbuf, int * & sdispls, int * sendcnt, int accnz, bool indexisvalue)
572 {
573  if(optbuf.totmax > 0) // graph500 optimization enabled
574  {
575  if(A.spSeq->getnsplit() > 0)
576  {
577  // optbuf.{inds/nums/dspls} and sendcnt are all pre-allocated and only filled by dcsc_gespmv_threaded
578  dcsc_gespmv_threaded_setbuffers<SR> (*(A.spSeq), indacc, numacc, accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs);
579  }
580  else
581  {
582  dcsc_gespmv<SR> (*(A.spSeq), indacc, numacc, accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs, indexisvalue);
583  }
584  DeleteAll(indacc,numacc);
585  }
586  else
587  {
588  if(A.spSeq->getnsplit() > 0)
589  {
590  // sendindbuf/sendnumbuf/sdispls are all allocated and filled by dcsc_gespmv_threaded
591  int totalsent = dcsc_gespmv_threaded<SR> (*(A.spSeq), indacc, numacc, accnz, sendindbuf, sendnumbuf, sdispls, rowneighs);
592 
593  DeleteAll(indacc, numacc);
594  for(int i=0; i<rowneighs-1; ++i)
595  sendcnt[i] = sdispls[i+1] - sdispls[i];
596  sendcnt[rowneighs-1] = totalsent - sdispls[rowneighs-1];
597  }
598  else
599  {
600  // serial SpMV with sparse vector
601  vector< int32_t > indy;
602  vector< OVT > numy;
603 
604  dcsc_gespmv<SR>(*(A.spSeq), indacc, numacc, accnz, indy, numy); // actual multiplication
605  DeleteAll(indacc, numacc);
606 
607  int32_t bufsize = indy.size(); // as compact as possible
608  sendindbuf = new int32_t[bufsize];
609  sendnumbuf = new OVT[bufsize];
610  int32_t perproc = A.getlocalrows() / rowneighs;
611 
612  int k = 0; // index to buffer
613  for(int i=0; i<rowneighs; ++i)
614  {
615  int32_t end_this = (i==rowneighs-1) ? A.getlocalrows(): (i+1)*perproc;
616  while(k < bufsize && indy[k] < end_this)
617  {
618  sendindbuf[k] = indy[k] - i*perproc;
619  sendnumbuf[k] = numy[k];
620  ++sendcnt[i];
621  ++k;
622  }
623  }
624  sdispls = new int[rowneighs]();
625  partial_sum(sendcnt, sendcnt+rowneighs-1, sdispls+1);
626  }
627  }
628 }
629 
630 template <typename SR, typename IU, typename OVT>
631 void MergeContributions(FullyDistSpVec<IU,OVT> & y, int * & recvcnt, int * & rdispls, int32_t * & recvindbuf, OVT * & recvnumbuf, int rowneighs)
632 {
633  // free memory of y, in case it was aliased
634  vector<IU>().swap(y.ind);
635  vector<OVT>().swap(y.num);
636 
637 #ifndef HEAPMERGE
638  IU ysize = y.MyLocLength(); // my local length is only O(n/p)
639  bool * isthere = new bool[ysize];
640  vector< pair<IU,OVT> > ts_pairs;
641  fill_n(isthere, ysize, false);
642 
643  // We don't need to keep a "merger" because minimum will always come from the processor
644  // with the smallest rank; so a linear sweep over the received buffer is enough
645  for(int i=0; i<rowneighs; ++i)
646  {
647  for(int j=0; j< recvcnt[i]; ++j)
648  {
649  int32_t index = recvindbuf[rdispls[i] + j];
650  if(!isthere[index])
651  ts_pairs.push_back(make_pair(index, recvnumbuf[rdispls[i] + j]));
652 
653  }
654  }
655  DeleteAll(recvcnt, rdispls);
656  DeleteAll(isthere, recvindbuf, recvnumbuf);
657  sort(ts_pairs.begin(), ts_pairs.end());
658  int nnzy = ts_pairs.size();
659  y.ind.resize(nnzy);
660  y.num.resize(nnzy);
661  for(int i=0; i< nnzy; ++i)
662  {
663  y.ind[i] = ts_pairs[i].first;
664  y.num[i] = ts_pairs[i].second;
665  }
666 #else
667  // Alternative 2: Heap-merge
668  int32_t hsize = 0;
669  int32_t inf = numeric_limits<int32_t>::min();
670  int32_t sup = numeric_limits<int32_t>::max();
671  KNHeap< int32_t, int32_t > sHeap(sup, inf);
672  int * processed = new int[rowneighs]();
673  for(int i=0; i<rowneighs; ++i)
674  {
675  if(recvcnt[i] > 0)
676  {
677  // key, proc_id
678  sHeap.insert(recvindbuf[rdispls[i]], i);
679  ++hsize;
680  }
681  }
682  int32_t key, locv;
683  if(hsize > 0)
684  {
685  sHeap.deleteMin(&key, &locv);
686  y.ind.push_back( static_cast<IU>(key));
687  y.num.push_back(recvnumbuf[rdispls[locv]]); // nothing is processed yet
688 
689  if( (++(processed[locv])) < recvcnt[locv] )
690  sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
691  else
692  --hsize;
693  }
694  while(hsize > 0)
695  {
696  sHeap.deleteMin(&key, &locv);
697  IU deref = rdispls[locv] + processed[locv];
698  if(y.ind.back() == static_cast<IU>(key)) // y.ind is surely not empty
699  {
700  y.num.back() = SR::add(y.num.back(), recvnumbuf[deref]);
701  // ABAB: Benchmark actually allows us to be non-deterministic in terms of parent selection
702  // We can just skip this addition operator (if it's a max/min select)
703  }
704  else
705  {
706  y.ind.push_back(static_cast<IU>(key));
707  y.num.push_back(recvnumbuf[deref]);
708  }
709 
710  if( (++(processed[locv])) < recvcnt[locv] )
711  sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
712  else
713  --hsize;
714  }
715  DeleteAll(recvcnt, rdispls,processed);
716  DeleteAll(recvindbuf, recvnumbuf);
717 #endif
718 
719 }
720 
727 template <typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
729  bool indexisvalue, OptBuf<int32_t, OVT > & optbuf)
730 {
731  CheckSpMVCompliance(A,x);
732  optbuf.MarkEmpty();
733 
734  MPI_Comm World = x.commGrid->GetWorld();
735  MPI_Comm ColWorld = x.commGrid->GetColWorld();
736  MPI_Comm RowWorld = x.commGrid->GetRowWorld();
737 
738  int accnz;
739  int32_t trxlocnz;
740  IU lenuntil;
741  int32_t *trxinds, *indacc;
742  IVT *trxnums, *numacc;
743  TransposeVector(World, x, trxlocnz, lenuntil, trxinds, trxnums, indexisvalue);
744  AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, indacc, numacc, accnz, indexisvalue);
745 
746  int rowneighs;
747  MPI_Comm_size(RowWorld, &rowneighs);
748  int * sendcnt = new int[rowneighs]();
749  int32_t * sendindbuf;
750  OVT * sendnumbuf;
751  int * sdispls;
752  LocalSpMV<SR>(A, rowneighs, optbuf, indacc, numacc, sendindbuf, sendnumbuf, sdispls, sendcnt, accnz, indexisvalue); // indacc/numacc deallocated, sendindbuf/sendnumbuf/sdispls allocated
753 
754  int * rdispls = new int[rowneighs];
755  int * recvcnt = new int[rowneighs];
756  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld); // share the request counts
757 
758  // receive displacements are exact whereas send displacements have slack
759  rdispls[0] = 0;
760  for(int i=0; i<rowneighs-1; ++i)
761  {
762  rdispls[i+1] = rdispls[i] + recvcnt[i];
763  }
764  int totrecv = accumulate(recvcnt,recvcnt+rowneighs,0);
765  int32_t * recvindbuf = new int32_t[totrecv];
766  OVT * recvnumbuf = new OVT[totrecv];
767 
768 #ifdef TIMING
769  double t2=MPI_Wtime();
770 #endif
771  if(optbuf.totmax > 0 ) // graph500 optimization enabled
772  {
773  MPI_Alltoallv(optbuf.inds, sendcnt, optbuf.dspls, MPIType<int32_t>(), recvindbuf, recvcnt, rdispls, MPIType<int32_t>(), RowWorld);
774  MPI_Alltoallv(optbuf.nums, sendcnt, optbuf.dspls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
775 
776  delete [] sendcnt;
777  }
778  else
779  {
780  /* ofstream oput;
781  x.commGrid->OpenDebugFile("Send", oput);
782  oput << "To displacements: "; copy(sdispls, sdispls+rowneighs, ostream_iterator<int>(oput, " ")); oput << endl;
783  oput << "To counts: "; copy(sendcnt, sendcnt+rowneighs, ostream_iterator<int>(oput, " ")); oput << endl;
784  for(int i=0; i< rowneighs; ++i)
785  {
786  oput << "To neighbor: " << i << endl;
787  copy(sendindbuf+sdispls[i], sendindbuf+sdispls[i]+sendcnt[i], ostream_iterator<int32_t>(oput, " ")); oput << endl;
788  copy(sendnumbuf+sdispls[i], sendnumbuf+sdispls[i]+sendcnt[i], ostream_iterator<OVT>(oput, " ")); oput << endl;
789  }
790  oput.close(); */
791 
792  MPI_Alltoallv(sendindbuf, sendcnt, sdispls, MPIType<int32_t>(), recvindbuf, recvcnt, rdispls, MPIType<int32_t>(), RowWorld);
793  MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
794 
795  DeleteAll(sendindbuf, sendnumbuf);
796  DeleteAll(sendcnt, sdispls);
797  }
798 #ifdef TIMING
799  double t3=MPI_Wtime();
800  cblas_alltoalltime += (t3-t2);
801 #endif
802 
803  // ofstream output;
804  // A.commGrid->OpenDebugFile("Recv", output);
805  // copy(recvindbuf, recvindbuf+totrecv, ostream_iterator<IU>(output," ")); output << endl;
806  // output.close();
807 
808  MergeContributions<SR>(y,recvcnt, rdispls, recvindbuf, recvnumbuf, rowneighs);
809 }
810 
811 
812 template <typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
813 void SpMV (const SpParMat<IU,NUM,UDER> & A, const FullyDistSpVec<IU,IVT> & x, FullyDistSpVec<IU,OVT> & y, bool indexisvalue)
814 {
816  SpMV<SR>(A, x, y, indexisvalue, optbuf);
817 }
818 
819 
824 template <typename SR, typename IU, typename NUM, typename UDER>
826 (const SpParMat<IU,NUM,UDER> & A, const FullyDistSpVec<IU,IU> & x, bool indexisvalue, OptBuf<int32_t, typename promote_trait<NUM,IU>::T_promote > & optbuf)
827 {
828  typedef typename promote_trait<NUM,IU>::T_promote T_promote;
829  FullyDistSpVec<IU, T_promote> y ( x.getcommgrid(), A.getnrow()); // identity doesn't matter for sparse vectors
830  SpMV<SR>(A, x, y, indexisvalue, optbuf);
831  return y;
832 }
833 
837 template <typename SR, typename IU, typename NUM, typename NUV, typename UDER>
840 {
841  typedef typename promote_trait<NUM,NUV>::T_promote T_promote;
842  CheckSpMVCompliance(A, x);
843 
844  MPI_Comm World = x.commGrid->GetWorld();
845  MPI_Comm ColWorld = x.commGrid->GetColWorld();
846  MPI_Comm RowWorld = x.commGrid->GetRowWorld();
847 
848  int xsize = (int) x.LocArrSize();
849  int trxsize = 0;
850 
851  int diagneigh = x.commGrid->GetComplementRank();
852  MPI_Status status;
853  MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh, TRX, &trxsize, 1, MPI_INT, diagneigh, TRX, World, &status);
854 
855  NUV * trxnums = new NUV[trxsize];
856  MPI_Sendrecv(const_cast<NUV*>(SpHelper::p2a(x.arr)), xsize, MPIType<NUV>(), diagneigh, TRX, trxnums, trxsize, MPIType<NUV>(), diagneigh, TRX, World, &status);
857 
858  int colneighs, colrank;
859  MPI_Comm_size(ColWorld, &colneighs);
860  MPI_Comm_rank(ColWorld, &colrank);
861  int * colsize = new int[colneighs];
862  colsize[colrank] = trxsize;
863  MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize, 1, MPI_INT, ColWorld);
864  int * dpls = new int[colneighs](); // displacements (zero initialized pid)
865  std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
866  int accsize = std::accumulate(colsize, colsize+colneighs, 0);
867  NUV * numacc = new NUV[accsize];
868 
869  MPI_Allgatherv(trxnums, trxsize, MPIType<NUV>(), numacc, colsize, dpls, MPIType<NUV>(), ColWorld);
870  delete [] trxnums;
871 
872  // serial SpMV with dense vector
873  T_promote id = SR::id();
874  IU ysize = A.getlocalrows();
875  T_promote * localy = new T_promote[ysize];
876  fill_n(localy, ysize, id);
877  dcsc_gespmv<SR>(*(A.spSeq), numacc, localy);
878 
879  DeleteAll(numacc,colsize, dpls);
880 
881  // FullyDistVec<IT,NT>(shared_ptr<CommGrid> grid, IT globallen, NT initval, NT id)
882  FullyDistVec<IU, T_promote> y ( x.commGrid, A.getnrow(), id);
883 
884  int rowneighs;
885  MPI_Comm_size(RowWorld, &rowneighs);
886 
887  IU begptr, endptr;
888  for(int i=0; i< rowneighs; ++i)
889  {
890  begptr = y.RowLenUntil(i);
891  if(i == rowneighs-1)
892  {
893  endptr = ysize;
894  }
895  else
896  {
897  endptr = y.RowLenUntil(i+1);
898  }
899  MPI_Reduce(localy+begptr, SpHelper::p2a(y.arr), endptr-begptr, MPIType<T_promote>(), SR::mpi_op(), i, RowWorld);
900  }
901  delete [] localy;
902  return y;
903 }
904 
905 
911 template <typename SR, typename IU, typename NUM, typename NUV, typename UDER>
914 {
915  typedef typename promote_trait<NUM,NUV>::T_promote T_promote;
916  CheckSpMVCompliance(A, x);
917 
918  MPI_Comm World = x.commGrid->GetWorld();
919  MPI_Comm ColWorld = x.commGrid->GetColWorld();
920  MPI_Comm RowWorld = x.commGrid->GetRowWorld();
921 
922  int xlocnz = (int) x.getlocnnz();
923  int trxlocnz = 0;
924  int roffst = x.RowLenUntil();
925  int offset;
926 
927  int diagneigh = x.commGrid->GetComplementRank();
928  MPI_Status status;
929  MPI_Sendrecv(&xlocnz, 1, MPI_INT, diagneigh, TRX, &trxlocnz, 1, MPI_INT, diagneigh, TRX, World, &status);
930  MPI_Sendrecv(&roffst, 1, MPI_INT, diagneigh, TROST, &offset, 1, MPI_INT, diagneigh, TROST, World, &status);
931 
932  IU * trxinds = new IU[trxlocnz];
933  NUV * trxnums = new NUV[trxlocnz];
934  MPI_Sendrecv(const_cast<IU*>(SpHelper::p2a(x.ind)), xlocnz, MPIType<IU>(), diagneigh, TRX, trxinds, trxlocnz, MPIType<IU>(), diagneigh, TRX, World, &status);
935  MPI_Sendrecv(const_cast<NUV*>(SpHelper::p2a(x.num)), xlocnz, MPIType<NUV>(), diagneigh, TRX, trxnums, trxlocnz, MPIType<NUV>(), diagneigh, TRX, World, &status);
936  transform(trxinds, trxinds+trxlocnz, trxinds, bind2nd(plus<IU>(), offset)); // fullydist indexing (n pieces) -> matrix indexing (sqrt(p) pieces)
937 
938  int colneighs, colrank;
939  MPI_Comm_size(ColWorld, &colneighs);
940  MPI_Comm_rank(ColWorld, &colrank);
941  int * colnz = new int[colneighs];
942  colnz[colrank] = trxlocnz;
943  MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
944  int * dpls = new int[colneighs](); // displacements (zero initialized pid)
945  std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
946  int accnz = std::accumulate(colnz, colnz+colneighs, 0);
947  IU * indacc = new IU[accnz];
948  NUV * numacc = new NUV[accnz];
949 
950  // ABAB: Future issues here, colnz is of type int (MPI limitation)
951  // What if the aggregate vector size along the processor row/column is not 32-bit addressible?
952  MPI_Allgatherv(trxinds, trxlocnz, MPIType<IU>(), indacc, colnz, dpls, MPIType<IU>(), ColWorld);
953  MPI_Allgatherv(trxnums, trxlocnz, MPIType<NUV>(), numacc, colnz, dpls, MPIType<NUV>(), ColWorld);
954  DeleteAll(trxinds, trxnums);
955 
956  // serial SpMV with sparse vector
957  vector< int32_t > indy;
958  vector< T_promote > numy;
959 
960  int32_t * tmpindacc = new int32_t[accnz];
961  for(int i=0; i< accnz; ++i) tmpindacc[i] = indacc[i];
962  delete [] indacc;
963 
964  dcsc_gespmv<SR>(*(A.spSeq), tmpindacc, numacc, accnz, indy, numy); // actual multiplication
965 
966  DeleteAll(tmpindacc, numacc);
967  DeleteAll(colnz, dpls);
968 
969  FullyDistSpVec<IU, T_promote> y ( x.commGrid, A.getnrow()); // identity doesn't matter for sparse vectors
970  IU yintlen = y.MyRowLength();
971 
972  int rowneighs;
973  MPI_Comm_size(RowWorld,&rowneighs);
974  vector< vector<IU> > sendind(rowneighs);
975  vector< vector<T_promote> > sendnum(rowneighs);
976  typename vector<int32_t>::size_type outnz = indy.size();
977  for(typename vector<IU>::size_type i=0; i< outnz; ++i)
978  {
979  IU locind;
980  int rown = y.OwnerWithinRow(yintlen, static_cast<IU>(indy[i]), locind);
981  sendind[rown].push_back(locind);
982  sendnum[rown].push_back(numy[i]);
983  }
984 
985  IU * sendindbuf = new IU[outnz];
986  T_promote * sendnumbuf = new T_promote[outnz];
987  int * sendcnt = new int[rowneighs];
988  int * sdispls = new int[rowneighs];
989  for(int i=0; i<rowneighs; ++i)
990  sendcnt[i] = sendind[i].size();
991 
992  int * rdispls = new int[rowneighs];
993  int * recvcnt = new int[rowneighs];
994  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld); // share the request counts
995 
996  sdispls[0] = 0;
997  rdispls[0] = 0;
998  for(int i=0; i<rowneighs-1; ++i)
999  {
1000  sdispls[i+1] = sdispls[i] + sendcnt[i];
1001  rdispls[i+1] = rdispls[i] + recvcnt[i];
1002  }
1003  int totrecv = accumulate(recvcnt,recvcnt+rowneighs,0);
1004  IU * recvindbuf = new IU[totrecv];
1005  T_promote * recvnumbuf = new T_promote[totrecv];
1006 
1007  for(int i=0; i<rowneighs; ++i)
1008  {
1009  copy(sendind[i].begin(), sendind[i].end(), sendindbuf+sdispls[i]);
1010  vector<IU>().swap(sendind[i]);
1011  }
1012  for(int i=0; i<rowneighs; ++i)
1013  {
1014  copy(sendnum[i].begin(), sendnum[i].end(), sendnumbuf+sdispls[i]);
1015  vector<T_promote>().swap(sendnum[i]);
1016  }
1017  MPI_Alltoallv(sendindbuf, sendcnt, sdispls, MPIType<IU>(), recvindbuf, recvcnt, rdispls, MPIType<IU>(), RowWorld);
1018  MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<T_promote>(), recvnumbuf, recvcnt, rdispls, MPIType<T_promote>(), RowWorld);
1019 
1020  DeleteAll(sendindbuf, sendnumbuf);
1021  DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
1022 
1023  // define a SPA-like data structure
1024  IU ysize = y.MyLocLength();
1025  T_promote * localy = new T_promote[ysize];
1026  bool * isthere = new bool[ysize];
1027  vector<IU> nzinds; // nonzero indices
1028  fill_n(isthere, ysize, false);
1029 
1030  for(int i=0; i< totrecv; ++i)
1031  {
1032  if(!isthere[recvindbuf[i]])
1033  {
1034  localy[recvindbuf[i]] = recvnumbuf[i]; // initial assignment
1035  nzinds.push_back(recvindbuf[i]);
1036  isthere[recvindbuf[i]] = true;
1037  }
1038  else
1039  {
1040  localy[recvindbuf[i]] = SR::add(localy[recvindbuf[i]], recvnumbuf[i]);
1041  }
1042  }
1043  DeleteAll(isthere, recvindbuf, recvnumbuf);
1044  sort(nzinds.begin(), nzinds.end());
1045  int nnzy = nzinds.size();
1046  y.ind.resize(nnzy);
1047  y.num.resize(nnzy);
1048  for(int i=0; i< nnzy; ++i)
1049  {
1050  y.ind[i] = nzinds[i];
1051  y.num[i] = localy[nzinds[i]];
1052  }
1053  delete [] localy;
1054  return y;
1055 }
1056 
1057 
1058 template <typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
1060  (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B , bool exclude)
1061 {
1062  typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
1063  typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
1064 
1065  if(*(A.commGrid) == *(B.commGrid))
1066  {
1067  DER_promote * result = new DER_promote( EWiseMult(*(A.spSeq),*(B.spSeq),exclude) );
1068  return SpParMat<IU, N_promote, DER_promote> (result, A.commGrid);
1069  }
1070  else
1071  {
1072  cout << "Grids are not comparable elementwise multiplication" << endl;
1073  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1075  }
1076 }
1077 
1078 template <typename RETT, typename RETDER, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB, typename _BinaryOperation>
1080  (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B, _BinaryOperation __binary_op, bool notB, const NU2& defaultBVal)
1081 {
1082  if(*(A.commGrid) == *(B.commGrid))
1083  {
1084  RETDER * result = new RETDER( EWiseApply<RETT>(*(A.spSeq),*(B.spSeq), __binary_op, notB, defaultBVal) );
1085  return SpParMat<IU, RETT, RETDER> (result, A.commGrid);
1086  }
1087  else
1088  {
1089  cout << "Grids are not comparable elementwise apply" << endl;
1090  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1091  return SpParMat< IU,RETT,RETDER >();
1092  }
1093 }
1094 
1095 template <typename RETT, typename RETDER, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB, typename _BinaryOperation, typename _BinaryPredicate>
1097  (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op, bool allowANulls, bool allowBNulls, const NU1& ANullVal, const NU2& BNullVal, const bool allowIntersect, const bool useExtendedBinOp)
1098 {
1099  if(*(A.commGrid) == *(B.commGrid))
1100  {
1101  RETDER * result = new RETDER( EWiseApply<RETT>(*(A.spSeq),*(B.spSeq), __binary_op, do_op, allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect) );
1102  return SpParMat<IU, RETT, RETDER> (result, A.commGrid);
1103  }
1104  else
1105  {
1106  cout << "Grids are not comparable elementwise apply" << endl;
1107  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1108  return SpParMat< IU,RETT,RETDER >();
1109  }
1110 }
1111 
1112 // plain adapter
1113 template <typename RETT, typename RETDER, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB, typename _BinaryOperation, typename _BinaryPredicate>
1115 EWiseApply (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op, bool allowANulls, bool allowBNulls, const NU1& ANullVal, const NU2& BNullVal, const bool allowIntersect = true)
1116 {
1117  return EWiseApply<RETT, RETDER>(A, B,
1120  allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect, true);
1121 }
1122 // end adapter
1123 
1124 
1129 template <typename IU, typename NU1, typename NU2>
1131  (const SpParVec<IU,NU1> & V, const DenseParVec<IU,NU2> & W , bool exclude, NU2 zero)
1132 {
1133  typedef typename promote_trait<NU1,NU2>::T_promote T_promote;
1134 
1135  if(*(V.commGrid) == *(W.commGrid))
1136  {
1137  SpParVec< IU, T_promote> Product(V.commGrid);
1138  Product.length = V.length;
1139  if(Product.diagonal)
1140  {
1141  if(exclude)
1142  {
1143  IU size= V.ind.size();
1144  for(IU i=0; i<size; ++i)
1145  {
1146  if(W.arr.size() <= V.ind[i] || W.arr[V.ind[i]] == zero) // keep only those
1147  {
1148  Product.ind.push_back(V.ind[i]);
1149  Product.num.push_back(V.num[i]);
1150  }
1151  }
1152  }
1153  else
1154  {
1155  IU size= V.ind.size();
1156  for(IU i=0; i<size; ++i)
1157  {
1158  if(W.arr.size() > V.ind[i] && W.arr[V.ind[i]] != zero) // keep only those
1159  {
1160  Product.ind.push_back(V.ind[i]);
1161  Product.num.push_back(V.num[i] * W.arr[V.ind[i]]);
1162  }
1163  }
1164  }
1165  }
1166  return Product;
1167  }
1168  else
1169  {
1170  cout << "Grids are not comparable elementwise multiplication" << endl;
1171  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1172  return SpParVec< IU,T_promote>();
1173  }
1174 }
1175 
1180 template <typename IU, typename NU1, typename NU2>
1182  (const FullyDistSpVec<IU,NU1> & V, const FullyDistVec<IU,NU2> & W , bool exclude, NU2 zero)
1183 {
1184  typedef typename promote_trait<NU1,NU2>::T_promote T_promote;
1185 
1186  if(*(V.commGrid) == *(W.commGrid))
1187  {
1188  FullyDistSpVec< IU, T_promote> Product(V.commGrid);
1189  if(V.glen != W.glen)
1190  {
1191  cerr << "Vector dimensions don't match for EWiseMult\n";
1192  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
1193  }
1194  else
1195  {
1196  Product.glen = V.glen;
1197  IU size= V.getlocnnz();
1198  if(exclude)
1199  {
1200  #if defined(_OPENMP) && defined(CBLAS_EXPERIMENTAL) // not faster than serial
1201  int actual_splits = cblas_splits * 1; // 1 is the parallel slackness
1202  vector <IU> tlosizes (actual_splits, 0);
1203  vector < vector<IU> > tlinds(actual_splits);
1204  vector < vector<T_promote> > tlnums(actual_splits);
1205  IU tlsize = size / actual_splits;
1206  #pragma omp parallel for //schedule(dynamic, 1)
1207  for(IU t = 0; t < actual_splits; ++t)
1208  {
1209  IU tlbegin = t*tlsize;
1210  IU tlend = (t==actual_splits-1)? size : (t+1)*tlsize;
1211  for(IU i=tlbegin; i<tlend; ++i)
1212  {
1213  if(W.arr[V.ind[i]] == zero) // keep only those
1214  {
1215  tlinds[t].push_back(V.ind[i]);
1216  tlnums[t].push_back(V.num[i]);
1217  tlosizes[t]++;
1218  }
1219  }
1220  }
1221  vector<IU> prefix_sum(actual_splits+1,0);
1222  partial_sum(tlosizes.begin(), tlosizes.end(), prefix_sum.begin()+1);
1223  Product.ind.resize(prefix_sum[actual_splits]);
1224  Product.num.resize(prefix_sum[actual_splits]);
1225 
1226  #pragma omp parallel for //schedule(dynamic, 1)
1227  for(IU t=0; t< actual_splits; ++t)
1228  {
1229  copy(tlinds[t].begin(), tlinds[t].end(), Product.ind.begin()+prefix_sum[t]);
1230  copy(tlnums[t].begin(), tlnums[t].end(), Product.num.begin()+prefix_sum[t]);
1231  }
1232  #else
1233  for(IU i=0; i<size; ++i)
1234  {
1235  if(W.arr[V.ind[i]] == zero) // keep only those
1236  {
1237  Product.ind.push_back(V.ind[i]);
1238  Product.num.push_back(V.num[i]);
1239  }
1240  }
1241  #endif
1242  }
1243  else
1244  {
1245  for(IU i=0; i<size; ++i)
1246  {
1247  if(W.arr[V.ind[i]] != zero) // keep only those
1248  {
1249  Product.ind.push_back(V.ind[i]);
1250  Product.num.push_back(V.num[i] * W.arr[V.ind[i]]);
1251  }
1252  }
1253  }
1254  }
1255  return Product;
1256  }
1257  else
1258  {
1259  cout << "Grids are not comparable elementwise multiplication" << endl;
1260  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1262  }
1263 }
1264 
1265 
1283 template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
1285  (const FullyDistSpVec<IU,NU1> & V, const FullyDistVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, NU1 Vzero, const bool useExtendedBinOp)
1286 {
1287  typedef RET T_promote; //typedef typename promote_trait<NU1,NU2>::T_promote T_promote;
1288  if(*(V.commGrid) == *(W.commGrid))
1289  {
1290  FullyDistSpVec< IU, T_promote> Product(V.commGrid);
1291  FullyDistVec< IU, NU1> DV (V);
1292  if(V.TotalLength() != W.TotalLength())
1293  {
1294  ostringstream outs;
1295  outs << "Vector dimensions don't match (" << V.TotalLength() << " vs " << W.TotalLength() << ") for EWiseApply (short version)\n";
1296  SpParHelper::Print(outs.str());
1297  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
1298  }
1299  else
1300  {
1301  Product.glen = V.glen;
1302  IU size= W.LocArrSize();
1303  IU spsize = V.getlocnnz();
1304  IU sp_iter = 0;
1305  if (allowVNulls)
1306  {
1307  // iterate over the dense vector
1308  for(IU i=0; i<size; ++i)
1309  {
1310  if(sp_iter < spsize && V.ind[sp_iter] == i)
1311  {
1312  if (_doOp(V.num[sp_iter], W.arr[i], false, false))
1313  {
1314  Product.ind.push_back(i);
1315  Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[i], false, false));
1316  }
1317  sp_iter++;
1318  }
1319  else
1320  {
1321  if (_doOp(Vzero, W.arr[i], true, false))
1322  {
1323  Product.ind.push_back(i);
1324  Product.num.push_back(_binary_op(Vzero, W.arr[i], true, false));
1325  }
1326  }
1327  }
1328  }
1329  else
1330  {
1331  // iterate over the sparse vector
1332  for(sp_iter = 0; sp_iter < spsize; ++sp_iter)
1333  {
1334  if (_doOp(V.num[sp_iter], W.arr[V.ind[sp_iter]], false, false))
1335  {
1336  Product.ind.push_back(V.ind[sp_iter]);
1337  Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[V.ind[sp_iter]], false, false));
1338  }
1339  }
1340  }
1341  }
1342  return Product;
1343  }
1344  else
1345  {
1346  cout << "Grids are not comparable for EWiseApply" << endl;
1347  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1349  }
1350 }
1351 
1373 template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
1375  (const FullyDistSpVec<IU,NU1> & V, const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, bool allowWNulls, NU1 Vzero, NU2 Wzero, const bool allowIntersect, const bool useExtendedBinOp)
1376 {
1377  typedef RET T_promote; // typename promote_trait<NU1,NU2>::T_promote T_promote;
1378  if(*(V.commGrid) == *(W.commGrid))
1379  {
1380  FullyDistSpVec< IU, T_promote> Product(V.commGrid);
1381  if(V.glen != W.glen)
1382  {
1383  ostringstream outs;
1384  outs << "Vector dimensions don't match (" << V.glen << " vs " << W.glen << ") for EWiseApply (full version)\n";
1385  SpParHelper::Print(outs.str());
1386  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
1387  }
1388  else
1389  {
1390  Product.glen = V.glen;
1391  typename vector< IU >::const_iterator indV = V.ind.begin();
1392  typename vector< NU1 >::const_iterator numV = V.num.begin();
1393  typename vector< IU >::const_iterator indW = W.ind.begin();
1394  typename vector< NU2 >::const_iterator numW = W.num.begin();
1395 
1396  while (indV < V.ind.end() && indW < W.ind.end())
1397  {
1398  if (*indV == *indW)
1399  {
1400  // overlap
1401  if (allowIntersect)
1402  {
1403  if (_doOp(*numV, *numW, false, false))
1404  {
1405  Product.ind.push_back(*indV);
1406  Product.num.push_back(_binary_op(*numV, *numW, false, false));
1407  }
1408  }
1409  indV++; numV++;
1410  indW++; numW++;
1411  }
1412  else if (*indV < *indW)
1413  {
1414  // V has value but W does not
1415  if (allowWNulls)
1416  {
1417  if (_doOp(*numV, Wzero, false, true))
1418  {
1419  Product.ind.push_back(*indV);
1420  Product.num.push_back(_binary_op(*numV, Wzero, false, true));
1421  }
1422  }
1423  indV++; numV++;
1424  }
1425  else //(*indV > *indW)
1426  {
1427  // W has value but V does not
1428  if (allowVNulls)
1429  {
1430  if (_doOp(Vzero, *numW, true, false))
1431  {
1432  Product.ind.push_back(*indW);
1433  Product.num.push_back(_binary_op(Vzero, *numW, true, false));
1434  }
1435  }
1436  indW++; numW++;
1437  }
1438  }
1439  // clean up
1440  while (allowWNulls && indV < V.ind.end())
1441  {
1442  if (_doOp(*numV, Wzero, false, true))
1443  {
1444  Product.ind.push_back(*indV);
1445  Product.num.push_back(_binary_op(*numV, Wzero, false, true));
1446  }
1447  indV++; numV++;
1448  }
1449  while (allowVNulls && indW < W.ind.end())
1450  {
1451  if (_doOp(Vzero, *numW, true, false))
1452  {
1453  Product.ind.push_back(*indW);
1454  Product.num.push_back(_binary_op(Vzero, *numW, true, false));
1455  }
1456  indW++; numW++;
1457  }
1458  }
1459  return Product;
1460  }
1461  else
1462  {
1463  cout << "Grids are not comparable for EWiseApply" << endl;
1464  MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1466  }
1467 }
1468 
1469 // plain callback versions
1470 template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
1472  (const FullyDistSpVec<IU,NU1> & V, const FullyDistVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, NU1 Vzero)
1473 {
1474  return EWiseApply<RET>(V, W,
1477  allowVNulls, Vzero, true);
1478 }
1479 
1480 template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
1482  (const FullyDistSpVec<IU,NU1> & V, const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, bool allowWNulls, NU1 Vzero, NU2 Wzero, const bool allowIntersect = true)
1483 {
1484  return EWiseApply<RET>(V, W,
1487  allowVNulls, allowWNulls, Vzero, Wzero, allowIntersect, true);
1488 }
1489 
1490 
1491 
1492 #endif
1493