COMBINATORIAL_BLAS  1.3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Friends.h
Go to the documentation of this file.
1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.2 -------------------------------------------------*/
4 /* date: 10/06/2011 --------------------------------------------*/
5 /* authors: Aydin Buluc (abuluc@lbl.gov), Adam Lugowski --------*/
6 /****************************************************************/
7 /*
8 Copyright (c) 2011, Aydin Buluc
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 THE SOFTWARE.
27 */
28 
29 #ifndef _FRIENDS_H_
30 #define _FRIENDS_H_
31 
32 #include <iostream>
33 #include "SpMat.h" // Best to include the base class first
34 #include "SpHelper.h"
35 #include "StackEntry.h"
36 #include "Isect.h"
37 #include "Deleter.h"
38 #include "SpImpl.h"
39 #include "SpImplNoSR.h"
40 #include "SpParHelper.h"
41 #include "Compare.h"
42 #include "CombBLAS.h"
43 using namespace std;
44 
45 template <class IU, class NU>
46 class SpTuples;
47 
48 template <class IU, class NU>
49 class SpDCCols;
50 
51 template <class IU, class NU>
52 class Dcsc;
53 
54 /*************************************************************************************************/
55 /**************************** SHARED ADDRESS SPACE FRIEND FUNCTIONS ******************************/
56 /****************************** MULTITHREADED LOGIC ALSO GOES HERE *******************************/
57 /*************************************************************************************************/
58 
60 template <typename SR, typename IU, typename NU, typename RHS, typename LHS>
61 void dcsc_gespmv (const SpDCCols<IU, NU> & A, const RHS * x, LHS * y)
62 {
63  if(A.nnz > 0)
64  {
65  for(IU j =0; j<A.dcsc->nzc; ++j) // for all nonzero columns
66  {
67  IU colid = A.dcsc->jc[j];
68  for(IU i = A.dcsc->cp[j]; i< A.dcsc->cp[j+1]; ++i)
69  {
70  IU rowid = A.dcsc->ir[i];
71  SR::axpy(A.dcsc->numx[i], x[colid], y[rowid]);
72  if (SR::returnedSAID())
73  {
74  cout << "the semiring returned SAID but that is not implemented. results will be incorrect." << endl;
75  throw string("the semiring returned SAID but that is not implemented. results will be incorrect.");
76  }
77  }
78  }
79  }
80 }
81 
86 template <typename SR, typename IU, typename NUM, typename IVT, typename OVT>
87 int dcsc_gespmv_threaded (const SpDCCols<IU, NUM> & A, const int32_t * indx, const IVT * numx, int32_t nnzx,
88  int32_t * & sendindbuf, OVT * & sendnumbuf, int * & sdispls, int p_c)
89 {
90  // FACTS: Split boundaries (for multithreaded execution) are independent of recipient boundaries
91  // Two splits might create output to the same recipient (needs to be merged)
92  // However, each split's output is distinct (no duplicate elimination is needed after merge)
93 
94  sdispls = new int[p_c](); // initialize to zero (as all indy might be empty)
95  if(A.getnnz() > 0 && nnzx > 0)
96  {
97  int splits = A.getnsplit();
98  if(splits > 0)
99  {
100  int32_t nlocrows = static_cast<int32_t>(A.getnrow());
101  int32_t perpiece = nlocrows / splits;
102  vector< vector< int32_t > > indy(splits);
103  vector< vector< OVT > > numy(splits);
104 
105  // Parallelize with OpenMP
106  #ifdef _OPENMP
107  #pragma omp parallel for // num_threads(6)
108  #endif
109  for(int i=0; i<splits; ++i)
110  {
111  if(i != splits-1)
112  SpMXSpV_ForThreading<SR>(*(A.GetDCSC(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
113  else
114  SpMXSpV_ForThreading<SR>(*(A.GetDCSC(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
115  }
116 
117  vector<int> accum(splits+1, 0);
118  for(int i=0; i<splits; ++i)
119  accum[i+1] = accum[i] + indy[i].size();
120 
121  sendindbuf = new int32_t[accum[splits]];
122  sendnumbuf = new OVT[accum[splits]];
123  int32_t perproc = nlocrows / p_c;
124  int32_t last_rec = p_c-1;
125 
126  // keep recipients of last entries in each split (-1 for an empty split)
127  // so that we can delete indy[] and numy[] contents as soon as they are processed
128  vector<int32_t> end_recs(splits);
129  for(int i=0; i<splits; ++i)
130  {
131  if(indy[i].empty())
132  end_recs[i] = -1;
133  else
134  end_recs[i] = min(indy[i].back() / perproc, last_rec);
135  }
136  #ifdef _OPENMP
137  #pragma omp parallel for // num_threads(6)
138  #endif
139  for(int i=0; i<splits; ++i)
140  {
141  if(!indy[i].empty()) // guarantee that .begin() and .end() are not null
142  {
143  // FACT: Data is sorted, so if the recipient of begin is the same as the owner of end,
144  // then the whole data is sent to the same processor
145  int32_t beg_rec = min( indy[i].front() / perproc, last_rec);
146 
147  // We have to test the previous "split", to see if we are marking a "recipient head"
148  // set displacement markers for the completed (previous) buffers only
149  if(i != 0)
150  {
151  int k = i-1;
152  while (k >= 0 && end_recs[k] == -1) k--; // loop backwards until seeing an non-empty split
153  if(k >= 0) // we found a non-empty split
154  {
155  fill(sdispls+end_recs[k]+1, sdispls+beg_rec+1, accum[i]); // last entry to be set is sdispls[beg_rec]
156  }
157  // else fill sdispls[1...beg_rec] with zero (already done)
158  }
159  // else set sdispls[0] to zero (already done)
160  if(beg_rec == end_recs[i]) // fast case
161  {
162  transform(indy[i].begin(), indy[i].end(), indy[i].begin(), bind2nd(minus<int32_t>(), perproc*beg_rec));
163  copy(indy[i].begin(), indy[i].end(), sendindbuf+accum[i]);
164  copy(numy[i].begin(), numy[i].end(), sendnumbuf+accum[i]);
165  }
166  else // slow case
167  {
168  // FACT: No matter how many splits or threads, there will be only one "recipient head"
169  // Therefore there are no race conditions for marking send displacements (sdispls)
170  int end = indy[i].size();
171  for(int cur=0; cur< end; ++cur)
172  {
173  int32_t cur_rec = min( indy[i][cur] / perproc, last_rec);
174  while(beg_rec != cur_rec)
175  {
176  sdispls[++beg_rec] = accum[i] + cur; // first entry to be set is sdispls[beg_rec+1]
177  }
178  sendindbuf[ accum[i] + cur ] = indy[i][cur] - perproc*beg_rec; // convert to receiver's local index
179  sendnumbuf[ accum[i] + cur ] = numy[i][cur];
180  }
181  }
182  vector<int32_t>().swap(indy[i]);
183  vector<OVT>().swap(numy[i]);
184  bool lastnonzero = true; // am I the last nonzero split?
185  for(int k=i+1; k < splits; ++k)
186  {
187  if(end_recs[k] != -1)
188  lastnonzero = false;
189  }
190  if(lastnonzero)
191  fill(sdispls+end_recs[i]+1, sdispls+p_c, accum[i+1]);
192  } // end_if(!indy[i].empty)
193  } // end parallel for
194  return accum[splits];
195  }
196  else
197  {
198  cout << "Something is wrong, splits should be nonzero for multithreaded execution" << endl;
199  return 0;
200  }
201  }
202  else
203  {
204  sendindbuf = NULL;
205  sendnumbuf = NULL;
206  return 0;
207  }
208 }
209 
210 
211 
218 template <typename SR, typename IU, typename NUM, typename IVT, typename OVT>
219 void dcsc_gespmv_threaded_setbuffers (const SpDCCols<IU, NUM> & A, const int32_t * indx, const IVT * numx, int32_t nnzx,
220  int32_t * sendindbuf, OVT * sendnumbuf, int * cnts, int * dspls, int p_c)
221 {
222  if(A.getnnz() > 0 && nnzx > 0)
223  {
224  int splits = A.getnsplit();
225  if(splits > 0)
226  {
227  vector< vector<int32_t> > indy(splits);
228  vector< vector< OVT > > numy(splits);
229  int32_t nlocrows = static_cast<int32_t>(A.getnrow());
230  int32_t perpiece = nlocrows / splits;
231 
232  #ifdef _OPENMP
233  #pragma omp parallel for
234  #endif
235  for(int i=0; i<splits; ++i)
236  {
237  if(i != splits-1)
238  SpMXSpV_ForThreading<SR>(*(A.GetDCSC(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
239  else
240  SpMXSpV_ForThreading<SR>(*(A.GetDCSC(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
241  }
242 
243  int32_t perproc = nlocrows / p_c;
244  int32_t last_rec = p_c-1;
245 
246  // keep recipients of last entries in each split (-1 for an empty split)
247  // so that we can delete indy[] and numy[] contents as soon as they are processed
248  vector<int32_t> end_recs(splits);
249  for(int i=0; i<splits; ++i)
250  {
251  if(indy[i].empty())
252  end_recs[i] = -1;
253  else
254  end_recs[i] = min(indy[i].back() / perproc, last_rec);
255  }
256 
257  int ** loc_rec_cnts = new int *[splits];
258  #ifdef _OPENMP
259  #pragma omp parallel for
260  #endif
261  for(int i=0; i<splits; ++i)
262  {
263  loc_rec_cnts[i] = new int[p_c](); // thread-local recipient data
264  if(!indy[i].empty()) // guarantee that .begin() and .end() are not null
265  {
266  int32_t cur_rec = min( indy[i].front() / perproc, last_rec);
267  int32_t lastdata = (cur_rec+1) * perproc; // one past last entry that goes to this current recipient
268  for(typename vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
269  {
270 
271  if( ( (*it) >= lastdata ) && cur_rec != last_rec )
272  {
273  cur_rec = min( (*it) / perproc, last_rec);
274  lastdata = (cur_rec+1) * perproc;
275  }
276  ++loc_rec_cnts[i][cur_rec];
277  }
278  }
279  }
280  #ifdef _OPENMP
281  #pragma omp parallel for
282  #endif
283  for(int i=0; i<splits; ++i)
284  {
285  if(!indy[i].empty()) // guarantee that .begin() and .end() are not null
286  {
287  // FACT: Data is sorted, so if the recipient of begin is the same as the owner of end,
288  // then the whole data is sent to the same processor
289  int32_t beg_rec = min( indy[i].front() / perproc, last_rec);
290  int32_t alreadysent = 0; // already sent per recipient
291  for(int before = i-1; before >= 0; before--)
292  alreadysent += loc_rec_cnts[before][beg_rec];
293 
294  if(beg_rec == end_recs[i]) // fast case
295  {
296  transform(indy[i].begin(), indy[i].end(), indy[i].begin(), bind2nd(minus<int32_t>(), perproc*beg_rec));
297  copy(indy[i].begin(), indy[i].end(), sendindbuf + dspls[beg_rec] + alreadysent);
298  copy(numy[i].begin(), numy[i].end(), sendnumbuf + dspls[beg_rec] + alreadysent);
299  }
300  else // slow case
301  {
302  int32_t cur_rec = beg_rec;
303  int32_t lastdata = (cur_rec+1) * perproc; // one past last entry that goes to this current recipient
304  for(typename vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
305  {
306  if( ( (*it) >= lastdata ) && cur_rec != last_rec )
307  {
308  cur_rec = min( (*it) / perproc, last_rec);
309  lastdata = (cur_rec+1) * perproc;
310 
311  // if this split switches to a new recipient after sending some data
312  // then it's sure that no data has been sent to that recipient yet
313  alreadysent = 0;
314  }
315  sendindbuf[ dspls[cur_rec] + alreadysent ] = (*it) - perproc*cur_rec; // convert to receiver's local index
316  sendnumbuf[ dspls[cur_rec] + (alreadysent++) ] = *(numy[i].begin() + (it-indy[i].begin()));
317  }
318  }
319  }
320  }
321  // Deallocated rec counts serially once all threads complete
322  for(int i=0; i< splits; ++i)
323  {
324  for(int j=0; j< p_c; ++j)
325  cnts[j] += loc_rec_cnts[i][j];
326  delete [] loc_rec_cnts[i];
327  }
328  delete [] loc_rec_cnts;
329  }
330  else
331  {
332  cout << "Something is wrong, splits should be nonzero for multithreaded execution" << endl;
333  }
334  }
335 }
336 
340 template <typename SR, typename MIND, typename VIND, typename NUM, typename IVT, typename OVT>
341 void dcsc_gespmv (const SpDCCols<MIND, NUM> & A, const VIND * indx, const IVT * numx, VIND nnzx, vector<VIND> & indy, vector<OVT> & numy)
342 {
343  if(A.getnnz() > 0 && nnzx > 0)
344  {
345  if(A.getnsplit() > 0)
346  {
347  cout << "Call dcsc_gespmv_threaded instead" << endl;
348  }
349  else
350  {
351  SpMXSpV<SR>(*(A.GetDCSC()), (VIND) A.getnrow(), indx, numx, nnzx, indy, numy);
352  }
353  }
354 }
355 
359 template <typename SR, typename IU, typename NUM, typename IVT, typename OVT>
360 void dcsc_gespmv (const SpDCCols<IU, NUM> & A, const int32_t * indx, const IVT * numx, int32_t nnzx,
361  int32_t * indy, OVT * numy, int * cnts, int * dspls, int p_c, bool indexisvalue)
362 {
363  if(A.getnnz() > 0 && nnzx > 0)
364  {
365  if(A.getnsplit() > 0)
366  {
367  SpParHelper::Print("Call dcsc_gespmv_threaded instead\n");
368  }
369  else
370  {
371  if(indexisvalue)
372  {
373  int32_t localm = (int32_t) A.getnrow();
374  BitMap * isthere = new BitMap(localm);
375  SpMXSpV(*(A.GetDCSC()), localm, indx, numx, nnzx, indy, numy, cnts, dspls, p_c, isthere);
376  delete isthere;
377  }
378  else
379  SpMXSpV<SR>(*(A.GetDCSC()), (int32_t) A.getnrow(), indx, numx, nnzx, indy, numy, cnts, dspls, p_c);
380  }
381  }
382 }
383 
384 
385 template<typename IU>
386 void BooleanRowSplit(SpDCCols<IU, bool> & A, int numsplits)
387 {
388  A.splits = numsplits;
389  IU perpiece = A.m / A.splits;
390  vector<IU> prevcolids(A.splits, -1); // previous column id's are set to -1
391  vector<IU> nzcs(A.splits, 0);
392  vector<IU> nnzs(A.splits, 0);
393  vector < vector < pair<IU,IU> > > colrowpairs(A.splits);
394  if(A.nnz > 0 && A.dcsc != NULL)
395  {
396  for(IU i=0; i< A.dcsc->nzc; ++i)
397  {
398  for(IU j = A.dcsc->cp[i]; j< A.dcsc->cp[i+1]; ++j)
399  {
400  IU colid = A.dcsc->jc[i];
401  IU rowid = A.dcsc->ir[j];
402  IU owner = min(rowid / perpiece, static_cast<IU>(A.splits-1));
403  colrowpairs[owner].push_back(make_pair(colid, rowid - owner*perpiece));
404 
405  if(prevcolids[owner] != colid)
406  {
407  prevcolids[owner] = colid;
408  ++nzcs[owner];
409  }
410  ++nnzs[owner];
411  }
412  }
413  }
414  delete A.dcsc; // claim memory
415  //copy(nzcs.begin(), nzcs.end(), ostream_iterator<IU>(cout," " )); cout << endl;
416  //copy(nnzs.begin(), nnzs.end(), ostream_iterator<IU>(cout," " )); cout << endl;
417  A.dcscarr = new Dcsc<IU,bool>*[A.splits];
418 
419  // To be parallelized with OpenMP
420  for(int i=0; i< A.splits; ++i)
421  {
422  sort(colrowpairs[i].begin(), colrowpairs[i].end()); // sort w.r.t. columns
423  A.dcscarr[i] = new Dcsc<IU,bool>(nnzs[i],nzcs[i]);
424  fill(A.dcscarr[i]->numx, A.dcscarr[i]->numx+nnzs[i], static_cast<bool>(1));
425  IU curnzc = 0; // number of nonzero columns constructed so far
426  IU cindex = colrowpairs[i][0].first;
427  IU rindex = colrowpairs[i][0].second;
428 
429  A.dcscarr[i]->ir[0] = rindex;
430  A.dcscarr[i]->jc[curnzc] = cindex;
431  A.dcscarr[i]->cp[curnzc++] = 0;
432 
433  for(IU j=1; j<nnzs[i]; ++j)
434  {
435  cindex = colrowpairs[i][j].first;
436  rindex = colrowpairs[i][j].second;
437 
438  A.dcscarr[i]->ir[j] = rindex;
439  if(cindex != A.dcscarr[i]->jc[curnzc-1])
440  {
441  A.dcscarr[i]->jc[curnzc] = cindex;
442  A.dcscarr[i]->cp[curnzc++] = j;
443  }
444  }
445  A.dcscarr[i]->cp[curnzc] = nnzs[i];
446  }
447 }
448 
449 
456 template<class SR, class NUO, class IU, class NU1, class NU2>
458  (const SpDCCols<IU, NU1> & A,
459  const SpDCCols<IU, NU2> & B,
460  bool clearA = false, bool clearB = false)
461 {
462  IU mdim = A.m;
463  IU ndim = B.m; // B is already transposed
464 
465  if(A.isZero() || B.isZero())
466  {
467  if(clearA) delete const_cast<SpDCCols<IU, NU1> *>(&A);
468  if(clearB) delete const_cast<SpDCCols<IU, NU2> *>(&B);
469  return new SpTuples< IU, NUO >(0, mdim, ndim); // just return an empty matrix
470  }
471  Isect<IU> *isect1, *isect2, *itr1, *itr2, *cols, *rows;
472  SpHelper::SpIntersect(*(A.dcsc), *(B.dcsc), cols, rows, isect1, isect2, itr1, itr2);
473 
474  IU kisect = static_cast<IU>(itr1-isect1); // size of the intersection ((itr1-isect1) == (itr2-isect2))
475  if(kisect == 0)
476  {
477  if(clearA) delete const_cast<SpDCCols<IU, NU1> *>(&A);
478  if(clearB) delete const_cast<SpDCCols<IU, NU2> *>(&B);
479  DeleteAll(isect1, isect2, cols, rows);
480  return new SpTuples< IU, NUO >(0, mdim, ndim);
481  }
482 
483  StackEntry< NUO, pair<IU,IU> > * multstack;
484 
485  IU cnz = SpHelper::SpCartesian< SR > (*(A.dcsc), *(B.dcsc), kisect, isect1, isect2, multstack);
486  DeleteAll(isect1, isect2, cols, rows);
487 
488  if(clearA) delete const_cast<SpDCCols<IU, NU1> *>(&A);
489  if(clearB) delete const_cast<SpDCCols<IU, NU2> *>(&B);
490  return new SpTuples<IU, NUO> (cnz, mdim, ndim, multstack);
491 }
492 
499 template<class SR, class NUO, class IU, class NU1, class NU2>
501  (const SpDCCols<IU, NU1> & A,
502  const SpDCCols<IU, NU2> & B,
503  bool clearA = false, bool clearB = false)
504 {
505  IU mdim = A.m;
506  IU ndim = B.n;
507  if(A.isZero() || B.isZero())
508  {
509  return new SpTuples<IU, NUO>(0, mdim, ndim);
510  }
511  StackEntry< NUO, pair<IU,IU> > * multstack;
512  IU cnz = SpHelper::SpColByCol< SR > (*(A.dcsc), *(B.dcsc), A.n, multstack);
513 
514  if(clearA)
515  delete const_cast<SpDCCols<IU, NU1> *>(&A);
516  if(clearB)
517  delete const_cast<SpDCCols<IU, NU2> *>(&B);
518 
519  return new SpTuples<IU, NUO> (cnz, mdim, ndim, multstack);
520 }
521 
522 
523 template<class SR, class NUO, class IU, class NU1, class NU2>
525  (const SpDCCols<IU, NU1> & A,
526  const SpDCCols<IU, NU2> & B,
527  bool clearA = false, bool clearB = false)
528 {
529  IU mdim = A.n;
530  IU ndim = B.m;
531  cout << "Tuples_AtXBt function has not been implemented yet !" << endl;
532 
533  return new SpTuples<IU, NUO> (0, mdim, ndim);
534 }
535 
536 template<class SR, class NUO, class IU, class NU1, class NU2>
538  (const SpDCCols<IU, NU1> & A,
539  const SpDCCols<IU, NU2> & B,
540  bool clearA = false, bool clearB = false)
541 {
542  IU mdim = A.n;
543  IU ndim = B.n;
544  cout << "Tuples_AtXBn function has not been implemented yet !" << endl;
545 
546  return new SpTuples<IU, NUO> (0, mdim, ndim);
547 }
548 
549 // Performs a balanced merge of the array of SpTuples
550 // Assumes the input parameters are already column sorted
551 template<class SR, class IU, class NU>
552 SpTuples<IU,NU> MergeAll( const vector<SpTuples<IU,NU> *> & ArrSpTups, IU mstar = 0, IU nstar = 0, bool delarrs = false )
553 {
554  int hsize = ArrSpTups.size();
555  if(hsize == 0)
556  {
557  return SpTuples<IU,NU>(0, mstar,nstar);
558  }
559  else
560  {
561  mstar = ArrSpTups[0]->m;
562  nstar = ArrSpTups[0]->n;
563  }
564  for(int i=1; i< hsize; ++i)
565  {
566  if((mstar != ArrSpTups[i]->m) || nstar != ArrSpTups[i]->n)
567  {
568  cerr << "Dimensions do not match on MergeAll()" << endl;
569  return SpTuples<IU,NU>(0,0,0);
570  }
571  }
572  if(hsize > 1)
573  {
574  ColLexiCompare<IU,int> heapcomp;
575  tuple<IU, IU, int> * heap = new tuple<IU, IU, int> [hsize]; // (rowindex, colindex, source-id)
576  IU * curptr = new IU[hsize];
577  fill_n(curptr, hsize, static_cast<IU>(0));
578  IU estnnz = 0;
579 
580  for(int i=0; i< hsize; ++i)
581  {
582  estnnz += ArrSpTups[i]->getnnz();
583  heap[i] = make_tuple(get<0>(ArrSpTups[i]->tuples[0]), get<1>(ArrSpTups[i]->tuples[0]), i);
584  }
585  make_heap(heap, heap+hsize, not2(heapcomp));
586 
587  tuple<IU, IU, NU> * ntuples = new tuple<IU,IU,NU>[estnnz];
588  IU cnz = 0;
589 
590  while(hsize > 0)
591  {
592  pop_heap(heap, heap + hsize, not2(heapcomp)); // result is stored in heap[hsize-1]
593  int source = get<2>(heap[hsize-1]);
594 
595  if( (cnz != 0) &&
596  ((get<0>(ntuples[cnz-1]) == get<0>(heap[hsize-1])) && (get<1>(ntuples[cnz-1]) == get<1>(heap[hsize-1]))) )
597  {
598  get<2>(ntuples[cnz-1]) = SR::add(get<2>(ntuples[cnz-1]), ArrSpTups[source]->numvalue(curptr[source]++));
599  }
600  else
601  {
602  ntuples[cnz++] = ArrSpTups[source]->tuples[curptr[source]++];
603  }
604 
605  if(curptr[source] != ArrSpTups[source]->getnnz()) // That array has not been depleted
606  {
607  heap[hsize-1] = make_tuple(get<0>(ArrSpTups[source]->tuples[curptr[source]]),
608  get<1>(ArrSpTups[source]->tuples[curptr[source]]), source);
609  push_heap(heap, heap+hsize, not2(heapcomp));
610  }
611  else
612  {
613  --hsize;
614  }
615  }
616  SpHelper::ShrinkArray(ntuples, cnz);
617  DeleteAll(heap, curptr);
618 
619  if(delarrs)
620  {
621  for(size_t i=0; i<ArrSpTups.size(); ++i)
622  delete ArrSpTups[i];
623  }
624  return SpTuples<IU,NU> (cnz, mstar, nstar, ntuples);
625  }
626  else
627  {
628  SpTuples<IU,NU> ret = *ArrSpTups[0];
629  if(delarrs)
630  delete ArrSpTups[0];
631  return ret;
632  }
633 }
634 
640 template <typename IU, typename NU1, typename NU2>
642 {
643  typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
644  IU estnzc, estnz;
645  if(exclude)
646  {
647  estnzc = A.nzc;
648  estnz = A.nz;
649  }
650  else
651  {
652  estnzc = std::min(A.nzc, B->nzc);
653  estnz = std::min(A.nz, B->nz);
654  }
655 
656  Dcsc<IU,N_promote> temp(estnz, estnzc);
657 
658  IU curnzc = 0;
659  IU curnz = 0;
660  IU i = 0;
661  IU j = 0;
662  temp.cp[0] = 0;
663 
664  if(!exclude) // A = A .* B
665  {
666  while(i< A.nzc && B != NULL && j<B->nzc)
667  {
668  if(A.jc[i] > B->jc[j]) ++j;
669  else if(A.jc[i] < B->jc[j]) ++i;
670  else
671  {
672  IU ii = A.cp[i];
673  IU jj = B->cp[j];
674  IU prevnz = curnz;
675  while (ii < A.cp[i+1] && jj < B->cp[j+1])
676  {
677  if (A.ir[ii] < B->ir[jj]) ++ii;
678  else if (A.ir[ii] > B->ir[jj]) ++jj;
679  else
680  {
681  temp.ir[curnz] = A.ir[ii];
682  temp.numx[curnz++] = A.numx[ii++] * B->numx[jj++];
683  }
684  }
685  if(prevnz < curnz) // at least one nonzero exists in this column
686  {
687  temp.jc[curnzc++] = A.jc[i];
688  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
689  }
690  ++i;
691  ++j;
692  }
693  }
694  }
695  else // A = A .* not(B)
696  {
697  while(i< A.nzc && B != NULL && j< B->nzc)
698  {
699  if(A.jc[i] > B->jc[j]) ++j;
700  else if(A.jc[i] < B->jc[j])
701  {
702  temp.jc[curnzc++] = A.jc[i++];
703  for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
704  {
705  temp.ir[curnz] = A.ir[k];
706  temp.numx[curnz++] = A.numx[k];
707  }
708  temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
709  }
710  else
711  {
712  IU ii = A.cp[i];
713  IU jj = B->cp[j];
714  IU prevnz = curnz;
715  while (ii < A.cp[i+1] && jj < B->cp[j+1])
716  {
717  if (A.ir[ii] > B->ir[jj]) ++jj;
718  else if (A.ir[ii] < B->ir[jj])
719  {
720  temp.ir[curnz] = A.ir[ii];
721  temp.numx[curnz++] = A.numx[ii++];
722  }
723  else // eliminate those existing nonzeros
724  {
725  ++ii;
726  ++jj;
727  }
728  }
729  while (ii < A.cp[i+1])
730  {
731  temp.ir[curnz] = A.ir[ii];
732  temp.numx[curnz++] = A.numx[ii++];
733  }
734 
735  if(prevnz < curnz) // at least one nonzero exists in this column
736  {
737  temp.jc[curnzc++] = A.jc[i];
738  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
739  }
740  ++i;
741  ++j;
742  }
743  }
744  while(i< A.nzc)
745  {
746  temp.jc[curnzc++] = A.jc[i++];
747  for(IU k = A.cp[i-1]; k< A.cp[i]; ++k)
748  {
749  temp.ir[curnz] = A.ir[k];
750  temp.numx[curnz++] = A.numx[k];
751  }
752  temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
753  }
754  }
755 
756  temp.Resize(curnzc, curnz);
757  return temp;
758 }
759 
760 template <typename N_promote, typename IU, typename NU1, typename NU2, typename _BinaryOperation>
761 Dcsc<IU, N_promote> EWiseApply(const Dcsc<IU,NU1> & A, const Dcsc<IU,NU2> * B, _BinaryOperation __binary_op, bool notB, const NU2& defaultBVal)
762 {
763  //typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
764  IU estnzc, estnz;
765  if(notB)
766  {
767  estnzc = A.nzc;
768  estnz = A.nz;
769  }
770  else
771  {
772  estnzc = std::min(A.nzc, B->nzc);
773  estnz = std::min(A.nz, B->nz);
774  }
775 
776  Dcsc<IU,N_promote> temp(estnz, estnzc);
777 
778  IU curnzc = 0;
779  IU curnz = 0;
780  IU i = 0;
781  IU j = 0;
782  temp.cp[0] = 0;
783 
784  if(!notB) // A = A .* B
785  {
786  while(i< A.nzc && B != NULL && j<B->nzc)
787  {
788  if(A.jc[i] > B->jc[j]) ++j;
789  else if(A.jc[i] < B->jc[j]) ++i;
790  else
791  {
792  IU ii = A.cp[i];
793  IU jj = B->cp[j];
794  IU prevnz = curnz;
795  while (ii < A.cp[i+1] && jj < B->cp[j+1])
796  {
797  if (A.ir[ii] < B->ir[jj]) ++ii;
798  else if (A.ir[ii] > B->ir[jj]) ++jj;
799  else
800  {
801  temp.ir[curnz] = A.ir[ii];
802  temp.numx[curnz++] = __binary_op(A.numx[ii++], B->numx[jj++]);
803  }
804  }
805  if(prevnz < curnz) // at least one nonzero exists in this column
806  {
807  temp.jc[curnzc++] = A.jc[i];
808  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
809  }
810  ++i;
811  ++j;
812  }
813  }
814  }
815  else // A = A .* not(B)
816  {
817  while(i< A.nzc && B != NULL && j< B->nzc)
818  {
819  if(A.jc[i] > B->jc[j]) ++j;
820  else if(A.jc[i] < B->jc[j])
821  {
822  temp.jc[curnzc++] = A.jc[i++];
823  for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
824  {
825  temp.ir[curnz] = A.ir[k];
826  temp.numx[curnz++] = __binary_op(A.numx[k], defaultBVal);
827  }
828  temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
829  }
830  else
831  {
832  IU ii = A.cp[i];
833  IU jj = B->cp[j];
834  IU prevnz = curnz;
835  while (ii < A.cp[i+1] && jj < B->cp[j+1])
836  {
837  if (A.ir[ii] > B->ir[jj]) ++jj;
838  else if (A.ir[ii] < B->ir[jj])
839  {
840  temp.ir[curnz] = A.ir[ii];
841  temp.numx[curnz++] = __binary_op(A.numx[ii++], defaultBVal);
842  }
843  else // eliminate those existing nonzeros
844  {
845  ++ii;
846  ++jj;
847  }
848  }
849  while (ii < A.cp[i+1])
850  {
851  temp.ir[curnz] = A.ir[ii];
852  temp.numx[curnz++] = __binary_op(A.numx[ii++], defaultBVal);
853  }
854 
855  if(prevnz < curnz) // at least one nonzero exists in this column
856  {
857  temp.jc[curnzc++] = A.jc[i];
858  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
859  }
860  ++i;
861  ++j;
862  }
863  }
864  while(i< A.nzc)
865  {
866  temp.jc[curnzc++] = A.jc[i++];
867  for(IU k = A.cp[i-1]; k< A.cp[i]; ++k)
868  {
869  temp.ir[curnz] = A.ir[k];
870  temp.numx[curnz++] = __binary_op(A.numx[k], defaultBVal);
871  }
872  temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
873  }
874  }
875 
876  temp.Resize(curnzc, curnz);
877  return temp;
878 }
879 
880 
881 template<typename IU, typename NU1, typename NU2>
883 {
884  typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
885  assert(A.m == B.m);
886  assert(A.n == B.n);
887 
888  Dcsc<IU, N_promote> * tdcsc = NULL;
889  if(A.nnz > 0 && B.nnz > 0)
890  {
891  tdcsc = new Dcsc<IU, N_promote>(EWiseMult(*(A.dcsc), B.dcsc, exclude));
892  return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
893  }
894  else if (A.nnz > 0 && exclude) // && B.nnz == 0
895  {
896  tdcsc = new Dcsc<IU, N_promote>(EWiseMult(*(A.dcsc), (const Dcsc<IU,NU2>*)NULL, exclude));
897  return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
898  }
899  else
900  {
901  return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
902  }
903 }
904 
905 
906 template<typename N_promote, typename IU, typename NU1, typename NU2, typename _BinaryOperation>
907 SpDCCols<IU, N_promote> EWiseApply (const SpDCCols<IU,NU1> & A, const SpDCCols<IU,NU2> & B, _BinaryOperation __binary_op, bool notB, const NU2& defaultBVal)
908 {
909  //typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
910  assert(A.m == B.m);
911  assert(A.n == B.n);
912 
913  Dcsc<IU, N_promote> * tdcsc = NULL;
914  if(A.nnz > 0 && B.nnz > 0)
915  {
916  tdcsc = new Dcsc<IU, N_promote>(EWiseApply<N_promote>(*(A.dcsc), B.dcsc, __binary_op, notB, defaultBVal));
917  return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
918  }
919  else if (A.nnz > 0 && notB) // && B.nnz == 0
920  {
921  tdcsc = new Dcsc<IU, N_promote>(EWiseApply<N_promote>(*(A.dcsc), (const Dcsc<IU,NU2>*)NULL, __binary_op, notB, defaultBVal));
922  return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
923  }
924  else
925  {
926  return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
927  }
928 }
929 
939 template <typename RETT, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
940 Dcsc<IU, RETT> EWiseApply(const Dcsc<IU,NU1> * Ap, const Dcsc<IU,NU2> * Bp, _BinaryOperation __binary_op, _BinaryPredicate do_op, bool allowANulls, bool allowBNulls, const NU1& ANullVal, const NU2& BNullVal, const bool allowIntersect)
941 {
942  if (Ap == NULL && Bp == NULL)
943  return Dcsc<IU,RETT>(0, 0);
944 
945  if (Ap == NULL && Bp != NULL)
946  {
947  if (!allowANulls)
948  return Dcsc<IU,RETT>(0, 0);
949 
950  const Dcsc<IU,NU2> & B = *Bp;
951  IU estnzc = B.nzc;
952  IU estnz = B.nz;
953  Dcsc<IU,RETT> temp(estnz, estnzc);
954 
955  IU curnzc = 0;
956  IU curnz = 0;
957  //IU i = 0;
958  IU j = 0;
959  temp.cp[0] = 0;
960  while(j<B.nzc)
961  {
962  // Based on the if statement below which handles A null values.
963  j++;
964  IU prevnz = curnz;
965  temp.jc[curnzc++] = B.jc[j-1];
966  for(IU k = B.cp[j-1]; k< B.cp[j]; ++k)
967  {
968  if (do_op(ANullVal, B.numx[k], true, false))
969  {
970  temp.ir[curnz] = B.ir[k];
971  temp.numx[curnz++] = __binary_op(ANullVal, B.numx[k], true, false);
972  }
973  }
974  //temp.cp[curnzc] = temp.cp[curnzc-1] + (B.cp[j] - B.cp[j-1]);
975  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
976  }
977  temp.Resize(curnzc, curnz);
978  return temp;
979  }
980 
981  if (Ap != NULL && Bp == NULL)
982  {
983  if (!allowBNulls)
984  return Dcsc<IU,RETT>(0, 0);
985 
986  const Dcsc<IU,NU1> & A = *Ap;
987  IU estnzc = A.nzc;
988  IU estnz = A.nz;
989  Dcsc<IU,RETT> temp(estnz, estnzc);
990 
991  IU curnzc = 0;
992  IU curnz = 0;
993  IU i = 0;
994  //IU j = 0;
995  temp.cp[0] = 0;
996  while(i< A.nzc)
997  {
998  i++;
999  IU prevnz = curnz;
1000  temp.jc[curnzc++] = A.jc[i-1];
1001  for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
1002  {
1003  if (do_op(A.numx[k], BNullVal, false, true))
1004  {
1005  temp.ir[curnz] = A.ir[k];
1006  temp.numx[curnz++] = __binary_op(A.numx[k], BNullVal, false, true);
1007  }
1008  }
1009  //temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
1010  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1011  }
1012  temp.Resize(curnzc, curnz);
1013  return temp;
1014  }
1015 
1016  // both A and B are non-NULL at this point
1017  const Dcsc<IU,NU1> & A = *Ap;
1018  const Dcsc<IU,NU2> & B = *Bp;
1019 
1020  IU estnzc = A.nzc + B.nzc;
1021  IU estnz = A.nz + B.nz;
1022  Dcsc<IU,RETT> temp(estnz, estnzc);
1023 
1024  IU curnzc = 0;
1025  IU curnz = 0;
1026  IU i = 0;
1027  IU j = 0;
1028  temp.cp[0] = 0;
1029  while(i< A.nzc && j<B.nzc)
1030  {
1031  if(A.jc[i] > B.jc[j])
1032  {
1033  j++;
1034  if (allowANulls)
1035  {
1036  IU prevnz = curnz;
1037  temp.jc[curnzc++] = B.jc[j-1];
1038  for(IU k = B.cp[j-1]; k< B.cp[j]; ++k)
1039  {
1040  if (do_op(ANullVal, B.numx[k], true, false))
1041  {
1042  temp.ir[curnz] = B.ir[k];
1043  temp.numx[curnz++] = __binary_op(ANullVal, B.numx[k], true, false);
1044  }
1045  }
1046  //temp.cp[curnzc] = temp.cp[curnzc-1] + (B.cp[j] - B.cp[j-1]);
1047  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1048  }
1049  }
1050  else if(A.jc[i] < B.jc[j])
1051  {
1052  i++;
1053  if (allowBNulls)
1054  {
1055  IU prevnz = curnz;
1056  temp.jc[curnzc++] = A.jc[i-1];
1057  for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
1058  {
1059  if (do_op(A.numx[k], BNullVal, false, true))
1060  {
1061  temp.ir[curnz] = A.ir[k];
1062  temp.numx[curnz++] = __binary_op(A.numx[k], BNullVal, false, true);
1063  }
1064  }
1065  //temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
1066  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1067  }
1068  }
1069  else
1070  {
1071  temp.jc[curnzc++] = A.jc[i];
1072  IU ii = A.cp[i];
1073  IU jj = B.cp[j];
1074  IU prevnz = curnz;
1075  while (ii < A.cp[i+1] && jj < B.cp[j+1])
1076  {
1077  if (A.ir[ii] < B.ir[jj])
1078  {
1079  if (allowBNulls && do_op(A.numx[ii], BNullVal, false, true))
1080  {
1081  temp.ir[curnz] = A.ir[ii];
1082  temp.numx[curnz++] = __binary_op(A.numx[ii++], BNullVal, false, true);
1083  }
1084  else
1085  ii++;
1086  }
1087  else if (A.ir[ii] > B.ir[jj])
1088  {
1089  if (allowANulls && do_op(ANullVal, B.numx[jj], true, false))
1090  {
1091  temp.ir[curnz] = B.ir[jj];
1092  temp.numx[curnz++] = __binary_op(ANullVal, B.numx[jj++], true, false);
1093  }
1094  else
1095  jj++;
1096  }
1097  else
1098  {
1099  if (allowIntersect && do_op(A.numx[ii], B.numx[jj], false, false))
1100  {
1101  temp.ir[curnz] = A.ir[ii];
1102  temp.numx[curnz++] = __binary_op(A.numx[ii++], B.numx[jj++], false, false); // might include zeros
1103  }
1104  else
1105  {
1106  ii++;
1107  jj++;
1108  }
1109  }
1110  }
1111  while (ii < A.cp[i+1])
1112  {
1113  if (allowBNulls && do_op(A.numx[ii], BNullVal, false, true))
1114  {
1115  temp.ir[curnz] = A.ir[ii];
1116  temp.numx[curnz++] = __binary_op(A.numx[ii++], BNullVal, false, true);
1117  }
1118  else
1119  ii++;
1120  }
1121  while (jj < B.cp[j+1])
1122  {
1123  if (allowANulls && do_op(ANullVal, B.numx[jj], true, false))
1124  {
1125  temp.ir[curnz] = B.ir[jj];
1126  temp.numx[curnz++] = __binary_op(ANullVal, B.numx[jj++], true, false);
1127  }
1128  else
1129  jj++;
1130  }
1131  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1132  ++i;
1133  ++j;
1134  }
1135  }
1136  while(allowBNulls && i< A.nzc) // remaining A elements after B ran out
1137  {
1138  IU prevnz = curnz;
1139  temp.jc[curnzc++] = A.jc[i++];
1140  for(IU k = A.cp[i-1]; k< A.cp[i]; ++k)
1141  {
1142  if (do_op(A.numx[k], BNullVal, false, true))
1143  {
1144  temp.ir[curnz] = A.ir[k];
1145  temp.numx[curnz++] = __binary_op(A.numx[k], BNullVal, false, true);
1146  }
1147  }
1148  //temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
1149  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1150  }
1151  while(allowANulls && j < B.nzc) // remaining B elements after A ran out
1152  {
1153  IU prevnz = curnz;
1154  temp.jc[curnzc++] = B.jc[j++];
1155  for(IU k = B.cp[j-1]; k< B.cp[j]; ++k)
1156  {
1157  if (do_op(ANullVal, B.numx[k], true, false))
1158  {
1159  temp.ir[curnz] = B.ir[k];
1160  temp.numx[curnz++] = __binary_op(ANullVal, B.numx[k], true, false);
1161  }
1162  }
1163  //temp.cp[curnzc] = temp.cp[curnzc-1] + (B.cp[j] - B.cp[j-1]);
1164  temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1165  }
1166  temp.Resize(curnzc, curnz);
1167  return temp;
1168 }
1169 
1170 template <typename RETT, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
1171 SpDCCols<IU,RETT> EWiseApply (const SpDCCols<IU,NU1> & A, const SpDCCols<IU,NU2> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op, bool allowANulls, bool allowBNulls, const NU1& ANullVal, const NU2& BNullVal, const bool allowIntersect)
1172 {
1173  assert(A.m == B.m);
1174  assert(A.n == B.n);
1175 
1176  Dcsc<IU, RETT> * tdcsc = new Dcsc<IU, RETT>(EWiseApply<RETT>(A.dcsc, B.dcsc, __binary_op, do_op, allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect));
1177  return SpDCCols<IU, RETT> (A.m , A.n, tdcsc);
1178 }
1179 
1180 
1181 #endif