Actual source code: ParDelta.hh

  1: #ifndef included_ALE_ParDelta_hh
  2: #define included_ALE_ParDelta_hh

  4: #ifndef  included_ALE_Sifter_hh
  5: #include <Sifter.hh>
  6: #endif



 10: //
 11: // Classes and methods implementing  the parallel Overlap and Fusion algorithms on ASifter-like objects.
 12: //
 13: namespace ALE {

 15:     template <typename RightConeSequence_>
 16:     class RightSequenceDuplicator {
 17:       // Replicate the cone sequence on the right in the overlap graph.
 18:       int debug;
 19:     public:
 20:       //Encapsulated types
 21:       typedef RightConeSequence_                            right_sequence_type;
 22:       typedef typename right_sequence_type::target_type     right_target_type;
 23:       //
 24:       typedef typename right_sequence_type::source_type     fusion_source_type;
 25:       typedef typename right_sequence_type::target_type     fusion_target_type;
 26:       typedef typename right_sequence_type::color_type      fusion_color_type;
 27:     public:
 28:       //
 29:       // Basic interface
 30:       //
 31:       RightSequenceDuplicator(int debug = 0) : debug(debug) {};
 32:       RightSequenceDuplicator(const RightSequenceDuplicator& f) {};
 33:       virtual ~RightSequenceDuplicator() {};

 35:       template <typename left_target_type>
 36:       fusion_target_type
 37:       fuseBasePoints(const left_target_type&  ltarget, const right_target_type& rtarget) {
 38:         return rtarget;
 39:       };

 41:       // FIX: need to have const left_sequence& and const right_sequence& , but begin() and end() aren't const methods
 42:       template <typename left_sequence_type, typename fusion_sequence_type>
 43:       void
 44:       fuseCones(left_sequence_type&  lcone, right_sequence_type& rcone, const Obj<fusion_sequence_type>& fcone) {
 45:         for(typename right_sequence_type::iterator rci = rcone.begin(); rci != rcone.end(); rci++) {
 46:           fcone->addArrow(rci.arrow());
 47:         }
 48:       };
 49:     }; // struct RightSequenceDuplicator


 52:     template <typename Arrow_>
 53:     class ConeArraySequence {
 54:       // ConeArraySequence wraps a raw byte array of (Source_,Color_) pairs
 55:       // presenting it as a cone sequence for a given target.
 56:     public:
 57:       typedef Arrow_                           arrow_type;
 58:       typedef typename arrow_type::source_type source_type;
 59:       typedef typename arrow_type::target_type target_type;
 60:       typedef typename arrow_type::color_type  color_type;
 61:       //
 62:       struct cone_arrow_type {
 63:         source_type source;
 64:         color_type color;
 65:         //
 66:         cone_arrow_type(const arrow_type&  a) : source(a.source), color(a.color)          {};
 67:         cone_arrow_type(const source_type& s, const color_type& c) : source(s), color(c)  {};
 68:         cone_arrow_type(const cone_arrow_type& ca) : source(ca.source), color(ca.color)   {};
 69:         //
 70:         static void place(cone_arrow_type* ca_ptr, const arrow_type& a) {
 71:           // WARNING: an unsafe method in that it has no way of checking the validity of ca_ptr
 72:           ca_ptr->source = a.source;
 73:           ca_ptr->color  = a.color;
 74:         };
 75:         static void place(cone_arrow_type* ca_ptr, const source_type& s, const color_type& c) {
 76:           // WARNING: an unsafe method in that it has no way of checking the validity of ca_ptr
 77:           ca_ptr->source = s;
 78:           ca_ptr->color  = c;
 79:         };
 80:       };
 81:     protected:
 82:       typedef cone_arrow_type* cone_arrow_array;
 83:       target_type        _target;
 84:       cone_arrow_array   _arr_ptr;
 85:       size_t             _seq_size;
 86:     public:
 87:       class iterator {
 88:         target_type        _target;
 89:         cone_arrow_type*   _ptr;
 90:       public:
 91:         iterator(const target_type& target, const cone_arrow_array& ptr) : _target(target),     _ptr(ptr)     {};
 92:         iterator(const iterator& it)                                       : _target(it._target), _ptr(it._ptr) {};
 93:         virtual ~iterator() {};
 94:         //
 95:         virtual source_type        operator*() const { return this->_ptr->source;};
 96:         virtual iterator           operator++()      {this->_ptr++; return *this;};
 97:         virtual iterator           operator++(int n) {iterator tmp(this->_target, this->_ptr); this->_ptr++; return tmp;};
 98:         virtual bool               operator!=(const iterator& it) {return ((it._target != this->_target)||(it._ptr != this->_ptr));};
 99:         //
100:         virtual const source_type& source() const    {return this->_ptr->source;};
101:         virtual const color_type&  color()  const    {return this->_ptr->color; };
102:         virtual const target_type& target() const    {return this->_target;     };
103:         virtual const arrow_type   arrow()  const    {
104:           return arrow_type(this->_ptr->source,this->_target,this->_ptr->color);
105:         };
106:       };
107:       // Basic interface
108:       ConeArraySequence(cone_arrow_array arr_ptr, const size_t& seq_size, const target_type& target) :
109:         _target(target), _arr_ptr(arr_ptr), _seq_size(seq_size) {};
110:       ConeArraySequence(const ConeArraySequence& seq) :
111:         _target(seq._target), _arr_ptr(seq._arr_ptr), _seq_size(seq._seq_size) {};
112:       virtual ~ConeArraySequence() {};
113:       //
114:       virtual iterator begin() { return iterator(this->_target, this->_arr_ptr); };
115:       virtual iterator end()   { return iterator(this->_target, this->_arr_ptr+this->_seq_size); };
116:       virtual size_t   size()  { return this->_seq_size; };
117:       virtual bool     empty() { return (this->size() == 0); };

119:       template<typename ostream_type>
120:       void view(ostream_type& os, const bool& useColor = false, const char* label = NULL){
121:         if(label != NULL) {
122:           os << "Viewing " << label << " sequence:" << std::endl;
123:         }
124:         os << "[";
125:         for(iterator i = this->begin(); i != this->end(); i++) {
126:           os << " (" << *i;
127:           if(useColor) {
128:             os << "," << i.color();
129:           }
130:           os  << ")";
131:         }
132:         os << " ]" << std::endl;
133:       };
134:     };// class ConeArraySequence


137:     template <typename ParSifter_,
138:               typename Fuser_ = RightSequenceDuplicator<ConeArraySequence<typename ParSifter_::traits::arrow_type> >,
139:               typename FusionSifter_ = typename ParSifter_::template rebind<typename Fuser_::fusion_source_type,
140:                                                                             typename Fuser_::fusion_target_type,
141:                                                                             typename Fuser_::fusion_color_type>::type
142:     >
143:     class ParConeDelta { // class ParConeDelta
144:     public:
145:       // Here we specialize to Sifters based on Points in order to enable parallel overlap discovery.
146:       // We also assume that the Points in the base are ordered appropriately so we can use baseSequence.begin() and
147:       // baseSequence.end() as the extrema for global reduction.
148:       typedef ParConeDelta<ParSifter_, Fuser_, FusionSifter_>                                   delta_type;
149:       typedef ParSifter_                                                                        graph_type;
150:       typedef Fuser_                                                                            fuser_type;
151:       // These are default "return" types, although methods are templated on their main input/return types
152:       typedef ASifter<int, ALE::Point, ALE::pair<ALE::Point, ALE::pair<int,int> >, SifterDef::uniColor>                 overlap_type;
153:       typedef ASifter<int, ALE::pair<int,ALE::Point>, ALE::pair<ALE::Point, ALE::pair<int,int> >, SifterDef::uniColor>  bioverlap_type;
154:       typedef FusionSifter_                                                                                             fusion_type;

156:       //
159:       static Obj<overlap_type>
160:       overlap(const Obj<graph_type> graph) {
161:         ALE_LOG_EVENT_BEGIN;
162:         Obj<overlap_type> overlap = new overlap_type(graph->comm());
163:         // If this is a serial object, we return an empty overlap
164:         if((graph->comm() != PETSC_COMM_SELF) && (graph->commSize() > 1)) {
165:           computeOverlap(graph, overlap);
166:         }
167:         ALE_LOG_EVENT_END;
168:         return overlap;
169:       };

171:       template <typename Overlap_>
172:       static void computeOverlap(const Obj<graph_type>& graph, Obj<Overlap_>& overlap){
173:         __computeOverlapNew(graph, overlap);
174:       };

178:       static Obj<bioverlap_type>
179:       overlap(const Obj<graph_type> graphA, const Obj<graph_type> graphB) {
180:         ALE_LOG_EVENT_BEGIN;
181:         Obj<bioverlap_type> overlap = new bioverlap_type(graphA->comm());
182:         PetscMPIInt         comp;

184:         MPI_Comm_compare(graphA->comm(), graphB->comm(), &comp);
185:         if (comp != MPI_IDENT) {
186:           throw ALE::Exception("Non-matching communicators for overlap");
187:         }
188:         computeOverlap(graphA, graphB, overlap);
189:         ALE_LOG_EVENT_END;
190:         return overlap;
191:       };

193:       template <typename Overlap_>
194:       static void computeOverlap(const Obj<graph_type>& graphA, const Obj<graph_type>& graphB, Obj<Overlap_>& overlap){
195:         __computeOverlapNew(graphA, graphB, overlap);
196:       };

198:       template <typename Overlap_>
199:       static Obj<fusion_type>
200:       fusion(const Obj<graph_type>& graph, const Obj<Overlap_>& overlap, const Obj<fuser_type>& fuser = new fuser_type()) {
201:         Obj<fusion_type> fusion = new fusion_type(graph->comm());
202:         // If this is a serial object, we return an empty delta
203:         if((graph->comm() != PETSC_COMM_SELF) && (graph->commSize() > 1)) {
204:           computeFusion(graph, overlap, fusion, fuser);
205:         }
206:         return fusion;
207:       };

209:       template <typename Overlap_>
210:       static void computeFusion(const Obj<graph_type>& graph, const Obj<Overlap_>& overlap, Obj<fusion_type>& fusion, const Obj<fuser_type>& fuser = new fuser_type()){
211:         __computeFusionNew(graph, overlap, fusion, fuser);
212:       };

214:       template <typename Overlap_>
215:       static Obj<fusion_type>
216:       fusion(const Obj<graph_type>& graphA, const Obj<graph_type>& graphB, const Obj<Overlap_>& overlap, const Obj<fuser_type>& fuser = new fuser_type()) {
217:         Obj<fusion_type> fusion = new fusion_type(graphA->comm());
218:         PetscMPIInt       comp;

220:         MPI_Comm_compare(graphA->comm(), graphB->comm(), &comp);
221:         if (comp != MPI_IDENT) {
222:           throw ALE::Exception("Non-matching communicators for overlap");
223:         }
224:         computeFusion(graphA, graphB, overlap, fusion, fuser);
225:         return fusion;
226:       };

228:       template <typename Overlap_>
229:       static void computeFusion(const Obj<graph_type>& graphA, const Obj<graph_type>& graphB, const Obj<Overlap_>& overlap, Obj<fusion_type>& fusion, const Obj<fuser_type>& fuser = new fuser_type()){
230:         PetscMPIInt       comp;

232:         MPI_Comm_compare(graphA->comm(), graphB->comm(), &comp);
233:         if (comp != MPI_IDENT) {
234:           throw ALE::Exception("Non-matching communicators for overlap");
235:         }
236:         __computeFusionNew(graphA, graphB, overlap, fusion, fuser);
237:       };

239:     protected:
240:       static int                debug;
241:       // Internal type definitions to ensure compatibility with the legacy code in the parallel subroutines
242:       typedef ALE::Point                                Point;
243:       typedef int                                            int32_t;
244:       typedef std::pair<int32_t, int32_t>                    int_pair;
245:       typedef std::set<std::pair<int32_t, int32_t> >         int_pair_set;
246:       typedef std::map<int32_t,int32_t>                      int__int;
247:       typedef std::map<Point, int32_t>                       Point__int;
248:       typedef std::map<Point, std::pair<int32_t,int32_t> >   Point__int_int;
249:       typedef std::map<Point, int_pair_set>                  Point__int_pair_set;

251:     protected:
252:       //--------------------------------------------------------------------------------------------------------
253:       template <typename Sequence>
254:       static void __determinePointOwners(const Obj<graph_type> _graph, const Obj<Sequence>& points, int32_t *LeaseData, int__int& owner) {
256:         // The Sequence points will be referred to as 'base' throughout, although it may in fact represent a cap.
257:         MPI_Comm comm = _graph->comm();
258:         int  size     = _graph->commSize();
259:         int  rank     = _graph->commRank();

261:         // We need to partition global nodes among lessors, which we do by global prefix
262:         // First we determine the extent of global prefices and the bounds on the indices with each global prefix.
263:         int minGlobalPrefix = 0;
264:         // Determine the local extent of global domains
265:         for(typename Sequence::iterator point_itor = points->begin(); point_itor != points->end(); point_itor++) {
266:           Point p = (*point_itor);
267:           if((p.prefix < 0) && (p.prefix < minGlobalPrefix)) {
268:             minGlobalPrefix = p.prefix;
269:           }
270:         }
271:         int MinGlobalPrefix;
272:         MPI_Allreduce(&minGlobalPrefix, &MinGlobalPrefix, 1, MPIU_INT, MPI_MIN, comm);
273:         CHKERROR(ierr, "Error in MPI_Allreduce");
274: 
275:         int__int BaseLowerBound, BaseUpperBound; // global quantities computed from the local quantities below
276:         int__int BaseMaxSize;                    // the maximum size of the global base index space by global prefix
277:         int__int BaseSliceScale, BaseSliceSize, BaseSliceOffset;
278: 
279:         if(MinGlobalPrefix < 0) { // if we actually do have global base points
280:           // Determine the upper and lower bounds on the indices of base points with each global prefix.
281:           // We use maps to keep track of these quantities with different global prefices.
282:           int__int baseLowerBound, baseUpperBound; // local quantities
283:           // Initialize local bound maps with the upper below lower so we can later recognize omitted prefices.
284:           for(int d = -1; d >= MinGlobalPrefix; d--) {
285:             baseLowerBound[d] = 0; baseUpperBound[d] = -1;
286:           }
287:           // Compute local bounds
288:           for(typename Sequence::iterator point_itor = points->begin(); point_itor != points->end(); point_itor++) {
289:             Point p = (*point_itor);
290:             int d = p.prefix;
291:             int i = p.index;
292:             if(d < 0) { // it is indeed a global prefix
293:               if (i < baseLowerBound[d]) {
294:                 baseLowerBound[d] = i;
295:               }
296:               if (i > baseUpperBound[d]) {
297:                 baseUpperBound[d] = i;
298:               }
299:             }
300:           }
301:           // Compute global bounds
302:           for(int d = -1; d >= MinGlobalPrefix; d--){
303:             int lowerBound, upperBound, maxSize;
304:             MPI_Allreduce(&baseLowerBound[d],&lowerBound,1,MPIU_INT,MPI_MIN,comm);
305:             CHKERROR(ierr, "Error in MPI_Allreduce");
306:             MPI_Allreduce(&baseUpperBound[d],&upperBound,1,MPIU_INT,MPI_MAX,comm);
307:             CHKERROR(ierr, "Error in MPI_Allreduce");
308:             maxSize = upperBound - lowerBound + 1;
309:             if(maxSize > 0) { // there are actually some indices in this global prefix
310:               BaseLowerBound[d] = lowerBound;
311:               BaseUpperBound[d] = upperBound;
312:               BaseMaxSize[d]    = maxSize;
313: 
314:               // Each processor (at least potentially) owns a slice of the base indices with each global indices.
315:               // The size of the slice with global prefix d is BaseMaxSize[d]/size + 1 (except if rank == size-1,
316:               // where the slice size can be smaller; +1 is for safety).
317: 
318:               // For a non-empty domain d we compute and store the slice size in BaseSliceScale[d] (the 'typical' slice size) and
319:               // BaseSliceSize[d] (the 'actual' slice size, which only differs from 'typical' for processor with rank == size -1 ).
320:               // Likewise, each processor has to keep track of the index offset for each slice it owns and stores it in BaseSliceOffset[d].
321:               BaseSliceScale[d]  = BaseMaxSize[d]/size + 1;
322:               BaseSliceSize[d]   = BaseSliceScale[d];
323:               if (rank == size-1) {
324:                 BaseSliceSize[d] =  BaseMaxSize[d] - BaseSliceScale[d]*(size-1);
325:               }
326:               BaseSliceSize[d]   = PetscMax(1,BaseSliceSize[d]);
327:               BaseSliceOffset[d] = BaseLowerBound[d] + BaseSliceScale[d]*rank;
328:             }// for(int d = -1; d >= MinGlobalPrefix; d--){
329:           }
330:         }// if(MinGlobalDomain < 0)
331: 
332:         for (typename Sequence::iterator point_itor = points->begin(); point_itor != points->end(); point_itor++) {
333:           Point p = (*point_itor);
334:           // Determine which slice p falls into
335:           // ASSUMPTION on Point type
336:           int d = p.prefix;
337:           int i = p.index;
338:           int proc;
339:           if(d < 0) { // global domain -- determine the owner by which slice p falls into
340:             proc = (i-BaseLowerBound[d])/BaseSliceScale[d];
341:           }
342:           else { // local domain -- must refer to a rank within the comm
343:             if(d >= size) {
344:               throw ALE::Exception("Local domain outside of comm size");
345:             }
346:             proc = d;
347:           }
348:           // FIX
349:           owner[p.index]      = proc;
350:           LeaseData[2*proc+1] = 1;                 // processor owns at least one of ours (i.e., the number of leases from proc is 1)
351:           LeaseData[2*proc]++;                     // count of how many we lease from proc
352:         }

354:         // Base was empty
355:         if(points->begin() == points->end()) {
356:           for(int p = 0; p < size; p++) {
357:             LeaseData[2*p+0] = 0;
358:             LeaseData[2*p+1] = 0;
359:           }
360:         }
361:       }; // __determinePointOwners()


364:       //-------------------------------------------------------------------------------------------------------
365:       #undef  __FUNCT__
367:       template <typename Overlap_>
368:       static void __computeOverlapNew(const Obj<graph_type>& _graph, Obj<Overlap_>& overlap) {
369:         typedef typename graph_type::traits::baseSequence Sequence;
370:         MPI_Comm       comm = _graph->comm();
371:         int            size = _graph->commSize();
372:         int            rank = _graph->commRank();
373:         PetscObject    petscObj = _graph->petscObj();
374:         PetscMPIInt    tag1, tag2, tag3;
376:         // The base we are going to work with
377:         Obj<Sequence> points = _graph->base();
378:         // 2 ints per processor: number of points we buy and number of sales (0 or 1).
379:         int *BuyData;
380:         PetscMalloc(2*size * sizeof(int), &BuyData);CHKERROR(ierr, "Error in PetscMalloc");
381:         PetscMemzero(BuyData, 2*size * sizeof(int));CHKERROR(ierr, "Error in PetscMemzero");
382:         // Map from points to the process managing its bin (seller)
383:         int__int owner;

385:         // determine owners of each base node and save it in a map
386:         __determinePointOwners(_graph, points, BuyData, owner);

388:         int  msgSize = 3;           // A point is 2 ints, and the cone size is 1
389:         int  BuyCount = 0;          // The number of sellers with which this process (buyer) communicates
390:         int *BuySizes = PETSC_NULL; // The number of points to buy from each seller
391:         int *Sellers = PETSC_NULL;  // The process for each seller
392:         int *offsets = new int[size];
393:         for(int p = 0; p < size; ++p) {BuyCount += BuyData[2*p+1];}
394:         PetscMalloc2(BuyCount,int,&BuySizes,BuyCount,int,&Sellers);CHKERROR(ierr, "Error in PetscMalloc");
395:         for(int p = 0, buyNum = 0; p < size; ++p) {
396:           if (BuyData[2*p]) {
397:             Sellers[buyNum]    = p;
398:             BuySizes[buyNum++] = BuyData[2*p];
399:           }
400:           if (p == 0) {
401:             offsets[p] = 0;
402:           } else {
403:             offsets[p] = offsets[p-1] + msgSize*BuyData[2*(p-1)];
404:           }
405:         }

407:         // All points are bought from someone
408:         int32_t *BuyPoints;
409:         PetscMalloc(msgSize*points->size() *sizeof(int32_t),&BuyPoints);CHKERROR(ierr,"Error in PetscMalloc");
410:         for (typename Sequence::iterator p_itor = points->begin(); p_itor != points->end(); p_itor++) {
411:           BuyPoints[offsets[owner[*p_itor]]++] = (*p_itor).prefix;
412:           BuyPoints[offsets[owner[*p_itor]]++] = (*p_itor).index;
413:           BuyPoints[offsets[owner[*p_itor]]++] = _graph->cone(*p_itor)->size();
414:         }
415:         for(int b = 0, o = 0; b < BuyCount; ++b) {
416:           if (offsets[Sellers[b]] - o != msgSize*BuySizes[b]) {
417:             throw ALE::Exception("Invalid point size");
418:           }
419:           o += msgSize*BuySizes[b];
420:         }
421:         delete [] offsets;

423:         int  SellCount;                   // The number of buyers with which this process (seller) communicates
424:         int *SellSizes = PETSC_NULL;      // The number of points to sell to each buyer
425:         int *Buyers = PETSC_NULL;         // The process for each buyer
426:         int  MaxSellSize;                 // The maximum number of messages to be sold to any buyer
427:         int32_t *SellPoints = PETSC_NULL; // The points and cone sizes from all buyers
428:         PetscMaxSum(comm, BuyData, &MaxSellSize, &SellCount);CHKERROR(ierr,"Error in PetscMaxSum");
429:         PetscMalloc2(SellCount,int,&SellSizes,SellCount,int,&Buyers);CHKERROR(ierr, "Error in PetscMalloc");
430:         for(int s = 0; s < SellCount; s++) {
431:           SellSizes[s] = MaxSellSize;
432:           Buyers[s]    = MPI_ANY_SOURCE;
433:         }

435:         if (debug) {
436:           ostringstream txt;

438:           for(int p = 0; p < (int) points->size(); p++) {
439:             txt << "["<<rank<<"]: BuyPoints["<<p<<"]: ("<<BuyPoints[p*msgSize]<<", "<<BuyPoints[p*msgSize+1]<<") coneSize "<<BuyPoints[p*msgSize+2]<<std::endl;
440:           }
441:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
442:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
443:         }

445:         // First tell sellers which points we want to buy
446:         PetscObjectGetNewTag(petscObj, &tag1); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
447:         commCycle(comm, tag1, msgSize, BuyCount, BuySizes, Sellers, BuyPoints, SellCount, SellSizes, Buyers, &SellPoints);

449:         if (debug) {
450:           ostringstream txt;

452:           if (!rank) {txt << "Unsquished" << std::endl;}
453:           for(int p = 0; p < SellCount*MaxSellSize; p++) {
454:             txt << "["<<rank<<"]: SellPoints["<<p<<"]: ("<<SellPoints[p*msgSize]<<", "<<SellPoints[p*msgSize+1]<<") coneSize "<<SellPoints[p*msgSize+2]<<std::endl;
455:           }
456:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
457:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
458:         }

460:         // Since we gave maximum sizes, we need to squeeze SellPoints
461:         for(int s = 0, offset = 0; s < SellCount; s++) {
462:           if (offset != s*MaxSellSize*msgSize) {
463:             PetscMemmove(&SellPoints[offset], &SellPoints[s*MaxSellSize*msgSize], SellSizes[s]*msgSize*sizeof(int32_t));CHKERROR(ierr,"Error in PetscMemmove");
464:           }
465:           offset += SellSizes[s]*msgSize;
466:         }

468:         if (debug) {
469:           ostringstream txt;
470:           int SellSize = 0;

472:           if (!rank) {txt << "Squished" << std::endl;}
473:           for(int s = 0; s < SellCount; s++) {
474:             SellSize += SellSizes[s];
475:             txt << "SellSizes["<<s<<"]: "<<SellSizes[s]<< std::endl;
476:           }
477:           for(int p = 0; p < SellSize; p++) {
478:             txt << "["<<rank<<"]: SellPoints["<<p<<"]: ("<<SellPoints[p*msgSize]<<", "<<SellPoints[p*msgSize+1]<<") coneSize "<<SellPoints[p*msgSize+2]<<std::endl;
479:           }
480:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
481:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
482:         }

484:         // SellSizes, Buyers, and SellPoints are output
485:         Point__int_pair_set BillOfSale;

487:         for(int s = 0, offset = 0; s < SellCount; s++) {
488:           for(int m = 0; m < SellSizes[s]; m++) {
489:             Point point = Point(SellPoints[offset], SellPoints[offset+1]);

491:             BillOfSale[point].insert(int_pair(Buyers[s], SellPoints[offset+2]));
492:             offset += msgSize;
493:           }
494:         }
495:         for(int s = 0, offset = 0; s < SellCount; s++) {
496:           for(int m = 0; m < SellSizes[s]; m++) {
497:             Point point = Point(SellPoints[offset], SellPoints[offset+1]);

499:             // Decrement the buyer count so as not to count the current buyer itself
500:             SellPoints[offset+2] = BillOfSale[point].size()-1;
501:             offset += msgSize;
502:           }
503:         }

505:         // Then tell buyers how many other buyers there were
506:         PetscObjectGetNewTag(petscObj, &tag2); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
507:         commCycle(comm, tag2, msgSize, SellCount, SellSizes, Buyers, SellPoints, BuyCount, BuySizes, Sellers, &BuyPoints);

509:         int      BuyConesSize  = 0;
510:         int      SellConesSize = 0;
511:         int     *BuyConesSizes = PETSC_NULL;  // The number of points to buy from each seller
512:         int     *SellConesSizes = PETSC_NULL; // The number of points to sell to each buyer
513:         int32_t *SellCones = PETSC_NULL;      // The (rank, cone size) for each point from all other buyers
514:         int32_t *overlapInfo = PETSC_NULL;    // The (rank, cone size) for each point from all other buyers
515:         PetscMalloc2(BuyCount,int,&BuyConesSizes,SellCount,int,&SellConesSizes);CHKERROR(ierr, "Error in PetscMalloc");
516:         for(int s = 0, offset = 0; s < SellCount; s++) {
517:           SellConesSizes[s] = 0;

519:           for(int m = 0; m < SellSizes[s]; m++) {
520:             SellConesSizes[s] += SellPoints[offset+2]+1;
521:             offset            += msgSize;
522:           }
523:           SellConesSize += SellConesSizes[s];
524:         }

526:         for(int b = 0, offset = 0; b < BuyCount; b++) {
527:           BuyConesSizes[b] = 0;

529:           for(int m = 0; m < BuySizes[b]; m++) {
530:             BuyConesSizes[b] += BuyPoints[offset+2]+1;
531:             offset           += msgSize;
532:           }
533:           BuyConesSize += BuyConesSizes[b];
534:         }

536:         int cMsgSize = 2;
537:         PetscMalloc(SellConesSize*cMsgSize * sizeof(int32_t), &SellCones);CHKERROR(ierr, "Error in PetscMalloc");
538:         for(int s = 0, offset = 0, cOffset = 0, SellConeSize = 0; s < SellCount; s++) {
539:           for(int m = 0; m < SellSizes[s]; m++) {
540:             Point point(SellPoints[offset],SellPoints[offset+1]);

542:             for(typename int_pair_set::iterator p_iter = BillOfSale[point].begin(); p_iter != BillOfSale[point].end(); ++p_iter) {
543:               SellCones[cOffset+0] = (*p_iter).first;
544:               SellCones[cOffset+1] = (*p_iter).second;
545:               cOffset += cMsgSize;
546:             }
547:             offset += msgSize;
548:           }
549:           if (cOffset - cMsgSize*SellConeSize != cMsgSize*SellConesSizes[s]) {
550:             throw ALE::Exception("Nonmatching sizes");
551:           }
552:           SellConeSize += SellConesSizes[s];
553:         }

555:         // Then send buyers a (rank, cone size) for all buyers of the same points
556:         PetscObjectGetNewTag(petscObj, &tag3); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
557:         commCycle(comm, tag3, cMsgSize, SellCount, SellConesSizes, Buyers, SellCones, BuyCount, BuyConesSizes, Sellers, &overlapInfo);

559:         // Finally build the overlap sifter
560:         //   (remote rank) ---(base overlap point, remote cone size, local cone size)---> (base overlap point)
561:         for(int b = 0, offset = 0, cOffset = 0; b < BuyCount; b++) {
562:           for(int m = 0; m < BuySizes[b]; m++) {
563:             Point p(BuyPoints[offset],BuyPoints[offset+1]);

565:             for(int n = 0; n <= BuyPoints[offset+2]; n++) {
566:               int neighbor = overlapInfo[cOffset+0];
567:               int coneSize = overlapInfo[cOffset+1];

569:               if (neighbor != rank) {
570:                 // Record the point, size of the cone over p coming in from neighbor, and going out to the neighbor for the arrow color
571:                 overlap->addArrow(neighbor, p, ALE::pair<Point,ALE::pair<int,int> >(p, ALE::pair<int,int>(coneSize, _graph->cone(p)->size())) );
572:               }
573:               cOffset += cMsgSize;
574:             }
575:             offset += msgSize;
576:           }
577:         }
578:       };

580:       #undef  __FUNCT__
582:       template <typename Overlap_>
583:       static void __computeOverlapNew(const Obj<graph_type>& _graphA, const Obj<graph_type>& _graphB, Obj<Overlap_>& overlap) {
584:         typedef typename graph_type::traits::baseSequence Sequence;
585:         MPI_Comm       comm = _graphA->comm();
586:         int            size = _graphA->commSize();
587:         int            rank = _graphA->commRank();
588:         PetscObject    petscObj = _graphA->petscObj();
589:         PetscMPIInt    tag1, tag2, tag3, tag4, tag5, tag6;
591:         // The bases we are going to work with
592:         Obj<Sequence> pointsA = _graphA->base();
593:         Obj<Sequence> pointsB = _graphB->base();

595:         // We MUST have the same sellers for points in A and B (same point owner determination)
596:         int *BuyDataA; // 2 ints per processor: number of A base points we buy and number of sales (0 or 1).
597:         int *BuyDataB; // 2 ints per processor: number of B base points we buy and number of sales (0 or 1).
598:         PetscMalloc2(2*size,int,&BuyDataA,2*size,int,&BuyDataB);CHKERROR(ierr, "Error in PetscMalloc");
599:         PetscMemzero(BuyDataA, 2*size * sizeof(int));CHKERROR(ierr, "Error in PetscMemzero");
600:         PetscMemzero(BuyDataB, 2*size * sizeof(int));CHKERROR(ierr, "Error in PetscMemzero");
601:         // Map from points to the process managing its bin (seller)
602:         int__int ownerA, ownerB;

604:         // determine owners of each base node and save it in a map
605:         __determinePointOwners(_graphA, pointsA, BuyDataA, ownerA);
606:         __determinePointOwners(_graphB, pointsB, BuyDataB, ownerB);

608:         int  msgSize = 3;   // A point is 2 ints, and the cone size is 1
609:         int  BuyCountA = 0; // The number of sellers with which this process (A buyer) communicates
610:         int  BuyCountB = 0; // The number of sellers with which this process (B buyer) communicates
611:         int *BuySizesA;     // The number of A points to buy from each seller
612:         int *BuySizesB;     // The number of B points to buy from each seller
613:         int *SellersA;      // The process for each seller of A points
614:         int *SellersB;      // The process for each seller of B points
615:         int *offsetsA = new int[size];
616:         int *offsetsB = new int[size];
617:         for(int p = 0; p < size; ++p) {
618:           BuyCountA += BuyDataA[2*p+1];
619:           BuyCountB += BuyDataB[2*p+1];
620:         }
621:         PetscMalloc2(BuyCountA,int,&BuySizesA,BuyCountA,int,&SellersA);CHKERROR(ierr, "Error in PetscMalloc");
622:         PetscMalloc2(BuyCountB,int,&BuySizesB,BuyCountB,int,&SellersB);CHKERROR(ierr, "Error in PetscMalloc");
623:         for(int p = 0, buyNumA = 0, buyNumB = 0; p < size; ++p) {
624:           if (BuyDataA[2*p+1]) {
625:             SellersA[buyNumA]    = p;
626:             BuySizesA[buyNumA++] = BuyDataA[2*p];
627:           }
628:           if (BuyDataB[2*p+1]) {
629:             SellersB[buyNumB]    = p;
630:             BuySizesB[buyNumB++] = BuyDataB[2*p];
631:           }
632:           if (p == 0) {
633:             offsetsA[p] = 0;
634:             offsetsB[p] = 0;
635:           } else {
636:             offsetsA[p] = offsetsA[p-1] + msgSize*BuyDataA[2*(p-1)];
637:             offsetsB[p] = offsetsB[p-1] + msgSize*BuyDataB[2*(p-1)];
638:           }
639:         }

641:         // All points are bought from someone
642:         int32_t *BuyPointsA; // (point, coneSize) for each A point boung from a seller
643:         int32_t *BuyPointsB; // (point, coneSize) for each B point boung from a seller
644:         PetscMalloc2(msgSize*pointsA->size(),int32_t,&BuyPointsA,msgSize*pointsB->size(),int32_t,&BuyPointsB);CHKERROR(ierr,"Error in PetscMalloc");
645:         for (typename Sequence::iterator p_itor = pointsA->begin(); p_itor != pointsA->end(); p_itor++) {
646:           BuyPointsA[offsetsA[ownerA[*p_itor]]++] = *p_itor;
647:           BuyPointsA[offsetsA[ownerA[*p_itor]]++] = *p_itor;
648:           BuyPointsA[offsetsA[ownerA[*p_itor]]++] = _graphA->cone(*p_itor)->size();
649:         }
650:         for (typename Sequence::iterator p_itor = pointsB->begin(); p_itor != pointsB->end(); p_itor++) {
651:           BuyPointsB[offsetsB[ownerB[*p_itor]]++] = *p_itor;
652:           BuyPointsB[offsetsB[ownerB[*p_itor]]++] = *p_itor;
653:           BuyPointsB[offsetsB[ownerB[*p_itor]]++] = _graphB->cone(*p_itor)->size();
654:         }
655:         for(int b = 0, o = 0; b < BuyCountA; ++b) {
656:           if (offsetsA[SellersA[b]] - o != msgSize*BuySizesA[b]) {
657:             throw ALE::Exception("Invalid A point size");
658:           }
659:           o += msgSize*BuySizesA[b];
660:         }
661:         for(int b = 0, o = 0; b < BuyCountB; ++b) {
662:           if (offsetsB[SellersB[b]] - o != msgSize*BuySizesB[b]) {
663:             throw ALE::Exception("Invalid B point size");
664:           }
665:           o += msgSize*BuySizesB[b];
666:         }
667:         delete [] offsetsA;
668:         delete [] offsetsB;

670:         int  SellCountA;     // The number of A point buyers with which this process (seller) communicates
671:         int  SellCountB;     // The number of B point buyers with which this process (seller) communicates
672:         int *SellSizesA;     // The number of A points to sell to each buyer
673:         int *SellSizesB;     // The number of B points to sell to each buyer
674:         int *BuyersA;        // The process for each A point buyer
675:         int *BuyersB;        // The process for each B point buyer
676:         int  MaxSellSizeA;   // The maximum number of messages to be sold to any A point buyer
677:         int  MaxSellSizeB;   // The maximum number of messages to be sold to any B point buyer
678:         int32_t *SellPointsA = PETSC_NULL; // The points and cone sizes from all buyers
679:         int32_t *SellPointsB = PETSC_NULL; // The points and cone sizes from all buyers
680:         PetscMaxSum(comm, BuyDataA, &MaxSellSizeA, &SellCountA);CHKERROR(ierr,"Error in PetscMaxSum");
681:         PetscMaxSum(comm, BuyDataB, &MaxSellSizeB, &SellCountB);CHKERROR(ierr,"Error in PetscMaxSum");
682:         PetscMalloc2(SellCountA,int,&SellSizesA,SellCountA,int,&BuyersA);CHKERROR(ierr, "Error in PetscMalloc");
683:         PetscMalloc2(SellCountB,int,&SellSizesB,SellCountB,int,&BuyersB);CHKERROR(ierr, "Error in PetscMalloc");
684:         for(int s = 0; s < SellCountA; s++) {
685:           SellSizesA[s] = MaxSellSizeA;
686:           BuyersA[s]    = MPI_ANY_SOURCE;
687:         }
688:         for(int s = 0; s < SellCountB; s++) {
689:           SellSizesB[s] = MaxSellSizeB;
690:           BuyersB[s]    = MPI_ANY_SOURCE;
691:         }

693:         if (debug) {
694:           ostringstream txt;

696:           for(int s = 0; s < BuyCountA; s++) {
697:             txt << "BuySizesA["<<s<<"]: "<<BuySizesA[s]<<" from seller "<<SellersA[s]<< std::endl;
698:           }
699:           for(int p = 0; p < (int) pointsA->size(); p++) {
700:             txt << "["<<rank<<"]: BuyPointsA["<<p<<"]: ("<<BuyPointsA[p*msgSize]<<", "<<BuyPointsA[p*msgSize+1]<<") coneSize "<<BuyPointsA[p*msgSize+2]<<std::endl;
701:           }
702:           for(int s = 0; s < BuyCountB; s++) {
703:             txt << "BuySizesB["<<s<<"]: "<<BuySizesB[s]<<" from seller "<<SellersB[s]<< std::endl;
704:           }
705:           for(int p = 0; p < (int) pointsB->size(); p++) {
706:             txt << "["<<rank<<"]: BuyPointsB["<<p<<"]: ("<<BuyPointsB[p*msgSize]<<", "<<BuyPointsB[p*msgSize+1]<<") coneSize "<<BuyPointsB[p*msgSize+2]<<std::endl;
707:           }
708:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
709:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
710:         }

712:         // First tell sellers which points we want to buy
713:         //   SellSizes, Buyers, and SellPoints are output
714:         PetscObjectGetNewTag(petscObj, &tag1); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
715:         commCycle(comm, tag1, msgSize, BuyCountA, BuySizesA, SellersA, BuyPointsA, SellCountA, SellSizesA, BuyersA, &SellPointsA);
716:         PetscObjectGetNewTag(petscObj, &tag2); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
717:         commCycle(comm, tag2, msgSize, BuyCountB, BuySizesB, SellersB, BuyPointsB, SellCountB, SellSizesB, BuyersB, &SellPointsB);

719:         if (debug) {
720:           ostringstream txt;

722:           if (!rank) {txt << "Unsquished" << std::endl;}
723:           for(int p = 0; p < SellCountA*MaxSellSizeA; p++) {
724:             txt << "["<<rank<<"]: SellPointsA["<<p<<"]: ("<<SellPointsA[p*msgSize]<<", "<<SellPointsA[p*msgSize+1]<<") coneSize "<<SellPointsA[p*msgSize+2]<<std::endl;
725:           }
726:           for(int p = 0; p < SellCountB*MaxSellSizeB; p++) {
727:             txt << "["<<rank<<"]: SellPointsB["<<p<<"]: ("<<SellPointsB[p*msgSize]<<", "<<SellPointsB[p*msgSize+1]<<") coneSize "<<SellPointsB[p*msgSize+2]<<std::endl;
728:           }
729:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
730:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
731:         }

733:         // Since we gave maximum sizes, we need to squeeze SellPoints
734:         for(int s = 0, offset = 0; s < SellCountA; s++) {
735:           if (offset != s*MaxSellSizeA*msgSize) {
736:             PetscMemmove(&SellPointsA[offset], &SellPointsA[s*MaxSellSizeA*msgSize], SellSizesA[s]*msgSize*sizeof(int32_t));CHKERROR(ierr,"Error in PetscMemmove");
737:           }
738:           offset += SellSizesA[s]*msgSize;
739:         }
740:         for(int s = 0, offset = 0; s < SellCountB; s++) {
741:           if (offset != s*MaxSellSizeB*msgSize) {
742:             PetscMemmove(&SellPointsB[offset], &SellPointsB[s*MaxSellSizeB*msgSize], SellSizesB[s]*msgSize*sizeof(int32_t));CHKERROR(ierr,"Error in PetscMemmove");
743:           }
744:           offset += SellSizesB[s]*msgSize;
745:         }

747:         if (debug) {
748:           ostringstream txt;
749:           int SellSizeA = 0, SellSizeB = 0;

751:           if (!rank) {txt << "Squished" << std::endl;}
752:           for(int s = 0; s < SellCountA; s++) {
753:             SellSizeA += SellSizesA[s];
754:             txt << "SellSizesA["<<s<<"]: "<<SellSizesA[s]<<" from buyer "<<BuyersA[s]<< std::endl;
755:           }
756:           for(int p = 0; p < SellSizeA; p++) {
757:             txt << "["<<rank<<"]: SellPointsA["<<p<<"]: ("<<SellPointsA[p*msgSize]<<", "<<SellPointsA[p*msgSize+1]<<") coneSize "<<SellPointsA[p*msgSize+2]<<std::endl;
758:           }
759:           for(int s = 0; s < SellCountB; s++) {
760:             SellSizeB += SellSizesB[s];
761:             txt << "SellSizesB["<<s<<"]: "<<SellSizesB[s]<<" from buyer "<<BuyersB[s]<< std::endl;
762:           }
763:           for(int p = 0; p < SellSizeB; p++) {
764:             txt << "["<<rank<<"]: SellPointsB["<<p<<"]: ("<<SellPointsB[p*msgSize]<<", "<<SellPointsB[p*msgSize+1]<<") coneSize "<<SellPointsB[p*msgSize+2]<<std::endl;
765:           }
766:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
767:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
768:         }

770:         // Map from A base points to (B process, B coneSize) pairs
771:         Point__int_pair_set BillOfSaleAtoB;
772:         // Map from B base points to (A process, A coneSize) pairs
773:         Point__int_pair_set BillOfSaleBtoA;

775:         // Find the A points being sold to B buyers and record the B cone size
776:         for(int s = 0, offset = 0; s < SellCountA; s++) {
777:           for(int m = 0; m < SellSizesA[s]; m++) {
778:             Point point = Point(SellPointsA[offset], SellPointsA[offset+1]);
779:             // Just insert the point
780:             int size = BillOfSaleAtoB[point].size();
781:             // Avoid unused variable warning
782:             if (!size) offset += msgSize;
783:           }
784:         }
785:         for(int s = 0, offset = 0; s < SellCountB; s++) {
786:           for(int m = 0; m < SellSizesB[s]; m++) {
787:             Point point = Point(SellPointsB[offset], SellPointsB[offset+1]);

789:             if (BillOfSaleAtoB.find(point) != BillOfSaleAtoB.end()) {
790:               BillOfSaleAtoB[point].insert(int_pair(BuyersB[s], SellPointsB[offset+2]));
791:             }
792:             offset += msgSize;
793:           }
794:         }
795:         // Find the B points being sold to A buyers and record the A cone size
796:         for(int s = 0, offset = 0; s < SellCountB; s++) {
797:           for(int m = 0; m < SellSizesB[s]; m++) {
798:             Point point = Point(SellPointsB[offset], SellPointsB[offset+1]);
799:             // Just insert the point
800:             int size = BillOfSaleBtoA[point].size();
801:             // Avoid unused variable warning
802:             if (!size) offset += msgSize;
803:           }
804:         }
805:         for(int s = 0, offset = 0; s < SellCountA; s++) {
806:           for(int m = 0; m < SellSizesA[s]; m++) {
807:             Point point = Point(SellPointsA[offset], SellPointsA[offset+1]);

809:             if (BillOfSaleBtoA.find(point) != BillOfSaleBtoA.end()) {
810:               BillOfSaleBtoA[point].insert(int_pair(BuyersA[s], SellPointsA[offset+2]));
811:             }
812:             offset += msgSize;
813:           }
814:         }
815:         // Calculate number of B buyers for A base points
816:         for(int s = 0, offset = 0; s < SellCountA; s++) {
817:           for(int m = 0; m < SellSizesA[s]; m++) {
818:             Point point = Point(SellPointsA[offset], SellPointsA[offset+1]);

820:             SellPointsA[offset+2] = BillOfSaleAtoB[point].size();
821:             offset += msgSize;
822:           }
823:         }
824:         // Calculate number of A buyers for B base points
825:         for(int s = 0, offset = 0; s < SellCountB; s++) {
826:           for(int m = 0; m < SellSizesB[s]; m++) {
827:             Point point = Point(SellPointsB[offset], SellPointsB[offset+1]);

829:             SellPointsB[offset+2] = BillOfSaleBtoA[point].size();
830:             offset += msgSize;
831:           }
832:         }

834:         // Tell A buyers how many B buyers there were (contained in BuyPointsA)
835:         // Tell B buyers how many A buyers there were (contained in BuyPointsB)
836:         PetscObjectGetNewTag(petscObj, &tag3); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
837:         commCycle(comm, tag3, msgSize, SellCountA, SellSizesA, BuyersA, SellPointsA, BuyCountA, BuySizesA, SellersA, &BuyPointsA);
838:         PetscObjectGetNewTag(petscObj, &tag4); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
839:         commCycle(comm, tag4, msgSize, SellCountB, SellSizesB, BuyersB, SellPointsB, BuyCountB, BuySizesB, SellersB, &BuyPointsB);

841:         if (debug) {
842:           ostringstream txt;
843:           int BuySizeA = 0, BuySizeB = 0;

845:           if (!rank) {txt << "Got other B and A buyers" << std::endl;}
846:           for(int s = 0; s < BuyCountA; s++) {
847:             BuySizeA += BuySizesA[s];
848:             txt << "BuySizesA["<<s<<"]: "<<BuySizesA[s]<<" from seller "<<SellersA[s]<< std::endl;
849:           }
850:           for(int p = 0; p < BuySizeA; p++) {
851:             txt << "["<<rank<<"]: BuyPointsA["<<p<<"]: ("<<BuyPointsA[p*msgSize]<<", "<<BuyPointsA[p*msgSize+1]<<") B buyers "<<BuyPointsA[p*msgSize+2]<<std::endl;
852:           }
853:           for(int s = 0; s < BuyCountB; s++) {
854:             BuySizeB += BuySizesB[s];
855:             txt << "BuySizesB["<<s<<"]: "<<BuySizesB[s]<<" from seller "<<SellersB[s]<< std::endl;
856:           }
857:           for(int p = 0; p < BuySizeB; p++) {
858:             txt << "["<<rank<<"]: BuyPointsB["<<p<<"]: ("<<BuyPointsB[p*msgSize]<<", "<<BuyPointsB[p*msgSize+1]<<") A buyers "<<BuyPointsB[p*msgSize+2]<<std::endl;
859:           }
860:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
861:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
862:         }

864:         int      BuyConesSizeA  = 0;
865:         int      BuyConesSizeB  = 0;
866:         int      SellConesSizeA = 0;
867:         int      SellConesSizeB = 0;
868:         int     *BuyConesSizesA;  // The number of A points to buy from each seller
869:         int     *BuyConesSizesB;  // The number of B points to buy from each seller
870:         int     *SellConesSizesA; // The number of A points to sell to each buyer
871:         int     *SellConesSizesB; // The number of B points to sell to each buyer
872:         int32_t *SellConesA;      // The (rank, B cone size) for each A point from all other B buyers
873:         int32_t *SellConesB;      // The (rank, A cone size) for each B point from all other A buyers
874:         int32_t *overlapInfoA = PETSC_NULL; // The (rank, B cone size) for each A point from all other B buyers
875:         int32_t *overlapInfoB = PETSC_NULL; // The (rank, A cone size) for each B point from all other A buyers
876:         PetscMalloc2(BuyCountA,int,&BuyConesSizesA,SellCountA,int,&SellConesSizesA);CHKERROR(ierr, "Error in PetscMalloc");
877:         PetscMalloc2(BuyCountB,int,&BuyConesSizesB,SellCountB,int,&SellConesSizesB);CHKERROR(ierr, "Error in PetscMalloc");
878:         for(int s = 0, offset = 0; s < SellCountA; s++) {
879:           SellConesSizesA[s] = 0;

881:           for(int m = 0; m < SellSizesA[s]; m++) {
882:             SellConesSizesA[s] += SellPointsA[offset+2];
883:             offset             += msgSize;
884:           }
885:           SellConesSizeA += SellConesSizesA[s];
886:         }
887:         for(int s = 0, offset = 0; s < SellCountB; s++) {
888:           SellConesSizesB[s] = 0;

890:           for(int m = 0; m < SellSizesB[s]; m++) {
891:             SellConesSizesB[s] += SellPointsB[offset+2];
892:             offset             += msgSize;
893:           }
894:           SellConesSizeB += SellConesSizesB[s];
895:         }

897:         for(int b = 0, offset = 0; b < BuyCountA; b++) {
898:           BuyConesSizesA[b] = 0;

900:           for(int m = 0; m < BuySizesA[b]; m++) {
901:             BuyConesSizesA[b] += BuyPointsA[offset+2];
902:             offset            += msgSize;
903:           }
904:           BuyConesSizeA += BuyConesSizesA[b];
905:         }
906:         for(int b = 0, offset = 0; b < BuyCountB; b++) {
907:           BuyConesSizesB[b] = 0;

909:           for(int m = 0; m < BuySizesB[b]; m++) {
910:             BuyConesSizesB[b] += BuyPointsB[offset+2];
911:             offset            += msgSize;
912:           }
913:           BuyConesSizeB += BuyConesSizesB[b];
914:         }

916:         int cMsgSize = 2;
917:         PetscMalloc2(SellConesSizeA*cMsgSize,int32_t,&SellConesA,SellConesSizeB*cMsgSize,int32_t,&SellConesB);CHKERROR(ierr, "Error in PetscMalloc");
918:         for(int s = 0, offset = 0, cOffset = 0, SellConeSize = 0; s < SellCountA; s++) {
919:           for(int m = 0; m < SellSizesA[s]; m++) {
920:             Point point(SellPointsA[offset],SellPointsA[offset+1]);

922:             for(typename int_pair_set::iterator p_iter = BillOfSaleAtoB[point].begin(); p_iter != BillOfSaleAtoB[point].end(); ++p_iter) {
923:               SellConesA[cOffset+0] = (*p_iter).first;
924:               SellConesA[cOffset+1] = (*p_iter).second;
925:               cOffset += cMsgSize;
926:             }
927:             offset += msgSize;
928:           }
929:           if (cOffset - cMsgSize*SellConeSize != cMsgSize*SellConesSizesA[s]) {
930:             throw ALE::Exception("Nonmatching sizes");
931:           }
932:           SellConeSize += SellConesSizesA[s];
933:         }
934:         for(int s = 0, offset = 0, cOffset = 0, SellConeSize = 0; s < SellCountB; s++) {
935:           for(int m = 0; m < SellSizesB[s]; m++) {
936:             Point point(SellPointsB[offset],SellPointsB[offset+1]);

938:             for(typename int_pair_set::iterator p_iter = BillOfSaleBtoA[point].begin(); p_iter != BillOfSaleBtoA[point].end(); ++p_iter) {
939:               SellConesB[cOffset+0] = (*p_iter).first;
940:               SellConesB[cOffset+1] = (*p_iter).second;
941:               cOffset += cMsgSize;
942:             }
943:             offset += msgSize;
944:           }
945:           if (cOffset - cMsgSize*SellConeSize != cMsgSize*SellConesSizesB[s]) {
946:             throw ALE::Exception("Nonmatching sizes");
947:           }
948:           SellConeSize += SellConesSizesB[s];
949:         }

951:         // Then send A buyers a (rank, cone size) for all B buyers of the same points
952:         PetscObjectGetNewTag(petscObj, &tag5); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
953:         commCycle(comm, tag5, cMsgSize, SellCountA, SellConesSizesA, BuyersA, SellConesA, BuyCountA, BuyConesSizesA, SellersA, &overlapInfoA);
954:         PetscObjectGetNewTag(petscObj, &tag6); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
955:         commCycle(comm, tag6, cMsgSize, SellCountB, SellConesSizesB, BuyersB, SellConesB, BuyCountB, BuyConesSizesB, SellersB, &overlapInfoB);

957:         // Finally build the A-->B overlap sifter
958:         //   (remote rank) ---(base A overlap point, remote cone size, local cone size)---> (base A overlap point)
959:         for(int b = 0, offset = 0, cOffset = 0; b < BuyCountA; b++) {
960:           for(int m = 0; m < BuySizesA[b]; m++) {
961:             Point p(BuyPointsA[offset],BuyPointsA[offset+1]);

963:             for(int n = 0; n < BuyPointsA[offset+2]; n++) {
964:               int neighbor = overlapInfoA[cOffset+0];
965:               int coneSize = overlapInfoA[cOffset+1];

967:               // Record the point, size of the cone over p coming in from neighbor, and going out to the neighbor for the arrow color
968:               overlap->addArrow(neighbor, ALE::pair<int,Point>(0, p), ALE::pair<Point,ALE::pair<int,int> >(p, ALE::pair<int,int>(coneSize, _graphA->cone(p)->size())) );
969:               cOffset += cMsgSize;
970:             }
971:             offset += msgSize;
972:           }
973:         }

975:         // Finally build the B-->A overlap sifter
976:         //   (remote rank) ---(base B overlap point, remote cone size, local cone size)---> (base B overlap point)
977:         for(int b = 0, offset = 0, cOffset = 0; b < BuyCountB; b++) {
978:           for(int m = 0; m < BuySizesB[b]; m++) {
979:             Point p(BuyPointsB[offset],BuyPointsB[offset+1]);

981:             for(int n = 0; n < BuyPointsB[offset+2]; n++) {
982:               int neighbor = overlapInfoB[cOffset+0];
983:               int coneSize = overlapInfoB[cOffset+1];

985:               // Record the point, size of the cone over p coming in from neighbor, and going out to the neighbor for the arrow color
986:               overlap->addArrow(neighbor, ALE::pair<int,Point>(1, p), ALE::pair<Point,ALE::pair<int,int> >(p, ALE::pair<int,int>(coneSize, _graphB->cone(p)->size())) );
987:               cOffset += cMsgSize;
988:             }
989:             offset += msgSize;
990:           }
991:         }
992:       };

994:       #undef  __FUNCT__
996:       template <typename Overlap_>
997:       static void __computeOverlap(const Obj<graph_type>& _graph, Obj<Overlap_>& overlap) {
998:         typedef typename graph_type::traits::baseSequence Sequence;
1000:         MPI_Comm comm = _graph->comm();
1001:         int      size = _graph->commSize();
1002:         int      rank = _graph->commRank();
1003:         PetscObject petscObj = _graph->petscObj();

1005:         bool debug  = delta_type::debug > 0;
1006:         bool debug2 = delta_type::debug > 1;

1008:         // Allocate space for the ownership data
1009:         int32_t *LeaseData; // 2 ints per processor: number of leased nodes and number of leases (0 or 1).
1010:         PetscMalloc(2*size*sizeof(PetscInt),&LeaseData);CHKERROR(ierr, "Error in PetscMalloc");
1011:         PetscMemzero(LeaseData,2*size*sizeof(PetscInt));CHKERROR(ierr, "Error in PetscMemzero");
1012: 
1013:         // The base we are going to work with
1014:         Obj<Sequence> points = _graph->base();

1016:         // determine owners of each base node and save it in a map
1017:         Point__int owner;
1018:         __determinePointOwners(_graph, _graph->base(), LeaseData, owner);
1019: 
1020:         // Now we accumulate the max lease size and the total number of renters
1021:         // Determine the owners of base nodes and collect the lease data for each processor:
1022:         // the number of nodes leased and the number of leases (0 or 1).
1023:         int32_t MaxLeaseSize, RenterCount;
1024:         PetscMaxSum(comm,LeaseData,&MaxLeaseSize,&RenterCount);CHKERROR(ierr,"Error in PetscMaxSum");
1025:         //PetscInfo1(0,"%s: Number of renters %d\n", __FUNCT__, RenterCount);
1026:         //CHKERROR(ierr,"Error in PetscInfo");

1028:         if(debug) { /* -------------------------------------------------------------- */
1029:           PetscSynchronizedPrintf(comm, "[%d]: %s: RenterCount = %d, MaxLeaseSize = %d\n",
1030:                                          rank, __FUNCT__, RenterCount, MaxLeaseSize);
1031:           CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1032:           PetscSynchronizedFlush(comm);
1033:           CHKERROR(ierr, "Error in PetscSynchronizedFlush");
1034:         } /* ----------------------------------------------------------------------- */
1035: 
1036:         // post receives for all Rented nodes; we will be receiving 3 data items per rented node,
1037:         // and at most MaxLeaseSize of nodes per renter
1038:         PetscMPIInt    tag1;
1039:         PetscObjectGetNewTag(petscObj, &tag1); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
1040:         int32_t *RentedNodes;
1041:         MPI_Request *Renter_waits;
1042:         if(RenterCount){
1043:           PetscMalloc((RenterCount)*(3*MaxLeaseSize+1)*sizeof(int32_t),&RentedNodes);  CHKERROR(ierr,"Error in PetscMalloc");
1044:           PetscMemzero(RentedNodes,(RenterCount)*(3*MaxLeaseSize+1)*sizeof(int32_t));  CHKERROR(ierr,"Error in PetscMemzero");
1045:           PetscMalloc((RenterCount)*sizeof(MPI_Request),&Renter_waits);                CHKERROR(ierr,"Error in PetscMalloc");
1046:         }
1047:         for (int32_t i=0; i<RenterCount; i++) {
1048:           MPI_Irecv(RentedNodes+3*MaxLeaseSize*i,3*MaxLeaseSize,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,Renter_waits+i);
1049:           CHKERROR(ierr,"Error in MPI_Irecv");
1050:         }
1051: 
1052:         int32_t LessorCount;
1053:         LessorCount = 0; for (int32_t i=0; i<size; i++) LessorCount += LeaseData[2*i+1];
1054:         //PetscInfo1(0,"%s: Number of lessors %d\n",__FUNCT__, LessorCount);
1055:         //CHKERROR(ierr,"Error in PetscInfo");
1056:         if(debug) { /* -------------------------------------------------------------- */
1057:           PetscSynchronizedPrintf(comm, "[%d]: %s: LessorCount = %d\n", rank, __FUNCT__, LessorCount);
1058:           CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1059:           PetscSynchronizedFlush(comm);
1060:           CHKERROR(ierr, "Error in PetscSynchronizedFlush");
1061:         } /* ----------------------------------------------------------------------- */
1062: 
1063:         // We keep only the data about the real lessors -- those that own the nodes we lease
1064:         int32_t *LeaseSizes, *Lessors;
1065:         if(LessorCount) {
1066:           PetscMalloc(sizeof(int32_t)*(LessorCount), &LeaseSizes); CHKERROR(ierr, "Error in PetscMalloc");
1067:           PetscMalloc(sizeof(int32_t)*(LessorCount), &Lessors);    CHKERROR(ierr, "Error in PetscMalloc");
1068:         }
1069:         // We also need to compute the inverse to the Lessors array, since we need to be able to convert i into cntr
1070:         // after using the owner array.  We use a map LessorIndex; it is likely to be small -- ASSUMPTION
1071:         int__int LessorIndex;
1072:         // Traverse all processes in ascending order
1073:         int32_t cntr = 0; // keep track of entered records
1074:         for(int32_t i = 0; i < size; i++) {
1075:           if(LeaseData[2*i]) { // if there are nodes leased from process i, record it
1076:             LeaseSizes[cntr] = LeaseData[2*i];
1077:             Lessors[cntr] = i;
1078:             LessorIndex[i] = cntr;
1079:             cntr++;
1080:           }
1081:         }
1082:         PetscFree(LeaseData); CHKERROR(ierr, "Error in PetscFree");
1083:         if(debug2) { /* ----------------------------------- */
1084:           ostringstream txt;
1085:           txt << "[" << rank << "]: " << __FUNCT__ << ": lessor data [index, rank, lease size]: ";
1086:           for(int32_t i = 0; i < LessorCount; i++) {
1087:             txt << "[" << i << ", " << Lessors[i] << ", " << LeaseSizes[i] << "] ";
1088:           }
1089:           txt << "\n";
1090:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1091:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
1092:         }/* -----------------------------------  */
1093:         if(debug2) { /* ----------------------------------- */
1094:           ostringstream txt;
1095:           txt << "[" << rank << "]: " << __FUNCT__ << ": LessorIndex: ";
1096:           for(int__int::iterator li_itor = LessorIndex.begin(); li_itor!= LessorIndex.end(); li_itor++) {
1097:             int32_t i = (*li_itor).first;
1098:             int32_t j = (*li_itor).second;
1099:             txt << i << "-->" << j << "; ";
1100:           }
1101:           txt << "\n";
1102:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1103:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
1104:         }/* -----------------------------------  */
1105: 
1106: 
1107:         // pack messages containing lists of leased base nodes and their cone sizes to the lessors
1108:         int32_t LeasedNodeCount = points->size(); // all points are considered leased from someone
1109:         int32_t *LeasedNodes;
1110:         int32_t *LessorOffsets;
1111:         // We need 3 ints per leased node -- 2 per Point and 1 for the cone size
1112:         if(LeasedNodeCount) {
1113:           PetscMalloc((3*LeasedNodeCount)*sizeof(PetscInt),&LeasedNodes); CHKERROR(ierr,"Error in PetscMalloc");
1114:         }
1115:         if(LessorCount) {
1116:           PetscMalloc((LessorCount)*sizeof(PetscInt),&LessorOffsets);     CHKERROR(ierr,"Error in PetscMalloc");
1117:           LessorOffsets[0] = 0;
1118:         }
1119:         for (int32_t i=1; i<LessorCount; i++) { LessorOffsets[i] = LessorOffsets[i-1] + 3*LeaseSizes[i-1];}
1120:         for (typename Sequence::iterator point_itor = points->begin(); point_itor != points->end(); point_itor++) {
1121:           Point p = (*point_itor);
1122:           int32_t ow = owner[p];
1123:           int32_t ind  = LessorIndex[ow];
1124:           LeasedNodes[LessorOffsets[ind]++] = p.prefix;
1125:           LeasedNodes[LessorOffsets[ind]++] = p.index;
1126:           LeasedNodes[LessorOffsets[ind]++] = _graph->cone(p)->size();
1127:         }
1128:         if(LessorCount) {
1129:           LessorOffsets[0] = 0;
1130:         }
1131:         for (int32_t i=1; i<LessorCount; i++) { LessorOffsets[i] = LessorOffsets[i-1] + 3*LeaseSizes[i-1];}
1132: 
1133:         // send the messages to the lessors
1134:         MPI_Request *Lessor_waits;
1135:         if(LessorCount) {
1136:           PetscMalloc((LessorCount)*sizeof(MPI_Request),&Lessor_waits);CHKERROR(ierr,"Error in PetscMalloc");
1137:         }
1138:         for (int32_t i=0; i<LessorCount; i++) {
1139:           MPI_Isend(LeasedNodes+LessorOffsets[i],3*LeaseSizes[i],MPIU_INT,Lessors[i],tag1,comm,&Lessor_waits[i]);
1140:           CHKERROR(ierr,"Error in MPI_Isend");
1141:         }
1142: 
1143:         // wait on receive request and prepare to record the identities of the renters responding to the request and their lease sizes
1144:         int__int Renters, RenterLeaseSizes;
1145:         // Prepare to compute the set of renters of each owned node along with the cone sizes held by those renters over the node.
1146:         // Since we don't have a unique ordering on the owned nodes a priori, we will utilize a map.
1147:         Point__int_pair_set NodeRenters;
1148:         cntr  = RenterCount;
1149:         while (cntr) {
1150:           int32_t arrivalNumber;
1151:           MPI_Status Renter_status;
1152:           MPI_Waitany(RenterCount,Renter_waits,&arrivalNumber,&Renter_status);
1153:           CHKMPIERROR(ierr,ERRORMSG("Error in MPI_Waitany"));
1154:           int32_t renter = Renter_status.MPI_SOURCE;
1155:           Renters[arrivalNumber] = renter;
1156:           MPI_Get_count(&Renter_status,MPIU_INT,&RenterLeaseSizes[arrivalNumber]); CHKERROR(ierr,"Error in MPI_Get_count");
1157:           // Since there are 3 ints per leased node, the lease size is computed by dividing the received count by 3;
1158:           RenterLeaseSizes[arrivalNumber] = RenterLeaseSizes[arrivalNumber]/3;
1159:           // Record the renters for each node
1160:           for (int32_t i=0; i<RenterLeaseSizes[arrivalNumber]; i++) {
1161:             // Compute the offset into the RentedNodes array for the arrived lease.
1162:             int32_t LeaseOffset = arrivalNumber*3*MaxLeaseSize;
1163:             // ASSUMPTION on Point type
1164:             Point node = Point(RentedNodes[LeaseOffset + 3*i], RentedNodes[LeaseOffset + 3*i+1]);
1165:             int32_t coneSize = RentedNodes[LeaseOffset + 3*i + 2];
1166:             NodeRenters[node].insert(int_pair(renter,coneSize));
1167:           }
1168:           cntr--;
1169:         }
1170: 
1171:         if (debug) { /* -----------------------------------  */
1172:           // We need to collect all the data to be submitted to PetscSynchronizedPrintf
1173:           // We use a C++ string streams for that
1174:           ostringstream txt;
1175:           for (Point__int_pair_set::iterator nodeRenters_itor=NodeRenters.begin();nodeRenters_itor!= NodeRenters.end();nodeRenters_itor++) {
1176:             Point node = (*nodeRenters_itor).first;
1177:             int_pair_set renterSet   = (*nodeRenters_itor).second;
1178:             // ASSUMPTION on point type
1179:             txt << "[" << rank << "]: " << __FUNCT__ << ": node (" << node.prefix << "," << node.index << ") is rented by " << renterSet.size() << " renters (renter, cone size):  ";
1180:             for (int_pair_set::iterator renterSet_itor = renterSet.begin(); renterSet_itor != renterSet.end(); renterSet_itor++)
1181:             {
1182:               txt << "(" << (*renterSet_itor).first << "," << (*renterSet_itor).second << ") ";
1183:             }
1184:             txt << "\n";
1185:           }
1186:           // Now send the C-string behind txt to PetscSynchronizedPrintf
1187:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1188:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
1189:         }/* -----------------------------------  */
1190: 
1191:         // wait on the original sends to the lessors
1192:         MPI_Status *Lessor_status;
1193:         if (LessorCount) {
1194:           PetscMalloc((LessorCount)*sizeof(MPI_Status),&Lessor_status); CHKERROR(ierr,"Error in PetscMalloc");
1195:           MPI_Waitall(LessorCount,Lessor_waits,Lessor_status);          CHKERROR(ierr,"Error in MPI_Waitall");
1196:         }
1197: 
1198: 
1199:         // Neighbor counts: here the renters receive from the lessors the number of other renters sharing each leased node.
1200:         // Prepare to receive three integers per leased node: two for the node itself and one for the number of neighbors over that node.
1201:         // The buffer has the same structure as LeasedNodes, hence LessorOffsets can be reused.
1202:         // IMPROVE: can probably reduce the message size by a factor of 3 if we assume an ordering on the nodes received from each lessor.
1203:         // ASSUMPTION on Point type
1204:         int32_t *NeighborCounts;
1205:         if(LeasedNodeCount) {
1206:           PetscMalloc(3*(LeasedNodeCount)*sizeof(PetscInt),&NeighborCounts); CHKERROR(ierr,"Error in PetscMalloc");
1207:         }
1208:         // Post receives for NeighbornCounts
1209:         PetscMPIInt    tag2;
1210:         PetscObjectGetNewTag(petscObj, &tag2); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
1211:         for (int32_t i=0; i<LessorCount; i++) {
1212:           MPI_Irecv(NeighborCounts+LessorOffsets[i],3*LeaseSizes[i],MPIU_INT,Lessors[i],tag2,comm,&Lessor_waits[i]);
1213:           CHKERROR(ierr,"Error in MPI_Irecv");
1214:         }
1215:         // pack and send messages back to renters; we need to send 3 integers per rental (2 for Point, 1 for sharer count)
1216:         // grouped by the renter
1217:         // ASSUMPTION on Point type
1218:         // first we compute the total number of rentals
1219:         int32_t TotalRentalCount = 0;
1220:         for(Point__int_pair_set::iterator nodeRenters_itor=NodeRenters.begin();nodeRenters_itor!=NodeRenters.end();nodeRenters_itor++){
1221:           TotalRentalCount += (*nodeRenters_itor).second.size();
1222:         }
1223:         if(debug2) {
1224:           PetscSynchronizedPrintf(comm, "[%d]: TotalRentalCount %d\n", rank, TotalRentalCount);
1225:           CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1226:           PetscSynchronizedFlush(comm); CHKERROR(ierr, "PetscSynchronizedFlush");
1227:         }/* -----------------------------------  */
1228: 
1229:         // Allocate sharer counts array for all rentals
1230:         int32_t *SharerCounts;
1231:         if(TotalRentalCount) {
1232:           PetscMalloc(3*(TotalRentalCount)*sizeof(int32_t),&SharerCounts); CHKERROR(ierr,"Error in PetscMalloc");
1233:         }
1234:         // Renters are traversed in the order of their original arrival index by arrival number a
1235:         int32_t RenterOffset = 0;
1236:         cntr = 0;
1237:         for(int32_t a = 0; a < RenterCount; a++) {
1238:           // traverse the nodes leased by the renter
1239:           int32_t RenterLeaseOffset = a*3*MaxLeaseSize;
1240:           for(int32_t i = 0; i < RenterLeaseSizes[a]; i++) {
1241:             // ASSUMPTION on Point type
1242:             Point node;
1243:             node.prefix = RentedNodes[RenterLeaseOffset + 3*i];
1244:             node.index  = RentedNodes[RenterLeaseOffset + 3*i + 1];
1245:             SharerCounts[cntr++]   = node.prefix;
1246:             SharerCounts[cntr++]   = node.index;
1247:             // Decrement the sharer count by one not to count the current renter itself (with arrival number a).
1248:             SharerCounts[cntr++] = NodeRenters[node].size()-1;
1249:           }
1250:           // Send message to renter
1251:           MPI_Isend(SharerCounts+RenterOffset,3*RenterLeaseSizes[a],MPIU_INT,Renters[a],tag2,comm,Renter_waits+a);
1252:           CHKERROR(ierr, "Error in MPI_Isend");
1253:           // Offset is advanced by thrice the number of leased nodes, since we store 3 integers per leased node: Point and cone size
1254:           RenterOffset = cntr;
1255:         }
1256:         // Wait on receives from lessors with the neighbor counts
1257:         if (LessorCount) {
1258:           MPI_Waitall(LessorCount,Lessor_waits,Lessor_status); CHKERROR(ierr,"Error in MPI_Waitall");
1259:         }
1260:         // Wait on the original sends to the renters
1261:         MPI_Status *Renter_status;
1262:         PetscMalloc((RenterCount)*sizeof(MPI_Status),&Renter_status);CHKERROR(ierr,"Error in PetscMalloc");
1263:         if(RenterCount) {
1264:           MPI_Waitall(RenterCount, Renter_waits, Renter_status);CHKERROR(ierr,"Error in MPI_Waitall");
1265:         }
1266: 
1267:         if (debug) { /* -----------------------------------  */
1268:           // Use a C++ string stream to report the numbers of shared nodes leased from each lessor
1269:           ostringstream txt;
1270:           cntr = 0;
1271:           txt << "[" << rank << "]: " << __FUNCT__ << ": neighbor counts by lessor-node [lessor rank, (node), neighbor count]:  ";
1272:           for(int32_t i = 0; i < LessorCount; i++) {
1273:             // ASSUMPTION on point type
1274:             for(int32_t j = 0; j < LeaseSizes[i]; j++)
1275:             {
1276:               int32_t prefix, index, sharerCount;
1277:               prefix      = NeighborCounts[cntr++];
1278:               index       = NeighborCounts[cntr++];
1279:               sharerCount = NeighborCounts[cntr++];
1280:               txt << "[" << Lessors[i] <<", (" << prefix << "," << index << "), " << sharerCount << "] ";
1281:             }
1282:           }
1283:           txt << "\n";
1284:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1285:           PetscSynchronizedFlush(comm); CHKERROR(ierr, "PetscSynchronizedFlush");
1286:         }/* -----------------------------------  */
1287: 
1288: 
1289:         // Now we allocate an array to receive the neighbor ranks and the remote cone sizes for each leased node,
1290:         // hence, the total array size is 2*TotalNeighborCount.
1291:         // Note that the lessor offsets must be recalculated, since they are no longer based on the number of nodes
1292:         // leased from that lessor, but on the number of neighbor over the nodes leased from that lessor.
1293: 
1294:         // First we compute the numbers of neighbors over the nodes leased from a given lessor.
1295:         // NeighborCountsByLessor[lessor] = # of neighbors on that lessor
1296:         int32_t TotalNeighborCount = 0;
1297:         int32_t *NeighborCountsByLessor;
1298:         if(LessorCount) {
1299:           PetscMalloc((LessorCount)*sizeof(int32_t), &NeighborCountsByLessor); CHKERROR(ierr, "Error in PetscMalloc");
1300:         }
1301:         cntr = 0;
1302:         for(int32_t i = 0; i < LessorCount; i++) {
1303:           int32_t neighborCountByLessor = 0;
1304:           for(int32_t j = 0; j < LeaseSizes[i]; j++) {
1305:             //ASSUMPTION on Point type affects NeighborCountsOffset size
1306:             cntr += 2;
1307:             neighborCountByLessor += NeighborCounts[cntr++];
1308:           }
1309:           NeighborCountsByLessor[i] = neighborCountByLessor;
1310:           TotalNeighborCount       += neighborCountByLessor;
1311:         }
1312:         if (debug2) { /* -----------------------------------  */
1313:           // Use a C++ string stream to report the numbers of shared nodes leased from each lessor
1314:           ostringstream txt;
1315:           cntr = 0;
1316:           txt << "[" << rank << "]: " << __FUNCT__ << ": NeighborCountsByLessor [rank, count]:  ";
1317:           for(int32_t i = 0; i < LessorCount; i++) {
1318:             txt << "[" << Lessors[i] <<","  <<  NeighborCountsByLessor[i] << "]; ";
1319:           }
1320:           txt << std::endl;
1321:           txt << "[" << rank << "]: " << __FUNCT__ << ": TotalNeighborCount: " << TotalNeighborCount << std::endl;
1322:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1323:           PetscSynchronizedFlush(comm); CHKERROR(ierr, "PetscSynchronizedFlush");
1324:         }/* -----------------------------------  */
1325:         int32_t *Neighbors = 0;
1326:         if(TotalNeighborCount) {
1327:           PetscMalloc((2*TotalNeighborCount)*sizeof(int32_t),&Neighbors); CHKERROR(ierr,"Error in PetscMalloc");
1328:         }
1329: 
1330:         // Post receives for Neighbors
1331:         PetscMPIInt    tag3;
1332:         PetscObjectGetNewTag(petscObj, &tag3); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
1333:         int32_t lessorOffset = 0;
1334:         for(int32_t i=0; i<LessorCount; i++) {
1335:           if(NeighborCountsByLessor[i]) { // We expect messages from lessors with a non-zero NeighborCountsByLessor entry only
1336:             MPI_Irecv(Neighbors+lessorOffset,2*NeighborCountsByLessor[i],MPIU_INT,Lessors[i],tag3,comm,&Lessor_waits[i]);
1337:             CHKERROR(ierr,"Error in MPI_Irecv");
1338:             lessorOffset += 2*NeighborCountsByLessor[i];
1339:           }
1340:         }
1341:         if (lessorOffset != 2*TotalNeighborCount) {
1342:           ostringstream msg;

1344:           msg << "["<<rank<<"]Invalid lessor offset " << lessorOffset << " should be " << 2*TotalNeighborCount << std::endl;
1345:           throw ALE::Exception(msg.str().c_str());
1346:         }
1347:         // Pack and send messages back to renters.
1348:         // For each node p and each renter r (hence for each rental (p,r)) we must send to r a segment consisting of the list of all
1349:         // (rr,cc) such that (p,rr) is a share and cc is the cone size over p at rr.
1350:         // ALTERNATIVE, SCALABILITY:
1351:         //             1. allocate an array capable of holding all messages to all renters and send one message per renter (more memory)
1352:         //             2. allocate an array capable of holding all rentals for all nodes and send one message per share (more messages).
1353:         // Here we choose 1 since we assume that the memory requirement is modest and the communication time is much more expensive,
1354:         // however, this is likely to be application-dependent, and a switch should be introduced to change this behavior at will.
1355:         // The rental segments are grouped by the renter recepient and within the renter by the node in the same order as SharerCounts.
1356: 
1357:         // We need to compute the send buffer size using the SharerCounts array.
1358:         // Traverse the renters in order of their original arrival, indexed by the arrival number a, and then by the nodes leased by a.
1359:         // Add up all entries equal to 2 mod 3 in SharerCounts (0 & 1 mod 3 are node IDs, ASSUMPTION on Point type) and double that number
1360:         // to account for sharer ranks AND the cone sizes we are sending.
1361:         int32_t SharersSize = 0; // 'Sharers' buffer size
1362:         cntr = 0;
1363:         for(int32_t a = 0; a < RenterCount; a++) {
1364:           // traverse the number of nodes leased by the renter
1365:           for(int32_t i = 0; i < RenterLeaseSizes[a]; i++) {
1366:             SharersSize += SharerCounts[3*cntr+2];
1367:             cntr++;
1368:           }
1369:         }
1370:         SharersSize *= 2;
1371:         // Allocate the Sharers array
1372:         int32_t *Sharers;
1373:         if(SharersSize) {
1374:           PetscMalloc(SharersSize*sizeof(int32_t),&Sharers); CHKERROR(ierr,"Error in PetscMalloc");
1375:         }
1376:         // Now pack the messages and send them off.
1377:         // Renters are traversed in the order of their original arrival index by arrival number a
1378:         ostringstream txt; // DEBUG
1379:         if(debug2) {
1380:           txt << "[" << rank << "]: " << __FUNCT__ << ": RenterCount = " << RenterCount << "\n";
1381:         }
1382:         RenterOffset = 0; // this is the current offset into Sharers needed for the send statement
1383:         for(int32_t a = 0; a < RenterCount; a++) {//
1384:           int32_t r = Renters[a];
1385:           int32_t RenterLeaseOffset = a*3*MaxLeaseSize;
1386:           int32_t SegmentSize = 0;
1387:           // traverse the nodes leased by the renter
1388:           for(int32_t i = 0; i < RenterLeaseSizes[a]; i++) {
1389:             // Get a node p rented to r
1390:             // ASSUMPTION on Point type
1391:             Point p;
1392:             p.prefix = RentedNodes[RenterLeaseOffset + 3*i];
1393:             p.index  = RentedNodes[RenterLeaseOffset + 3*i + 1];
1394:             if(debug) {
1395:               txt << "[" << rank << "]: " << __FUNCT__ << ": renters sharing with " << r << " of node  (" << p.prefix << "," << p.index << ")  [rank, cone size]:  ";
1396:             }
1397:             // now traverse the set of all the renters of p
1398:             for(int_pair_set::iterator pRenters_itor=NodeRenters[p].begin(); pRenters_itor!=NodeRenters[p].end(); pRenters_itor++) {
1399:               int32_t rr = (*pRenters_itor).first;  // rank of a pRenter
1400:               int32_t cc = (*pRenters_itor).second; // cone size over p at rr
1401:               // skip r itself
1402:               if(rr != r){
1403:                 Sharers[RenterOffset+SegmentSize++] = rr;
1404:                 Sharers[RenterOffset+SegmentSize++] = cc;
1405:                 if(debug) {
1406:                   txt << "[" << rr << ","  << cc << "]; ";
1407:                 }
1408:               }
1409:             }// for(int_pair_set::iterator pRenters_itor=NodeRenters[p].begin(); pRenters_itor!=NodeRenters[p].end(); pRenters_itor++) {
1410:             if(debug) {
1411:               txt << "\n";
1412:             }
1413:           }// for(int32_t i = 0; i < RenterLeaseSizes[a]; i++) {
1414:           // Send message to renter only if the segment size is positive
1415:           if(SegmentSize > 0) {
1416:             MPI_Isend(Sharers+RenterOffset,SegmentSize,MPIU_INT,Renters[a],tag3,comm,Renter_waits+a);
1417:             CHKERROR(ierr, "Error in MPI_Isend");
1418:           }
1419:           // Offset is advanced by the segmentSize
1420:           RenterOffset += SegmentSize;
1421:         }//  for(int32_t a = 0; a < RenterCount; a++) {
1422:         if(debug) {
1423:           PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1424:           PetscSynchronizedFlush(comm); CHKERROR(ierr, "PetscSynchronizedFlush");
1425:         }
1426: 
1427:         // Wait on receives from lessors with the neighbor counts
1428:         if (LessorCount) {
1429:           MPI_Waitall(LessorCount,Lessor_waits,Lessor_status);CHKERROR(ierr,"Error in MPI_Waitall");
1430:         }
1431:         if (debug) { /* -----------------------------------  */
1432:           // To report the neighbors at each lessor we use C++ a string stream
1433:           ostringstream txt;
1434:           int32_t cntr1 = 0;
1435:           int32_t cntr2 = 0;
1436:           for(int32_t i = 0; i < LessorCount; i++) {
1437:             // ASSUMPTION on point type
1438:             txt << "[" <<rank<< "]: " << __FUNCT__ << ": neighbors over nodes leased from " <<Lessors[i]<< ":\n";
1439:             int32_t activeLessor = 0;
1440:             for(int32_t j = 0; j < LeaseSizes[i]; j++)
1441:             {
1442:               int32_t prefix, index, sharerCount;
1443:               prefix = NeighborCounts[cntr1++];
1444:               index = NeighborCounts[cntr1++];
1445:               sharerCount = NeighborCounts[cntr1++];
1446:               if(sharerCount > 0) {
1447:                 txt <<"[" << rank << "]:\t(" << prefix <<","<<index<<"):  [rank, coneSize]: ";
1448:                 activeLessor++;
1449:               }
1450:               for(int32_t k = 0; k < sharerCount; k++) {
1451:                 int32_t sharer = Neighbors[cntr2++];
1452:                 int32_t coneSize = Neighbors[cntr2++];
1453:                 txt << "[" <<sharer <<", "<< coneSize << "] ";
1454:               }
1455:             }// for(int32_t j = 0; j < LeaseSizes[i]; j++)
1456:             if(!activeLessor) {
1457:               txt <<"[" << rank << "]:\tnone";
1458:             }
1459:             txt << "\n";
1460:           }// for(int32_t i = 0; i < LessorCount; i++)
1461:           PetscSynchronizedPrintf(comm,txt.str().c_str());CHKERROR(ierr,"Error in PetscSynchronizedPrintf");
1462:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
1463:         }/* -----------------------------------  */
1464: 
1465:         // This concludes the interaction of lessors and renters, and the exchange is completed by a peer-to-peer neighbor cone swap
1466:         // (except we still have to wait on our last sends to the renters -- see below).
1467:         // However, we don't free all of the arrays associated with the lessor-renter exchanges, since some of the data
1468:         // still use those structures.  Here are the arrays we can get rid of:
1469:         if(RenterCount) {
1470:           PetscFree(RentedNodes);  CHKERROR(ierr, "Error in PetscFree");
1471:         }
1472:         if(SharersSize) {PetscFree(Sharers); CHKERROR(ierr, "Error in PetscFree");}
1473:         if(LessorCount) {
1474:           PetscFree(NeighborCountsByLessor); CHKERROR(ierr, "Error in PetscFree");
1475:           PetscFree(Lessor_status);          CHKERROR(ierr,"Error in PetscFree");
1476:           PetscFree(Lessor_waits);           CHKERROR(ierr,"Error in PetscFree");
1477:           PetscFree(LessorOffsets);          CHKERROR(ierr,"Error in PetscFree");
1478:           PetscFree(LeaseSizes);             CHKERROR(ierr,"Error in PetscFree");
1479:           PetscFree(Lessors);                CHKERROR(ierr,"Error in PetscFree");
1480:         }
1481:         if(LeasedNodeCount) {
1482:           PetscFree(LeasedNodes); CHKERROR(ierr,"Error in PetscFree");
1483:         }
1484: 
1485:         // Now we record the neighbors and the cones over each node to be received from or sent to each neigbor.
1486:         // cntr keeps track of the current position within the Neighbors array, node boundaries are delineated using NeighborCounts.
1487:         // ASSUMPTION: 'Neighbors' stores node renter segments in the same order as NeighborCounts stores the node data.
1488:         cntr = 0;
1489:         for(int32_t i = 0; i < LeasedNodeCount; i++) {
1490:           // ASSUMPTION on Point type
1491:           Point p;
1492:           p.prefix = NeighborCounts[3*i];
1493:           p.index  = NeighborCounts[3*i+1];
1494:           int32_t pNeighborsCount = NeighborCounts[3*i+2]; // recall that NeighborCounts lists the number of neighbors after each node
1495:           // extract the renters of p from Neighbors
1496:           for(int32_t j = 0; j < pNeighborsCount; j++) {
1497:             int32_t neighbor = Neighbors[cntr++];
1498:             int32_t coneSize = Neighbors[cntr++];
1499:             // Record the size of the cone over p coming in from neighbor and going out to the neighbor as a pair of integers
1500:             // which is the color of the overlap arrow from neighbor to p
1501:             overlap->addArrow(neighbor, p, ALE::pair<Point,ALE::pair<int,int> >(p, ALE::pair<int,int>(coneSize, _graph->cone(p)->size())) );
1502:           }
1503:         }// for(int32_t i = 0; i < LeasedNodeCount; i++)

1505:         // Wait on the original sends to the renters (the last vestige of the lessor-renter exchange epoch; we delayed it to afford the
1506:         // greatest opportunity for a communication-computation overlap).
1507:         if(RenterCount) {
1508:           MPI_Waitall(RenterCount, Renter_waits, Renter_status); CHKERROR(ierr,"Error in MPI_Waitall");
1509:         }
1510:         if(RenterCount) {
1511:           PetscFree(Renter_waits); CHKERROR(ierr, "Error in PetscFree");
1512:           PetscFree(Renter_status); CHKERROR(ierr, "Error in PetscFree");
1513:         }

1515:         if(LeasedNodeCount) {PetscFree(NeighborCounts); CHKERROR(ierr,"Error in PetscFree");}
1516:         if(TotalNeighborCount) {PetscFree(Neighbors);   CHKERROR(ierr, "Error in PetscFree");}
1517:         if(TotalRentalCount){PetscFree(SharerCounts);   CHKERROR(ierr, "Error in PetscFree");}

1519:       };// __computeOverlap()

1521:       #undef  __FUNCT__
1523:       /*
1524:         Seller:       A possessor of data
1525:         Buyer:        A requestor of data

1527:         Note that in this routine, the caller functions as BOTH a buyer and seller.

1529:         When we post receives, we use a buffer of the maximum size for each message
1530:         in order to simplify the size calculations (less communication).

1532:         BuyCount:     The number of sellers with which this process (buyer) communicates
1533:                       This is calculated locally
1534:         BuySizes:     The number of messages to buy from each seller
1535:         Sellers:      The process for each seller
1536:         BuyData:      The data to be bought from each seller. There are BuySizes[p] messages
1537:                       to be purchased from each process p, in order of rank.
1538:         SellCount:    The number of buyers with which this process (seller) communicates
1539:                       This requires communication
1540:         SellSizes:    The number of messages to be sold to each buyer
1541:         Buyers:       The process for each buyer
1542:         msgSize:      The number of integers in each message
1543:         SellData:     The data to be sold to each buyer. There are SellSizes[p] messages
1544:                       to be sold to each process p, in order of rank.
1545:       */
1546:       static void commCycle(MPI_Comm comm, PetscMPIInt tag, int msgSize, int BuyCount, int BuySizes[], int Sellers[], int32_t BuyData[], int SellCount, int SellSizes[], int Buyers[], int32_t *SellData[]) {
1547:         int32_t     *locSellData; // Messages to sell to buyers (received from buyers)
1548:         int          SellSize = 0;
1549:         int         *BuyOffsets = PETSC_NULL, *SellOffsets = PETSC_NULL;
1550:         MPI_Request *buyWaits = PETSC_NULL,  *sellWaits = PETSC_NULL;
1551:         MPI_Status  *buyStatus = PETSC_NULL;

1554:         // Allocation
1555:         PetscMallocValidate(__LINE__,__FUNCT__,__FILE__,__SDIR__);CHKERROR(ierr,"Memory corruption");
1556:         for(int s = 0; s < SellCount; s++) {SellSize += SellSizes[s];}
1557:         PetscMalloc2(BuyCount,int,&BuyOffsets,SellCount,int,&SellOffsets);CHKERROR(ierr,"Error in PetscMalloc");
1558:         PetscMalloc3(BuyCount,MPI_Request,&buyWaits,SellCount,MPI_Request,&sellWaits,BuyCount,MPI_Status,&buyStatus);
1559:         CHKERROR(ierr,"Error in PetscMalloc");
1560:         if (*SellData) {
1561:           locSellData = *SellData;
1562:         } else {
1563:           // Stupid, stupid, stupid fucking MPICH fails with 0-length storage
1564:           PetscMalloc(PetscMax(1, msgSize*SellSize) * sizeof(int32_t), &locSellData);CHKERROR(ierr,"Error in PetscMalloc");
1565:         }
1566:         // Initialization
1567:         for(int b = 0; b < BuyCount; b++) {
1568:           if (b == 0) {
1569:             BuyOffsets[0] = 0;
1570:           } else {
1571:             BuyOffsets[b] = BuyOffsets[b-1] + msgSize*BuySizes[b-1];
1572:           }
1573:         }
1574:         for(int s = 0; s < SellCount; s++) {
1575:           if (s == 0) {
1576:             SellOffsets[0] = 0;
1577:           } else {
1578:             SellOffsets[s] = SellOffsets[s-1] + msgSize*SellSizes[s-1];
1579:           }
1580:         }
1581:         PetscMemzero(locSellData, msgSize*SellSize * sizeof(int32_t));CHKERROR(ierr,"Error in PetscMemzero");

1583:         // Post receives for bill of sale (data request)
1584:         for(int s = 0; s < SellCount; s++) {
1585:           MPI_Irecv(&locSellData[SellOffsets[s]], msgSize*SellSizes[s], MPIU_INT, Buyers[s], tag, comm, &sellWaits[s]);
1586:           CHKERROR(ierr,"Error in MPI_Irecv");
1587:         }
1588:         // Post sends with bill of sale (data request)
1589:         for(int b = 0; b < BuyCount; b++) {
1590:           MPI_Isend(&BuyData[BuyOffsets[b]], msgSize*BuySizes[b], MPIU_INT, Sellers[b], tag, comm, &buyWaits[b]);
1591:           CHKERROR(ierr,"Error in MPI_Isend");
1592:         }
1593:         // Receive the bill of sale from buyer
1594:         for(int s = 0; s < SellCount; s++) {
1595:           MPI_Status sellStatus;
1596:           int        num;

1598:           MPI_Waitany(SellCount, sellWaits, &num, &sellStatus);CHKMPIERROR(ierr,ERRORMSG("Error in MPI_Waitany"));
1599:           // OUTPUT: Overwriting input buyer process
1600:           Buyers[num] = sellStatus.MPI_SOURCE;
1601:           // OUTPUT: Overwriting input sell size
1602:           MPI_Get_count(&sellStatus, MPIU_INT, &SellSizes[num]);CHKERROR(ierr,"Error in MPI_Get_count");
1603:           SellSizes[num] /= msgSize;
1604:         }
1605:         // Wait on send for bill of sale
1606:         if (BuyCount) {
1607:           MPI_Waitall(BuyCount, buyWaits, buyStatus); CHKERROR(ierr,"Error in MPI_Waitall");
1608:         }

1610:         PetscFree2(BuyOffsets, SellOffsets);CHKERROR(ierr,"Error in PetscFree");
1611:         PetscFree3(buyWaits, sellWaits, buyStatus);CHKERROR(ierr,"Error in PetscFree");
1612:         // OUTPUT: Providing data out
1613:         *SellData = locSellData;
1614:       }

1616:       // -------------------------------------------------------------------------------------------------------------------
1619:       template <typename Overlap_, typename Fusion_>
1620:       static void __computeFusion(const Obj<graph_type>& _graph, const Obj<Overlap_>& overlap, Obj<Fusion_> fusion, const Obj<fuser_type>& fuser) {
1621:         //
1622:         typedef ConeArraySequence<typename graph_type::traits::arrow_type> cone_array_sequence;
1623:         typedef typename cone_array_sequence::cone_arrow_type              cone_arrow_type;
1625:         MPI_Comm comm = _graph->comm();
1626:         int      rank = _graph->commRank();
1627:         PetscObject petscObj = _graph->petscObj();

1629:         bool debug = delta_type::debug > 0;
1630:         bool debug2 = delta_type::debug > 1;

1632:         // Compute total incoming cone sizes by neighbor and the total incomping cone size.
1633:         // Also count the total number of neighbors we will be communicating with
1634:         int32_t  NeighborCountIn = 0;
1635:         int__int NeighborConeSizeIn;
1636:         int32_t  ConeSizeIn = 0;
1637:         ostringstream txt3;
1638:         // Traverse all of the neighbors  from whom we will be receiving cones -- the cap of the overlap.
1639:         typename Overlap_::traits::capSequence overlapCap = overlap->cap();
1640:         for(typename Overlap_::traits::capSequence::iterator ci  = overlapCap.begin(); ci != overlapCap.end(); ci++)
1641:         { // traversing overlap.cap()
1642:           int32_t neighborIn = *ci;
1643:           // Traverse the supports of the overlap graph under each neighbor rank, count cone sizes to be received and add the cone sizes
1644:           typename Overlap_::traits::supportSequence supp = overlap->support(*ci);
1645:           if(debug2) {
1646:             //txt3 << "[" << rank << "]: " << __FUNCT__ << ": overlap: support of rank " << neighborIn << ": " << std::endl;
1647:             //txt3 << supp;
1648:           }
1649:           int32_t coneSizeIn = 0;
1650:           for(typename Overlap_::traits::supportSequence::iterator si = supp.begin(); si != supp.end(); si++) {
1651:             // FIX: replace si.color() type: Point --> ALE::pair
1652:             //coneSizeIn += si.color().prefix;
1653:             coneSizeIn += si.color().second.first;
1654:           }
1655:           if(coneSizeIn > 0) {
1656:             // Accumulate the total cone size
1657:             ConeSizeIn += coneSizeIn;
1658:             NeighborConeSizeIn[neighborIn] = coneSizeIn;
1659:             NeighborCountIn++;
1660:             txt3  << "[" << rank << "]: " << "NeighborConeSizeIn[" << neighborIn << "]: " << NeighborConeSizeIn[neighborIn] << "\n";
1661:           }
1662:         }
1663:         if(debug2) {
1664:           if(NeighborCountIn == 0) {
1665:             txt3  << "[" << rank << "]: no incoming Neighbors" << std::endl;
1666:           }
1667:           PetscSynchronizedPrintf(comm,txt3.str().c_str());CHKERROR(ierr,"Error in PetscSynchronizedPrintf");
1668:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
1669:         }
1670:         if(debug) {/* --------------------------------------------------------------------------------------------- */
1671:           ostringstream txt;
1672:           txt << "[" << rank << "]: " << __FUNCT__ << ": total size of incoming cone: " << ConeSizeIn << "\n";
1673:           for(int__int::iterator np_itor = NeighborConeSizeIn.begin();np_itor!=NeighborConeSizeIn.end();np_itor++)
1674:           {
1675:             int32_t neighbor = (*np_itor).first;
1676:             int32_t coneSize = (*np_itor).second;
1677:             txt << "[" << rank << "]: " << __FUNCT__ << ": size of cone from " << neighbor << ": " << coneSize << "\n";
1678: 
1679:           }//int__int::iterator np_itor=NeighborConeSizeIn.begin();np_itor!=NeighborConeSizeIn.end();np_itor++)
1680:           PetscSynchronizedPrintf(comm, txt.str().c_str());
1681:           CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1682:           PetscSynchronizedFlush(comm);
1683:           CHKERROR(ierr, "Error in PetscSynchronizedFlush");
1684:         }/* --------------------------------------------------------------------------------------------- */
1685:         // Compute the size of a cone element
1686:         size_t cone_arrow_size = sizeof(cone_arrow_type);
1687:         // Now we can allocate a receive buffer to receive all of the remote cones from neighbors
1688:         cone_arrow_type *ConesIn;
1689:         if(ConeSizeIn) {
1690:           PetscMalloc(ConeSizeIn*cone_arrow_size,&ConesIn); CHKERROR(ierr,"Error in PetscMalloc");
1691:         }
1692:         // Allocate receive requests
1693:         MPI_Request *NeighborsIn_waits;
1694:         if(NeighborCountIn) {
1695:           PetscMalloc((NeighborCountIn)*sizeof(MPI_Request),&NeighborsIn_waits);CHKERROR(ierr,"Error in PetscMalloc");
1696:         }
1697:         // Post receives for ConesIn
1698:         PetscMPIInt    tag4;
1699:         PetscObjectGetNewTag(petscObj, &tag4); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
1700:         // Traverse all neighbors from whom we are receiving cones
1701:         cone_arrow_type *NeighborOffsetIn = ConesIn;
1702:         if(debug2) {
1703:           PetscSynchronizedPrintf(comm, "[%d]: %s: NeighborConeSizeIn.size() = %d\n",rank, __FUNCT__, NeighborConeSizeIn.size());
1704:           CHKERROR(ierr, "Error in PetscSynchornizedPrintf");
1705:           PetscSynchronizedFlush(comm);
1706:           CHKERROR(ierr, "Error in PetscSynchornizedFlush");
1707:           if(NeighborConeSizeIn.size()) {
1708:             ierr=PetscSynchronizedPrintf(comm, "[%d]: %s: *NeighborConeSizeIn.begin() = (%d,%d)\n",
1709:                                          rank, __FUNCT__, (*NeighborConeSizeIn.begin()).first, (*NeighborConeSizeIn.begin()).second);
1710:             CHKERROR(ierr, "Error in PetscSynchornizedPrintf");
1711:             PetscSynchronizedFlush(comm);
1712:             CHKERROR(ierr, "Error in PetscSynchornizedFlush");
1713: 
1714:           }
1715:         }
1716:         int32_t n = 0;
1717:         for(std::map<int32_t, int32_t>::iterator n_itor = NeighborConeSizeIn.begin(); n_itor!=NeighborConeSizeIn.end(); n_itor++) {
1718:           int32_t neighborIn = (*n_itor).first;
1719:           int32_t coneSizeIn = (*n_itor).second;
1720:           MPI_Irecv(NeighborOffsetIn,cone_arrow_size*coneSizeIn,MPI_BYTE,neighborIn,tag4,comm, NeighborsIn_waits+n);
1721:           CHKERROR(ierr, "Error in MPI_Irecv");
1722:           NeighborOffsetIn += coneSizeIn;
1723:           n++;
1724:         }
1725: 
1726:         // Compute the total outgoing cone sizes by neighbor and the total outgoing cone size.
1727:         int__int NeighborConeSizeOut;
1728:         int32_t  ConeSizeOut = 0;
1729:         int32_t NeighborCountOut = 0;
1730:         for(typename Overlap_::traits::capSequence::iterator ci  = overlapCap.begin(); ci != overlapCap.end(); ci++)
1731:         { // traversing overlap.cap()
1732:           int32_t neighborOut = *ci;
1733:           // Traverse the supports of the overlap graph under each neighbor rank, count cone sizes to be sent and add the cone sizes
1734:           typename Overlap_::traits::supportSequence supp = overlap->support(*ci);
1735:           if(debug2) {
1736:             //txt3 << "[" << rank << "]: " << __FUNCT__ << ": overlap: support of rank " << neighborOut << ": " << std::endl;
1737:             //txt3 << supp;
1738:           }
1739:           int32_t coneSizeOut = 0;
1740:           for(typename Overlap_::traits::supportSequence::iterator si = supp.begin(); si != supp.end(); si++) {
1741:             // FIX: replace si.color() Point --> ALE::pair
1742:             //coneSizeOut += si.color().index;
1743:             coneSizeOut += si.color().second.second;
1744:           }
1745:           if(coneSizeOut > 0) {
1746:             // Accumulate the total cone size
1747:             ConeSizeOut += coneSizeOut;
1748:             NeighborConeSizeOut[neighborOut] = coneSizeOut;
1749:             NeighborCountOut++;
1750:           }
1751:         }//traversing overlap.cap()
1752: 
1753:         if(debug) {/* --------------------------------------------------------------------------------------------- */
1754:           ostringstream txt;
1755:           txt << "[" << rank << "]: " << __FUNCT__ << ": total size of outgoing cone: " << ConeSizeOut << "\n";
1756:           for(int__int::iterator np_itor = NeighborConeSizeOut.begin();np_itor!=NeighborConeSizeOut.end();np_itor++)
1757:           {
1758:             int32_t neighborOut = (*np_itor).first;
1759:             int32_t coneSizeOut = (*np_itor).second;
1760:             txt << "[" << rank << "]: " << __FUNCT__ << ": size of cone to " << neighborOut << ": " << coneSizeOut << "\n";
1761: 
1762:           }//int__int::iterator np_itor=NeighborConeSizeOut.begin();np_itor!=NeighborConeSizeOut.end();np_itor++)
1763:           PetscSynchronizedPrintf(comm, txt.str().c_str());
1764:           CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1765:           PetscSynchronizedFlush(comm);
1766:           CHKERROR(ierr, "Error in PetscSynchronizedFlush");
1767:         }/* --------------------------------------------------------------------------------------------- */
1768: 
1769:         // Now we can allocate a send buffer to send all of the remote cones to neighbors
1770:         cone_arrow_type *ConesOut;
1771:         if(ConeSizeOut) {
1772:           PetscMalloc(cone_arrow_size*ConeSizeOut,&ConesOut); CHKERROR(ierr,"Error in PetscMalloc");
1773:         }
1774:         // Allocate send requests
1775:         MPI_Request *NeighborsOut_waits;
1776:         if(NeighborCountOut) {
1777:           PetscMalloc((NeighborCountOut)*sizeof(MPI_Request),&NeighborsOut_waits);CHKERROR(ierr,"Error in PetscMalloc");
1778:         }
1779: 
1780:         // Pack and send messages
1781:         cone_arrow_type *NeighborOffsetOut = ConesOut;
1782:         int32_t cntr = 0; // arrow counter
1783:         n = 0;    // neighbor counter
1784:         ostringstream txt2;
1785:         // Traverse all neighbors to whom we are sending cones
1786:         for(typename Overlap_::traits::capSequence::iterator ci  = overlapCap.begin(); ci != overlapCap.end(); ci++)
1787:         { // traversing overlap.cap()
1788:           int32_t neighborOut = *ci;

1790:           // Make sure we have a cone going out to this neighbor
1791:           if(NeighborConeSizeOut.find(neighborOut) != NeighborConeSizeOut.end()) { // if there is anything to send
1792:             if(debug) { /* ------------------------------------------------------------ */
1793:               txt2  << "[" << rank << "]: " << __FUNCT__ << ": outgoing cones destined for " << neighborOut << "\n";
1794:             }/* ----------------------------------------------------------------------- */
1795:             int32_t coneSizeOut = NeighborConeSizeOut[neighborOut];
1796:             // ASSUMPTION: all overlap supports are "symmetric" with respect to swapping processes,so we safely can assume that
1797:             //             the receiver will be expecting points in the same order as they appear in the support here.
1798:             // Traverse all the points within the overlap with this neighbor
1799:             typename Overlap_::traits::supportSequence supp = overlap->support(*ci);
1800:             for(typename Overlap_::traits::supportSequence::iterator si = supp.begin(); si != supp.end(); si++) {
1801:               Point p = *si;
1802:               if(debug) { /* ------------------------------------------------------------ */
1803:                 txt2  << "[" << rank << "]: \t cone over " << p << ":  ";
1804:               }/* ----------------------------------------------------------------------- */
1805:               // Traverse the cone over p in the local _graph and place corresponding TargetArrows in ConesOut
1806:               typename graph_type::traits::coneSequence cone = _graph->cone(p);
1807:               for(typename graph_type::traits::coneSequence::iterator cone_itor = cone.begin(); cone_itor != cone.end(); cone_itor++) {
1808:                 // Place a TargetArrow into the ConesOut buffer
1809:                 // WARNING: pointer arithmetic involving ConesOut takes place here
1810:                 //cone_arrow_type::place(ConesOut+cntr, cone_itor.arrow());
1811:                 cone_arrow_type::place(ConesOut+cntr, typename graph_type::traits::arrow_type(*cone_itor,p,cone_itor.color()));
1812:                 cntr++;
1813:                 if(debug) { /* ------------------------------------------------------------ */
1814:                   txt2  << " " << *cone_itor;
1815:                 }/* ----------------------------------------------------------------------- */
1816:               }
1817:               if(debug) { /* ------------------------------------------------------------ */
1818:                 txt2  << std::endl;
1819:               }/* ----------------------------------------------------------------------- */
1820:             }
1821:             MPI_Isend(NeighborOffsetOut,cone_arrow_size*coneSizeOut,MPI_BYTE,neighborOut,tag4,comm, NeighborsOut_waits+n);
1822:             CHKERROR(ierr, "Error in MPI_Isend");
1823:             // WARNING: pointer arithmetic involving NeighborOffsetOut takes place here
1824:             NeighborOffsetOut += coneSizeOut; // keep track of offset
1825:             n++;  // count neighbors
1826:           }// if there is anything to send
1827:         }// traversing overlap.cap()
1828:         if(debug && NeighborCountOut) {/* --------------------------------------------------------------- */
1829:           PetscSynchronizedPrintf(comm, txt2.str().c_str());
1830:           CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1831:           PetscSynchronizedFlush(comm);
1832:           CHKERROR(ierr, "Error in PetscSynchronizedFlush");
1833:         }/* --------------------------------------------------------------------------------------------- */
1834: 
1835:         // Allocate an In status array
1836:         MPI_Status *NeighborIn_status;
1837:         if(NeighborCountIn) {
1838:           PetscMalloc((NeighborCountIn)*sizeof(MPI_Status),&NeighborIn_status);CHKERROR(ierr,"Error in PetscMalloc");
1839:         }
1840: 
1841:         // Wait on the receives
1842:         if(NeighborCountIn) {
1843:           ostringstream txt;
1844:           txt << "[" << _graph->commRank() << "]: Error in MPI_Waitall";
1845:           MPI_Waitall(NeighborCountIn, NeighborsIn_waits, NeighborIn_status); CHKERROR(ierr,txt.str().c_str());
1846:         }
1847: 
1848:         // Now we unpack the received cones, fuse them with the local cones and store the result in the completion graph.
1849:         // Traverse all neighbors  from whom we are expecting cones
1850:         cntr = 0; // arrow counter
1851:         NeighborOffsetIn = ConesIn;
1852:         ostringstream txt;
1853:         for(typename Overlap_::traits::capSequence::iterator ci  = overlapCap.begin(); ci != overlapCap.end(); ci++)
1854:         { // traversing overlap.cap()
1855:           // Traverse all the points within the overlap with this neighbor
1856:           // ASSUMPTION: points are sorted within each neighbor, so we are expecting points in the same order as they arrived in ConesIn
1857:           typename Overlap_::traits::supportSequence supp = overlap->support(*ci);
1858:           for(typename Overlap_::traits::supportSequence::iterator si = supp.begin(); si != supp.end(); si++)
1859:           {
1860:             Point p = *si;
1861:             //int32_t coneSizeIn = si.color().prefix; // FIX: color() type Point --> ALE::pair
1862:             int32_t coneSizeIn = si.color().second.first;
1863:             // NOTE: coneSizeIn may be 0, which is legal, since the fuser in principle can operate on an empty cone.
1864:             // Extract the local cone into a coneSequence
1865:             typename graph_type::traits::coneSequence lcone = _graph->cone(p);
1866:             // Wrap the arrived cone in a cone_array_sequence
1867:             cone_array_sequence rcone(NeighborOffsetIn, coneSizeIn, p);
1868:             if(debug) { /* ---------------------------------------------------------------------------------------*/
1869:               txt << "[" << rank << "]: "<<__FUNCT__<< ": received a cone over " << p << " of size " << coneSizeIn << " from rank "<<*ci<< ":" << std::endl;
1870:               rcone.view(txt, true);
1871:             }/* --------------------------------------------------------------------------------------------------*/
1872:             // Fuse the cones
1873:             fuser->fuseCones(lcone, rcone, fusion->cone(fuser->fuseBasePoints(p,p)));
1874:             if(debug) {
1875:               //ostringstream txt;
1876:               //txt << "[" << rank << "]: ... after fusing the cone over" << p << std::endl;
1877:               //fusion->view(std::cout, txt.str().c_str());
1878:             }
1879:             NeighborOffsetIn += coneSizeIn;
1880:           }
1881:         }
1882:         if(debug) { /* ---------------------------------------------------------------------------------------*/
1883:           if(NeighborCountIn == 0) {
1884:             txt << "[" << rank << "]: no cones to fuse in" << std::endl;
1885:           }
1886:           PetscSynchronizedPrintf(comm, txt.str().c_str());
1887:           CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1888:           PetscSynchronizedFlush(comm);
1889:           CHKERROR(ierr, "Error in PetscSynchronizedFlush");
1890:         }

1892:         // Wait on the original sends
1893:         // Allocate an Out status array
1894:         MPI_Status *NeighborOut_status;
1895:         if(NeighborCountOut) {
1896:           PetscMalloc((NeighborCountOut)*sizeof(MPI_Status),&NeighborOut_status);CHKERROR(ierr,"Error in PetscMalloc");
1897:           MPI_Waitall(NeighborCountOut, NeighborsOut_waits, NeighborOut_status); CHKERROR(ierr,"Error in MPI_Waitall");
1898:         }
1899: 
1900:         // Computation complete; freeing memory.
1901:         // Some of these can probably be freed earlier, if memory is needed.
1902:         // However, be careful while freeing memory that may be in use implicitly.
1903:         // For instance, ConesOut is a send buffer and should probably be retained until all send requests have been waited on.
1904:         if(NeighborCountOut){
1905:           PetscFree(NeighborsOut_waits); CHKERROR(ierr, "Error in PetscFree");
1906:           PetscFree(NeighborOut_status); CHKERROR(ierr, "Error in PetscFree");
1907:         }
1908:         if(NeighborCountIn){
1909:           PetscFree(NeighborsIn_waits);  CHKERROR(ierr, "Error in PetscFree");
1910:           PetscFree(NeighborIn_status); CHKERROR(ierr, "Error in PetscFree");
1911:         }
1912: 
1913:         if(ConeSizeIn) {PetscFree(ConesIn);           CHKERROR(ierr, "Error in PetscFree");}
1914:         if(ConeSizeOut){PetscFree(ConesOut);          CHKERROR(ierr, "Error in PetscFree");}
1915: 
1916:         // Done!
1917:       };// __computeFusion()

1921:       template <typename Overlap_, typename Fusion_>
1922:       static void __computeFusionNew(const Obj<graph_type>& _graph, const Obj<Overlap_>& overlap, Obj<Fusion_> fusion, const Obj<fuser_type>& fuser) {
1923:         typedef ConeArraySequence<typename graph_type::traits::arrow_type> cone_array_sequence;
1924:         typedef typename cone_array_sequence::cone_arrow_type              cone_arrow_type;
1925:         MPI_Comm       comm = _graph->comm();
1926:         int            rank = _graph->commRank();
1927:         int            size = _graph->commSize();
1928:         PetscObject    petscObj = _graph->petscObj();
1929:         PetscMPIInt    tag1;

1932:         Obj<typename Overlap_::traits::capSequence> overlapCap = overlap->cap();
1933:         int msgSize = sizeof(cone_arrow_type)/sizeof(int); // Messages are arrows

1935:         int NeighborCount = overlapCap->size();
1936:         int *Neighbors = PETSC_NULL, *NeighborByProc = PETSC_NULL; // Neighbor processes and the reverse map
1937:         int *SellSizes = PETSC_NULL, *BuySizes = PETSC_NULL;       // Sizes of the cones to transmit and receive
1938:         int *SellCones = PETSC_NULL, *BuyCones = PETSC_NULL;       //
1939:         int n, offset;
1940:         PetscMalloc2(NeighborCount,int,&Neighbors,size,int,&NeighborByProc);CHKERROR(ierr, "Error in PetscMalloc");
1941:         PetscMalloc2(NeighborCount,int,&SellSizes,NeighborCount,int,&BuySizes);CHKERROR(ierr, "Error in PetscMalloc");

1943:         n = 0;
1944:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
1945:           Neighbors[n] = *neighbor;
1946:           NeighborByProc[*neighbor] = n;
1947:           BuySizes[n] = 0;
1948:           SellSizes[n] = 0;
1949:           n++;
1950:         }

1952:         n = 0;
1953:         offset = 0;
1954:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
1955:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);

1957:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
1958:             BuySizes[n] += p_iter.color().second.first;
1959:             SellSizes[n] += p_iter.color().second.second;
1960:             offset += _graph->cone(*p_iter)->size();
1961:           }
1962:           n++;
1963:         }

1965:         PetscMalloc(offset*msgSize * sizeof(int), &SellCones);CHKERROR(ierr, "Error in PetscMalloc");
1966:         cone_arrow_type *ConesOut = (cone_arrow_type *) SellCones;
1967:         offset = 0;
1968:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
1969:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);
1970:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
1971:             Obj<typename graph_type::traits::coneSequence> cone = _graph->cone(*p_iter);

1973:             for(typename graph_type::traits::coneSequence::iterator c_iter = cone->begin(); c_iter != cone->end(); ++c_iter) {
1974:               if (debug) {
1975:                 ostringstream txt;

1977:                 txt << "["<<rank<<"]Packing arrow for " << *neighbor << "  " << *c_iter << "--" << c_iter.color() << "-->" << *p_iter << std::endl;
1978:                 PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
1979:               }
1980:               cone_arrow_type::place(ConesOut+offset, typename graph_type::traits::arrow_type(*c_iter, *p_iter, c_iter.color()));
1981:               offset++;
1982:             }
1983:             if (p_iter.color().second.second != (int) cone->size()) {
1984:               throw ALE::Exception("Non-matching sizes");
1985:             }
1986:           }
1987:         }
1988:         if (debug) {
1989:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
1990:         }

1992:         // Send and retrieve cones of the base overlap
1993:         PetscObjectGetNewTag(petscObj, &tag1); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
1994:         commCycle(comm, tag1, msgSize, NeighborCount, SellSizes, Neighbors, SellCones, NeighborCount, BuySizes, Neighbors, &BuyCones);

1996:         cone_arrow_type *ConesIn = (cone_arrow_type *) BuyCones;
1997:         offset = 0;
1998:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
1999:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);

2001:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2002:             const Obj<typename graph_type::traits::coneSequence>& localCone = _graph->cone(*p_iter);
2003:             int remoteConeSize = p_iter.color().second.first;
2004:             cone_array_sequence remoteCone(&ConesIn[offset], remoteConeSize, *p_iter);
2005:             if (debug) {
2006:               ostringstream txt;

2008:               txt << "["<<rank<<"]Unpacking cone for " << *p_iter << std::endl;
2009:               remoteCone.view(txt, true);
2010:               PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
2011:             }
2012:             // Fuse in received cones
2013:             fuser->fuseCones(localCone, remoteCone, fusion->cone(fuser->fuseBasePoints(*p_iter, *p_iter)));
2014:             offset += remoteConeSize;
2015:           }
2016:         }
2017:         if (debug) {
2018:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
2019:         }
2020:       };

2024:       template <typename Overlap_, typename Fusion_>
2025:       static void __computeFusionNew(const Obj<graph_type>& _graphA, const Obj<graph_type>& _graphB, const Obj<Overlap_>& overlap, Obj<Fusion_> fusion, const Obj<fuser_type>& fuser) {
2026:         typedef ConeArraySequence<typename graph_type::traits::arrow_type> cone_array_sequence;
2027:         typedef typename cone_array_sequence::cone_arrow_type              cone_arrow_type;
2028:         MPI_Comm       comm = _graphA->comm();
2029:         int            rank = _graphA->commRank();
2030:         PetscObject    petscObj = _graphA->petscObj();
2031:         PetscMPIInt    tag1;

2034:         Obj<typename Overlap_::traits::capSequence> overlapCap = overlap->cap();
2035:         int msgSize = sizeof(cone_arrow_type)/sizeof(int); // Messages are arrows

2037:         int NeighborCountA = 0, NeighborCountB = 0;
2038:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
2039:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);

2041:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2042:             if (((*p_iter).first == 0) && (p_iter.color().second.first || p_iter.color().second.second)) {
2043:               NeighborCountA++;
2044:               break;
2045:             }
2046:           }
2047:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2048:             if (((*p_iter).first == 1) && (p_iter.color().second.first || p_iter.color().second.second)) {
2049:               NeighborCountB++;
2050:               break;
2051:             }
2052:           }
2053:         }

2055:         int *NeighborsA, *NeighborsB; // Neighbor processes
2056:         int *SellSizesA, *BuySizesA;  // Sizes of the A cones to transmit and B cones to receive
2057:         int *SellSizesB, *BuySizesB;  // Sizes of the B cones to transmit and A cones to receive
2058:         int *SellConesA = PETSC_NULL, *BuyConesA = PETSC_NULL;
2059:         int *SellConesB = PETSC_NULL, *BuyConesB = PETSC_NULL;
2060:         int nA, nB, offsetA, offsetB;
2061:         PetscMalloc2(NeighborCountA,int,&NeighborsA,NeighborCountB,int,&NeighborsB);CHKERROR(ierr, "Error in PetscMalloc");
2062:         PetscMalloc2(NeighborCountA,int,&SellSizesA,NeighborCountA,int,&BuySizesA);CHKERROR(ierr, "Error in PetscMalloc");
2063:         PetscMalloc2(NeighborCountB,int,&SellSizesB,NeighborCountB,int,&BuySizesB);CHKERROR(ierr, "Error in PetscMalloc");

2065:         nA = 0;
2066:         nB = 0;
2067:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
2068:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);

2070:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2071:             if (((*p_iter).first == 0) && (p_iter.color().second.first || p_iter.color().second.second)) {
2072:               NeighborsA[nA] = *neighbor;
2073:               BuySizesA[nA] = 0;
2074:               SellSizesA[nA] = 0;
2075:               nA++;
2076:               break;
2077:             }
2078:           }
2079:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2080:             if (((*p_iter).first == 1) && (p_iter.color().second.first || p_iter.color().second.second)) {
2081:               NeighborsB[nB] = *neighbor;
2082:               BuySizesB[nB] = 0;
2083:               SellSizesB[nB] = 0;
2084:               nB++;
2085:               break;
2086:             }
2087:           }
2088:         }
2089:         if ((nA != NeighborCountA) || (nB != NeighborCountB)) {
2090:           throw ALE::Exception("Invalid neighbor count");
2091:         }

2093:         nA = 0;
2094:         offsetA = 0;
2095:         nB = 0;
2096:         offsetB = 0;
2097:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
2098:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);
2099:           int foundA = 0, foundB = 0;

2101:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2102:             if (((*p_iter).first == 0) && (p_iter.color().second.first || p_iter.color().second.second)) {
2103:               BuySizesA[nA] += p_iter.color().second.first;
2104:               SellSizesA[nA] += p_iter.color().second.second;
2105:               offsetA += _graphA->cone((*p_iter).second)->size();
2106:               foundA = 1;
2107:             } else if (((*p_iter).first == 1) && (p_iter.color().second.first || p_iter.color().second.second)) {
2108:               BuySizesB[nB] += p_iter.color().second.first;
2109:               SellSizesB[nB] += p_iter.color().second.second;
2110:               offsetB += _graphB->cone((*p_iter).second)->size();
2111:               foundB = 1;
2112:             }
2113:           }
2114:           if (foundA) nA++;
2115:           if (foundB) nB++;
2116:         }

2118:         PetscMalloc2(offsetA*msgSize,int,&SellConesA,offsetB*msgSize,int,&SellConesB);CHKERROR(ierr, "Error in PetscMalloc");
2119:         cone_arrow_type *ConesOutA = (cone_arrow_type *) SellConesA;
2120:         cone_arrow_type *ConesOutB = (cone_arrow_type *) SellConesB;
2121:         offsetA = 0;
2122:         offsetB = 0;
2123:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
2124:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);

2126:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2127:             Obj<typename graph_type::traits::coneSequence> cone;
2128:             const Point& p = (*p_iter).second;

2130:             if ((*p_iter).first == 0) {
2131:               cone = _graphA->cone(p);
2132:               for(typename graph_type::traits::coneSequence::iterator c_iter = cone->begin(); c_iter != cone->end(); ++c_iter) {
2133:                 if (debug) {
2134:                   ostringstream txt;

2136:                   txt << "["<<rank<<"]Packing A arrow for " << *neighbor << "  " << *c_iter << "--" << c_iter.color() << "-->" << p << std::endl;
2137:                   PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
2138:                 }
2139:                 cone_arrow_type::place(ConesOutA+offsetA, typename graph_type::traits::arrow_type(*c_iter, p, c_iter.color()));
2140:                 offsetA++;
2141:               }
2142:             } else {
2143:               cone = _graphB->cone(p);
2144:               for(typename graph_type::traits::coneSequence::iterator c_iter = cone->begin(); c_iter != cone->end(); ++c_iter) {
2145:                 if (debug) {
2146:                   ostringstream txt;

2148:                   txt << "["<<rank<<"]Packing B arrow for " << *neighbor << "  " << *c_iter << "--" << c_iter.color() << "-->" << p << std::endl;
2149:                   PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
2150:                 }
2151:                 cone_arrow_type::place(ConesOutB+offsetB, typename graph_type::traits::arrow_type(*c_iter, p, c_iter.color()));
2152:                 offsetB++;
2153:               }
2154:             }
2155:             if (p_iter.color().second.second != (int) cone->size()) {
2156:               std::cout << "["<<rank<<"] " << p_iter.color() << " does not match cone size " << cone->size() << std::endl;
2157:               throw ALE::Exception("Non-matching sizes");
2158:             }
2159:           }
2160:         }
2161:         if (debug) {
2162:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
2163:         }

2165:         // Send and retrieve cones of the base overlap
2166:         PetscObjectGetNewTag(petscObj, &tag1); CHKERROR(ierr, "Failed on PetscObjectGetNewTag");
2167:         commCycle(comm, tag1, msgSize, NeighborCountA, SellSizesA, NeighborsA, SellConesA, NeighborCountB, BuySizesB, NeighborsB, &BuyConesB);
2168:         commCycle(comm, tag1, msgSize, NeighborCountB, SellSizesB, NeighborsB, SellConesB, NeighborCountA, BuySizesA, NeighborsA, &BuyConesA);

2170:         // Must unpack with the BtoA overlap
2171:         //cone_arrow_type *ConesInA = (cone_arrow_type *) BuyConesA;
2172:         cone_arrow_type *ConesInB = (cone_arrow_type *) BuyConesB;
2173:         offsetA = 0;
2174:         offsetB = 0;
2175:         for(typename Overlap_::traits::capSequence::iterator neighbor = overlapCap->begin(); neighbor != overlapCap->end(); ++neighbor) {
2176:           Obj<typename Overlap_::traits::supportSequence> support = overlap->support(*neighbor);

2178:           for(typename Overlap_::traits::supportSequence::iterator p_iter = support->begin(); p_iter != support->end(); ++p_iter) {
2179:             Obj<typename graph_type::traits::coneSequence> localCone;
2180:             const Point& p = (*p_iter).second;
2181:             int remoteConeSize = p_iter.color().second.first;

2183:             // Right now we only provide the A->B fusion
2184:             if ((*p_iter).first == 0) {
2185: #if 0
2186:               cone_array_sequence remoteCone(&ConesInA[offsetA], remoteConeSize, p);

2188:               localCone = _graphA->cone(p);
2189:               offsetA += remoteConeSize;
2190:               if (debug) {
2191:                 ostringstream txt;

2193:                 txt << "["<<rank<<"]Unpacking B cone for " << p << " from " << *neighbor << std::endl;
2194:                 remoteCone.view(txt, true);
2195:                 PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
2196:               }
2197:               // Fuse in received cones
2198:               fuser->fuseCones(localCone, remoteCone, fusion->cone(fuser->fuseBasePoints(p, p)));
2199: #endif
2200:             } else {
2201:               cone_array_sequence remoteCone(&ConesInB[offsetB], remoteConeSize, p);

2203:               localCone = _graphB->cone(p);
2204:               offsetB += remoteConeSize;
2205:               if (debug) {
2206:                 ostringstream txt;

2208:                 txt << "["<<rank<<"]Unpacking A cone for " << p <<  " from " << *neighbor << std::endl;
2209:                 remoteCone.view(txt, true);
2210:                 PetscSynchronizedPrintf(comm, txt.str().c_str()); CHKERROR(ierr, "Error in PetscSynchronizedPrintf");
2211:               }
2212:               // Fuse in received cones
2213:               fuser->fuseCones(localCone, remoteCone, fusion->cone(fuser->fuseBasePoints(p, p)));
2214:             }
2215:           }
2216:         }
2217:         if (debug) {
2218:           PetscSynchronizedFlush(comm);CHKERROR(ierr,"Error in PetscSynchronizedFlush");
2219:         }
2220:       };

2222:     public:
2223:       static void setDebug(int debug) {ParConeDelta::debug = debug;};
2224:       static int  getDebug() {return ParConeDelta::debug;};
2225:     }; // class ParConeDelta
2226: 
2227:     template <typename ParSifter_, typename Fuser_, typename FusionSifter_>
2228:     int ParConeDelta<ParSifter_, Fuser_, FusionSifter_>::debug = 0;
2229: 

2231:     //
2232:     // Auxiliary type
2233:     //
2234:     template <typename Sifter_>
2235:     class Flip { // class Flip
2236:     public:
2237:       typedef Sifter_       graph_type;
2238:       typedef Flip<Sifter_> flip_type;
2239:     protected:
2240:       Obj<graph_type> _graph;
2241:     public:
2242:       //
2243:       struct traits {
2244:         // Basic types
2245:         typedef typename graph_type::traits::arrow_type::flip::type                 arrow_type;
2246:         typedef typename arrow_type::source_type                                    source_type;
2247:         typedef typename arrow_type::target_type                                    target_type;
2248:         typedef typename arrow_type::color_type                                     color_type;
2249:         // Sequences
2250:         // Be careful: use only a limited set of iterator methods: NO arrow(), source(), target() etc; operator*() and color() are OK.
2251:         typedef typename graph_type::traits::coneSequence                           supportSequence;
2252:         typedef typename graph_type::traits::supportSequence                        coneSequence;
2253:         typedef typename graph_type::traits::baseSequence                           capSequence;
2254:         typedef typename graph_type::traits::capSequence                            baseSequence;
2255:       };
2256:       // Basic interface
2257:       Flip(const Obj<graph_type>& graph) : _graph(graph) {};
2258:       Flip(const Flip& flip) : _graph(flip._graph) {};
2259:       virtual ~Flip() {};
2260:       // Redirect
2261:       // Only a limited set of methods is redirected: simple cone, support, base, cap and arrow insertion.
2262:       //
2263:       // Query methods
2264:       //
2265:       MPI_Comm    comm()     const {return this->_graph->comm();};
2266:       int         commSize() const {return this->_graph->commSize();};
2267:       int         commRank() const {return this->_graph->commRank();}
2268:       PetscObject petscObj() const {return this->_graph->petscObj();};

2270:       int view(const char* label = NULL) {return this->_graph->view(label);}
2271: 
2272:       // FIX: need const_cap, const_base returning const capSequence etc, but those need to have const_iterators, const_begin etc.
2273:       Obj<typename traits::capSequence> cap() {
2274:         return this->_graph->base();
2275:       };
2276:       Obj<typename traits::baseSequence> base() {
2277:         return this->_graph->cap();
2278:       };
2279: 
2280:       Obj<typename traits::coneSequence>
2281:       cone(const typename traits::target_type& p) {
2282:         return this->_graph->support(p);
2283:       };
2284: 
2285:       Obj<typename traits::coneSequence>
2286:       cone(const typename traits::target_type& p, const typename traits::color_type& color) {
2287:         return this->_graph->support(p, color);
2288:       };

2290:       template<typename PointCheck>
2291:       bool coneContains(const typename traits::target_type& p, const PointCheck& checker) {
2292:         return this->_graph->supportContains(p, checker);
2293:       };

2295:       template<typename PointProcess>
2296:       void coneApply(const typename traits::target_type& p, const PointProcess& processor) {
2297:         this->_graph->supportApply(p, processor);
2298:       };
2299: 
2300:       Obj<typename traits::supportSequence>
2301:       support(const typename traits::source_type& p) {
2302:         return this->_graph->cone(p);
2303:       };
2304: 
2305:       Obj<typename traits::supportSequence>
2306:       support(const typename traits::source_type& p, const typename traits::color_type& color) {
2307:         return this->_graph->cone(p,color);
2308:       };
2309: 
2310:       virtual void addArrow(const typename traits::source_type& p, const typename traits::target_type& q) {
2311:         this->_graph->addArrow(q, p);
2312:       };
2313: 
2314:       virtual void addArrow(const typename traits::source_type& p, const typename traits::target_type& q, const typename traits::color_type& color) {
2315:         this->_graph->addArrow(q, p, color);
2316:       };
2317: 
2318:       virtual void addArrow(const typename traits::arrow_type& a) {
2319:         this->_graph->addArrow(a.target, a.source, a.color);
2320:       };
2321: 
2322:     };// class Flip


2325:     // WARNING: must pass in a 'flipped' Fuser, that is a fuser that acts on cones instead of supports
2326:     template<typename ParSifter_,
2327:              typename Fuser_ = RightSequenceDuplicator<ConeArraySequence<typename ParSifter_::traits::arrow_type::flip::type> >,
2328:              typename FusionSifter_ = typename ParSifter_::template rebind<typename Fuser_::fusion_target_type,
2329:                                                                              typename Fuser_::fusion_source_type,
2330:                                                                              typename Fuser_::fusion_color_type>::type>
2331:     class ParSupportDelta {
2332:     public:
2333:       // Here we specialize to Sifters based on Points in order to enable parallel overlap discovery.
2334:       // We also assume that the Points in the base are ordered appropriately so we can use baseSequence.begin() and
2335:       // baseSequence.end() as the extrema for global reduction.
2336:       typedef ParSupportDelta<ParSifter_, Fuser_, FusionSifter_>                                delta_type;
2337:       typedef ParSifter_                                                                        graph_type;
2338:       typedef Fuser_                                                                            fuser_type;
2339:       typedef ASifter<ALE::Point, int, ALE::pair<ALE::Point, ALE::pair<int,int> >, SifterDef::uniColor>                overlap_type;
2340:       typedef ASifter<ALE::pair<int,ALE::Point>, int, ALE::pair<ALE::Point, ALE::pair<int,int> >, SifterDef::uniColor> bioverlap_type;
2341:       typedef FusionSifter_                                                                                            fusion_type;
2342:       //

2344:       //
2345:       // FIX: Is there a way to inherit this from ParConeDelta?  Right now it is a verbatim copy.
2348:       static Obj<overlap_type>
2349:       overlap(const Obj<graph_type> graph) {
2350:         ALE_LOG_EVENT_BEGIN;
2351:         Obj<overlap_type> overlap = new overlap_type(graph->comm());
2352:         // If this is a serial object, we return an empty overlap
2353:         if((graph->comm() != PETSC_COMM_SELF) && (graph->commSize() > 1)) {
2354:           computeOverlap(graph, overlap);
2355:         }
2356:         ALE_LOG_EVENT_END;
2357:         return overlap;
2358:       };

2360:       template <typename Overlap_>
2361:       static void computeOverlap(const Obj<graph_type>& graph, Obj<Overlap_>& overlap){
2362:         // Flip the graph and the overlap and use ParConeDelta's method
2363:         Obj<Flip<graph_type> >   graph_flip   = Flip<graph_type>(graph);
2364:         Obj<Flip<Overlap_> > overlap_flip     = Flip<Overlap_>(overlap);
2365:         ParConeDelta<Flip<graph_type>, fuser_type, Flip<fusion_type> >::computeOverlap(graph_flip, overlap_flip);
2366:       };

2370:       static Obj<bioverlap_type>
2371:       overlap(const Obj<graph_type> graphA, const Obj<graph_type> graphB) {
2372:         ALE_LOG_EVENT_BEGIN;
2373:         Obj<bioverlap_type> overlap = new bioverlap_type(graphA->comm());
2374:         PetscMPIInt         comp;

2376:         MPI_Comm_compare(graphA->comm(), graphB->comm(), &comp);
2377:         if (comp != MPI_IDENT) {
2378:           throw ALE::Exception("Non-matching communicators for overlap");
2379:         }
2380:         Obj<Flip<graph_type> >   graphA_flip   = Flip<graph_type>(graphA);
2381:         Obj<Flip<graph_type> >   graphB_flip   = Flip<graph_type>(graphB);
2382:         Obj<Flip<bioverlap_type> > overlap_flip     = Flip<bioverlap_type>(overlap);

2384:         ParConeDelta<Flip<graph_type>, fuser_type, Flip<fusion_type> >::computeOverlap(graphA_flip, graphB_flip, overlap_flip);
2385:         ALE_LOG_EVENT_END;
2386:         return overlap;
2387:       };

2389:       template <typename Overlap_>
2390:       static Obj<fusion_type>
2391:       fusion(const Obj<graph_type>& graphA, const Obj<graph_type>& graphB, const Obj<Overlap_>& overlap, const Obj<fuser_type>& fuser = fuser_type()) {
2392:         Obj<fusion_type> fusion = new fusion_type(graphA->comm());
2393:         PetscMPIInt      comp;

2395:         MPI_Comm_compare(graphA->comm(), graphB->comm(), &comp);
2396:         if (comp != MPI_IDENT) {
2397:           throw ALE::Exception("Non-matching communicators for overlap");
2398:         }
2399:         Obj<Flip<graph_type> >  graphA_flip  = Flip<graph_type>(graphA);
2400:         Obj<Flip<graph_type> >  graphB_flip  = Flip<graph_type>(graphB);
2401:         Obj<Flip<Overlap_> >    overlap_flip = Flip<Overlap_>(overlap);
2402:         Obj<Flip<fusion_type> > fusion_flip  = Flip<fusion_type>(fusion);

2404:         ParConeDelta<Flip<graph_type>, fuser_type, Flip<fusion_type> >::computeFusion(graphA_flip, graphB_flip, overlap_flip, fusion_flip);
2405:         return fusion;
2406:       };

2408:       // FIX: Is there a way to inherit this from ParConeDelta?  Right now it is a verbatim copy.
2409:       template <typename Overlap_>
2410:       static Obj<fusion_type>
2411:       fusion(const Obj<graph_type>& graph, const Obj<Overlap_>& overlap, const Obj<fuser_type>& fuser = new fuser_type()) {
2412:         Obj<fusion_type> fusion = new fusion_type(graph->comm());
2413:         // If this is a serial object, we return an empty delta
2414:         if((graph->comm() != PETSC_COMM_SELF) && (graph->commSize() > 1)) {
2415:           computeFusion(graph, overlap, fusion, fuser);
2416:         }
2417:         return fusion;
2418:       };

2420:       template <typename Overlap_, typename Fusion_>
2421:       static void computeFusion(const Obj<graph_type>& graph, const Obj<Overlap_>& overlap, Obj<Fusion_> fusion, const Obj<fuser_type>& fuser = new fuser_type()){
2422:         // Flip the graph, the overlap and the fusion, and the use ParConeDelta's method
2423:         Obj<Flip<graph_type> > graph_flip   = Flip<graph_type>(graph);
2424:         Obj<Flip<Overlap_> >   overlap_flip = Flip<Overlap_>(overlap);
2425:         Obj<Flip<Fusion_> >    fusion_flip  = Flip<Fusion_>(fusion);
2426:         ParConeDelta<Flip<graph_type>, fuser_type, Flip<fusion_type> >::computeFusion(graph_flip, overlap_flip, fusion_flip);
2427:       };
2428:     public:
2429:       static void setDebug(int debug) {ParConeDelta<Flip<graph_type>, fuser_type, Flip<fusion_type> >::setDebug(debug);};
2430:       static int  getDebug() {return ParConeDelta<Flip<graph_type>, fuser_type, Flip<fusion_type> >::getDebug();};
2431:     }; // class ParSupportDelta
2432: 
2433: } // namespace ALE

2435: #endif