001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.flow.planner;
022    
023    import java.util.ArrayList;
024    import java.util.Arrays;
025    import java.util.Collection;
026    import java.util.Collections;
027    import java.util.HashSet;
028    import java.util.List;
029    import java.util.Map;
030    import java.util.Set;
031    
032    import cascading.flow.AssemblyPlanner;
033    import cascading.flow.Flow;
034    import cascading.flow.FlowConnector;
035    import cascading.flow.FlowDef;
036    import cascading.flow.FlowElement;
037    import cascading.operation.AssertionLevel;
038    import cascading.operation.DebugLevel;
039    import cascading.pipe.Checkpoint;
040    import cascading.pipe.CoGroup;
041    import cascading.pipe.Each;
042    import cascading.pipe.Every;
043    import cascading.pipe.Group;
044    import cascading.pipe.GroupBy;
045    import cascading.pipe.HashJoin;
046    import cascading.pipe.Merge;
047    import cascading.pipe.OperatorException;
048    import cascading.pipe.Pipe;
049    import cascading.pipe.Splice;
050    import cascading.pipe.SubAssembly;
051    import cascading.property.PropertyUtil;
052    import cascading.scheme.Scheme;
053    import cascading.tap.Tap;
054    import cascading.tap.TapException;
055    import cascading.tuple.Fields;
056    import cascading.util.Util;
057    import org.jgrapht.GraphPath;
058    import org.jgrapht.Graphs;
059    import org.slf4j.Logger;
060    import org.slf4j.LoggerFactory;
061    
062    import static cascading.flow.planner.ElementGraphs.*;
063    import static java.util.Arrays.asList;
064    
065    /** Class FlowPlanner is the base class for all planner implementations. */
066    public abstract class FlowPlanner<F extends Flow, Config>
067      {
068      /** Field LOG */
069      private static final Logger LOG = LoggerFactory.getLogger( FlowPlanner.class );
070    
071      /** Field properties */
072      protected Map<Object, Object> properties;
073    
074      protected String checkpointRootPath = null;
075    
076      /** Field assertionLevel */
077      protected AssertionLevel assertionLevel;
078      /** Field debugLevel */
079      protected DebugLevel debugLevel;
080    
081      /**
082       * Method getAssertionLevel returns the configured target planner {@link cascading.operation.AssertionLevel}.
083       *
084       * @param properties of type Map<Object, Object>
085       * @return AssertionLevel the configured AssertionLevel
086       */
087      static AssertionLevel getAssertionLevel( Map<Object, Object> properties )
088        {
089        String assertionLevel = PropertyUtil.getProperty( properties, "cascading.flowconnector.assertionlevel", AssertionLevel.STRICT.name() );
090    
091        return AssertionLevel.valueOf( assertionLevel );
092        }
093    
094      /**
095       * Method getDebugLevel returns the configured target planner {@link cascading.operation.DebugLevel}.
096       *
097       * @param properties of type Map<Object, Object>
098       * @return DebugLevel the configured DebugLevel
099       */
100      static DebugLevel getDebugLevel( Map<Object, Object> properties )
101        {
102        String debugLevel = PropertyUtil.getProperty( properties, "cascading.flowconnector.debuglevel", DebugLevel.DEFAULT.name() );
103    
104        return DebugLevel.valueOf( debugLevel );
105        }
106    
107      public Map<Object, Object> getProperties()
108        {
109        return properties;
110        }
111    
112      public abstract Config getConfig();
113    
114      public abstract PlatformInfo getPlatformInfo();
115    
116      public void initialize( FlowConnector flowConnector, Map<Object, Object> properties )
117        {
118        this.properties = properties;
119        this.assertionLevel = getAssertionLevel( properties );
120        this.debugLevel = getDebugLevel( properties );
121        }
122    
123      protected abstract Flow createFlow( FlowDef flowDef );
124    
125      /**
126       * Method buildFlow renders the actual Flow instance.
127       *
128       * @param flowDef
129       * @return Flow
130       */
131      public abstract F buildFlow( FlowDef flowDef );
132    
133      protected Pipe[] resolveTails( FlowDef flowDef, Flow<Config> flow )
134        {
135        Pipe[] tails = flowDef.getTailsArray();
136    
137        tails = resolveAssemblyPlanners( flowDef, flow, tails );
138    
139        return tails;
140        }
141    
142      protected Pipe[] resolveAssemblyPlanners( FlowDef flowDef, Flow flow, Pipe[] pipes )
143        {
144        List<Pipe> tails = Arrays.asList( pipes );
145    
146        List<AssemblyPlanner> assemblyPlanners = flowDef.getAssemblyPlanners();
147    
148        for( AssemblyPlanner assemblyPlanner : assemblyPlanners )
149          {
150          tails = assemblyPlanner.resolveTails( new AssemblyPlannerContext( flowDef, flow, tails ) );
151    
152          if( tails.isEmpty() )
153            throw new PlannerException( "assembly planner: " + assemblyPlanner + ", returned zero tails" );
154    
155          tails = Collections.unmodifiableList( tails );
156          }
157    
158        return tails.toArray( new Pipe[ tails.size() ] );
159        }
160    
161      protected void verifyAssembly( FlowDef flowDef, Pipe[] tails )
162        {
163        verifyPipeAssemblyEndPoints( flowDef, tails );
164        verifyTraps( flowDef, tails );
165        verifyCheckpoints( flowDef, tails );
166        }
167    
168      protected void verifyAllTaps( FlowDef flowDef )
169        {
170        verifySourceNotSinks( flowDef.getSources(), flowDef.getSinks() );
171    
172        verifyTaps( flowDef.getSources(), true, true );
173        verifyTaps( flowDef.getSinks(), false, true );
174        verifyTaps( flowDef.getTraps(), false, false );
175    
176        // are both sources and sinks
177        verifyTaps( flowDef.getCheckpoints(), true, false );
178        verifyTaps( flowDef.getCheckpoints(), false, false );
179        }
180    
181      protected ElementGraph createElementGraph( FlowDef flowDef, Pipe[] flowTails )
182        {
183        Map<String, Tap> sources = flowDef.getSourcesCopy();
184        Map<String, Tap> sinks = flowDef.getSinksCopy();
185        Map<String, Tap> traps = flowDef.getTrapsCopy();
186        Map<String, Tap> checkpoints = flowDef.getCheckpointsCopy();
187    
188        AssertionLevel assertionLevel = flowDef.getAssertionLevel() == null ? this.assertionLevel : flowDef.getAssertionLevel();
189        DebugLevel debugLevel = flowDef.getDebugLevel() == null ? this.debugLevel : flowDef.getDebugLevel();
190    
191        checkpointRootPath = makeCheckpointRootPath( flowDef );
192    
193        return new ElementGraph( getPlatformInfo(), flowTails, sources, sinks, traps, checkpoints, checkpointRootPath != null, assertionLevel, debugLevel );
194        }
195    
196      private String makeCheckpointRootPath( FlowDef flowDef )
197        {
198        String flowName = flowDef.getName();
199        String runID = flowDef.getRunID();
200    
201        if( runID == null )
202          return null;
203    
204        if( flowName == null )
205          throw new PlannerException( "flow name is required when providing a run id" );
206    
207        return flowName + "/" + runID;
208        }
209    
210    
211      protected void verifySourceNotSinks( Map<String, Tap> sources, Map<String, Tap> sinks )
212        {
213        Collection<Tap> sourcesSet = sources.values();
214    
215        for( Tap tap : sinks.values() )
216          {
217          if( sourcesSet.contains( tap ) )
218            throw new PlannerException( "tap may not be used as both source and sink in the same Flow: " + tap );
219          }
220        }
221    
222      /**
223       * Method verifyTaps ...
224       *
225       * @param taps          of type Map<String, Tap>
226       * @param areSources    of type boolean
227       * @param mayNotBeEmpty of type boolean
228       */
229      protected void verifyTaps( Map<String, Tap> taps, boolean areSources, boolean mayNotBeEmpty )
230        {
231        if( mayNotBeEmpty && taps.isEmpty() )
232          throw new PlannerException( ( areSources ? "source" : "sink" ) + " taps are required" );
233    
234        for( String tapName : taps.keySet() )
235          {
236          if( areSources && !taps.get( tapName ).isSource() )
237            throw new PlannerException( "tap named: '" + tapName + "', cannot be used as a source: " + taps.get( tapName ) );
238          else if( !areSources && !taps.get( tapName ).isSink() )
239            throw new PlannerException( "tap named: '" + tapName + "', cannot be used as a sink: " + taps.get( tapName ) );
240          }
241        }
242    
243      /**
244       * Method verifyEndPoints verifies
245       * <p/>
246       * there aren't dupe names in heads or tails.
247       * all the sink and source tap names match up with tail and head pipes
248       */
249      // todo: force dupe names to throw exceptions
250      protected void verifyPipeAssemblyEndPoints( FlowDef flowDef, Pipe[] flowTails )
251        {
252        Set<String> tapNames = new HashSet<String>();
253    
254        tapNames.addAll( flowDef.getSources().keySet() );
255        tapNames.addAll( flowDef.getSinks().keySet() );
256    
257        // handle tails
258        Set<Pipe> tails = new HashSet<Pipe>();
259        Set<String> tailNames = new HashSet<String>();
260    
261        for( Pipe pipe : flowTails )
262          {
263          if( pipe instanceof SubAssembly )
264            {
265            for( Pipe tail : ( (SubAssembly) pipe ).getTails() )
266              {
267              String tailName = tail.getName();
268    
269              if( !tapNames.contains( tailName ) )
270                throw new PlannerException( tail, "pipe name not found in either sink or source map: '" + tailName + "'" );
271    
272              if( tailNames.contains( tailName ) && !tails.contains( tail ) )
273                LOG.warn( "duplicate tail name found: '{}'", tailName );
274    //            throw new PlannerException( pipe, "duplicate tail name found: " + tailName );
275    
276              tailNames.add( tailName );
277              tails.add( tail );
278              }
279            }
280          else
281            {
282            String tailName = pipe.getName();
283    
284            if( !tapNames.contains( tailName ) )
285              throw new PlannerException( pipe, "pipe name not found in either sink or source map: '" + tailName + "'" );
286    
287            if( tailNames.contains( tailName ) && !tails.contains( pipe ) )
288              LOG.warn( "duplicate tail name found: '{}'", tailName );
289    //            throw new PlannerException( pipe, "duplicate tail name found: " + tailName );
290    
291            tailNames.add( tailName );
292            tails.add( pipe );
293            }
294          }
295    
296    //    Set<String> allTailNames = new HashSet<String>( tailNames );
297        tailNames.removeAll( flowDef.getSinks().keySet() );
298        Set<String> remainingSinks = new HashSet<String>( flowDef.getSinks().keySet() );
299        remainingSinks.removeAll( tailNames );
300    
301        if( tailNames.size() != 0 )
302          throw new PlannerException( "not all tail pipes bound to sink taps, remaining tail pipe names: [" + Util.join( Util.quote( tailNames, "'" ), ", " ) + "], remaining sink tap names: [" + Util.join( Util.quote( remainingSinks, "'" ), ", " ) + "]" );
303    
304        // unlike heads, pipes can input to another pipe and simultaneously be a sink
305        // so there is no way to know all the intentional tails, so they aren't listed below in the exception
306        remainingSinks = new HashSet<String>( flowDef.getSinks().keySet() );
307        remainingSinks.removeAll( asList( Pipe.names( flowTails ) ) );
308    
309        if( remainingSinks.size() != 0 )
310          throw new PlannerException( "not all sink taps bound to tail pipes, remaining sink tap names: [" + Util.join( Util.quote( remainingSinks, "'" ), ", " ) + "]" );
311    
312        // handle heads
313        Set<Pipe> heads = new HashSet<Pipe>();
314        Set<String> headNames = new HashSet<String>();
315    
316        for( Pipe pipe : flowTails )
317          {
318          for( Pipe head : pipe.getHeads() )
319            {
320            String headName = head.getName();
321    
322            if( !tapNames.contains( headName ) )
323              throw new PlannerException( head, "pipe name not found in either sink or source map: '" + headName + "'" );
324    
325            if( headNames.contains( headName ) && !heads.contains( head ) )
326              LOG.warn( "duplicate head name found, not an error but heads should have unique names: '{}'", headName );
327    //          throw new PlannerException( pipe, "duplicate head name found: " + headName );
328    
329            headNames.add( headName );
330            heads.add( head );
331            }
332          }
333    
334        Set<String> allHeadNames = new HashSet<String>( headNames );
335        headNames.removeAll( flowDef.getSources().keySet() );
336        Set<String> remainingSources = new HashSet<String>( flowDef.getSources().keySet() );
337        remainingSources.removeAll( headNames );
338    
339        if( headNames.size() != 0 )
340          throw new PlannerException( "not all head pipes bound to source taps, remaining head pipe names: [" + Util.join( Util.quote( headNames, "'" ), ", " ) + "], remaining source tap names: [" + Util.join( Util.quote( remainingSources, "'" ), ", " ) + "]" );
341    
342        remainingSources = new HashSet<String>( flowDef.getSources().keySet() );
343        remainingSources.removeAll( allHeadNames );
344    
345        if( remainingSources.size() != 0 )
346          throw new PlannerException( "not all source taps bound to head pipes, remaining source tap names: [" + Util.join( Util.quote( remainingSources, "'" ), ", " ) + "], remaining head pipe names: [" + Util.join( Util.quote( headNames, "'" ), ", " ) + "]" );
347    
348        }
349    
350      protected void verifyTraps( FlowDef flowDef, Pipe[] flowTails )
351        {
352        verifyNotSourcesSinks( flowDef.getTraps(), flowDef.getSources(), flowDef.getSinks(), "trap" );
353    
354        Set<String> names = new HashSet<String>( asList( Pipe.names( flowTails ) ) );
355    
356        for( String name : flowDef.getTraps().keySet() )
357          {
358          if( !names.contains( name ) )
359            throw new PlannerException( "trap name not found in assembly: '" + name + "'" );
360          }
361        }
362    
363      protected void verifyCheckpoints( FlowDef flowDef, Pipe[] flowTails )
364        {
365        verifyNotSourcesSinks( flowDef.getCheckpoints(), flowDef.getSources(), flowDef.getSinks(), "checkpoint" );
366    
367        for( Tap checkpointTap : flowDef.getCheckpoints().values() )
368          {
369          Scheme scheme = checkpointTap.getScheme();
370    
371          if( scheme.getSourceFields().equals( Fields.UNKNOWN ) && scheme.getSinkFields().equals( Fields.ALL ) )
372            continue;
373    
374          throw new PlannerException( "checkpoint tap scheme must be undeclared, source fields must be UNKNOWN, and sink fields ALL, got: " + scheme.toString() );
375          }
376    
377        Set<String> names = new HashSet<String>( asList( Pipe.names( flowTails ) ) );
378    
379        for( String name : flowDef.getCheckpoints().keySet() )
380          {
381          if( !names.contains( name ) )
382            throw new PlannerException( "checkpoint name not found in assembly: '" + name + "'" );
383    
384          Set<Pipe> pipes = new HashSet<Pipe>( asList( Pipe.named( name, flowTails ) ) );
385    
386          int count = 0;
387    
388          for( Pipe pipe : pipes )
389            {
390            if( pipe instanceof Checkpoint )
391              count++;
392            }
393    
394          if( count == 0 )
395            throw new PlannerException( "no checkpoint with name found in assembly: '" + name + "'" );
396    
397          if( count > 1 )
398            throw new PlannerException( "more than one checkpoint with name found in assembly: '" + name + "'" );
399          }
400        }
401    
402      private void verifyNotSourcesSinks( Map<String, Tap> taps, Map<String, Tap> sources, Map<String, Tap> sinks, String role )
403        {
404        Collection<Tap> sourceTaps = sources.values();
405        Collection<Tap> sinkTaps = sinks.values();
406    
407        for( Tap tap : taps.values() )
408          {
409          if( sourceTaps.contains( tap ) )
410            throw new PlannerException( "tap may not be used as both a " + role + " and a source in the same Flow: " + tap );
411    
412          if( sinkTaps.contains( tap ) )
413            throw new PlannerException( "tap may not be used as both a " + role + " and a sink in the same Flow: " + tap );
414          }
415        }
416    
417      /**
418       * Verifies that there are not only GroupAssertions following any given Group instance. This will adversely
419       * affect the stream entering any subsequent Tap of Each instances.
420       */
421      protected void failOnLoneGroupAssertion( ElementGraph elementGraph )
422        {
423        List<Group> groups = elementGraph.findAllGroups();
424    
425        // walk Every instances after Group
426        for( Group group : groups )
427          {
428          for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsFrom( group ) )
429            {
430            List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is tail
431    
432            int everies = 0;
433            int assertions = 0;
434    
435            for( FlowElement flowElement : flowElements )
436              {
437              if( flowElement instanceof Group )
438                continue;
439    
440              if( !( flowElement instanceof Every ) )
441                break;
442    
443              everies++;
444    
445              Every every = (Every) flowElement;
446    
447              if( every.getPlannerLevel() != null )
448                assertions++;
449              }
450    
451            if( everies != 0 && everies == assertions )
452              throw new PlannerException( "group assertions must be accompanied by aggregator operations" );
453            }
454          }
455        }
456    
457      protected void failOnMissingGroup( ElementGraph elementGraph )
458        {
459        List<Every> everies = elementGraph.findAllEveries();
460    
461        // walk Every instances after Group
462        for( Every every : everies )
463          {
464          for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsTo( every ) )
465            {
466            List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is every
467            Collections.reverse( flowElements ); // first element is every
468    
469            for( FlowElement flowElement : flowElements )
470              {
471              if( flowElement instanceof Every || flowElement.getClass() == Pipe.class )
472                continue;
473    
474              if( flowElement instanceof GroupBy || flowElement instanceof CoGroup )
475                break;
476    
477              throw new PlannerException( (Pipe) flowElement, "Every may only be preceded by another Every or a Group pipe, found: " + flowElement );
478              }
479            }
480          }
481        }
482    
483      protected void failOnMisusedBuffer( ElementGraph elementGraph )
484        {
485        List<Every> everies = elementGraph.findAllEveries();
486    
487        // walk Every instances after Group
488        for( Every every : everies )
489          {
490          for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsTo( every ) )
491            {
492            List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is every
493            Collections.reverse( flowElements ); // first element is every
494    
495            Every last = null;
496            boolean foundBuffer = false;
497            int foundEveries = -1;
498    
499            for( FlowElement flowElement : flowElements )
500              {
501              if( flowElement instanceof Each )
502                throw new PlannerException( (Pipe) flowElement, "Every may only be preceded by another Every or a GroupBy or CoGroup pipe, found: " + flowElement );
503    
504              if( flowElement instanceof Every )
505                {
506                foundEveries++;
507    
508                boolean isBuffer = ( (Every) flowElement ).isBuffer();
509    
510                if( foundEveries != 0 && ( isBuffer || foundBuffer ) )
511                  throw new PlannerException( (Pipe) flowElement, "Only one Every with a Buffer may follow a GroupBy or CoGroup pipe, no other Every instances are allowed immediately before or after, found: " + flowElement + " before: " + last );
512    
513                if( !foundBuffer )
514                  foundBuffer = isBuffer;
515    
516                last = (Every) flowElement;
517                }
518    
519              if( flowElement instanceof Group )
520                break;
521              }
522            }
523          }
524        }
525    
526      protected void failOnGroupEverySplit( ElementGraph elementGraph )
527        {
528        List<Group> groups = new ArrayList<Group>();
529    
530        elementGraph.findAllOfType( 1, 2, Group.class, groups );
531    
532        for( Group group : groups )
533          {
534          Set<FlowElement> children = elementGraph.getAllChildrenNotExactlyType( group, Pipe.class );
535    
536          for( FlowElement flowElement : children )
537            {
538            if( flowElement instanceof Every )
539              throw new PlannerException( (Every) flowElement, "Every instances may not split after a GroupBy or CoGroup pipe, found: " + flowElement + " after: " + group );
540            }
541          }
542        }
543    
544      protected PlannerException handleExceptionDuringPlanning( Exception exception, ElementGraph elementGraph )
545        {
546        if( exception instanceof PlannerException )
547          {
548          ( (PlannerException) exception ).elementGraph = elementGraph;
549    
550          return (PlannerException) exception;
551          }
552        else if( exception instanceof ElementGraphException )
553          {
554          Throwable cause = exception.getCause();
555    
556          if( cause == null )
557            cause = exception;
558    
559          // captures pipegraph for debugging
560          // forward message in case cause or trace is lost
561          String message = String.format( "could not build flow from assembly: [%s]", cause.getMessage() );
562    
563          if( cause instanceof OperatorException )
564            return new PlannerException( message, cause, elementGraph );
565    
566          if( cause instanceof TapException )
567            return new PlannerException( message, cause, elementGraph );
568    
569          return new PlannerException( ( (ElementGraphException) exception ).getPipe(), message, cause, elementGraph );
570          }
571        else
572          {
573          // captures pipegraph for debugging
574          // forward message in case cause or trace is lost
575          String message = String.format( "could not build flow from assembly: [%s]", exception.getMessage() );
576          return new PlannerException( message, exception, elementGraph );
577          }
578        }
579    
580      protected void handleNonSafeOperations( ElementGraph elementGraph )
581        {
582        // if there was a graph change, iterate paths again.
583        while( !internalNonSafeOperations( elementGraph ) )
584          ;
585        }
586    
587      private boolean internalNonSafeOperations( ElementGraph elementGraph )
588        {
589        Set<Pipe> tapInsertions = new HashSet<Pipe>();
590    
591        List<Pipe> splits = elementGraph.findAllPipeSplits();
592    
593        // if any predecessor is unsafe, insert temp
594        for( Pipe split : splits )
595          {
596          List<GraphPath<FlowElement, Scope>> paths = elementGraph.getAllShortestPathsTo( split );
597    
598          for( GraphPath<FlowElement, Scope> path : paths )
599            {
600            List<FlowElement> elements = Graphs.getPathVertexList( path );
601            Collections.reverse( elements );
602    
603            for( FlowElement element : elements )
604              {
605              if( !( element instanceof Each ) && element.getClass() != Pipe.class )
606                break;
607    
608              if( element.getClass() == Pipe.class )
609                continue;
610    
611              if( !( (Each) element ).getOperation().isSafe() )
612                {
613                tapInsertions.add( split );
614                break;
615                }
616              }
617            }
618          }
619    
620        for( Pipe pipe : tapInsertions )
621          insertTempTapAfter( elementGraph, pipe );
622    
623        return tapInsertions.isEmpty();
624        }
625    
626      /**
627       * Method insertTapAfter ...
628       *
629       * @param graph of type PipeGraph
630       * @param pipe  of type Pipe
631       */
632      protected void insertTempTapAfter( ElementGraph graph, Pipe pipe )
633        {
634        LOG.debug( "inserting tap after: {}", pipe );
635    
636        Tap checkpointTap = graph.getCheckpointsMap().get( pipe.getName() );
637    
638        if( checkpointTap != null )
639          LOG.info( "found checkpoint: {}, using tap: {}", pipe.getName(), checkpointTap );
640    
641        if( checkpointTap == null )
642          {
643          // only restart from a checkpoint pipe or checkpoint tap below
644          if( pipe instanceof Checkpoint )
645            checkpointTap = makeTempTap( checkpointRootPath, pipe.getName() );
646          else
647            checkpointTap = makeTempTap( pipe.getName() );
648          }
649    
650        graph.insertFlowElementAfter( pipe, checkpointTap );
651        }
652    
653      protected Tap makeTempTap( String name )
654        {
655        return makeTempTap( null, name );
656        }
657    
658      protected abstract Tap makeTempTap( String prefix, String name );
659    
660      /**
661       * Inserts a temporary Tap between logical MR jobs.
662       * <p/>
663       * Since all joins are at groups or splices, depth first search is safe
664       * <p/>
665       * todo: refactor so that rules are applied to path segments bounded by taps
666       * todo: this would allow balancing of operations within paths instead of pushing
667       * todo: all operations up. may allow for consolidation of rules
668       *
669       * @param elementGraph of type PipeGraph
670       */
671      protected void handleJobPartitioning( ElementGraph elementGraph )
672        {
673        // if there was a graph change, iterate paths again. prevents many temp taps from being inserted in front of a group
674        while( !internalJobPartitioning( elementGraph ) )
675          ;
676        }
677    
678      private boolean internalJobPartitioning( ElementGraph elementGraph )
679        {
680        for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsBetweenExtents() )
681          {
682          List<FlowElement> flowElements = Graphs.getPathVertexList( path );
683          List<Pipe> tapInsertions = new ArrayList<Pipe>();
684    
685          boolean foundGroup = false;
686    
687          for( int i = 0; i < flowElements.size(); i++ )
688            {
689            FlowElement flowElement = flowElements.get( i );
690    
691            if( flowElement instanceof ElementGraph.Extent ) // is an extent: head or tail
692              continue;
693            else if( flowElement instanceof Tap && flowElements.get( i - 1 ) instanceof ElementGraph.Extent )  // is a source tap
694              continue;
695    
696            if( flowElement instanceof Group && !foundGroup )
697              {
698              foundGroup = true;
699              }
700            else if( flowElement instanceof Splice && foundGroup ) // add tap between groups, push joins/merge map side
701              {
702              tapInsertions.add( (Pipe) flowElements.get( i - 1 ) );
703    
704              if( !( flowElement instanceof Group ) )
705                foundGroup = false;
706              }
707            else if( flowElement instanceof Checkpoint ) // add tap after checkpoint
708              {
709              if( flowElements.get( i + 1 ) instanceof Tap ) // don't keep inserting
710                continue;
711    
712              tapInsertions.add( (Pipe) flowElement );
713              foundGroup = false;
714              }
715            else if( flowElement instanceof Tap )
716              {
717              foundGroup = false;
718              }
719            }
720    
721          for( Pipe pipe : tapInsertions )
722            insertTempTapAfter( elementGraph, pipe );
723    
724          if( !tapInsertions.isEmpty() )
725            return false;
726          }
727    
728        return true;
729        }
730    
731      /**
732       * Prevent leftmost sources from sourcing a downstream join on the rightmost side intra-task by inserting a
733       * temp tap between the left-sourced join and right-sourced join.
734       *
735       * @param elementGraph
736       */
737      protected void handleJoins( ElementGraph elementGraph )
738        {
739        while( !internalJoins( elementGraph ) )
740          ;
741        }
742    
743      private boolean internalJoins( ElementGraph elementGraph )
744        {
745        List<GraphPath<FlowElement, Scope>> paths = elementGraph.getAllShortestPathsBetweenExtents();
746    
747        // large to small
748        Collections.reverse( paths );
749    
750        for( GraphPath<FlowElement, Scope> path : paths )
751          {
752          List<FlowElement> flowElements = Graphs.getPathVertexList( path );
753          List<Pipe> tapInsertions = new ArrayList<Pipe>();
754          List<HashJoin> joins = new ArrayList<HashJoin>();
755          List<Merge> merges = new ArrayList<Merge>();
756    
757          FlowElement lastSourceElement = null;
758    
759          for( int i = 0; i < flowElements.size(); i++ )
760            {
761            FlowElement flowElement = flowElements.get( i );
762    
763            if( flowElement instanceof Merge )
764              {
765              merges.add( (Merge) flowElement );
766              }
767            else if( flowElement instanceof HashJoin )
768              {
769              HashJoin join = (HashJoin) flowElement;
770    
771              Map<Integer, Integer> pathCounts = countOrderedDirectPathsBetween( elementGraph, lastSourceElement, join, true );
772    
773              // is this path streamed
774              int pathPosition = pathPositionInto( path, join );
775              boolean thisPathIsStreamed = pathPosition == 0;
776    
777              boolean isAccumulatedAndStreamed = isBothAccumulatedAndStreamedPath( pathCounts ); // has streamed and accumulated paths
778              int pathCount = countPaths( pathCounts );
779    
780              int priorJoins = countTypesBetween( elementGraph, lastSourceElement, join, HashJoin.class );
781    
782              if( priorJoins == 0 )
783                {
784                // if same source is leading into the hashjoin, insert tap on the accumulated side
785                if( pathCount == 2 && isAccumulatedAndStreamed && !thisPathIsStreamed )
786                  {
787                  tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) );
788                  break;
789                  }
790    
791                // if more than one path into streamed and accumulated branches, insert tap on streamed side
792                if( pathCount > 2 && isAccumulatedAndStreamed && thisPathIsStreamed )
793                  {
794                  tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) );
795                  break;
796                  }
797                }
798    
799              if( !merges.isEmpty() )
800                {
801                // if a Merge is prior to a HashJoin, and its an accumulated path, force Merge results to disk
802                int joinPos = flowElements.indexOf( join );
803                int mergePos = nearest( flowElements, joinPos, merges );
804    
805                if( mergePos != -1 && joinPos > mergePos )
806                  {
807                  // if all paths are accumulated and streamed, insert
808                  // else if just if this path is accumulated
809                  if( ( isAccumulatedAndStreamed && thisPathIsStreamed ) || !thisPathIsStreamed )
810                    {
811                    tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) );
812                    break;
813                    }
814                  }
815                }
816    
817              joins.add( (HashJoin) flowElement );
818              }
819            else if( flowElement instanceof Tap || flowElement instanceof Group )
820              {
821              for( int j = 0; j < joins.size(); j++ )
822                {
823                HashJoin join = joins.get( j );
824    
825                int pathPosition = pathPositionInto( path, join );
826                boolean thisPathIsStreamed = pathPosition == 0;
827    
828                Map<Integer, Integer> pathCounts = countOrderedDirectPathsBetween( elementGraph, lastSourceElement, join, true );
829    
830                boolean isAccumulatedAndStreamed = isBothAccumulatedAndStreamedPath( pathCounts ); // has streamed and accumulated paths
831                int pathCount = countPaths( pathCounts );
832    
833                if( pathCount >= 2 && isAccumulatedAndStreamed && thisPathIsStreamed )
834                  {
835                  tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) );
836                  break;
837                  }
838    
839                if( thisPathIsStreamed )
840                  continue;
841    
842                if( j == 0 ) // is accumulated on first join
843                  break;
844    
845                // prevent a streamed path from being accumulated by injecting a tap before the
846                // current HashJoin
847                tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) );
848                break;
849                }
850    
851              if( !tapInsertions.isEmpty() )
852                break;
853    
854              lastSourceElement = flowElement;
855              merges.clear();
856              joins.clear();
857              }
858            }
859    
860          for( Pipe pipe : tapInsertions )
861            insertTempTapAfter( elementGraph, pipe );
862    
863          if( !tapInsertions.isEmpty() )
864            return false;
865          }
866    
867        return true;
868        }
869    
870      private int nearest( List<FlowElement> flowElements, int index, List<Merge> merges )
871        {
872        List<Merge> reversed = new ArrayList<Merge>( merges );
873        Collections.reverse( reversed );
874    
875        for( Merge merge : reversed )
876          {
877          int pos = flowElements.indexOf( merge );
878          if( pos < index )
879            return pos;
880          }
881    
882        return -1;
883        }
884      }