001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.flow.hadoop.planner;
022    
023    import java.net.URI;
024    import java.util.Collections;
025    import java.util.Comparator;
026    import java.util.HashMap;
027    import java.util.HashSet;
028    import java.util.Iterator;
029    import java.util.List;
030    import java.util.Map;
031    import java.util.Properties;
032    import java.util.Set;
033    import java.util.TreeSet;
034    
035    import cascading.flow.FlowConnector;
036    import cascading.flow.FlowDef;
037    import cascading.flow.FlowElement;
038    import cascading.flow.hadoop.HadoopFlow;
039    import cascading.flow.hadoop.util.HadoopUtil;
040    import cascading.flow.planner.ElementGraph;
041    import cascading.flow.planner.ElementGraphs;
042    import cascading.flow.planner.FlowPlanner;
043    import cascading.flow.planner.FlowStepGraph;
044    import cascading.flow.planner.PlatformInfo;
045    import cascading.flow.planner.Scope;
046    import cascading.pipe.CoGroup;
047    import cascading.pipe.Every;
048    import cascading.pipe.Group;
049    import cascading.pipe.Pipe;
050    import cascading.property.AppProps;
051    import cascading.property.PropertyUtil;
052    import cascading.tap.Tap;
053    import cascading.tap.hadoop.Hfs;
054    import cascading.tap.hadoop.util.TempHfs;
055    import cascading.util.Util;
056    import org.apache.hadoop.mapred.JobConf;
057    import org.jgrapht.GraphPath;
058    import org.jgrapht.Graphs;
059    import org.slf4j.Logger;
060    import org.slf4j.LoggerFactory;
061    
062    import static cascading.flow.planner.ElementGraphs.getAllShortestPathsBetween;
063    
064    /**
065     * Class HadoopPlanner is the core Hadoop MapReduce planner.
066     * <p/>
067     * Notes:
068     * <p/>
069     * <strong>Custom JobConf properties</strong><br/>
070     * A custom JobConf instance can be passed to this planner by calling {@link #copyJobConf(java.util.Map, org.apache.hadoop.mapred.JobConf)}
071     * on a map properties object before constructing a new {@link cascading.flow.hadoop.HadoopFlowConnector}.
072     * <p/>
073     * A better practice would be to set Hadoop properties directly on the map properties object handed to the FlowConnector.
074     * All values in the map will be passed to a new default JobConf instance to be used as defaults for all resulting
075     * Flow instances.
076     * <p/>
077     * For example, {@code properties.set("mapred.child.java.opts","-Xmx512m");} would convince Hadoop
078     * to spawn all child jvms with a heap of 512MB.
079     */
080    public class HadoopPlanner extends FlowPlanner<HadoopFlow, JobConf>
081      {
082      /** Field LOG */
083      private static final Logger LOG = LoggerFactory.getLogger( HadoopPlanner.class );
084    
085      /** Field jobConf */
086      private JobConf jobConf;
087      /** Field intermediateSchemeClass */
088      private Class intermediateSchemeClass;
089    
090      /**
091       * Method copyJobConf adds the given JobConf values to the given properties object. Use this method to pass
092       * custom default Hadoop JobConf properties to Hadoop.
093       *
094       * @param properties of type Map
095       * @param jobConf    of type JobConf
096       */
097      public static void copyJobConf( Map<Object, Object> properties, JobConf jobConf )
098        {
099        for( Map.Entry<String, String> entry : jobConf )
100          properties.put( entry.getKey(), entry.getValue() );
101        }
102    
103      /**
104       * Method createJobConf returns a new JobConf instance using the values in the given properties argument.
105       *
106       * @param properties of type Map
107       * @return a JobConf instance
108       */
109      public static JobConf createJobConf( Map<Object, Object> properties )
110        {
111        JobConf conf = new JobConf();
112    
113        copyProperties( conf, properties );
114    
115        return conf;
116        }
117    
118      /**
119       * Method copyProperties adds the given Map values to the given JobConf object.
120       *
121       * @param jobConf    of type JobConf
122       * @param properties of type Map
123       */
124      public static void copyProperties( JobConf jobConf, Map<Object, Object> properties )
125        {
126        if( properties instanceof Properties )
127          {
128          Properties props = (Properties) properties;
129          Set<String> keys = props.stringPropertyNames();
130    
131          for( String key : keys )
132            jobConf.set( key, props.getProperty( key ) );
133          }
134        else
135          {
136          for( Map.Entry<Object, Object> entry : properties.entrySet() )
137            {
138            if( entry.getValue() != null )
139              jobConf.set( entry.getKey().toString(), entry.getValue().toString() );
140            }
141          }
142        }
143    
144      /**
145       * Method setNormalizeHeterogeneousSources adds the given doNormalize boolean to the given properties object.
146       * Use this method if additional jobs should be planned in to handle incompatible InputFormat classes.
147       * <p/>
148       * Normalization is off by default and should only be enabled by advanced users. Typically this will decrease
149       * application performance.
150       *
151       * @param properties  of type Map
152       * @param doNormalize of type boolean
153       */
154      @Deprecated
155      public static void setNormalizeHeterogeneousSources( Map<Object, Object> properties, boolean doNormalize )
156        {
157        properties.put( "cascading.multimapreduceplanner.normalizesources", Boolean.toString( doNormalize ) );
158        }
159    
160      /**
161       * Method getNormalizeHeterogeneousSources returns if this planner will normalize heterogeneous input sources.
162       *
163       * @param properties of type Map
164       * @return a boolean
165       */
166      @Deprecated
167      public static boolean getNormalizeHeterogeneousSources( Map<Object, Object> properties )
168        {
169        return Boolean.parseBoolean( PropertyUtil.getProperty( properties, "cascading.multimapreduceplanner.normalizesources", "false" ) );
170        }
171    
172      @Override
173      public JobConf getConfig()
174        {
175        return jobConf;
176        }
177    
178      @Override
179      public PlatformInfo getPlatformInfo()
180        {
181        return HadoopUtil.getPlatformInfo();
182        }
183    
184      @Override
185      public void initialize( FlowConnector flowConnector, Map<Object, Object> properties )
186        {
187        super.initialize( flowConnector, properties );
188    
189        jobConf = HadoopUtil.createJobConf( properties, createJobConf( properties ) );
190        intermediateSchemeClass = flowConnector.getIntermediateSchemeClass( properties );
191    
192        Class type = AppProps.getApplicationJarClass( properties );
193        if( jobConf.getJar() == null && type != null )
194          jobConf.setJarByClass( type );
195    
196        String path = AppProps.getApplicationJarPath( properties );
197        if( jobConf.getJar() == null && path != null )
198          jobConf.setJar( path );
199    
200        if( jobConf.getJar() == null )
201          jobConf.setJarByClass( HadoopUtil.findMainClass( HadoopPlanner.class ) );
202    
203        AppProps.setApplicationJarPath( properties, jobConf.getJar() );
204    
205        LOG.info( "using application jar: {}", jobConf.getJar() );
206        }
207    
208      @Override
209      protected HadoopFlow createFlow( FlowDef flowDef )
210        {
211        return new HadoopFlow( getPlatformInfo(), getProperties(), getConfig(), flowDef );
212        }
213    
214      @Override
215      public HadoopFlow buildFlow( FlowDef flowDef )
216        {
217        ElementGraph elementGraph = null;
218    
219        try
220          {
221          // generic
222          verifyAllTaps( flowDef );
223    
224          HadoopFlow flow = createFlow( flowDef );
225    
226          Pipe[] tails = resolveTails( flowDef, flow );
227    
228          verifyAssembly( flowDef, tails );
229    
230          elementGraph = createElementGraph( flowDef, tails );
231    
232          // rules
233          failOnLoneGroupAssertion( elementGraph );
234          failOnMissingGroup( elementGraph );
235          failOnMisusedBuffer( elementGraph );
236          failOnGroupEverySplit( elementGraph );
237    
238          // m/r specific
239          handleWarnEquivalentPaths( elementGraph );
240          handleSplit( elementGraph );
241          handleJobPartitioning( elementGraph );
242          handleJoins( elementGraph );
243          handleNonSafeOperations( elementGraph );
244    
245          if( getNormalizeHeterogeneousSources( properties ) )
246            handleHeterogeneousSources( elementGraph );
247    
248          // generic
249          elementGraph.removeUnnecessaryPipes(); // groups must be added before removing pipes
250          elementGraph.resolveFields();
251    
252          elementGraph = flow.updateSchemes( elementGraph );
253    
254          // m/r specific
255          handleAdjacentTaps( elementGraph );
256    
257          FlowStepGraph flowStepGraph = new HadoopStepGraph( flowDef.getName(), elementGraph );
258    
259          flow.initialize( elementGraph, flowStepGraph );
260    
261          return flow;
262          }
263        catch( Exception exception )
264          {
265          throw handleExceptionDuringPlanning( exception, elementGraph );
266          }
267        }
268    
269      private void handleWarnEquivalentPaths( ElementGraph elementGraph )
270        {
271        List<CoGroup> coGroups = elementGraph.findAllCoGroups();
272    
273        for( CoGroup coGroup : coGroups )
274          {
275          List<GraphPath<FlowElement, Scope>> graphPaths = elementGraph.getAllShortestPathsTo( coGroup );
276    
277          List<List<FlowElement>> paths = ElementGraphs.asPathList( graphPaths );
278    
279          if( !areEquivalentPaths( elementGraph, paths ) )
280            continue;
281    
282          LOG.warn( "found equivalent paths from: {} to: {}", paths.get( 0 ).get( 1 ), coGroup );
283    
284          // in order to remove dupe paths, we need to verify there isn't any branching
285          }
286        }
287    
288      private boolean areEquivalentPaths( ElementGraph elementGraph, List<List<FlowElement>> paths )
289        {
290        int length = sameLength( paths );
291    
292        if( length == -1 )
293          return false;
294    
295        Set<FlowElement> elements = new TreeSet<FlowElement>( new EquivalenceComparator( elementGraph ) );
296    
297        for( int i = 0; i < length; i++ )
298          {
299          elements.clear();
300    
301          for( List<FlowElement> path : paths )
302            elements.add( path.get( i ) );
303    
304          if( elements.size() != 1 )
305            return false;
306          }
307    
308        return true;
309        }
310    
311      private class EquivalenceComparator implements Comparator<FlowElement>
312        {
313        private final ElementGraph elementGraph;
314    
315        public EquivalenceComparator( ElementGraph elementGraph )
316          {
317          this.elementGraph = elementGraph;
318          }
319    
320        @Override
321        public int compare( FlowElement lhs, FlowElement rhs )
322          {
323          boolean areEquivalent = lhs.isEquivalentTo( rhs );
324          boolean sameIncoming = elementGraph.inDegreeOf( lhs ) == elementGraph.inDegreeOf( rhs );
325          boolean sameOutgoing = elementGraph.outDegreeOf( lhs ) == elementGraph.outDegreeOf( rhs );
326    
327          if( areEquivalent && sameIncoming && sameOutgoing )
328            return 0;
329    
330          return System.identityHashCode( lhs ) - System.identityHashCode( rhs );
331          }
332        }
333    
334      private int sameLength( List<List<FlowElement>> paths )
335        {
336        int lastSize = paths.get( 0 ).size();
337    
338        for( int i = 1; i < paths.size(); i++ )
339          {
340          if( paths.get( i ).size() != lastSize )
341            return -1;
342          }
343    
344        return lastSize;
345        }
346    
347      /**
348       * optimized for this case
349       * <pre>
350       *         e - t           e1 - e - t
351       * t - e1 -       -- > t -
352       *         e - t           e1 - e - t
353       * </pre>
354       * <p/>
355       * this should run in two map/red jobs, not 3. needs to be a flag on e1 to prevent this
356       * <p/>
357       * <pre>
358       *        g - t                 g - t
359       * g - e -       --> g - e - t -
360       *        g - t                 g - t
361       * </pre>
362       * <p/>
363       * <pre>
364       *             - e - e                            e - e
365       * t - e1 - e2         - g  --> t - e1 - e2 - t -       - g
366       *             - e - e                            e - e
367       * </pre>
368       *
369       * @param elementGraph
370       */
371      private void handleSplit( ElementGraph elementGraph )
372        {
373        // if there was a graph change, iterate paths again.
374        while( !internalSplit( elementGraph ) )
375          ;
376        }
377    
378      private boolean internalSplit( ElementGraph elementGraph )
379        {
380        List<GraphPath<FlowElement, Scope>> paths = elementGraph.getAllShortestPathsBetweenExtents();
381    
382        for( GraphPath<FlowElement, Scope> path : paths )
383          {
384          List<FlowElement> flowElements = Graphs.getPathVertexList( path );
385          Set<Pipe> tapInsertions = new HashSet<Pipe>();
386          FlowElement lastInsertable = null;
387    
388          for( int i = 0; i < flowElements.size(); i++ )
389            {
390            FlowElement flowElement = flowElements.get( i );
391    
392            if( flowElement instanceof ElementGraph.Extent ) // is an extent: head or tail
393              continue;
394    
395            // if Tap, Group, or Every - we insert the tap here
396            if( flowElement instanceof Tap || flowElement instanceof Group || flowElement instanceof Every )
397              lastInsertable = flowElement;
398    
399            // support splits on Pipe unless the previous is a Tap
400            if( flowElement.getClass() == Pipe.class && flowElements.get( i - 1 ) instanceof Tap )
401              continue;
402    
403            if( flowElement instanceof Tap )
404              continue;
405    
406            if( elementGraph.outDegreeOf( flowElement ) <= 1 )
407              continue;
408    
409            // we are at the root of a split here
410    
411            // do any split paths converge on a single Group?
412            int maxPaths = elementGraph.getMaxNumPathsBetweenElementAndGroupingMergeJoin( flowElement );
413            if( maxPaths <= 1 && lastInsertable instanceof Tap )
414              continue;
415    
416            tapInsertions.add( (Pipe) flowElement );
417            }
418    
419          for( Pipe pipe : tapInsertions )
420            insertTempTapAfter( elementGraph, pipe );
421    
422          if( !tapInsertions.isEmpty() )
423            return false;
424          }
425    
426        return true;
427        }
428    
429      /**
430       * will collapse adjacent and equivalent taps.
431       * equivalence is based on the tap adjacent taps using the same filesystem
432       * and the sink being symmetrical, and having the same fields as the temp tap.
433       * <p/>
434       * <p/>
435       * must be run after fields are resolved so temp taps have fully defined scheme instances.
436       *
437       * @param elementGraph
438       */
439      private void handleAdjacentTaps( ElementGraph elementGraph )
440        {
441        // if there was a graph change, iterate paths again.
442        while( !internalAdjacentTaps( elementGraph ) )
443          ;
444        }
445    
446      private boolean internalAdjacentTaps( ElementGraph elementGraph )
447        {
448        List<Tap> taps = elementGraph.findAllTaps();
449    
450        for( Tap tap : taps )
451          {
452          if( !( tap instanceof TempHfs ) )
453            continue;
454    
455          for( FlowElement successor : elementGraph.getAllSuccessors( tap ) )
456            {
457            if( !( successor instanceof Hfs ) )
458              continue;
459    
460            Hfs successorTap = (Hfs) successor;
461    
462            // does this scheme source what it sinks
463            if( !successorTap.getScheme().isSymmetrical() )
464              continue;
465    
466            URI tempURIScheme = getDefaultURIScheme( tap ); // temp uses default fs
467            URI successorURIScheme = getURIScheme( successorTap );
468    
469            if( !tempURIScheme.equals( successorURIScheme ) )
470              continue;
471    
472            // safe, both are symmetrical
473            // should be called after fields are resolved
474            if( !tap.getSourceFields().equals( successorTap.getSourceFields() ) )
475              continue;
476    
477            elementGraph.replaceElementWith( tap, successor );
478    
479            return false;
480            }
481          }
482    
483        return true;
484        }
485    
486      private URI getDefaultURIScheme( Tap tap )
487        {
488        return ( (Hfs) tap ).getDefaultFileSystemURIScheme( jobConf );
489        }
490    
491      private URI getURIScheme( Tap tap )
492        {
493        return ( (Hfs) tap ).getURIScheme( jobConf );
494        }
495    
496      private void handleHeterogeneousSources( ElementGraph elementGraph )
497        {
498        while( !internalHeterogeneousSources( elementGraph ) )
499          ;
500        }
501    
502      private boolean internalHeterogeneousSources( ElementGraph elementGraph )
503        {
504        // find all Groups
505        List<Group> groups = elementGraph.findAllMergeJoinGroups();
506    
507        // compare group sources
508        Map<Group, Set<Tap>> normalizeGroups = new HashMap<Group, Set<Tap>>();
509    
510        for( Group group : groups )
511          {
512          Set<Tap> taps = new HashSet<Tap>();
513    
514          // iterate each shortest path to current group finding each tap sourcing the merge/join
515          for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsTo( group ) )
516            {
517            List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is group
518            Collections.reverse( flowElements ); // first element is group
519    
520            for( FlowElement previousElement : flowElements )
521              {
522              if( previousElement instanceof Tap )
523                {
524                taps.add( (Tap) previousElement );
525                break; // stop finding taps in this path
526                }
527              }
528            }
529    
530          if( taps.size() == 1 )
531            continue;
532    
533          Iterator<Tap> iterator = taps.iterator();
534          Tap commonTap = iterator.next();
535    
536          while( iterator.hasNext() )
537            {
538            Tap tap = iterator.next();
539    
540            // making assumption hadoop can handle multiple filesytems, but not multiple inputformats
541            // in the same job
542            // possibly could test for common input format
543            if( getSchemeClass( tap ) != getSchemeClass( commonTap ) )
544              {
545              normalizeGroups.put( group, taps );
546              break;
547              }
548            }
549          }
550    
551        // if incompatible, insert Tap after its join/merge pipe
552        for( Group group : normalizeGroups.keySet() )
553          {
554          Set<Tap> taps = normalizeGroups.get( group );
555    
556          for( Tap tap : taps )
557            {
558            if( tap instanceof TempHfs || getSchemeClass( tap ).equals( intermediateSchemeClass ) ) // we normalize to TempHfs
559              continue;
560    
561            // handle case where there is a split on a pipe between the tap and group
562            for( GraphPath<FlowElement, Scope> path : getAllShortestPathsBetween( elementGraph, tap, group ) )
563              {
564              List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // shortest path tap -> group
565              Collections.reverse( flowElements ); // group -> tap
566    
567              FlowElement flowElement = flowElements.get( 1 );
568    
569              if( flowElement instanceof TempHfs )
570                continue;
571    
572              LOG.warn( "inserting step to normalize incompatible sources: {}", tap );
573    
574              insertTempTapAfter( elementGraph, (Pipe) flowElement );
575    
576              return false;
577              }
578            }
579          }
580    
581        return normalizeGroups.isEmpty();
582        }
583    
584      @Override
585      protected Tap makeTempTap( String prefix, String name )
586        {
587        // must give Taps unique names
588        return new TempHfs( jobConf, Util.makePath( prefix, name ), intermediateSchemeClass, prefix == null );
589        }
590    
591      private Class getSchemeClass( Tap tap )
592        {
593        if( tap instanceof TempHfs )
594          return ( (TempHfs) tap ).getSchemeClass();
595        else
596          return tap.getScheme().getClass();
597        }
598      }