001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.flow.hadoop;
022    
023    import java.io.IOException;
024    import java.util.Iterator;
025    
026    import cascading.CascadingException;
027    import cascading.flow.FlowException;
028    import cascading.flow.FlowSession;
029    import cascading.flow.FlowStep;
030    import cascading.flow.SliceCounters;
031    import cascading.flow.hadoop.stream.HadoopMapStreamGraph;
032    import cascading.flow.hadoop.util.HadoopUtil;
033    import cascading.flow.stream.Duct;
034    import cascading.flow.stream.ElementDuct;
035    import cascading.flow.stream.SourceStage;
036    import cascading.tap.Tap;
037    import org.apache.hadoop.mapred.JobConf;
038    import org.apache.hadoop.mapred.MapRunnable;
039    import org.apache.hadoop.mapred.OutputCollector;
040    import org.apache.hadoop.mapred.RecordReader;
041    import org.apache.hadoop.mapred.Reporter;
042    import org.slf4j.Logger;
043    import org.slf4j.LoggerFactory;
044    
045    import static cascading.flow.hadoop.util.HadoopUtil.deserializeBase64;
046    import static cascading.flow.hadoop.util.HadoopUtil.readStateFromDistCache;
047    
048    /** Class FlowMapper is the Hadoop Mapper implementation. */
049    public class FlowMapper implements MapRunnable
050      {
051      private static final Logger LOG = LoggerFactory.getLogger( FlowMapper.class );
052    
053      private HadoopMapStreamGraph streamGraph;
054      /** Field currentProcess */
055      private HadoopFlowProcess currentProcess;
056    
057    
058      /** Constructor FlowMapper creates a new FlowMapper instance. */
059      public FlowMapper()
060        {
061        }
062    
063      @Override
064      public void configure( JobConf jobConf )
065        {
066        try
067          {
068          HadoopUtil.initLog4j( jobConf );
069    
070          LOG.info( "cascading version: {}", jobConf.get( "cascading.version", "" ) );
071          LOG.info( "child jvm opts: {}", jobConf.get( "mapred.child.java.opts", "" ) );
072    
073          currentProcess = new HadoopFlowProcess( new FlowSession(), jobConf, true );
074    
075          String stepState = jobConf.getRaw( "cascading.flow.step" );
076    
077          if( stepState == null )
078            stepState = readStateFromDistCache( jobConf, jobConf.get( FlowStep.CASCADING_FLOW_STEP_ID ) );
079    
080          HadoopFlowStep step = deserializeBase64( stepState, jobConf, HadoopFlowStep.class );
081          Tap source = step.getTapForID( step.getSources(), jobConf.get( "cascading.step.source" ) );
082    
083          streamGraph = new HadoopMapStreamGraph( currentProcess, step, source );
084    
085          for( Duct head : streamGraph.getHeads() )
086            LOG.info( "sourcing from: " + ( (ElementDuct) head ).getFlowElement() );
087    
088          for( Duct tail : streamGraph.getTails() )
089            LOG.info( "sinking to: " + ( (ElementDuct) tail ).getFlowElement() );
090    
091          for( Tap trap : step.getMapperTraps().values() )
092            LOG.info( "trapping to: " + trap );
093          }
094        catch( Throwable throwable )
095          {
096          if( throwable instanceof CascadingException )
097            throw (CascadingException) throwable;
098    
099          throw new FlowException( "internal error during mapper configuration", throwable );
100          }
101        }
102    
103      @Override
104      public void run( RecordReader input, OutputCollector output, Reporter reporter ) throws IOException
105        {
106        currentProcess.setReporter( reporter );
107        currentProcess.increment( SliceCounters.Process_Begin_Time, System.currentTimeMillis() );
108        currentProcess.setOutputCollector( output );
109    
110        streamGraph.prepare();
111    
112        SourceStage streamedHead = streamGraph.getStreamedHead();
113        Iterator<Duct> iterator = streamGraph.getHeads().iterator();
114    
115        try
116          {
117          try
118            {
119            while( iterator.hasNext() )
120              {
121              Duct next = iterator.next();
122    
123              if( next != streamedHead )
124                ( (SourceStage) next ).run( null );
125              }
126    
127            streamedHead.run( input );
128            }
129          catch( OutOfMemoryError error )
130            {
131            throw error;
132            }
133          catch( IOException exception )
134            {
135            throw exception;
136            }
137          catch( Throwable throwable )
138            {
139            if( throwable instanceof CascadingException )
140              throw (CascadingException) throwable;
141    
142            throw new FlowException( "internal error during mapper execution", throwable );
143            }
144          }
145        finally
146          {
147          try
148            {
149            streamGraph.cleanup();
150            }
151          finally
152            {
153            currentProcess.increment( SliceCounters.Process_End_Time, System.currentTimeMillis() );
154            }
155          }
156        }
157      }