001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.flow.hadoop.planner; 022 023 import java.net.URI; 024 import java.util.Collections; 025 import java.util.Comparator; 026 import java.util.HashMap; 027 import java.util.HashSet; 028 import java.util.Iterator; 029 import java.util.List; 030 import java.util.Map; 031 import java.util.Properties; 032 import java.util.Set; 033 import java.util.TreeSet; 034 035 import cascading.flow.FlowConnector; 036 import cascading.flow.FlowDef; 037 import cascading.flow.FlowElement; 038 import cascading.flow.hadoop.HadoopFlow; 039 import cascading.flow.hadoop.util.HadoopUtil; 040 import cascading.flow.planner.ElementGraph; 041 import cascading.flow.planner.ElementGraphs; 042 import cascading.flow.planner.FlowPlanner; 043 import cascading.flow.planner.FlowStepGraph; 044 import cascading.flow.planner.PlatformInfo; 045 import cascading.flow.planner.Scope; 046 import cascading.pipe.CoGroup; 047 import cascading.pipe.Every; 048 import cascading.pipe.Group; 049 import cascading.pipe.Pipe; 050 import cascading.property.AppProps; 051 import cascading.property.PropertyUtil; 052 import cascading.tap.Tap; 053 import cascading.tap.hadoop.Hfs; 054 import cascading.tap.hadoop.util.TempHfs; 055 import cascading.util.Util; 056 import org.apache.hadoop.mapred.JobConf; 057 import org.jgrapht.GraphPath; 058 import org.jgrapht.Graphs; 059 import org.slf4j.Logger; 060 import org.slf4j.LoggerFactory; 061 062 import static cascading.flow.planner.ElementGraphs.getAllShortestPathsBetween; 063 064 /** 065 * Class HadoopPlanner is the core Hadoop MapReduce planner. 066 * <p/> 067 * Notes: 068 * <p/> 069 * <strong>Custom JobConf properties</strong><br/> 070 * A custom JobConf instance can be passed to this planner by calling {@link #copyJobConf(java.util.Map, org.apache.hadoop.mapred.JobConf)} 071 * on a map properties object before constructing a new {@link cascading.flow.hadoop.HadoopFlowConnector}. 072 * <p/> 073 * A better practice would be to set Hadoop properties directly on the map properties object handed to the FlowConnector. 074 * All values in the map will be passed to a new default JobConf instance to be used as defaults for all resulting 075 * Flow instances. 076 * <p/> 077 * For example, {@code properties.set("mapred.child.java.opts","-Xmx512m");} would convince Hadoop 078 * to spawn all child jvms with a heap of 512MB. 079 */ 080 public class HadoopPlanner extends FlowPlanner<HadoopFlow, JobConf> 081 { 082 /** Field LOG */ 083 private static final Logger LOG = LoggerFactory.getLogger( HadoopPlanner.class ); 084 085 /** Field jobConf */ 086 private JobConf jobConf; 087 /** Field intermediateSchemeClass */ 088 private Class intermediateSchemeClass; 089 090 /** 091 * Method copyJobConf adds the given JobConf values to the given properties object. Use this method to pass 092 * custom default Hadoop JobConf properties to Hadoop. 093 * 094 * @param properties of type Map 095 * @param jobConf of type JobConf 096 */ 097 public static void copyJobConf( Map<Object, Object> properties, JobConf jobConf ) 098 { 099 for( Map.Entry<String, String> entry : jobConf ) 100 properties.put( entry.getKey(), entry.getValue() ); 101 } 102 103 /** 104 * Method createJobConf returns a new JobConf instance using the values in the given properties argument. 105 * 106 * @param properties of type Map 107 * @return a JobConf instance 108 */ 109 public static JobConf createJobConf( Map<Object, Object> properties ) 110 { 111 JobConf conf = new JobConf(); 112 113 copyProperties( conf, properties ); 114 115 return conf; 116 } 117 118 /** 119 * Method copyProperties adds the given Map values to the given JobConf object. 120 * 121 * @param jobConf of type JobConf 122 * @param properties of type Map 123 */ 124 public static void copyProperties( JobConf jobConf, Map<Object, Object> properties ) 125 { 126 if( properties instanceof Properties ) 127 { 128 Properties props = (Properties) properties; 129 Set<String> keys = props.stringPropertyNames(); 130 131 for( String key : keys ) 132 jobConf.set( key, props.getProperty( key ) ); 133 } 134 else 135 { 136 for( Map.Entry<Object, Object> entry : properties.entrySet() ) 137 { 138 if( entry.getValue() != null ) 139 jobConf.set( entry.getKey().toString(), entry.getValue().toString() ); 140 } 141 } 142 } 143 144 /** 145 * Method setNormalizeHeterogeneousSources adds the given doNormalize boolean to the given properties object. 146 * Use this method if additional jobs should be planned in to handle incompatible InputFormat classes. 147 * <p/> 148 * Normalization is off by default and should only be enabled by advanced users. Typically this will decrease 149 * application performance. 150 * 151 * @param properties of type Map 152 * @param doNormalize of type boolean 153 */ 154 @Deprecated 155 public static void setNormalizeHeterogeneousSources( Map<Object, Object> properties, boolean doNormalize ) 156 { 157 properties.put( "cascading.multimapreduceplanner.normalizesources", Boolean.toString( doNormalize ) ); 158 } 159 160 /** 161 * Method getNormalizeHeterogeneousSources returns if this planner will normalize heterogeneous input sources. 162 * 163 * @param properties of type Map 164 * @return a boolean 165 */ 166 @Deprecated 167 public static boolean getNormalizeHeterogeneousSources( Map<Object, Object> properties ) 168 { 169 return Boolean.parseBoolean( PropertyUtil.getProperty( properties, "cascading.multimapreduceplanner.normalizesources", "false" ) ); 170 } 171 172 @Override 173 public JobConf getConfig() 174 { 175 return jobConf; 176 } 177 178 @Override 179 public PlatformInfo getPlatformInfo() 180 { 181 return HadoopUtil.getPlatformInfo(); 182 } 183 184 @Override 185 public void initialize( FlowConnector flowConnector, Map<Object, Object> properties ) 186 { 187 super.initialize( flowConnector, properties ); 188 189 jobConf = HadoopUtil.createJobConf( properties, createJobConf( properties ) ); 190 intermediateSchemeClass = flowConnector.getIntermediateSchemeClass( properties ); 191 192 Class type = AppProps.getApplicationJarClass( properties ); 193 if( jobConf.getJar() == null && type != null ) 194 jobConf.setJarByClass( type ); 195 196 String path = AppProps.getApplicationJarPath( properties ); 197 if( jobConf.getJar() == null && path != null ) 198 jobConf.setJar( path ); 199 200 if( jobConf.getJar() == null ) 201 jobConf.setJarByClass( HadoopUtil.findMainClass( HadoopPlanner.class ) ); 202 203 AppProps.setApplicationJarPath( properties, jobConf.getJar() ); 204 205 LOG.info( "using application jar: {}", jobConf.getJar() ); 206 } 207 208 @Override 209 protected HadoopFlow createFlow( FlowDef flowDef ) 210 { 211 return new HadoopFlow( getPlatformInfo(), getProperties(), getConfig(), flowDef ); 212 } 213 214 @Override 215 public HadoopFlow buildFlow( FlowDef flowDef ) 216 { 217 ElementGraph elementGraph = null; 218 219 try 220 { 221 // generic 222 verifyAllTaps( flowDef ); 223 224 HadoopFlow flow = createFlow( flowDef ); 225 226 Pipe[] tails = resolveTails( flowDef, flow ); 227 228 verifyAssembly( flowDef, tails ); 229 230 elementGraph = createElementGraph( flowDef, tails ); 231 232 // rules 233 failOnLoneGroupAssertion( elementGraph ); 234 failOnMissingGroup( elementGraph ); 235 failOnMisusedBuffer( elementGraph ); 236 failOnGroupEverySplit( elementGraph ); 237 238 // m/r specific 239 handleWarnEquivalentPaths( elementGraph ); 240 handleSplit( elementGraph ); 241 handleJobPartitioning( elementGraph ); 242 handleJoins( elementGraph ); 243 handleNonSafeOperations( elementGraph ); 244 245 if( getNormalizeHeterogeneousSources( properties ) ) 246 handleHeterogeneousSources( elementGraph ); 247 248 // generic 249 elementGraph.removeUnnecessaryPipes(); // groups must be added before removing pipes 250 elementGraph.resolveFields(); 251 252 elementGraph = flow.updateSchemes( elementGraph ); 253 254 // m/r specific 255 handleAdjacentTaps( elementGraph ); 256 257 FlowStepGraph flowStepGraph = new HadoopStepGraph( flowDef.getName(), elementGraph ); 258 259 flow.initialize( elementGraph, flowStepGraph ); 260 261 return flow; 262 } 263 catch( Exception exception ) 264 { 265 throw handleExceptionDuringPlanning( exception, elementGraph ); 266 } 267 } 268 269 private void handleWarnEquivalentPaths( ElementGraph elementGraph ) 270 { 271 List<CoGroup> coGroups = elementGraph.findAllCoGroups(); 272 273 for( CoGroup coGroup : coGroups ) 274 { 275 List<GraphPath<FlowElement, Scope>> graphPaths = elementGraph.getAllShortestPathsTo( coGroup ); 276 277 List<List<FlowElement>> paths = ElementGraphs.asPathList( graphPaths ); 278 279 if( !areEquivalentPaths( elementGraph, paths ) ) 280 continue; 281 282 LOG.warn( "found equivalent paths from: {} to: {}", paths.get( 0 ).get( 1 ), coGroup ); 283 284 // in order to remove dupe paths, we need to verify there isn't any branching 285 } 286 } 287 288 private boolean areEquivalentPaths( ElementGraph elementGraph, List<List<FlowElement>> paths ) 289 { 290 int length = sameLength( paths ); 291 292 if( length == -1 ) 293 return false; 294 295 Set<FlowElement> elements = new TreeSet<FlowElement>( new EquivalenceComparator( elementGraph ) ); 296 297 for( int i = 0; i < length; i++ ) 298 { 299 elements.clear(); 300 301 for( List<FlowElement> path : paths ) 302 elements.add( path.get( i ) ); 303 304 if( elements.size() != 1 ) 305 return false; 306 } 307 308 return true; 309 } 310 311 private class EquivalenceComparator implements Comparator<FlowElement> 312 { 313 private final ElementGraph elementGraph; 314 315 public EquivalenceComparator( ElementGraph elementGraph ) 316 { 317 this.elementGraph = elementGraph; 318 } 319 320 @Override 321 public int compare( FlowElement lhs, FlowElement rhs ) 322 { 323 boolean areEquivalent = lhs.isEquivalentTo( rhs ); 324 boolean sameIncoming = elementGraph.inDegreeOf( lhs ) == elementGraph.inDegreeOf( rhs ); 325 boolean sameOutgoing = elementGraph.outDegreeOf( lhs ) == elementGraph.outDegreeOf( rhs ); 326 327 if( areEquivalent && sameIncoming && sameOutgoing ) 328 return 0; 329 330 return System.identityHashCode( lhs ) - System.identityHashCode( rhs ); 331 } 332 } 333 334 private int sameLength( List<List<FlowElement>> paths ) 335 { 336 int lastSize = paths.get( 0 ).size(); 337 338 for( int i = 1; i < paths.size(); i++ ) 339 { 340 if( paths.get( i ).size() != lastSize ) 341 return -1; 342 } 343 344 return lastSize; 345 } 346 347 /** 348 * optimized for this case 349 * <pre> 350 * e - t e1 - e - t 351 * t - e1 - -- > t - 352 * e - t e1 - e - t 353 * </pre> 354 * <p/> 355 * this should run in two map/red jobs, not 3. needs to be a flag on e1 to prevent this 356 * <p/> 357 * <pre> 358 * g - t g - t 359 * g - e - --> g - e - t - 360 * g - t g - t 361 * </pre> 362 * <p/> 363 * <pre> 364 * - e - e e - e 365 * t - e1 - e2 - g --> t - e1 - e2 - t - - g 366 * - e - e e - e 367 * </pre> 368 * 369 * @param elementGraph 370 */ 371 private void handleSplit( ElementGraph elementGraph ) 372 { 373 // if there was a graph change, iterate paths again. 374 while( !internalSplit( elementGraph ) ) 375 ; 376 } 377 378 private boolean internalSplit( ElementGraph elementGraph ) 379 { 380 List<GraphPath<FlowElement, Scope>> paths = elementGraph.getAllShortestPathsBetweenExtents(); 381 382 for( GraphPath<FlowElement, Scope> path : paths ) 383 { 384 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); 385 Set<Pipe> tapInsertions = new HashSet<Pipe>(); 386 FlowElement lastInsertable = null; 387 388 for( int i = 0; i < flowElements.size(); i++ ) 389 { 390 FlowElement flowElement = flowElements.get( i ); 391 392 if( flowElement instanceof ElementGraph.Extent ) // is an extent: head or tail 393 continue; 394 395 // if Tap, Group, or Every - we insert the tap here 396 if( flowElement instanceof Tap || flowElement instanceof Group || flowElement instanceof Every ) 397 lastInsertable = flowElement; 398 399 // support splits on Pipe unless the previous is a Tap 400 if( flowElement.getClass() == Pipe.class && flowElements.get( i - 1 ) instanceof Tap ) 401 continue; 402 403 if( flowElement instanceof Tap ) 404 continue; 405 406 if( elementGraph.outDegreeOf( flowElement ) <= 1 ) 407 continue; 408 409 // we are at the root of a split here 410 411 // do any split paths converge on a single Group? 412 int maxPaths = elementGraph.getMaxNumPathsBetweenElementAndGroupingMergeJoin( flowElement ); 413 if( maxPaths <= 1 && lastInsertable instanceof Tap ) 414 continue; 415 416 tapInsertions.add( (Pipe) flowElement ); 417 } 418 419 for( Pipe pipe : tapInsertions ) 420 insertTempTapAfter( elementGraph, pipe ); 421 422 if( !tapInsertions.isEmpty() ) 423 return false; 424 } 425 426 return true; 427 } 428 429 /** 430 * will collapse adjacent and equivalent taps. 431 * equivalence is based on the tap adjacent taps using the same filesystem 432 * and the sink being symmetrical, and having the same fields as the temp tap. 433 * <p/> 434 * <p/> 435 * must be run after fields are resolved so temp taps have fully defined scheme instances. 436 * 437 * @param elementGraph 438 */ 439 private void handleAdjacentTaps( ElementGraph elementGraph ) 440 { 441 // if there was a graph change, iterate paths again. 442 while( !internalAdjacentTaps( elementGraph ) ) 443 ; 444 } 445 446 private boolean internalAdjacentTaps( ElementGraph elementGraph ) 447 { 448 List<Tap> taps = elementGraph.findAllTaps(); 449 450 for( Tap tap : taps ) 451 { 452 if( !( tap instanceof TempHfs ) ) 453 continue; 454 455 for( FlowElement successor : elementGraph.getAllSuccessors( tap ) ) 456 { 457 if( !( successor instanceof Hfs ) ) 458 continue; 459 460 Hfs successorTap = (Hfs) successor; 461 462 // does this scheme source what it sinks 463 if( !successorTap.getScheme().isSymmetrical() ) 464 continue; 465 466 URI tempURIScheme = getDefaultURIScheme( tap ); // temp uses default fs 467 URI successorURIScheme = getURIScheme( successorTap ); 468 469 if( !tempURIScheme.equals( successorURIScheme ) ) 470 continue; 471 472 // safe, both are symmetrical 473 // should be called after fields are resolved 474 if( !tap.getSourceFields().equals( successorTap.getSourceFields() ) ) 475 continue; 476 477 elementGraph.replaceElementWith( tap, successor ); 478 479 return false; 480 } 481 } 482 483 return true; 484 } 485 486 private URI getDefaultURIScheme( Tap tap ) 487 { 488 return ( (Hfs) tap ).getDefaultFileSystemURIScheme( jobConf ); 489 } 490 491 private URI getURIScheme( Tap tap ) 492 { 493 return ( (Hfs) tap ).getURIScheme( jobConf ); 494 } 495 496 private void handleHeterogeneousSources( ElementGraph elementGraph ) 497 { 498 while( !internalHeterogeneousSources( elementGraph ) ) 499 ; 500 } 501 502 private boolean internalHeterogeneousSources( ElementGraph elementGraph ) 503 { 504 // find all Groups 505 List<Group> groups = elementGraph.findAllMergeJoinGroups(); 506 507 // compare group sources 508 Map<Group, Set<Tap>> normalizeGroups = new HashMap<Group, Set<Tap>>(); 509 510 for( Group group : groups ) 511 { 512 Set<Tap> taps = new HashSet<Tap>(); 513 514 // iterate each shortest path to current group finding each tap sourcing the merge/join 515 for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsTo( group ) ) 516 { 517 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is group 518 Collections.reverse( flowElements ); // first element is group 519 520 for( FlowElement previousElement : flowElements ) 521 { 522 if( previousElement instanceof Tap ) 523 { 524 taps.add( (Tap) previousElement ); 525 break; // stop finding taps in this path 526 } 527 } 528 } 529 530 if( taps.size() == 1 ) 531 continue; 532 533 Iterator<Tap> iterator = taps.iterator(); 534 Tap commonTap = iterator.next(); 535 536 while( iterator.hasNext() ) 537 { 538 Tap tap = iterator.next(); 539 540 // making assumption hadoop can handle multiple filesytems, but not multiple inputformats 541 // in the same job 542 // possibly could test for common input format 543 if( getSchemeClass( tap ) != getSchemeClass( commonTap ) ) 544 { 545 normalizeGroups.put( group, taps ); 546 break; 547 } 548 } 549 } 550 551 // if incompatible, insert Tap after its join/merge pipe 552 for( Group group : normalizeGroups.keySet() ) 553 { 554 Set<Tap> taps = normalizeGroups.get( group ); 555 556 for( Tap tap : taps ) 557 { 558 if( tap instanceof TempHfs || getSchemeClass( tap ).equals( intermediateSchemeClass ) ) // we normalize to TempHfs 559 continue; 560 561 // handle case where there is a split on a pipe between the tap and group 562 for( GraphPath<FlowElement, Scope> path : getAllShortestPathsBetween( elementGraph, tap, group ) ) 563 { 564 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // shortest path tap -> group 565 Collections.reverse( flowElements ); // group -> tap 566 567 FlowElement flowElement = flowElements.get( 1 ); 568 569 if( flowElement instanceof TempHfs ) 570 continue; 571 572 LOG.warn( "inserting step to normalize incompatible sources: {}", tap ); 573 574 insertTempTapAfter( elementGraph, (Pipe) flowElement ); 575 576 return false; 577 } 578 } 579 } 580 581 return normalizeGroups.isEmpty(); 582 } 583 584 @Override 585 protected Tap makeTempTap( String prefix, String name ) 586 { 587 // must give Taps unique names 588 return new TempHfs( jobConf, Util.makePath( prefix, name ), intermediateSchemeClass, prefix == null ); 589 } 590 591 private Class getSchemeClass( Tap tap ) 592 { 593 if( tap instanceof TempHfs ) 594 return ( (TempHfs) tap ).getSchemeClass(); 595 else 596 return tap.getScheme().getClass(); 597 } 598 }