001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.flow.planner; 022 023 import java.util.ArrayList; 024 import java.util.Arrays; 025 import java.util.Collection; 026 import java.util.Collections; 027 import java.util.HashSet; 028 import java.util.List; 029 import java.util.Map; 030 import java.util.Set; 031 032 import cascading.flow.AssemblyPlanner; 033 import cascading.flow.Flow; 034 import cascading.flow.FlowConnector; 035 import cascading.flow.FlowDef; 036 import cascading.flow.FlowElement; 037 import cascading.operation.AssertionLevel; 038 import cascading.operation.DebugLevel; 039 import cascading.pipe.Checkpoint; 040 import cascading.pipe.CoGroup; 041 import cascading.pipe.Each; 042 import cascading.pipe.Every; 043 import cascading.pipe.Group; 044 import cascading.pipe.GroupBy; 045 import cascading.pipe.HashJoin; 046 import cascading.pipe.Merge; 047 import cascading.pipe.OperatorException; 048 import cascading.pipe.Pipe; 049 import cascading.pipe.Splice; 050 import cascading.pipe.SubAssembly; 051 import cascading.property.PropertyUtil; 052 import cascading.scheme.Scheme; 053 import cascading.tap.Tap; 054 import cascading.tap.TapException; 055 import cascading.tuple.Fields; 056 import cascading.util.Util; 057 import org.jgrapht.GraphPath; 058 import org.jgrapht.Graphs; 059 import org.slf4j.Logger; 060 import org.slf4j.LoggerFactory; 061 062 import static cascading.flow.planner.ElementGraphs.*; 063 import static java.util.Arrays.asList; 064 065 /** Class FlowPlanner is the base class for all planner implementations. */ 066 public abstract class FlowPlanner<F extends Flow, Config> 067 { 068 /** Field LOG */ 069 private static final Logger LOG = LoggerFactory.getLogger( FlowPlanner.class ); 070 071 /** Field properties */ 072 protected Map<Object, Object> properties; 073 074 protected String checkpointRootPath = null; 075 076 /** Field assertionLevel */ 077 protected AssertionLevel assertionLevel; 078 /** Field debugLevel */ 079 protected DebugLevel debugLevel; 080 081 /** 082 * Method getAssertionLevel returns the configured target planner {@link cascading.operation.AssertionLevel}. 083 * 084 * @param properties of type Map<Object, Object> 085 * @return AssertionLevel the configured AssertionLevel 086 */ 087 static AssertionLevel getAssertionLevel( Map<Object, Object> properties ) 088 { 089 String assertionLevel = PropertyUtil.getProperty( properties, "cascading.flowconnector.assertionlevel", AssertionLevel.STRICT.name() ); 090 091 return AssertionLevel.valueOf( assertionLevel ); 092 } 093 094 /** 095 * Method getDebugLevel returns the configured target planner {@link cascading.operation.DebugLevel}. 096 * 097 * @param properties of type Map<Object, Object> 098 * @return DebugLevel the configured DebugLevel 099 */ 100 static DebugLevel getDebugLevel( Map<Object, Object> properties ) 101 { 102 String debugLevel = PropertyUtil.getProperty( properties, "cascading.flowconnector.debuglevel", DebugLevel.DEFAULT.name() ); 103 104 return DebugLevel.valueOf( debugLevel ); 105 } 106 107 public Map<Object, Object> getProperties() 108 { 109 return properties; 110 } 111 112 public abstract Config getConfig(); 113 114 public abstract PlatformInfo getPlatformInfo(); 115 116 public void initialize( FlowConnector flowConnector, Map<Object, Object> properties ) 117 { 118 this.properties = properties; 119 this.assertionLevel = getAssertionLevel( properties ); 120 this.debugLevel = getDebugLevel( properties ); 121 } 122 123 protected abstract Flow createFlow( FlowDef flowDef ); 124 125 /** 126 * Method buildFlow renders the actual Flow instance. 127 * 128 * @param flowDef 129 * @return Flow 130 */ 131 public abstract F buildFlow( FlowDef flowDef ); 132 133 protected Pipe[] resolveTails( FlowDef flowDef, Flow<Config> flow ) 134 { 135 Pipe[] tails = flowDef.getTailsArray(); 136 137 tails = resolveAssemblyPlanners( flowDef, flow, tails ); 138 139 return tails; 140 } 141 142 protected Pipe[] resolveAssemblyPlanners( FlowDef flowDef, Flow flow, Pipe[] pipes ) 143 { 144 List<Pipe> tails = Arrays.asList( pipes ); 145 146 List<AssemblyPlanner> assemblyPlanners = flowDef.getAssemblyPlanners(); 147 148 for( AssemblyPlanner assemblyPlanner : assemblyPlanners ) 149 { 150 tails = assemblyPlanner.resolveTails( new AssemblyPlannerContext( flowDef, flow, tails ) ); 151 152 if( tails.isEmpty() ) 153 throw new PlannerException( "assembly planner: " + assemblyPlanner + ", returned zero tails" ); 154 155 tails = Collections.unmodifiableList( tails ); 156 } 157 158 return tails.toArray( new Pipe[ tails.size() ] ); 159 } 160 161 protected void verifyAssembly( FlowDef flowDef, Pipe[] tails ) 162 { 163 verifyPipeAssemblyEndPoints( flowDef, tails ); 164 verifyTraps( flowDef, tails ); 165 verifyCheckpoints( flowDef, tails ); 166 } 167 168 protected void verifyAllTaps( FlowDef flowDef ) 169 { 170 verifySourceNotSinks( flowDef.getSources(), flowDef.getSinks() ); 171 172 verifyTaps( flowDef.getSources(), true, true ); 173 verifyTaps( flowDef.getSinks(), false, true ); 174 verifyTaps( flowDef.getTraps(), false, false ); 175 176 // are both sources and sinks 177 verifyTaps( flowDef.getCheckpoints(), true, false ); 178 verifyTaps( flowDef.getCheckpoints(), false, false ); 179 } 180 181 protected ElementGraph createElementGraph( FlowDef flowDef, Pipe[] flowTails ) 182 { 183 Map<String, Tap> sources = flowDef.getSourcesCopy(); 184 Map<String, Tap> sinks = flowDef.getSinksCopy(); 185 Map<String, Tap> traps = flowDef.getTrapsCopy(); 186 Map<String, Tap> checkpoints = flowDef.getCheckpointsCopy(); 187 188 AssertionLevel assertionLevel = flowDef.getAssertionLevel() == null ? this.assertionLevel : flowDef.getAssertionLevel(); 189 DebugLevel debugLevel = flowDef.getDebugLevel() == null ? this.debugLevel : flowDef.getDebugLevel(); 190 191 checkpointRootPath = makeCheckpointRootPath( flowDef ); 192 193 return new ElementGraph( getPlatformInfo(), flowTails, sources, sinks, traps, checkpoints, checkpointRootPath != null, assertionLevel, debugLevel ); 194 } 195 196 private String makeCheckpointRootPath( FlowDef flowDef ) 197 { 198 String flowName = flowDef.getName(); 199 String runID = flowDef.getRunID(); 200 201 if( runID == null ) 202 return null; 203 204 if( flowName == null ) 205 throw new PlannerException( "flow name is required when providing a run id" ); 206 207 return flowName + "/" + runID; 208 } 209 210 211 protected void verifySourceNotSinks( Map<String, Tap> sources, Map<String, Tap> sinks ) 212 { 213 Collection<Tap> sourcesSet = sources.values(); 214 215 for( Tap tap : sinks.values() ) 216 { 217 if( sourcesSet.contains( tap ) ) 218 throw new PlannerException( "tap may not be used as both source and sink in the same Flow: " + tap ); 219 } 220 } 221 222 /** 223 * Method verifyTaps ... 224 * 225 * @param taps of type Map<String, Tap> 226 * @param areSources of type boolean 227 * @param mayNotBeEmpty of type boolean 228 */ 229 protected void verifyTaps( Map<String, Tap> taps, boolean areSources, boolean mayNotBeEmpty ) 230 { 231 if( mayNotBeEmpty && taps.isEmpty() ) 232 throw new PlannerException( ( areSources ? "source" : "sink" ) + " taps are required" ); 233 234 for( String tapName : taps.keySet() ) 235 { 236 if( areSources && !taps.get( tapName ).isSource() ) 237 throw new PlannerException( "tap named: '" + tapName + "', cannot be used as a source: " + taps.get( tapName ) ); 238 else if( !areSources && !taps.get( tapName ).isSink() ) 239 throw new PlannerException( "tap named: '" + tapName + "', cannot be used as a sink: " + taps.get( tapName ) ); 240 } 241 } 242 243 /** 244 * Method verifyEndPoints verifies 245 * <p/> 246 * there aren't dupe names in heads or tails. 247 * all the sink and source tap names match up with tail and head pipes 248 */ 249 // todo: force dupe names to throw exceptions 250 protected void verifyPipeAssemblyEndPoints( FlowDef flowDef, Pipe[] flowTails ) 251 { 252 Set<String> tapNames = new HashSet<String>(); 253 254 tapNames.addAll( flowDef.getSources().keySet() ); 255 tapNames.addAll( flowDef.getSinks().keySet() ); 256 257 // handle tails 258 Set<Pipe> tails = new HashSet<Pipe>(); 259 Set<String> tailNames = new HashSet<String>(); 260 261 for( Pipe pipe : flowTails ) 262 { 263 if( pipe instanceof SubAssembly ) 264 { 265 for( Pipe tail : ( (SubAssembly) pipe ).getTails() ) 266 { 267 String tailName = tail.getName(); 268 269 if( !tapNames.contains( tailName ) ) 270 throw new PlannerException( tail, "pipe name not found in either sink or source map: '" + tailName + "'" ); 271 272 if( tailNames.contains( tailName ) && !tails.contains( tail ) ) 273 LOG.warn( "duplicate tail name found: '{}'", tailName ); 274 // throw new PlannerException( pipe, "duplicate tail name found: " + tailName ); 275 276 tailNames.add( tailName ); 277 tails.add( tail ); 278 } 279 } 280 else 281 { 282 String tailName = pipe.getName(); 283 284 if( !tapNames.contains( tailName ) ) 285 throw new PlannerException( pipe, "pipe name not found in either sink or source map: '" + tailName + "'" ); 286 287 if( tailNames.contains( tailName ) && !tails.contains( pipe ) ) 288 LOG.warn( "duplicate tail name found: '{}'", tailName ); 289 // throw new PlannerException( pipe, "duplicate tail name found: " + tailName ); 290 291 tailNames.add( tailName ); 292 tails.add( pipe ); 293 } 294 } 295 296 // Set<String> allTailNames = new HashSet<String>( tailNames ); 297 tailNames.removeAll( flowDef.getSinks().keySet() ); 298 Set<String> remainingSinks = new HashSet<String>( flowDef.getSinks().keySet() ); 299 remainingSinks.removeAll( tailNames ); 300 301 if( tailNames.size() != 0 ) 302 throw new PlannerException( "not all tail pipes bound to sink taps, remaining tail pipe names: [" + Util.join( Util.quote( tailNames, "'" ), ", " ) + "], remaining sink tap names: [" + Util.join( Util.quote( remainingSinks, "'" ), ", " ) + "]" ); 303 304 // unlike heads, pipes can input to another pipe and simultaneously be a sink 305 // so there is no way to know all the intentional tails, so they aren't listed below in the exception 306 remainingSinks = new HashSet<String>( flowDef.getSinks().keySet() ); 307 remainingSinks.removeAll( asList( Pipe.names( flowTails ) ) ); 308 309 if( remainingSinks.size() != 0 ) 310 throw new PlannerException( "not all sink taps bound to tail pipes, remaining sink tap names: [" + Util.join( Util.quote( remainingSinks, "'" ), ", " ) + "]" ); 311 312 // handle heads 313 Set<Pipe> heads = new HashSet<Pipe>(); 314 Set<String> headNames = new HashSet<String>(); 315 316 for( Pipe pipe : flowTails ) 317 { 318 for( Pipe head : pipe.getHeads() ) 319 { 320 String headName = head.getName(); 321 322 if( !tapNames.contains( headName ) ) 323 throw new PlannerException( head, "pipe name not found in either sink or source map: '" + headName + "'" ); 324 325 if( headNames.contains( headName ) && !heads.contains( head ) ) 326 LOG.warn( "duplicate head name found, not an error but heads should have unique names: '{}'", headName ); 327 // throw new PlannerException( pipe, "duplicate head name found: " + headName ); 328 329 headNames.add( headName ); 330 heads.add( head ); 331 } 332 } 333 334 Set<String> allHeadNames = new HashSet<String>( headNames ); 335 headNames.removeAll( flowDef.getSources().keySet() ); 336 Set<String> remainingSources = new HashSet<String>( flowDef.getSources().keySet() ); 337 remainingSources.removeAll( headNames ); 338 339 if( headNames.size() != 0 ) 340 throw new PlannerException( "not all head pipes bound to source taps, remaining head pipe names: [" + Util.join( Util.quote( headNames, "'" ), ", " ) + "], remaining source tap names: [" + Util.join( Util.quote( remainingSources, "'" ), ", " ) + "]" ); 341 342 remainingSources = new HashSet<String>( flowDef.getSources().keySet() ); 343 remainingSources.removeAll( allHeadNames ); 344 345 if( remainingSources.size() != 0 ) 346 throw new PlannerException( "not all source taps bound to head pipes, remaining source tap names: [" + Util.join( Util.quote( remainingSources, "'" ), ", " ) + "], remaining head pipe names: [" + Util.join( Util.quote( headNames, "'" ), ", " ) + "]" ); 347 348 } 349 350 protected void verifyTraps( FlowDef flowDef, Pipe[] flowTails ) 351 { 352 verifyNotSourcesSinks( flowDef.getTraps(), flowDef.getSources(), flowDef.getSinks(), "trap" ); 353 354 Set<String> names = new HashSet<String>( asList( Pipe.names( flowTails ) ) ); 355 356 for( String name : flowDef.getTraps().keySet() ) 357 { 358 if( !names.contains( name ) ) 359 throw new PlannerException( "trap name not found in assembly: '" + name + "'" ); 360 } 361 } 362 363 protected void verifyCheckpoints( FlowDef flowDef, Pipe[] flowTails ) 364 { 365 verifyNotSourcesSinks( flowDef.getCheckpoints(), flowDef.getSources(), flowDef.getSinks(), "checkpoint" ); 366 367 for( Tap checkpointTap : flowDef.getCheckpoints().values() ) 368 { 369 Scheme scheme = checkpointTap.getScheme(); 370 371 if( scheme.getSourceFields().equals( Fields.UNKNOWN ) && scheme.getSinkFields().equals( Fields.ALL ) ) 372 continue; 373 374 throw new PlannerException( "checkpoint tap scheme must be undeclared, source fields must be UNKNOWN, and sink fields ALL, got: " + scheme.toString() ); 375 } 376 377 Set<String> names = new HashSet<String>( asList( Pipe.names( flowTails ) ) ); 378 379 for( String name : flowDef.getCheckpoints().keySet() ) 380 { 381 if( !names.contains( name ) ) 382 throw new PlannerException( "checkpoint name not found in assembly: '" + name + "'" ); 383 384 Set<Pipe> pipes = new HashSet<Pipe>( asList( Pipe.named( name, flowTails ) ) ); 385 386 int count = 0; 387 388 for( Pipe pipe : pipes ) 389 { 390 if( pipe instanceof Checkpoint ) 391 count++; 392 } 393 394 if( count == 0 ) 395 throw new PlannerException( "no checkpoint with name found in assembly: '" + name + "'" ); 396 397 if( count > 1 ) 398 throw new PlannerException( "more than one checkpoint with name found in assembly: '" + name + "'" ); 399 } 400 } 401 402 private void verifyNotSourcesSinks( Map<String, Tap> taps, Map<String, Tap> sources, Map<String, Tap> sinks, String role ) 403 { 404 Collection<Tap> sourceTaps = sources.values(); 405 Collection<Tap> sinkTaps = sinks.values(); 406 407 for( Tap tap : taps.values() ) 408 { 409 if( sourceTaps.contains( tap ) ) 410 throw new PlannerException( "tap may not be used as both a " + role + " and a source in the same Flow: " + tap ); 411 412 if( sinkTaps.contains( tap ) ) 413 throw new PlannerException( "tap may not be used as both a " + role + " and a sink in the same Flow: " + tap ); 414 } 415 } 416 417 /** 418 * Verifies that there are not only GroupAssertions following any given Group instance. This will adversely 419 * affect the stream entering any subsequent Tap of Each instances. 420 */ 421 protected void failOnLoneGroupAssertion( ElementGraph elementGraph ) 422 { 423 List<Group> groups = elementGraph.findAllGroups(); 424 425 // walk Every instances after Group 426 for( Group group : groups ) 427 { 428 for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsFrom( group ) ) 429 { 430 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is tail 431 432 int everies = 0; 433 int assertions = 0; 434 435 for( FlowElement flowElement : flowElements ) 436 { 437 if( flowElement instanceof Group ) 438 continue; 439 440 if( !( flowElement instanceof Every ) ) 441 break; 442 443 everies++; 444 445 Every every = (Every) flowElement; 446 447 if( every.getPlannerLevel() != null ) 448 assertions++; 449 } 450 451 if( everies != 0 && everies == assertions ) 452 throw new PlannerException( "group assertions must be accompanied by aggregator operations" ); 453 } 454 } 455 } 456 457 protected void failOnMissingGroup( ElementGraph elementGraph ) 458 { 459 List<Every> everies = elementGraph.findAllEveries(); 460 461 // walk Every instances after Group 462 for( Every every : everies ) 463 { 464 for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsTo( every ) ) 465 { 466 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is every 467 Collections.reverse( flowElements ); // first element is every 468 469 for( FlowElement flowElement : flowElements ) 470 { 471 if( flowElement instanceof Every || flowElement.getClass() == Pipe.class ) 472 continue; 473 474 if( flowElement instanceof GroupBy || flowElement instanceof CoGroup ) 475 break; 476 477 throw new PlannerException( (Pipe) flowElement, "Every may only be preceded by another Every or a Group pipe, found: " + flowElement ); 478 } 479 } 480 } 481 } 482 483 protected void failOnMisusedBuffer( ElementGraph elementGraph ) 484 { 485 List<Every> everies = elementGraph.findAllEveries(); 486 487 // walk Every instances after Group 488 for( Every every : everies ) 489 { 490 for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsTo( every ) ) 491 { 492 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); // last element is every 493 Collections.reverse( flowElements ); // first element is every 494 495 Every last = null; 496 boolean foundBuffer = false; 497 int foundEveries = -1; 498 499 for( FlowElement flowElement : flowElements ) 500 { 501 if( flowElement instanceof Each ) 502 throw new PlannerException( (Pipe) flowElement, "Every may only be preceded by another Every or a GroupBy or CoGroup pipe, found: " + flowElement ); 503 504 if( flowElement instanceof Every ) 505 { 506 foundEveries++; 507 508 boolean isBuffer = ( (Every) flowElement ).isBuffer(); 509 510 if( foundEveries != 0 && ( isBuffer || foundBuffer ) ) 511 throw new PlannerException( (Pipe) flowElement, "Only one Every with a Buffer may follow a GroupBy or CoGroup pipe, no other Every instances are allowed immediately before or after, found: " + flowElement + " before: " + last ); 512 513 if( !foundBuffer ) 514 foundBuffer = isBuffer; 515 516 last = (Every) flowElement; 517 } 518 519 if( flowElement instanceof Group ) 520 break; 521 } 522 } 523 } 524 } 525 526 protected void failOnGroupEverySplit( ElementGraph elementGraph ) 527 { 528 List<Group> groups = new ArrayList<Group>(); 529 530 elementGraph.findAllOfType( 1, 2, Group.class, groups ); 531 532 for( Group group : groups ) 533 { 534 Set<FlowElement> children = elementGraph.getAllChildrenNotExactlyType( group, Pipe.class ); 535 536 for( FlowElement flowElement : children ) 537 { 538 if( flowElement instanceof Every ) 539 throw new PlannerException( (Every) flowElement, "Every instances may not split after a GroupBy or CoGroup pipe, found: " + flowElement + " after: " + group ); 540 } 541 } 542 } 543 544 protected PlannerException handleExceptionDuringPlanning( Exception exception, ElementGraph elementGraph ) 545 { 546 if( exception instanceof PlannerException ) 547 { 548 ( (PlannerException) exception ).elementGraph = elementGraph; 549 550 return (PlannerException) exception; 551 } 552 else if( exception instanceof ElementGraphException ) 553 { 554 Throwable cause = exception.getCause(); 555 556 if( cause == null ) 557 cause = exception; 558 559 // captures pipegraph for debugging 560 // forward message in case cause or trace is lost 561 String message = String.format( "could not build flow from assembly: [%s]", cause.getMessage() ); 562 563 if( cause instanceof OperatorException ) 564 return new PlannerException( message, cause, elementGraph ); 565 566 if( cause instanceof TapException ) 567 return new PlannerException( message, cause, elementGraph ); 568 569 return new PlannerException( ( (ElementGraphException) exception ).getPipe(), message, cause, elementGraph ); 570 } 571 else 572 { 573 // captures pipegraph for debugging 574 // forward message in case cause or trace is lost 575 String message = String.format( "could not build flow from assembly: [%s]", exception.getMessage() ); 576 return new PlannerException( message, exception, elementGraph ); 577 } 578 } 579 580 protected void handleNonSafeOperations( ElementGraph elementGraph ) 581 { 582 // if there was a graph change, iterate paths again. 583 while( !internalNonSafeOperations( elementGraph ) ) 584 ; 585 } 586 587 private boolean internalNonSafeOperations( ElementGraph elementGraph ) 588 { 589 Set<Pipe> tapInsertions = new HashSet<Pipe>(); 590 591 List<Pipe> splits = elementGraph.findAllPipeSplits(); 592 593 // if any predecessor is unsafe, insert temp 594 for( Pipe split : splits ) 595 { 596 List<GraphPath<FlowElement, Scope>> paths = elementGraph.getAllShortestPathsTo( split ); 597 598 for( GraphPath<FlowElement, Scope> path : paths ) 599 { 600 List<FlowElement> elements = Graphs.getPathVertexList( path ); 601 Collections.reverse( elements ); 602 603 for( FlowElement element : elements ) 604 { 605 if( !( element instanceof Each ) && element.getClass() != Pipe.class ) 606 break; 607 608 if( element.getClass() == Pipe.class ) 609 continue; 610 611 if( !( (Each) element ).getOperation().isSafe() ) 612 { 613 tapInsertions.add( split ); 614 break; 615 } 616 } 617 } 618 } 619 620 for( Pipe pipe : tapInsertions ) 621 insertTempTapAfter( elementGraph, pipe ); 622 623 return tapInsertions.isEmpty(); 624 } 625 626 /** 627 * Method insertTapAfter ... 628 * 629 * @param graph of type PipeGraph 630 * @param pipe of type Pipe 631 */ 632 protected void insertTempTapAfter( ElementGraph graph, Pipe pipe ) 633 { 634 LOG.debug( "inserting tap after: {}", pipe ); 635 636 Tap checkpointTap = graph.getCheckpointsMap().get( pipe.getName() ); 637 638 if( checkpointTap != null ) 639 LOG.info( "found checkpoint: {}, using tap: {}", pipe.getName(), checkpointTap ); 640 641 if( checkpointTap == null ) 642 { 643 // only restart from a checkpoint pipe or checkpoint tap below 644 if( pipe instanceof Checkpoint ) 645 checkpointTap = makeTempTap( checkpointRootPath, pipe.getName() ); 646 else 647 checkpointTap = makeTempTap( pipe.getName() ); 648 } 649 650 graph.insertFlowElementAfter( pipe, checkpointTap ); 651 } 652 653 protected Tap makeTempTap( String name ) 654 { 655 return makeTempTap( null, name ); 656 } 657 658 protected abstract Tap makeTempTap( String prefix, String name ); 659 660 /** 661 * Inserts a temporary Tap between logical MR jobs. 662 * <p/> 663 * Since all joins are at groups or splices, depth first search is safe 664 * <p/> 665 * todo: refactor so that rules are applied to path segments bounded by taps 666 * todo: this would allow balancing of operations within paths instead of pushing 667 * todo: all operations up. may allow for consolidation of rules 668 * 669 * @param elementGraph of type PipeGraph 670 */ 671 protected void handleJobPartitioning( ElementGraph elementGraph ) 672 { 673 // if there was a graph change, iterate paths again. prevents many temp taps from being inserted in front of a group 674 while( !internalJobPartitioning( elementGraph ) ) 675 ; 676 } 677 678 private boolean internalJobPartitioning( ElementGraph elementGraph ) 679 { 680 for( GraphPath<FlowElement, Scope> path : elementGraph.getAllShortestPathsBetweenExtents() ) 681 { 682 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); 683 List<Pipe> tapInsertions = new ArrayList<Pipe>(); 684 685 boolean foundGroup = false; 686 687 for( int i = 0; i < flowElements.size(); i++ ) 688 { 689 FlowElement flowElement = flowElements.get( i ); 690 691 if( flowElement instanceof ElementGraph.Extent ) // is an extent: head or tail 692 continue; 693 else if( flowElement instanceof Tap && flowElements.get( i - 1 ) instanceof ElementGraph.Extent ) // is a source tap 694 continue; 695 696 if( flowElement instanceof Group && !foundGroup ) 697 { 698 foundGroup = true; 699 } 700 else if( flowElement instanceof Splice && foundGroup ) // add tap between groups, push joins/merge map side 701 { 702 tapInsertions.add( (Pipe) flowElements.get( i - 1 ) ); 703 704 if( !( flowElement instanceof Group ) ) 705 foundGroup = false; 706 } 707 else if( flowElement instanceof Checkpoint ) // add tap after checkpoint 708 { 709 if( flowElements.get( i + 1 ) instanceof Tap ) // don't keep inserting 710 continue; 711 712 tapInsertions.add( (Pipe) flowElement ); 713 foundGroup = false; 714 } 715 else if( flowElement instanceof Tap ) 716 { 717 foundGroup = false; 718 } 719 } 720 721 for( Pipe pipe : tapInsertions ) 722 insertTempTapAfter( elementGraph, pipe ); 723 724 if( !tapInsertions.isEmpty() ) 725 return false; 726 } 727 728 return true; 729 } 730 731 /** 732 * Prevent leftmost sources from sourcing a downstream join on the rightmost side intra-task by inserting a 733 * temp tap between the left-sourced join and right-sourced join. 734 * 735 * @param elementGraph 736 */ 737 protected void handleJoins( ElementGraph elementGraph ) 738 { 739 while( !internalJoins( elementGraph ) ) 740 ; 741 } 742 743 private boolean internalJoins( ElementGraph elementGraph ) 744 { 745 List<GraphPath<FlowElement, Scope>> paths = elementGraph.getAllShortestPathsBetweenExtents(); 746 747 // large to small 748 Collections.reverse( paths ); 749 750 for( GraphPath<FlowElement, Scope> path : paths ) 751 { 752 List<FlowElement> flowElements = Graphs.getPathVertexList( path ); 753 List<Pipe> tapInsertions = new ArrayList<Pipe>(); 754 List<HashJoin> joins = new ArrayList<HashJoin>(); 755 List<Merge> merges = new ArrayList<Merge>(); 756 757 FlowElement lastSourceElement = null; 758 759 for( int i = 0; i < flowElements.size(); i++ ) 760 { 761 FlowElement flowElement = flowElements.get( i ); 762 763 if( flowElement instanceof Merge ) 764 { 765 merges.add( (Merge) flowElement ); 766 } 767 else if( flowElement instanceof HashJoin ) 768 { 769 HashJoin join = (HashJoin) flowElement; 770 771 Map<Integer, Integer> pathCounts = countOrderedDirectPathsBetween( elementGraph, lastSourceElement, join, true ); 772 773 // is this path streamed 774 int pathPosition = pathPositionInto( path, join ); 775 boolean thisPathIsStreamed = pathPosition == 0; 776 777 boolean isAccumulatedAndStreamed = isBothAccumulatedAndStreamedPath( pathCounts ); // has streamed and accumulated paths 778 int pathCount = countPaths( pathCounts ); 779 780 int priorJoins = countTypesBetween( elementGraph, lastSourceElement, join, HashJoin.class ); 781 782 if( priorJoins == 0 ) 783 { 784 // if same source is leading into the hashjoin, insert tap on the accumulated side 785 if( pathCount == 2 && isAccumulatedAndStreamed && !thisPathIsStreamed ) 786 { 787 tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) ); 788 break; 789 } 790 791 // if more than one path into streamed and accumulated branches, insert tap on streamed side 792 if( pathCount > 2 && isAccumulatedAndStreamed && thisPathIsStreamed ) 793 { 794 tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) ); 795 break; 796 } 797 } 798 799 if( !merges.isEmpty() ) 800 { 801 // if a Merge is prior to a HashJoin, and its an accumulated path, force Merge results to disk 802 int joinPos = flowElements.indexOf( join ); 803 int mergePos = nearest( flowElements, joinPos, merges ); 804 805 if( mergePos != -1 && joinPos > mergePos ) 806 { 807 // if all paths are accumulated and streamed, insert 808 // else if just if this path is accumulated 809 if( ( isAccumulatedAndStreamed && thisPathIsStreamed ) || !thisPathIsStreamed ) 810 { 811 tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) ); 812 break; 813 } 814 } 815 } 816 817 joins.add( (HashJoin) flowElement ); 818 } 819 else if( flowElement instanceof Tap || flowElement instanceof Group ) 820 { 821 for( int j = 0; j < joins.size(); j++ ) 822 { 823 HashJoin join = joins.get( j ); 824 825 int pathPosition = pathPositionInto( path, join ); 826 boolean thisPathIsStreamed = pathPosition == 0; 827 828 Map<Integer, Integer> pathCounts = countOrderedDirectPathsBetween( elementGraph, lastSourceElement, join, true ); 829 830 boolean isAccumulatedAndStreamed = isBothAccumulatedAndStreamedPath( pathCounts ); // has streamed and accumulated paths 831 int pathCount = countPaths( pathCounts ); 832 833 if( pathCount >= 2 && isAccumulatedAndStreamed && thisPathIsStreamed ) 834 { 835 tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) ); 836 break; 837 } 838 839 if( thisPathIsStreamed ) 840 continue; 841 842 if( j == 0 ) // is accumulated on first join 843 break; 844 845 // prevent a streamed path from being accumulated by injecting a tap before the 846 // current HashJoin 847 tapInsertions.add( (Pipe) flowElements.get( flowElements.indexOf( join ) - 1 ) ); 848 break; 849 } 850 851 if( !tapInsertions.isEmpty() ) 852 break; 853 854 lastSourceElement = flowElement; 855 merges.clear(); 856 joins.clear(); 857 } 858 } 859 860 for( Pipe pipe : tapInsertions ) 861 insertTempTapAfter( elementGraph, pipe ); 862 863 if( !tapInsertions.isEmpty() ) 864 return false; 865 } 866 867 return true; 868 } 869 870 private int nearest( List<FlowElement> flowElements, int index, List<Merge> merges ) 871 { 872 List<Merge> reversed = new ArrayList<Merge>( merges ); 873 Collections.reverse( reversed ); 874 875 for( Merge merge : reversed ) 876 { 877 int pos = flowElements.indexOf( merge ); 878 if( pos < index ) 879 return pos; 880 } 881 882 return -1; 883 } 884 }