001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.pipe.assembly; 022 023 import java.beans.ConstructorProperties; 024 import java.io.Serializable; 025 import java.util.ArrayList; 026 import java.util.Arrays; 027 import java.util.Collections; 028 import java.util.LinkedHashMap; 029 import java.util.List; 030 import java.util.Map; 031 032 import cascading.flow.FlowProcess; 033 import cascading.operation.Aggregator; 034 import cascading.operation.BaseOperation; 035 import cascading.operation.Function; 036 import cascading.operation.FunctionCall; 037 import cascading.operation.OperationCall; 038 import cascading.pipe.Each; 039 import cascading.pipe.Every; 040 import cascading.pipe.GroupBy; 041 import cascading.pipe.Pipe; 042 import cascading.pipe.SubAssembly; 043 import cascading.tuple.Fields; 044 import cascading.tuple.Tuple; 045 import cascading.tuple.TupleEntry; 046 import cascading.tuple.TupleEntryCollector; 047 import cascading.tuple.util.TupleViews; 048 import org.slf4j.Logger; 049 import org.slf4j.LoggerFactory; 050 051 /** 052 * Class AggregateBy is a {@link SubAssembly} that serves two roles for handling aggregate operations. 053 * <p/> 054 * The first role is as a base class for composable aggregate operations that have a MapReduce Map side optimization for the 055 * Reduce side aggregation. For example 'summing' a value within a grouping can be performed partially Map side and 056 * completed Reduce side. Summing is associative and commutative. 057 * <p/> 058 * AggregateBy also supports operations that are not associative/commutative like 'counting'. Counting 059 * would result in 'counting' value occurrences Map side but summing those counts Reduce side. (Yes, counting can be 060 * transposed to summing Map and Reduce sides by emitting 1's before the first sum, but that's three operations over 061 * two, and a hack) 062 * <p/> 063 * Think of this mechanism as a MapReduce Combiner, but more efficient as no values are serialized, 064 * deserialized, saved to disk, and multi-pass sorted in the process, which consume cpu resources in trade of 065 * memory and a little or no IO. 066 * <p/> 067 * Further, Combiners are limited to only associative/commutative operations. 068 * <p/> 069 * Additionally the Cascading planner can move the Map side optimization 070 * to the previous Reduce operation further increasing IO performance (between the preceding Reduce and Map phase which 071 * is over HDFS). 072 * <p/> 073 * The second role of the AggregateBy class is to allow for composition of AggregateBy 074 * sub-classes. That is, {@link SumBy} and {@link CountBy} AggregateBy sub-classes can be performed 075 * in parallel on the same grouping keys. 076 * </p> 077 * Custom AggregateBy classes can be created by sub-classing this class and implementing a special 078 * {@link Functor} for use on the Map side. Multiple Functor instances are managed by the {@link CompositeFunction} 079 * class allowing them all to share the same LRU value map for more efficiency. 080 * <p/> 081 * AggregateBy instances return {@code argumentFields} which are used internally to control the values passed to 082 * internal Functor instances. If any argumentFields also have {@link java.util.Comparator}s, they will be used 083 * to for secondary sorting (see {@link GroupBy} {@code sortFields}. This feature is used by {@link FirstBy} to 084 * control which Tuple is seen first for a grouping. 085 * <p/> 086 * <p/> 087 * To tune the LRU, set the {@code threshold} value to a high enough value to utilize available memory. Or set a 088 * default value via the {@link #AGGREGATE_BY_THRESHOLD} property. The current default ({@link CompositeFunction#DEFAULT_THRESHOLD}) 089 * is {@code 10, 000} unique keys. Note "flushes" from the LRU will be logged in threshold increments along with memory 090 * information. 091 * <p/> 092 * Note using a AggregateBy instance automatically inserts a {@link GroupBy} into the resulting {@link cascading.flow.Flow}. 093 * And passing multiple AggregateBy instances to a parent AggregateBy instance still results in one GroupBy. 094 * <p/> 095 * Also note that {@link Unique} is not a CompositeAggregator and is slightly more optimized internally. 096 * <p/> 097 * Keep in mind the {@link cascading.tuple.Hasher} interface is not honored here (for storing keys in the cache). Thus 098 * arrays of primitives and object, like {@code byte[]} will not be properly stored. This is a known issue and will 099 * be resolved in a future release. 100 * 101 * @see SumBy 102 * @see CountBy 103 * @see Unique 104 */ 105 public class AggregateBy extends SubAssembly 106 { 107 private static final Logger LOG = LoggerFactory.getLogger( AggregateBy.class ); 108 109 public static final int USE_DEFAULT_THRESHOLD = 0; 110 public static final int DEFAULT_THRESHOLD = CompositeFunction.DEFAULT_THRESHOLD; 111 public static final String AGGREGATE_BY_THRESHOLD = "cascading.aggregateby.threshold"; 112 113 private String name; 114 private int threshold; 115 private Fields groupingFields; 116 private Fields[] argumentFields; 117 private Functor[] functors; 118 private Aggregator[] aggregators; 119 private transient GroupBy groupBy; 120 121 public enum Flush 122 { 123 Num_Keys_Flushed 124 } 125 126 /** 127 * Interface Functor provides a means to create a simple function for use with the {@link CompositeFunction} class. 128 * <p/> 129 * Note the {@link FlowProcess} argument provides access to the underlying properties and counter APIs. 130 */ 131 public interface Functor extends Serializable 132 { 133 /** 134 * Method getDeclaredFields returns the declaredFields of this Functor object. 135 * 136 * @return the declaredFields (type Fields) of this Functor object. 137 */ 138 Fields getDeclaredFields(); 139 140 /** 141 * Method aggregate operates on the given args in tandem (optionally) with the given context values. 142 * <p/> 143 * The context argument is the result of the previous call to this method. Use it to store values between aggregate 144 * calls (the current count, or sum of the args). 145 * <p/> 146 * On the very first invocation of aggregate for a given grouping key, context will be {@code null}. All subsequent 147 * invocations context will be the value returned on the previous invocation. 148 * 149 * @param flowProcess of type FlowProcess 150 * @param args of type TupleEntry 151 * @param context of type Tuple @return Tuple 152 */ 153 Tuple aggregate( FlowProcess flowProcess, TupleEntry args, Tuple context ); 154 155 /** 156 * Method complete allows the final aggregate computation to be performed before the return value is collected. 157 * <p/> 158 * The number of values in the returned {@link Tuple} instance must match the number of declaredFields. 159 * <p/> 160 * It is safe to return the context object as the result value. 161 * 162 * @param flowProcess of type FlowProcess 163 * @param context of type Tuple @return Tuple 164 */ 165 Tuple complete( FlowProcess flowProcess, Tuple context ); 166 } 167 168 /** 169 * Class CompositeFunction takes multiple Functor instances and manages them as a single {@link Function}. 170 * 171 * @see Functor 172 */ 173 public static class CompositeFunction extends BaseOperation<CompositeFunction.Context> implements Function<CompositeFunction.Context> 174 { 175 public static final int DEFAULT_THRESHOLD = 10000; 176 177 private int threshold = 0; 178 private final Fields groupingFields; 179 private final Fields[] argumentFields; 180 private final Fields[] functorFields; 181 private final Functor[] functors; 182 183 public static class Context 184 { 185 LinkedHashMap<Tuple, Tuple[]> lru; 186 TupleEntry[] arguments; 187 Tuple result; 188 } 189 190 /** 191 * Constructor CompositeFunction creates a new CompositeFunction instance. 192 * 193 * @param groupingFields of type Fields 194 * @param argumentFields of type Fields 195 * @param functor of type Functor 196 * @param threshold of type int 197 */ 198 public CompositeFunction( Fields groupingFields, Fields argumentFields, Functor functor, int threshold ) 199 { 200 this( groupingFields, Fields.fields( argumentFields ), new Functor[]{functor}, threshold ); 201 } 202 203 /** 204 * Constructor CompositeFunction creates a new CompositeFunction instance. 205 * 206 * @param groupingFields of type Fields 207 * @param argumentFields of type Fields[] 208 * @param functors of type Functor[] 209 * @param threshold of type int 210 */ 211 public CompositeFunction( Fields groupingFields, Fields[] argumentFields, Functor[] functors, int threshold ) 212 { 213 super( getFields( groupingFields, functors ) ); // todo: groupingFields should lookup incoming type information 214 this.groupingFields = groupingFields; 215 this.argumentFields = argumentFields; 216 this.functors = functors; 217 this.threshold = threshold; 218 219 this.functorFields = new Fields[ functors.length ]; 220 221 for( int i = 0; i < functors.length; i++ ) 222 this.functorFields[ i ] = functors[ i ].getDeclaredFields(); 223 } 224 225 private static Fields getFields( Fields groupingFields, Functor[] functors ) 226 { 227 Fields fields = groupingFields; 228 229 for( Functor functor : functors ) 230 fields = fields.append( functor.getDeclaredFields() ); 231 232 return fields; 233 } 234 235 @Override 236 public void prepare( final FlowProcess flowProcess, final OperationCall<CompositeFunction.Context> operationCall ) 237 { 238 if( threshold == 0 ) 239 { 240 Integer value = flowProcess.getIntegerProperty( AGGREGATE_BY_THRESHOLD ); 241 242 if( value != null && value > 0 ) 243 threshold = value; 244 else 245 threshold = DEFAULT_THRESHOLD; 246 } 247 248 LOG.info( "using threshold value: {}", threshold ); 249 250 Fields[] fields = new Fields[ functors.length + 1 ]; 251 252 fields[ 0 ] = groupingFields; 253 254 for( int i = 0; i < functors.length; i++ ) 255 fields[ i + 1 ] = functors[ i ].getDeclaredFields(); 256 257 final Context context = new Context(); 258 259 context.arguments = new TupleEntry[ functors.length ]; 260 261 for( int i = 0; i < context.arguments.length; i++ ) 262 { 263 Fields resolvedArgumentFields = operationCall.getArgumentFields(); 264 265 int[] pos; 266 267 if( argumentFields[ i ].isAll() ) 268 pos = resolvedArgumentFields.getPos(); 269 else 270 pos = resolvedArgumentFields.getPos( argumentFields[ i ] ); // returns null if selector is ALL 271 272 Tuple narrow = TupleViews.createNarrow( pos ); 273 274 Fields currentFields; 275 276 if( this.argumentFields[ i ].isSubstitution() ) 277 currentFields = resolvedArgumentFields.select( this.argumentFields[ i ] ); // attempt to retain comparator 278 else 279 currentFields = Fields.asDeclaration( this.argumentFields[ i ] ); 280 281 context.arguments[ i ] = new TupleEntry( currentFields, narrow ); 282 } 283 284 context.result = TupleViews.createComposite( fields ); 285 286 context.lru = new LinkedHashMap<Tuple, Tuple[]>( threshold, 0.75f, true ) 287 { 288 long flushes = 0; 289 290 @Override 291 protected boolean removeEldestEntry( Map.Entry<Tuple, Tuple[]> eldest ) 292 { 293 boolean doRemove = size() > threshold; 294 295 if( doRemove ) 296 { 297 completeFunctors( flowProcess, ( (FunctionCall) operationCall ).getOutputCollector(), context.result, eldest ); 298 flowProcess.increment( Flush.Num_Keys_Flushed, 1 ); 299 300 if( flushes % threshold == 0 ) // every multiple, write out data 301 { 302 Runtime runtime = Runtime.getRuntime(); 303 long freeMem = runtime.freeMemory() / 1024 / 1024; 304 long maxMem = runtime.maxMemory() / 1024 / 1024; 305 long totalMem = runtime.totalMemory() / 1024 / 1024; 306 307 LOG.info( "flushed keys num times: {}, with threshold: {}", flushes + 1, threshold ); 308 LOG.info( "mem on flush (mb), free: " + freeMem + ", total: " + totalMem + ", max: " + maxMem ); 309 310 float percent = (float) totalMem / (float) maxMem; 311 312 if( percent < 0.80F ) 313 LOG.info( "total mem is {}% of max mem, to better utilize unused memory consider increasing current LRU threshold with system property \"{}\"", (int) ( percent * 100.0F ), AGGREGATE_BY_THRESHOLD ); 314 } 315 316 flushes++; 317 } 318 319 return doRemove; 320 } 321 }; 322 323 operationCall.setContext( context ); 324 } 325 326 @Override 327 public void operate( FlowProcess flowProcess, FunctionCall<CompositeFunction.Context> functionCall ) 328 { 329 TupleEntry arguments = functionCall.getArguments(); 330 Tuple key = arguments.selectTupleCopy( groupingFields ); 331 332 Context context = functionCall.getContext(); 333 Tuple[] functorContext = context.lru.get( key ); 334 335 if( functorContext == null ) 336 { 337 functorContext = new Tuple[ functors.length ]; 338 context.lru.put( key, functorContext ); 339 } 340 341 for( int i = 0; i < functors.length; i++ ) 342 { 343 TupleViews.reset( context.arguments[ i ].getTuple(), arguments.getTuple() ); 344 functorContext[ i ] = functors[ i ].aggregate( flowProcess, context.arguments[ i ], functorContext[ i ] ); 345 } 346 } 347 348 @Override 349 public void flush( FlowProcess flowProcess, OperationCall<CompositeFunction.Context> operationCall ) 350 { 351 // need to drain context 352 TupleEntryCollector collector = ( (FunctionCall) operationCall ).getOutputCollector(); 353 354 Tuple result = operationCall.getContext().result; 355 LinkedHashMap<Tuple, Tuple[]> context = operationCall.getContext().lru; 356 357 for( Map.Entry<Tuple, Tuple[]> entry : context.entrySet() ) 358 completeFunctors( flowProcess, collector, result, entry ); 359 360 operationCall.setContext( null ); 361 } 362 363 private void completeFunctors( FlowProcess flowProcess, TupleEntryCollector outputCollector, Tuple result, Map.Entry<Tuple, Tuple[]> entry ) 364 { 365 Tuple[] results = new Tuple[ functors.length + 1 ]; 366 367 results[ 0 ] = entry.getKey(); 368 369 Tuple[] values = entry.getValue(); 370 371 for( int i = 0; i < functors.length; i++ ) 372 results[ i + 1 ] = functors[ i ].complete( flowProcess, values[ i ] ); 373 374 TupleViews.reset( result, results ); 375 376 outputCollector.add( result ); 377 } 378 379 @Override 380 public boolean equals( Object object ) 381 { 382 if( this == object ) 383 return true; 384 if( !( object instanceof CompositeFunction ) ) 385 return false; 386 if( !super.equals( object ) ) 387 return false; 388 389 CompositeFunction that = (CompositeFunction) object; 390 391 if( threshold != that.threshold ) 392 return false; 393 if( !Arrays.equals( argumentFields, that.argumentFields ) ) 394 return false; 395 if( !Arrays.equals( functorFields, that.functorFields ) ) 396 return false; 397 if( !Arrays.equals( functors, that.functors ) ) 398 return false; 399 if( groupingFields != null ? !groupingFields.equals( that.groupingFields ) : that.groupingFields != null ) 400 return false; 401 402 return true; 403 } 404 405 @Override 406 public int hashCode() 407 { 408 int result = super.hashCode(); 409 result = 31 * result + threshold; 410 result = 31 * result + ( groupingFields != null ? groupingFields.hashCode() : 0 ); 411 result = 31 * result + ( argumentFields != null ? Arrays.hashCode( argumentFields ) : 0 ); 412 result = 31 * result + ( functorFields != null ? Arrays.hashCode( functorFields ) : 0 ); 413 result = 31 * result + ( functors != null ? Arrays.hashCode( functors ) : 0 ); 414 return result; 415 } 416 } 417 418 /** 419 * Constructor CompositeAggregator creates a new CompositeAggregator instance. 420 * 421 * @param name of type String 422 * @param threshold of type int 423 */ 424 protected AggregateBy( String name, int threshold ) 425 { 426 this.name = name; 427 this.threshold = threshold; 428 } 429 430 /** 431 * Constructor CompositeAggregator creates a new CompositeAggregator instance. 432 * 433 * @param argumentFields of type Fields 434 * @param functor of type Functor 435 * @param aggregator of type Aggregator 436 */ 437 protected AggregateBy( Fields argumentFields, Functor functor, Aggregator aggregator ) 438 { 439 this.argumentFields = Fields.fields( argumentFields ); 440 this.functors = new Functor[]{functor}; 441 this.aggregators = new Aggregator[]{aggregator}; 442 } 443 444 /** 445 * Constructor CompositeAggregator creates a new CompositeAggregator instance. 446 * 447 * @param pipe of type Pipe 448 * @param groupingFields of type Fields 449 * @param assemblies of type CompositeAggregator... 450 */ 451 @ConstructorProperties({"pipe", "groupingFields", "assemblies"}) 452 public AggregateBy( Pipe pipe, Fields groupingFields, AggregateBy... assemblies ) 453 { 454 this( null, Pipe.pipes( pipe ), groupingFields, 0, assemblies ); 455 } 456 457 /** 458 * Constructor CompositeAggregator creates a new CompositeAggregator instance. 459 * 460 * @param pipe of type Pipe 461 * @param groupingFields of type Fields 462 * @param threshold of type int 463 * @param assemblies of type CompositeAggregator... 464 */ 465 @ConstructorProperties({"pipe", "groupingFields", "threshold", "assemblies"}) 466 public AggregateBy( Pipe pipe, Fields groupingFields, int threshold, AggregateBy... assemblies ) 467 { 468 this( null, Pipe.pipes( pipe ), groupingFields, threshold, assemblies ); 469 } 470 471 /** 472 * Constructor CompositeAggregator creates a new CompositeAggregator instance. 473 * 474 * @param pipe of type Pipe 475 * @param groupingFields of type Fields 476 * @param threshold of type int 477 * @param assemblies of type CompositeAggregator... 478 */ 479 @ConstructorProperties({"name", "pipe", "groupingFields", "threshold", "assemblies"}) 480 public AggregateBy( String name, Pipe pipe, Fields groupingFields, int threshold, AggregateBy... assemblies ) 481 { 482 this( name, Pipe.pipes( pipe ), groupingFields, threshold, assemblies ); 483 } 484 485 /** 486 * Constructor CompositeAggregator creates a new CompositeAggregator instance. 487 * 488 * @param name of type String 489 * @param pipes of type Pipe[] 490 * @param groupingFields of type Fields 491 * @param assemblies of type CompositeAggregator... 492 */ 493 @ConstructorProperties({"name", "pipes", "groupingFields", "assemblies"}) 494 public AggregateBy( String name, Pipe[] pipes, Fields groupingFields, AggregateBy... assemblies ) 495 { 496 this( name, pipes, groupingFields, 0, assemblies ); 497 } 498 499 /** 500 * Constructor CompositeAggregator creates a new CompositeAggregator instance. 501 * 502 * @param name of type String 503 * @param pipes of type Pipe[] 504 * @param groupingFields of type Fields 505 * @param threshold of type int 506 * @param assemblies of type CompositeAggregator... 507 */ 508 @ConstructorProperties({"name", "pipes", "groupingFields", "threshold", "assemblies"}) 509 public AggregateBy( String name, Pipe[] pipes, Fields groupingFields, int threshold, AggregateBy... assemblies ) 510 { 511 this( name, threshold ); 512 513 List<Fields> arguments = new ArrayList<Fields>(); 514 List<Functor> functors = new ArrayList<Functor>(); 515 List<Aggregator> aggregators = new ArrayList<Aggregator>(); 516 517 for( int i = 0; i < assemblies.length; i++ ) 518 { 519 AggregateBy assembly = assemblies[ i ]; 520 521 Collections.addAll( arguments, assembly.getArgumentFields() ); 522 Collections.addAll( functors, assembly.getFunctors() ); 523 Collections.addAll( aggregators, assembly.getAggregators() ); 524 } 525 526 initialize( groupingFields, pipes, arguments.toArray( new Fields[ arguments.size() ] ), functors.toArray( new Functor[ functors.size() ] ), aggregators.toArray( new Aggregator[ aggregators.size() ] ) ); 527 } 528 529 protected AggregateBy( String name, Pipe[] pipes, Fields groupingFields, Fields argumentFields, Functor functor, Aggregator aggregator, int threshold ) 530 { 531 this( name, threshold ); 532 initialize( groupingFields, pipes, argumentFields, functor, aggregator ); 533 } 534 535 protected void initialize( Fields groupingFields, Pipe[] pipes, Fields argumentFields, Functor functor, Aggregator aggregator ) 536 { 537 initialize( groupingFields, pipes, Fields.fields( argumentFields ), 538 new Functor[]{functor}, 539 new Aggregator[]{aggregator} ); 540 } 541 542 protected void initialize( Fields groupingFields, Pipe[] pipes, Fields[] argumentFields, Functor[] functors, Aggregator[] aggregators ) 543 { 544 setPrevious( pipes ); 545 546 this.groupingFields = groupingFields; 547 this.argumentFields = argumentFields; 548 this.functors = functors; 549 this.aggregators = aggregators; 550 551 verify(); 552 553 Fields sortFields = Fields.copyComparators( Fields.merge( this.argumentFields ), this.argumentFields ); 554 Fields argumentSelector = Fields.merge( this.groupingFields, sortFields ); 555 556 if( argumentSelector.equals( Fields.NONE ) ) 557 argumentSelector = Fields.ALL; 558 559 Pipe[] functions = new Pipe[ pipes.length ]; 560 561 CompositeFunction function = new CompositeFunction( this.groupingFields, this.argumentFields, this.functors, threshold ); 562 563 for( int i = 0; i < functions.length; i++ ) 564 functions[ i ] = new Each( pipes[ i ], argumentSelector, function, Fields.RESULTS ); 565 566 groupBy = new GroupBy( name, functions, this.groupingFields, sortFields.hasComparators() ? sortFields : null ); 567 568 Pipe pipe = groupBy; 569 570 for( int i = 0; i < aggregators.length; i++ ) 571 pipe = new Every( pipe, this.functors[ i ].getDeclaredFields(), this.aggregators[ i ], Fields.ALL ); 572 573 setTails( pipe ); 574 } 575 576 /** Method verify should be overridden by sub-classes if any values must be tested before the calling constructor returns. */ 577 protected void verify() 578 { 579 580 } 581 582 /** 583 * Method getGroupingFields returns the Fields this instances will be grouping against. 584 * 585 * @return the current grouping fields 586 */ 587 public Fields getGroupingFields() 588 { 589 return groupingFields; 590 } 591 592 /** 593 * Method getFieldDeclarations returns an array of Fields where each Field element in the array corresponds to the 594 * field declaration of the given Aggregator operations. 595 * <p/> 596 * Note the actual Fields values are returned, not planner resolved Fields. 597 * 598 * @return and array of Fields 599 */ 600 public Fields[] getFieldDeclarations() 601 { 602 Fields[] fields = new Fields[ this.aggregators.length ]; 603 604 for( int i = 0; i < aggregators.length; i++ ) 605 fields[ i ] = aggregators[ i ].getFieldDeclaration(); 606 607 return fields; 608 } 609 610 protected Fields[] getArgumentFields() 611 { 612 return argumentFields; 613 } 614 615 protected Functor[] getFunctors() 616 { 617 return functors; 618 } 619 620 protected Aggregator[] getAggregators() 621 { 622 return aggregators; 623 } 624 625 /** 626 * Method getGroupBy returns the internal {@link GroupBy} instance so that any custom properties 627 * can be set on it via {@link cascading.pipe.Pipe#getStepConfigDef()}. 628 * 629 * @return GroupBy type 630 */ 631 public GroupBy getGroupBy() 632 { 633 return groupBy; 634 } 635 }