001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.pipe.assembly; 022 023 import java.beans.ConstructorProperties; 024 025 import cascading.flow.FlowProcess; 026 import cascading.operation.aggregator.First; 027 import cascading.pipe.Pipe; 028 import cascading.tuple.Fields; 029 import cascading.tuple.Tuple; 030 import cascading.tuple.TupleEntry; 031 032 /** 033 * Class FirstBy is used to return the first encountered Tuple in a tuple stream grouping. 034 * <p/> 035 * Typically finding the first Tuple in a tuple stream grouping relies on a {@link cascading.pipe.GroupBy} and a 036 * {@link cascading.operation.aggregator.First} {@link cascading.operation.Aggregator} operation. 037 * <p/> 038 * If the {@code firstFields} argument has custom {@link java.util.Comparator} instances, they will be used 039 * as the GroupBy {@code sortFields}. 040 * <p/> 041 * This SubAssembly also uses the {@link cascading.pipe.assembly.FirstBy.FirstPartials} 042 * {@link cascading.pipe.assembly.AggregateBy.Functor} 043 * to collect field values before the GroupBy operator to reduce IO over the network. 044 * <p/> 045 * This strategy is similar to using {@code combiners}, except no sorting or serialization is invoked and results 046 * in a much simpler mechanism. 047 * <p/> 048 * The {@code threshold} value tells the underlying FirstPartials functions how many unique key counts to accumulate 049 * in the LRU cache, before emitting the least recently used entry. 050 * <p/> 051 * By default, either the value of {@link #AGGREGATE_BY_THRESHOLD} System property or {@link AggregateBy#DEFAULT_THRESHOLD} 052 * will be used. 053 * 054 * @see AggregateBy 055 */ 056 public class FirstBy extends AggregateBy 057 { 058 /** 059 * Class CountPartials is a {@link cascading.pipe.assembly.AggregateBy.Functor} that is used to count observed duplicates from the tuple stream. 060 * <p/> 061 * Use this class typically in tandem with a {@link cascading.operation.aggregator.Sum} 062 * {@link cascading.operation.Aggregator} in order to improve counting performance by removing as many values 063 * as possible before the intermediate {@link cascading.pipe.GroupBy} operator. 064 * 065 * @see cascading.pipe.assembly.FirstBy 066 */ 067 public static class FirstPartials implements Functor 068 { 069 private final Fields declaredFields; 070 071 /** 072 * Constructor FirstPartials creates a new FirstPartials instance. 073 * 074 * @param declaredFields of type Fields 075 */ 076 public FirstPartials( Fields declaredFields ) 077 { 078 this.declaredFields = declaredFields; 079 080 if( !declaredFields.isDeclarator() || declaredFields.size() != 1 ) 081 throw new IllegalArgumentException( "declaredFields should declare only one field name" ); 082 } 083 084 @Override 085 public Fields getDeclaredFields() 086 { 087 return declaredFields; 088 } 089 090 @Override 091 public Tuple aggregate( FlowProcess flowProcess, TupleEntry args, Tuple context ) 092 { 093 if( context == null || args.getFields().compare( context, args.getTuple() ) > 0 ) 094 return args.getTupleCopy(); 095 096 return context; 097 } 098 099 @Override 100 public Tuple complete( FlowProcess flowProcess, Tuple context ) 101 { 102 return context; 103 } 104 } 105 106 /** 107 * Constructor FirstBy creates a new FirstBy instance. Use this constructor when used with a {@link AggregateBy} 108 * instance. 109 * 110 * @param firstFields of type Fields 111 */ 112 @ConstructorProperties({"firstFields"}) 113 public FirstBy( Fields firstFields ) 114 { 115 super( firstFields, new FirstPartials( firstFields ), new First( firstFields ) ); 116 } 117 118 /** 119 * Constructor FirstBy creates a new FirstBy instance. Use this constructor when used with a {@link AggregateBy} 120 * instance. 121 * 122 * @param firstFields of type Fields 123 */ 124 @ConstructorProperties({"argumentFields", "firstFields"}) 125 public FirstBy( Fields argumentFields, Fields firstFields ) 126 { 127 super( argumentFields, new FirstPartials( argumentFields ), new First( firstFields ) ); 128 } 129 130 /////// 131 132 /** 133 * Constructor FirstBy creates a new FirstBy instance. 134 * 135 * @param pipe of type Pipe 136 * @param groupingFields of type Fields 137 * @param firstFields of type Fields 138 */ 139 @ConstructorProperties({"pipe", "groupingFields", "firstFields"}) 140 public FirstBy( Pipe pipe, Fields groupingFields, Fields firstFields ) 141 { 142 this( null, pipe, groupingFields, firstFields ); 143 } 144 145 /** 146 * Constructor FirstBy creates a new FirstBy instance. 147 * 148 * @param pipe of type Pipe 149 * @param groupingFields of type Fields 150 * @param firstFields fo type Fields 151 * @param threshold of type int 152 */ 153 @ConstructorProperties({"pipe", "groupingFields", "firstFields", "threshold"}) 154 public FirstBy( Pipe pipe, Fields groupingFields, Fields firstFields, int threshold ) 155 { 156 this( null, pipe, groupingFields, firstFields, threshold ); 157 } 158 159 /** 160 * Constructor FirstBy creates a new FirstBy instance. 161 * 162 * @param name of type String 163 * @param pipe of type Pipe 164 * @param groupingFields of type Fields 165 * @param firstFields of type Fields 166 */ 167 @ConstructorProperties({"name", "pipe", "groupingFields", "firstFields"}) 168 public FirstBy( String name, Pipe pipe, Fields groupingFields, Fields firstFields ) 169 { 170 this( name, pipe, groupingFields, firstFields, USE_DEFAULT_THRESHOLD ); 171 } 172 173 /** 174 * Constructor FirstBy creates a new FirstBy instance. 175 * 176 * @param name of type String 177 * @param pipe of type Pipe 178 * @param groupingFields of type Fields 179 * @param firstFields of type Fields 180 * @param threshold of type int 181 */ 182 @ConstructorProperties({"name", "pipe", "groupingFields", "firstFields", "threshold"}) 183 public FirstBy( String name, Pipe pipe, Fields groupingFields, Fields firstFields, int threshold ) 184 { 185 this( name, Pipe.pipes( pipe ), groupingFields, firstFields, threshold ); 186 } 187 188 /** 189 * Constructor FirstBy creates a new FirstBy instance. 190 * 191 * @param pipes of type Pipe[] 192 * @param groupingFields of type Fields 193 * @param firstFields of type Fields 194 */ 195 @ConstructorProperties({"pipes", "groupingFields", "firstFields"}) 196 public FirstBy( Pipe[] pipes, Fields groupingFields, Fields firstFields ) 197 { 198 this( null, pipes, groupingFields, firstFields, USE_DEFAULT_THRESHOLD ); 199 } 200 201 /** 202 * Constructor FirstBy creates a new FirstBy instance. 203 * 204 * @param pipes of type Pipe[] 205 * @param groupingFields of type Fields 206 * @param firstFields of type Fields 207 * @param threshold of type int 208 */ 209 @ConstructorProperties({"pipes", "groupingFields", "firstFields", "threshold"}) 210 public FirstBy( Pipe[] pipes, Fields groupingFields, Fields firstFields, int threshold ) 211 { 212 this( null, pipes, groupingFields, firstFields, threshold ); 213 } 214 215 /** 216 * Constructor FirstBy creates a new FirstBy instance. 217 * 218 * @param name of type String 219 * @param pipes of type Pipe[] 220 * @param groupingFields of type Fields 221 * @param firstFields of type Fields 222 */ 223 @ConstructorProperties({"name", "pipes", "groupingFields", "firstFields"}) 224 public FirstBy( String name, Pipe[] pipes, Fields groupingFields, Fields firstFields ) 225 { 226 this( name, pipes, groupingFields, firstFields, USE_DEFAULT_THRESHOLD ); 227 } 228 229 /** 230 * Constructor FirstBy creates a new FirstBy instance. 231 * 232 * @param name of type String 233 * @param pipes of type Pipe[] 234 * @param groupingFields of type Fields 235 * @param firstFields of type Fields 236 * @param threshold of type int 237 */ 238 @ConstructorProperties({"name", "pipes", "groupingFields", "firstFields", "threshold"}) 239 public FirstBy( String name, Pipe[] pipes, Fields groupingFields, Fields firstFields, int threshold ) 240 { 241 super( name, pipes, groupingFields, firstFields, new FirstPartials( firstFields ), new First( firstFields ), threshold ); 242 } 243 }