001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading;
022    
023    import java.io.Serializable;
024    import java.util.Comparator;
025    import java.util.HashMap;
026    import java.util.HashSet;
027    import java.util.List;
028    import java.util.Map;
029    import java.util.Set;
030    
031    import cascading.flow.Flow;
032    import cascading.operation.Function;
033    import cascading.operation.Identity;
034    import cascading.operation.aggregator.First;
035    import cascading.operation.expression.ExpressionFunction;
036    import cascading.operation.regex.RegexFilter;
037    import cascading.operation.regex.RegexSplitter;
038    import cascading.pipe.CoGroup;
039    import cascading.pipe.Each;
040    import cascading.pipe.Every;
041    import cascading.pipe.GroupBy;
042    import cascading.pipe.HashJoin;
043    import cascading.pipe.Pipe;
044    import cascading.pipe.joiner.InnerJoin;
045    import cascading.pipe.joiner.Joiner;
046    import cascading.pipe.joiner.LeftJoin;
047    import cascading.pipe.joiner.MixedJoin;
048    import cascading.pipe.joiner.OuterJoin;
049    import cascading.pipe.joiner.RightJoin;
050    import cascading.tap.SinkMode;
051    import cascading.tap.Tap;
052    import cascading.tuple.Fields;
053    import cascading.tuple.Hasher;
054    import cascading.tuple.Tuple;
055    import org.junit.Test;
056    
057    import static data.InputData.*;
058    
059    
060    public class JoinFieldedPipesPlatformTest extends PlatformTestCase
061      {
062      public JoinFieldedPipesPlatformTest()
063        {
064        super( true, 4, 1 ); // leave cluster testing enabled
065        }
066    
067      @Test
068      public void testCross() throws Exception
069        {
070        getPlatform().copyFromLocal( inputFileLhs );
071        getPlatform().copyFromLocal( inputFileRhs );
072    
073        Map sources = new HashMap();
074    
075        sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) );
076        sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) );
077    
078        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE );
079    
080        Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
081        Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );
082    
083        Pipe cross = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
084    
085        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross );
086    
087        flow.complete();
088    
089        validateLength( flow, 37, null );
090    
091        List<Tuple> values = getSinkAsList( flow );
092    
093        assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
094        assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) );
095        }
096    
097      @Test
098      public void testJoin() throws Exception
099        {
100        getPlatform().copyFromLocal( inputFileLower );
101        getPlatform().copyFromLocal( inputFileUpper );
102    
103        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
104        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
105    
106        Map sources = new HashMap();
107    
108        sources.put( "lower", sourceLower );
109        sources.put( "upper", sourceUpper );
110    
111        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE );
112    
113        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
114    
115        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
116        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
117    
118        Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
119    
120        Map<Object, Object> properties = getProperties();
121    
122        Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
123    
124        flow.complete();
125    
126        validateLength( flow, 5 );
127    
128        List<Tuple> values = getSinkAsList( flow );
129    
130        assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
131        assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
132        }
133    
134      @Test
135      public void testJoinSamePipeName() throws Exception
136        {
137        getPlatform().copyFromLocal( inputFileLower );
138        getPlatform().copyFromLocal( inputFileUpper );
139    
140        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
141        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
142    
143        Map sources = new HashMap();
144    
145        sources.put( "lower", sourceLower );
146        sources.put( "upper", sourceUpper );
147    
148        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "renamedpipes" ), SinkMode.REPLACE );
149    
150        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
151    
152        Pipe pipeLower = new Pipe( "lower" );
153        Pipe pipeUpper = new Pipe( "upper" );
154    
155        // these pipes will hide the source name, and could cause one to be lost
156        pipeLower = new Pipe( "same", pipeLower );
157        pipeUpper = new Pipe( "same", pipeUpper );
158    
159        pipeLower = new Each( pipeLower, new Fields( "line" ), splitter );
160        pipeUpper = new Each( pipeUpper, new Fields( "line" ), splitter );
161    
162    //    pipeLower = new Each( pipeLower, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) );
163    //    pipeUpper = new Each( pipeUpper, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) );
164    
165        pipeLower = new Pipe( "left", pipeLower );
166        pipeUpper = new Pipe( "right", pipeUpper );
167    
168    //    pipeLower = new Each( pipeLower, new Debug( true ) );
169    //    pipeUpper = new Each( pipeUpper, new Debug( true ) );
170    
171        Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
172    
173    //    splice = new Each( splice, new Debug( true ) );
174        splice = new Pipe( "splice", splice );
175        splice = new Pipe( "tail", splice );
176    
177        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
178    
179        flow.complete();
180    
181        validateLength( flow, 5 );
182    
183        List<Tuple> values = getSinkAsList( flow );
184    
185        assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
186        assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
187        }
188    
189      @Test
190      public void testJoinWithUnknowns() throws Exception
191        {
192        getPlatform().copyFromLocal( inputFileLower );
193        getPlatform().copyFromLocal( inputFileUpper );
194    
195        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
196        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
197    
198        Map sources = new HashMap();
199    
200        sources.put( "lower", sourceLower );
201        sources.put( "upper", sourceUpper );
202    
203        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "unknown" ), SinkMode.REPLACE );
204    
205        Function splitter = new RegexSplitter( Fields.UNKNOWN, " " );
206    
207        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
208        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
209    
210        Pipe splice = new HashJoin( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ), Fields.size( 4 ) );
211    
212        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
213    
214        flow.complete();
215    
216        validateLength( flow, 5 );
217    
218        List<Tuple> values = getSinkAsList( flow );
219    
220        assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
221        assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
222        }
223    
224      /**
225       * this test intentionally filters out all values so the intermediate tap is empty. this tap is cogrouped with
226       * a new stream using an outerjoin.
227       *
228       * @throws Exception
229       */
230      @Test
231      public void testJoinFilteredBranch() throws Exception
232        {
233        getPlatform().copyFromLocal( inputFileLower );
234        getPlatform().copyFromLocal( inputFileUpper );
235    
236        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
237        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
238    
239        Map sources = new HashMap();
240    
241        sources.put( "lower", sourceLower );
242        sources.put( "upper", sourceUpper );
243    
244        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinfilteredbranch" ), SinkMode.REPLACE );
245    
246        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
247    
248        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
249        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
250        pipeUpper = new Each( pipeUpper, new Fields( "num" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all
251        pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
252    
253        Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() );
254    
255        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
256    
257        flow.complete();
258    
259        validateLength( flow, 5 );
260    
261        List<Tuple> values = getSinkAsList( flow );
262    
263        assertTrue( values.contains( new Tuple( "1\ta\tnull\tnull" ) ) );
264        assertTrue( values.contains( new Tuple( "2\tb\tnull\tnull" ) ) );
265        }
266    
267      @Test
268      public void testJoinSelf() throws Exception
269        {
270        getPlatform().copyFromLocal( inputFileLower );
271    
272        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
273        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
274    
275        Map sources = new HashMap();
276    
277        sources.put( "lower", sourceLower );
278        sources.put( "upper", sourceUpper );
279    
280        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinself" ), SinkMode.REPLACE );
281    
282        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
283    
284        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
285        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
286    
287        Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
288    
289        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
290    
291        flow.complete();
292    
293        validateLength( flow, 5 );
294    
295        List<Tuple> values = getSinkAsList( flow );
296    
297        assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) );
298        assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) );
299        }
300    
301      /**
302       * Method testCoGroupAfterEvery tests that a tmp tap is inserted after the Every in the cogroup join
303       *
304       * @throws Exception when
305       */
306      @Test
307      public void testJoinAfterEvery() throws Exception
308        {
309        getPlatform().copyFromLocal( inputFileLower );
310        getPlatform().copyFromLocal( inputFileUpper );
311    
312        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
313        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
314    
315        Map sources = new HashMap();
316    
317        sources.put( "lower", sourceLower );
318        sources.put( "upper", sourceUpper );
319    
320        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "afterevery" ), SinkMode.REPLACE );
321    
322        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
323    
324        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
325        pipeLower = new GroupBy( pipeLower, new Fields( "num" ) );
326        pipeLower = new Every( pipeLower, new Fields( "char" ), new First(), Fields.ALL );
327    
328        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
329        pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
330        pipeUpper = new Every( pipeUpper, new Fields( "char" ), new First(), Fields.ALL );
331    
332        Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
333    
334        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
335    
336        flow.complete();
337    
338        validateLength( flow, 5, null );
339    
340        List<Tuple> values = getSinkAsList( flow );
341    
342        assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
343        assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
344        }
345    
346      @Test
347      public void testJoinInnerSingleField() throws Exception
348        {
349        getPlatform().copyFromLocal( inputFileLowerOffset );
350        getPlatform().copyFromLocal( inputFileUpper );
351    
352        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset );
353        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
354    
355        Map sources = new HashMap();
356    
357        sources.put( "lower", sourceLower );
358        sources.put( "upper", sourceUpper );
359    
360        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joininnersingle" ), SinkMode.REPLACE );
361    
362        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char" ), " " ), new Fields( "num1" ) );
363        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char" ), " " ), new Fields( "num2" ) );
364    
365        Pipe join = new HashJoin( pipeLower, new Fields( "num1" ), pipeUpper, new Fields( "num2" ) );
366    
367        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join );
368    
369        flow.complete();
370    
371        validateLength( flow, 3, null );
372    
373        Set<Tuple> results = new HashSet<Tuple>();
374    
375        results.add( new Tuple( "1\t1" ) );
376        results.add( new Tuple( "5\t5" ) );
377    
378        List<Tuple> actual = getSinkAsList( flow );
379    
380        results.removeAll( actual );
381    
382        assertEquals( 0, results.size() );
383        }
384    
385      /**
386       * 1 a1
387       * 1 a2
388       * 1 a3
389       * 2 b1
390       * 3 c1
391       * 4 d1
392       * 4 d2
393       * 4 d3
394       * 5 e1
395       * 5 e2
396       * 5 e3
397       * 7 g1
398       * 7 g2
399       * 7 g3
400       * 7 g4
401       * 7 g5
402       * null h1
403       * <p/>
404       * 1 A1
405       * 1 A2
406       * 1 A3
407       * 2 B1
408       * 2 B2
409       * 2 B3
410       * 4 D1
411       * 6 F1
412       * 6 F2
413       * null H1
414       * <p/>
415       * 1  a1      1       A1
416       * 1  a1      1       A2
417       * 1  a1      1       A3
418       * 1  a2      1       A1
419       * 1  a2      1       A2
420       * 1  a2      1       A3
421       * 1  a3      1       A1
422       * 1  a3      1       A2
423       * 1  a3      1       A3
424       * 2  b1      2       B1
425       * 2  b1      2       B2
426       * 2  b1      2       B3
427       * 4  d1      4       D1
428       * 4  d2      4       D1
429       * 4  d3      4       D1
430       * null h1  null  H1
431       *
432       * @throws Exception
433       */
434      @Test
435      public void testJoinInner() throws Exception
436        {
437        HashSet<Tuple> results = new HashSet<Tuple>();
438    
439        results.add( new Tuple( "1", "a1", "1", "A1" ) );
440        results.add( new Tuple( "1", "a1", "1", "A2" ) );
441        results.add( new Tuple( "1", "a1", "1", "A3" ) );
442        results.add( new Tuple( "1", "a2", "1", "A1" ) );
443        results.add( new Tuple( "1", "a2", "1", "A2" ) );
444        results.add( new Tuple( "1", "a2", "1", "A3" ) );
445        results.add( new Tuple( "1", "a3", "1", "A1" ) );
446        results.add( new Tuple( "1", "a3", "1", "A2" ) );
447        results.add( new Tuple( "1", "a3", "1", "A3" ) );
448        results.add( new Tuple( "2", "b1", "2", "B1" ) );
449        results.add( new Tuple( "2", "b1", "2", "B2" ) );
450        results.add( new Tuple( "2", "b1", "2", "B3" ) );
451        results.add( new Tuple( "4", "d1", "4", "D1" ) );
452        results.add( new Tuple( "4", "d2", "4", "D1" ) );
453        results.add( new Tuple( "4", "d3", "4", "D1" ) );
454        results.add( new Tuple( null, "h1", null, "H1" ) );
455    
456        handleJoins( "joininner", new InnerJoin(), results );
457        }
458    
459      /**
460       * /**
461       * 1 a1
462       * 1 a2
463       * 1 a3
464       * 2 b1
465       * 3 c1
466       * 4 d1
467       * 4 d2
468       * 4 d3
469       * 5 e1
470       * 5 e2
471       * 5 e3
472       * 7 g1
473       * 7 g2
474       * 7 g3
475       * 7 g4
476       * 7 g5
477       * null h1
478       * <p/>
479       * 1 A1
480       * 1 A2
481       * 1 A3
482       * 2 B1
483       * 2 B2
484       * 2 B3
485       * 4 D1
486       * 6 F1
487       * 6 F2
488       * null H1
489       * <p/>
490       * 1  a1      1       A1
491       * 1  a1      1       A2
492       * 1  a1      1       A3
493       * 1  a2      1       A1
494       * 1  a2      1       A2
495       * 1  a2      1       A3
496       * 1  a3      1       A1
497       * 1  a3      1       A2
498       * 1  a3      1       A3
499       * 2  b1      2       B1
500       * 2  b1      2       B2
501       * 2  b1      2       B3
502       * 3  c1      null    null
503       * 4  d1      4       D1
504       * 4  d2      4       D1
505       * 4  d3      4       D1
506       * 5  e1      null    null
507       * 5  e2      null    null
508       * 5  e3      null    null
509       * null       null    6       F1
510       * null       null    6       F2
511       * 7  g1      null    null
512       * 7  g2      null    null
513       * 7  g3      null    null
514       * 7  g4      null    null
515       * 7  g5      null    null
516       * null h1  null  H1
517       *
518       * @throws Exception
519       */
520      @Test
521      public void testJoinOuter() throws Exception
522        {
523        // skip if hadoop cluster mode, outer joins don't behave the same
524        if( getPlatform().isMapReduce() && getPlatform().isUseCluster() )
525          return;
526    
527        Set<Tuple> results = new HashSet<Tuple>();
528    
529        results.add( new Tuple( "1", "a1", "1", "A1" ) );
530        results.add( new Tuple( "1", "a1", "1", "A2" ) );
531        results.add( new Tuple( "1", "a1", "1", "A3" ) );
532        results.add( new Tuple( "1", "a2", "1", "A1" ) );
533        results.add( new Tuple( "1", "a2", "1", "A2" ) );
534        results.add( new Tuple( "1", "a2", "1", "A3" ) );
535        results.add( new Tuple( "1", "a3", "1", "A1" ) );
536        results.add( new Tuple( "1", "a3", "1", "A2" ) );
537        results.add( new Tuple( "1", "a3", "1", "A3" ) );
538        results.add( new Tuple( "2", "b1", "2", "B1" ) );
539        results.add( new Tuple( "2", "b1", "2", "B2" ) );
540        results.add( new Tuple( "2", "b1", "2", "B3" ) );
541        results.add( new Tuple( "3", "c1", null, null ) );
542        results.add( new Tuple( "4", "d1", "4", "D1" ) );
543        results.add( new Tuple( "4", "d2", "4", "D1" ) );
544        results.add( new Tuple( "4", "d3", "4", "D1" ) );
545        results.add( new Tuple( "5", "e1", null, null ) );
546        results.add( new Tuple( "5", "e2", null, null ) );
547        results.add( new Tuple( "5", "e3", null, null ) );
548        results.add( new Tuple( null, null, "6", "F1" ) );
549        results.add( new Tuple( null, null, "6", "F2" ) );
550        results.add( new Tuple( "7", "g1", null, null ) );
551        results.add( new Tuple( "7", "g2", null, null ) );
552        results.add( new Tuple( "7", "g3", null, null ) );
553        results.add( new Tuple( "7", "g4", null, null ) );
554        results.add( new Tuple( "7", "g5", null, null ) );
555        results.add( new Tuple( null, "h1", null, "H1" ) );
556    
557        handleJoins( "joinouter", new OuterJoin(), results );
558        }
559    
560      /**
561       * 1 a1
562       * 1 a2
563       * 1 a3
564       * 2 b1
565       * 3 c1
566       * 4 d1
567       * 4 d2
568       * 4 d3
569       * 5 e1
570       * 5 e2
571       * 5 e3
572       * 7 g1
573       * 7 g2
574       * 7 g3
575       * 7 g4
576       * 7 g5
577       * null h1
578       * <p/>
579       * 1 A1
580       * 1 A2
581       * 1 A3
582       * 2 B1
583       * 2 B2
584       * 2 B3
585       * 4 D1
586       * 6 F1
587       * 6 F2
588       * null H1
589       * <p/>
590       * 1  a1      1       A1
591       * 1  a1      1       A2
592       * 1  a1      1       A3
593       * 1  a2      1       A1
594       * 1  a2      1       A2
595       * 1  a2      1       A3
596       * 1  a3      1       A1
597       * 1  a3      1       A2
598       * 1  a3      1       A3
599       * 2  b1      2       B1
600       * 2  b1      2       B2
601       * 2  b1      2       B3
602       * 3  c1      null    null
603       * 4  d1      4       D1
604       * 4  d2      4       D1
605       * 4  d3      4       D1
606       * 5  e1      null    null
607       * 5  e2      null    null
608       * 5  e3      null    null
609       * 7  g1      null    null
610       * 7  g2      null    null
611       * 7  g3      null    null
612       * 7  g4      null    null
613       * 7  g5      null    null
614       * null h1    null    H1
615       *
616       * @throws Exception
617       */
618      @Test
619      public void testJoinInnerOuter() throws Exception
620        {
621        Set<Tuple> results = new HashSet<Tuple>();
622    
623        results.add( new Tuple( "1", "a1", "1", "A1" ) );
624        results.add( new Tuple( "1", "a1", "1", "A2" ) );
625        results.add( new Tuple( "1", "a1", "1", "A3" ) );
626        results.add( new Tuple( "1", "a2", "1", "A1" ) );
627        results.add( new Tuple( "1", "a2", "1", "A2" ) );
628        results.add( new Tuple( "1", "a2", "1", "A3" ) );
629        results.add( new Tuple( "1", "a3", "1", "A1" ) );
630        results.add( new Tuple( "1", "a3", "1", "A2" ) );
631        results.add( new Tuple( "1", "a3", "1", "A3" ) );
632        results.add( new Tuple( "2", "b1", "2", "B1" ) );
633        results.add( new Tuple( "2", "b1", "2", "B2" ) );
634        results.add( new Tuple( "2", "b1", "2", "B3" ) );
635        results.add( new Tuple( "3", "c1", null, null ) );
636        results.add( new Tuple( "4", "d1", "4", "D1" ) );
637        results.add( new Tuple( "4", "d2", "4", "D1" ) );
638        results.add( new Tuple( "4", "d3", "4", "D1" ) );
639        results.add( new Tuple( "5", "e1", null, null ) );
640        results.add( new Tuple( "5", "e2", null, null ) );
641        results.add( new Tuple( "5", "e3", null, null ) );
642        results.add( new Tuple( "7", "g1", null, null ) );
643        results.add( new Tuple( "7", "g2", null, null ) );
644        results.add( new Tuple( "7", "g3", null, null ) );
645        results.add( new Tuple( "7", "g4", null, null ) );
646        results.add( new Tuple( "7", "g5", null, null ) );
647        results.add( new Tuple( null, "h1", null, "H1" ) );
648    
649        handleJoins( "joininnerouter", new LeftJoin(), results );
650        }
651    
652      /**
653       * 1 a1
654       * 1 a2
655       * 1 a3
656       * 2 b1
657       * 3 c1
658       * 4 d1
659       * 4 d2
660       * 4 d3
661       * 5 e1
662       * 5 e2
663       * 5 e3
664       * 7 g1
665       * 7 g2
666       * 7 g3
667       * 7 g4
668       * 7 g5
669       * null h1
670       * <p/>
671       * 1 A1
672       * 1 A2
673       * 1 A3
674       * 2 B1
675       * 2 B2
676       * 2 B3
677       * 4 D1
678       * 6 F1
679       * 6 F2
680       * null H1
681       * <p/>
682       * 1  a1      1       A1
683       * 1  a1      1       A2
684       * 1  a1      1       A3
685       * 1  a2      1       A1
686       * 1  a2      1       A2
687       * 1  a2      1       A3
688       * 1  a3      1       A1
689       * 1  a3      1       A2
690       * 1  a3      1       A3
691       * 2  b1      2       B1
692       * 2  b1      2       B2
693       * 2  b1      2       B3
694       * 4  d1      4       D1
695       * 4  d2      4       D1
696       * 4  d3      4       D1
697       * null       null    6       F1
698       * null       null    6       F2
699       * null h1    null    H1
700       *
701       * @throws Exception
702       */
703      @Test
704      public void testJoinOuterInner() throws Exception
705        {
706        // skip if hadoop cluster mode, outer joins don't behave the same
707        if( getPlatform().isMapReduce() && getPlatform().isUseCluster() )
708          return;
709    
710        Set<Tuple> results = new HashSet<Tuple>();
711    
712        results.add( new Tuple( "1", "a1", "1", "A1" ) );
713        results.add( new Tuple( "1", "a1", "1", "A2" ) );
714        results.add( new Tuple( "1", "a1", "1", "A3" ) );
715        results.add( new Tuple( "1", "a2", "1", "A1" ) );
716        results.add( new Tuple( "1", "a2", "1", "A2" ) );
717        results.add( new Tuple( "1", "a2", "1", "A3" ) );
718        results.add( new Tuple( "1", "a3", "1", "A1" ) );
719        results.add( new Tuple( "1", "a3", "1", "A2" ) );
720        results.add( new Tuple( "1", "a3", "1", "A3" ) );
721        results.add( new Tuple( "2", "b1", "2", "B1" ) );
722        results.add( new Tuple( "2", "b1", "2", "B2" ) );
723        results.add( new Tuple( "2", "b1", "2", "B3" ) );
724        results.add( new Tuple( "4", "d1", "4", "D1" ) );
725        results.add( new Tuple( "4", "d2", "4", "D1" ) );
726        results.add( new Tuple( "4", "d3", "4", "D1" ) );
727        results.add( new Tuple( null, null, "6", "F1" ) );
728        results.add( new Tuple( null, null, "6", "F2" ) );
729        results.add( new Tuple( null, "h1", null, "H1" ) );
730    
731        handleJoins( "joinouterinner", new RightJoin(), results );
732        }
733    
734      private void handleJoins( String path, Joiner joiner, Set<Tuple> results ) throws Exception
735        {
736        getPlatform().copyFromLocal( inputFileLhsSparse );
737        getPlatform().copyFromLocal( inputFileRhsSparse );
738    
739        Fields fields = new Fields( "num", "char" ).applyTypes( Integer.class, String.class );
740        Tap sourceLower = getPlatform().getDelimitedFile( fields, " ", inputFileLhsSparse );
741        Tap sourceUpper = getPlatform().getDelimitedFile( fields, " ", inputFileRhsSparse );
742    
743        Map sources = new HashMap();
744    
745        sources.put( "lower", sourceLower );
746        sources.put( "upper", sourceUpper );
747    
748        Tap sink = getPlatform().getDelimitedFile( Fields.size( 4, String.class ), "\t", getOutputPath( path ), SinkMode.REPLACE );
749    
750        Pipe pipeLower = new Pipe( "lower" );
751        Pipe pipeUpper = new Pipe( "upper" );
752    
753        Fields declaredFields = new Fields( "num", "char", "num2", "char2" );
754        Fields groupingFields = new Fields( "num" );
755    
756        Pipe splice = new HashJoin( pipeLower, groupingFields, pipeUpper, groupingFields, declaredFields, joiner );
757    
758        splice = new Each( splice, Fields.ALL, new Identity(), Fields.RESULTS );
759    
760        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
761    
762        flow.complete();
763    
764        validateLength( flow, results.size() );
765    
766        List<Tuple> actual = getSinkAsList( flow );
767    
768        results.removeAll( actual );
769    
770        assertEquals( 0, results.size() );
771        }
772    
773      /**
774       * 1 a
775       * 5 b
776       * 6 c
777       * 5 b
778       * 5 e
779       * <p/>
780       * 1 A
781       * 2 B
782       * 3 C
783       * 4 D
784       * 5 E
785       * <p/>
786       * 1 a
787       * 2 b
788       * 3 c
789       * 4 d
790       * 5 e
791       * <p/>
792       * 1  a       1       A  1  a
793       * -  -   2   B  2  b
794       * -  -   3   C  3  c
795       * -  -   4   D  4  d
796       * 5  b       5   E  5  e
797       * 5  e       5   E  5  e
798       *
799       * @throws Exception
800       */
801      @Test
802      public void testJoinMixed() throws Exception
803        {
804        // skip if hadoop cluster mode, outer joins don't behave the same
805        if( getPlatform().isMapReduce() && getPlatform().isUseCluster() )
806          return;
807    
808        getPlatform().copyFromLocal( inputFileLowerOffset );
809        getPlatform().copyFromLocal( inputFileLower );
810        getPlatform().copyFromLocal( inputFileUpper );
811    
812        Tap sourceLowerOffset = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset );
813        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
814        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
815    
816        Map sources = new HashMap();
817    
818        sources.put( "loweroffset", sourceLowerOffset );
819        sources.put( "lower", sourceLower );
820        sources.put( "upper", sourceUpper );
821    
822        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinmixed" ), SinkMode.REPLACE );
823    
824        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
825    
826        Pipe pipeLowerOffset = new Each( new Pipe( "loweroffset" ), new Fields( "line" ), splitter );
827        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
828        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
829    
830        Pipe[] pipes = Pipe.pipes( pipeLowerOffset, pipeUpper, pipeLower );
831        Fields[] fields = Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) );
832    
833        MixedJoin join = new MixedJoin( new boolean[]{MixedJoin.OUTER, MixedJoin.INNER, MixedJoin.OUTER} );
834        Pipe splice = new HashJoin( pipes, fields, Fields.size( 6 ), join );
835    
836        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
837    
838        flow.complete();
839    
840        validateLength( flow, 6 );
841    
842        Set<Tuple> results = new HashSet<Tuple>();
843    
844        results.add( new Tuple( "1\ta\t1\tA\t1\ta" ) );
845        results.add( new Tuple( "null\tnull\t2\tB\t2\tb" ) );
846        results.add( new Tuple( "null\tnull\t3\tC\t3\tc" ) );
847        results.add( new Tuple( "null\tnull\t4\tD\t4\td" ) );
848        results.add( new Tuple( "5\tb\t5\tE\t5\te" ) );
849        results.add( new Tuple( "5\te\t5\tE\t5\te" ) );
850    
851        List<Tuple> actual = getSinkAsList( flow );
852    
853        results.removeAll( actual );
854    
855        assertEquals( 0, results.size() );
856        }
857    
858      @Test
859      public void testJoinDiffFields() throws Exception
860        {
861        getPlatform().copyFromLocal( inputFileLower );
862        getPlatform().copyFromLocal( inputFileUpper );
863    
864        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
865        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
866    
867        Map sources = new HashMap();
868    
869        sources.put( "lower", sourceLower );
870        sources.put( "upper", sourceUpper );
871    
872        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "difffields" ), SinkMode.REPLACE );
873    
874        Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
875        Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
876    
877        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
878        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
879    
880        Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
881    
882        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe );
883    
884        flow.complete();
885    
886        validateLength( flow, 5 );
887    
888        List<Tuple> actual = getSinkAsList( flow );
889    
890        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
891        assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) );
892        }
893    
894      @Test
895      public void testJoinGroupBy() throws Exception
896        {
897        getPlatform().copyFromLocal( inputFileLower );
898        getPlatform().copyFromLocal( inputFileUpper );
899    
900        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
901        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
902    
903        Map sources = new HashMap();
904    
905        sources.put( "lower", sourceLower );
906        sources.put( "upper", sourceUpper );
907    
908        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupby" ), SinkMode.REPLACE );
909    
910        Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
911        Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
912    
913        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
914        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
915    
916        Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
917    
918        Pipe groupby = new GroupBy( pipe, new Fields( "numA" ) );
919    
920        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, groupby );
921    
922        flow.complete();
923    
924        validateLength( flow, 5, null );
925    
926        List<Tuple> actual = getSinkAsList( flow );
927    
928        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
929        assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) );
930        }
931    
932      @Test
933      public void testJoinSamePipe() throws Exception
934        {
935        getPlatform().copyFromLocal( inputFileLower );
936    
937        Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
938    
939        Map sources = new HashMap();
940    
941        sources.put( "lower", source );
942    
943        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe" ), SinkMode.REPLACE );
944    
945        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
946    
947        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
948    
949        Pipe pipe = new HashJoin( pipeLower, new Fields( "num" ), 1, new Fields( "num1", "char1", "num2", "char2" ) );
950    
951        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe );
952    
953        flow.complete();
954    
955        validateLength( flow, 5, null );
956    
957        List<Tuple> actual = getSinkAsList( flow );
958    
959        assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
960        assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
961        }
962    
963      @Test
964      public void testJoinSamePipe2() throws Exception
965        {
966        getPlatform().copyFromLocal( inputFileLower );
967    
968        Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
969    
970        Map sources = new HashMap();
971    
972        sources.put( "lower", source );
973    
974        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe2" ), SinkMode.REPLACE );
975    
976        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
977    
978        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
979    
980        Pipe join = new HashJoin( pipeLower, new Fields( "num" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
981    
982        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join );
983    
984        flow.complete();
985    
986        validateLength( flow, 5, null );
987    
988        List<Tuple> actual = getSinkAsList( flow );
989    
990        assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
991        assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
992        }
993    
994      @Test
995      public void testJoinSamePipe3() throws Exception
996        {
997        getPlatform().copyFromLocal( inputFileLower );
998    
999        Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower );
1000    
1001        Map sources = new HashMap();
1002    
1003        sources.put( "lower", source );
1004    
1005        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe3" ), SinkMode.REPLACE );
1006    
1007        Pipe pipe = new Pipe( "lower" );
1008    
1009        Pipe lhs = new Pipe( "lhs", pipe );
1010        Pipe rhs = new Pipe( "rhs", pipe );
1011    
1012        Pipe join = new HashJoin( lhs, new Fields( "num" ), rhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1013    
1014        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join );
1015    
1016        flow.complete();
1017    
1018        validateLength( flow, 5, null );
1019    
1020        List<Tuple> actual = getSinkAsList( flow );
1021    
1022        assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
1023        assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
1024        }
1025    
1026      /**
1027       * Same source as rightmost
1028       * <p/>
1029       * should be a single job as the same file accumulates into the joins
1030       *
1031       * @throws Exception
1032       */
1033      @Test
1034      public void testJoinAroundJoinRightMost() throws Exception
1035        {
1036        getPlatform().copyFromLocal( inputFileLower );
1037        getPlatform().copyFromLocal( inputFileUpper );
1038    
1039        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1040        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1041    
1042        Map sources = new HashMap();
1043    
1044        sources.put( "lower", sourceLower );
1045        sources.put( "upper1", sourceUpper );
1046        sources.put( "upper2", sourceUpper );
1047    
1048        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinrightmost" ), SinkMode.REPLACE );
1049    
1050        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1051    
1052        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1053        Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1054        Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1055    
1056        Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1057    
1058        splice1 = new Each( splice1, new Identity() );
1059    
1060        Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1061    
1062        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1063    
1064    //    flow.writeDOT( "joinaroundrightmost.dot" );
1065    
1066        if( getPlatform().isMapReduce() )
1067          assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1068    
1069        flow.complete();
1070    
1071        validateLength( flow, 5, null );
1072    
1073        List<Tuple> actual = getSinkAsList( flow );
1074    
1075        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) );
1076        assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) );
1077        }
1078    
1079      /**
1080       * Same source as leftmost
1081       *
1082       * @throws Exception
1083       */
1084      @Test
1085      public void testJoinAroundJoinLeftMost() throws Exception
1086        {
1087        getPlatform().copyFromLocal( inputFileLower );
1088        getPlatform().copyFromLocal( inputFileUpper );
1089    
1090        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1091        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1092    
1093        Map sources = new HashMap();
1094    
1095        sources.put( "lower", sourceLower );
1096        sources.put( "upper1", sourceUpper );
1097        sources.put( "upper2", sourceUpper );
1098    
1099        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinleftmost" ), SinkMode.REPLACE );
1100    
1101        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1102    
1103        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1104        Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1105        Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1106    
1107        Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1108    
1109        splice1 = new Each( splice1, new Identity() );
1110    
1111        Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1112    
1113        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1114    
1115    //    flow.writeDOT( "joinaroundleftmost.dot" );
1116    
1117        if( getPlatform().isMapReduce() )
1118          assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1119    
1120        flow.complete();
1121    
1122        validateLength( flow, 5, null );
1123    
1124        List<Tuple> actual = getSinkAsList( flow );
1125    
1126        assertTrue( actual.contains( new Tuple( "1\tA\t1\tA\t1\ta" ) ) );
1127        assertTrue( actual.contains( new Tuple( "2\tB\t2\tB\t2\tb" ) ) );
1128        }
1129    
1130      /**
1131       * Upper as leftmost and rightmost forcing two jobs
1132       *
1133       * @throws Exception
1134       */
1135      @Test
1136      public void testJoinAroundJoinRightMostSwapped() throws Exception
1137        {
1138        getPlatform().copyFromLocal( inputFileLower );
1139        getPlatform().copyFromLocal( inputFileUpper );
1140    
1141        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1142        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1143    
1144        Map sources = new HashMap();
1145    
1146        sources.put( "lower", sourceLower );
1147        sources.put( "upper1", sourceUpper );
1148        sources.put( "upper2", sourceUpper );
1149    
1150        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinswapped" ), SinkMode.REPLACE );
1151    
1152        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1153    
1154        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1155        Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1156        Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1157    
1158        Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1159    
1160        splice1 = new Each( splice1, new Identity() );
1161    
1162        // upper2 becomes leftmost, forcing a tap between the joins
1163        Pipe splice2 = new HashJoin( pipeUpper2, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1164    
1165        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1166    
1167        if( getPlatform().isMapReduce() )
1168          assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1169    
1170        flow.complete();
1171    
1172        validateLength( flow, 5, null );
1173    
1174        List<Tuple> actual = getSinkAsList( flow );
1175    
1176        assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\tA" ) ) );
1177        assertTrue( actual.contains( new Tuple( "2\tB\t2\tb\t2\tB" ) ) );
1178        }
1179    
1180      @Test
1181      public void testJoinGroupByJoin() throws Exception
1182        {
1183        getPlatform().copyFromLocal( inputFileLower );
1184        getPlatform().copyFromLocal( inputFileUpper );
1185        getPlatform().copyFromLocal( inputFileJoined );
1186    
1187        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1188        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1189        Tap sourceJoined = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileJoined );
1190    
1191        Map sources = new HashMap();
1192    
1193        sources.put( "lower", sourceLower );
1194        sources.put( "upper", sourceUpper );
1195        sources.put( "joined", sourceJoined );
1196    
1197        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupbyjoin" ), SinkMode.REPLACE );
1198    
1199        Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
1200        Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
1201        Function splitterJoined = new RegexSplitter( new Fields( "numC", "lowerC", "upperC" ), "\t" );
1202    
1203        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
1204        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
1205        Pipe pipeJoined = new Each( new Pipe( "joined" ), new Fields( "line" ), splitterJoined );
1206    
1207        Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
1208    
1209        pipe = new GroupBy( pipe, new Fields( "numA" ) );
1210    
1211        pipe = new HashJoin( pipe, new Fields( "numA" ), pipeJoined, new Fields( "numC" ) );
1212    
1213        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe );
1214    
1215        if( getPlatform().isMapReduce() )
1216          assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1217    
1218        flow.complete();
1219    
1220        validateLength( flow, 5, null );
1221    
1222        List<Tuple> actual = getSinkAsList( flow );
1223    
1224        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\tA" ) ) );
1225        assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tb\tB" ) ) );
1226        }
1227    
1228      /**
1229       * here the same file is fed into the same HashJoin.
1230       * <p/>
1231       * This is three jobs.
1232       * <p/>
1233       * a temp tap is inserted before the accumulated branch for two reasons on the common HashJoin
1234       * <p/>
1235       * it is assumed the accumulated side is filtered down, so pushing to disk will preserve io
1236       * if accumulated side was streamed instead via a fork, only part of the file will accumulate into the HashJoin
1237       * <p/>
1238       * /-T-\ <-- accumulated
1239       * T      HJ
1240       * \---/ <-- streamed
1241       *
1242       * @throws Exception
1243       */
1244      @Test
1245      public void testJoinSameSourceIntoJoin() throws Exception
1246        {
1247        getPlatform().copyFromLocal( inputFileLower );
1248        getPlatform().copyFromLocal( inputFileUpper );
1249    
1250        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1251        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1252    
1253        Map sources = new HashMap();
1254    
1255        sources.put( "lower", sourceLower );
1256        sources.put( "upper1", sourceUpper );
1257        sources.put( "upper2", sourceUpper );
1258    
1259        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceintojoin" ), SinkMode.REPLACE );
1260    
1261        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1262    
1263        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1264        Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1265        Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1266    
1267        Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1268    
1269        splice1 = new Each( splice1, new Identity() );
1270    
1271        Pipe splice2 = new HashJoin( pipeLower, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1272    
1273        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1274    
1275    //    flow.writeDOT( "joinsamesourceintojoin.dot" );
1276    
1277        if( getPlatform().isMapReduce() )
1278          assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() );
1279    
1280        flow.complete();
1281    
1282        validateLength( flow, 5, null );
1283    
1284        List<Tuple> actual = getSinkAsList( flow );
1285    
1286        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) );
1287        assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) );
1288        }
1289    
1290      /**
1291       * Tests that two independent streamed sources with loadable tributaries properly plan into a GroupBy
1292       * without loading unused sources
1293       *
1294       * @throws Exception
1295       */
1296      @Test
1297      public void testJoinsIntoGroupBy() throws Exception
1298        {
1299        getPlatform().copyFromLocal( inputFileLower );
1300        getPlatform().copyFromLocal( inputFileUpper );
1301    
1302        getPlatform().copyFromLocal( inputFileLhs );
1303        getPlatform().copyFromLocal( inputFileRhs );
1304    
1305        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1306        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1307    
1308        Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1309        Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1310    
1311        Map sources = new HashMap();
1312    
1313        sources.put( "lower", sourceLower );
1314        sources.put( "upper", sourceUpper );
1315        sources.put( "lhs", sourceLhs );
1316        sources.put( "rhs", sourceRhs );
1317    
1318        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintogroupby" ), SinkMode.REPLACE );
1319    
1320        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1321    
1322        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1323        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1324    
1325        Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1326        Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1327    
1328        Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1329    
1330        upperLower = new Each( upperLower, new Identity() );
1331    
1332        Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1333    
1334        lhsRhs = new Each( lhsRhs, new Identity() );
1335    
1336        Pipe grouped = new GroupBy( "merging", Pipe.pipes( upperLower, lhsRhs ), new Fields( "num1" ) );
1337    
1338        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1339    
1340        if( getPlatform().isMapReduce() )
1341          assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1342    
1343        flow.complete();
1344    
1345        validateLength( flow, 42, null );
1346    
1347        List<Tuple> actual = getSinkAsList( flow );
1348    
1349        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
1350        assertTrue( actual.contains( new Tuple( "5\te\t5\tE" ) ) );
1351        }
1352    
1353      @Test
1354      public void testJoinSamePipeAroundGroupBy() throws Exception
1355        {
1356        getPlatform().copyFromLocal( inputFileLower );
1357    
1358        Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1359        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipearoundgroupby" ), SinkMode.REPLACE );
1360    
1361        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1362    
1363        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1364    
1365        Pipe lhsPipe = new Each( new Pipe( "lhs", pipeLower ), new Identity() );
1366    
1367        Pipe rhsPipe = new Each( new Pipe( "rhs", pipeLower ), new Identity() );
1368    
1369        rhsPipe = new GroupBy( rhsPipe, new Fields( "num" ) );
1370    
1371        rhsPipe = new Each( rhsPipe, new Identity() );
1372    
1373        Pipe pipe = new HashJoin( lhsPipe, new Fields( "num" ), rhsPipe, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1374    
1375        Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
1376    
1377        flow.complete();
1378    
1379        validateLength( flow, 5, null );
1380    
1381        List<Tuple> actual = getSinkAsList( flow );
1382    
1383        assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
1384        assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
1385        }
1386    
1387      /**
1388       * This test results in two MR jobs because one join feeds into the accumulated side of the second. A mapper
1389       * can only stream on branch at a time forcing a temp file between the mappers. see next test for swapped join
1390       *
1391       * @throws Exception
1392       */
1393      @Test
1394      public void testJoinsIntoCoGroupLhs() throws Exception
1395        {
1396        getPlatform().copyFromLocal( inputFileLower );
1397        getPlatform().copyFromLocal( inputFileUpper );
1398    
1399        getPlatform().copyFromLocal( inputFileLhs );
1400        getPlatform().copyFromLocal( inputFileRhs );
1401    
1402        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1403        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1404    
1405        Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1406        Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1407    
1408        Map sources = new HashMap();
1409    
1410        sources.put( "lower", sourceLower );
1411        sources.put( "upper", sourceUpper );
1412        sources.put( "lhs", sourceLhs );
1413        sources.put( "rhs", sourceRhs );
1414    
1415        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhs" ), SinkMode.REPLACE );
1416    
1417        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1418    
1419        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1420        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1421    
1422        Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1423        Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1424    
1425        Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1426    
1427        upperLower = new Each( upperLower, new Identity() );
1428    
1429        Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1430    
1431        lhsUpperLower = new Each( lhsUpperLower, new Identity() );
1432    
1433        Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) );
1434    
1435        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1436    
1437        if( getPlatform().isMapReduce() )
1438          assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1439    
1440        flow.complete();
1441    
1442        validateLength( flow, 37, null );
1443    
1444        List<Tuple> actual = getSinkAsList( flow );
1445    
1446        assertTrue( actual.contains( new Tuple( "1\ta\t1\ta\t1\tA\t1\tA" ) ) );
1447        assertTrue( actual.contains( new Tuple( "5\ta\t5\te\t5\tE\t5\tA" ) ) );
1448        }
1449    
1450      /**
1451       * This test results in one MR jobs because one join feeds into the streamed side of the second.
1452       *
1453       * @throws Exception
1454       */
1455      @Test
1456      public void testJoinsIntoCoGroupLhsSwappedJoin() throws Exception
1457        {
1458        getPlatform().copyFromLocal( inputFileLower );
1459        getPlatform().copyFromLocal( inputFileUpper );
1460    
1461        getPlatform().copyFromLocal( inputFileLhs );
1462        getPlatform().copyFromLocal( inputFileRhs );
1463    
1464        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1465        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1466    
1467        Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1468        Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1469    
1470        Map sources = new HashMap();
1471    
1472        sources.put( "lower", sourceLower );
1473        sources.put( "upper", sourceUpper );
1474        sources.put( "lhs", sourceLhs );
1475        sources.put( "rhs", sourceRhs );
1476    
1477        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhsswappedjoin" ), SinkMode.REPLACE );
1478    
1479        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1480    
1481        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1482        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1483    
1484        Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1485        Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1486    
1487        Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1488    
1489        upperLower = new Each( upperLower, new Identity() );
1490    
1491        Pipe lhsUpperLower = new HashJoin( upperLower, new Fields( "numUpperLower" ), pipeLhs, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower", "numLhs", "charLhs" ) );
1492    
1493        lhsUpperLower = new Each( lhsUpperLower, new Identity() );
1494    
1495        Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) );
1496    
1497        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1498    
1499        if( getPlatform().isMapReduce() )
1500          assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1501    
1502        flow.complete();
1503    
1504        validateLength( flow, 37, null );
1505    
1506        List<Tuple> actual = getSinkAsList( flow );
1507    
1508        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) );
1509        assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) );
1510        }
1511    
1512      @Test
1513      public void testJoinsIntoCoGroupRhs() throws Exception
1514        {
1515        getPlatform().copyFromLocal( inputFileLower );
1516        getPlatform().copyFromLocal( inputFileUpper );
1517    
1518        getPlatform().copyFromLocal( inputFileLhs );
1519        getPlatform().copyFromLocal( inputFileRhs );
1520    
1521        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1522        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1523    
1524        Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1525        Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1526    
1527        Map sources = new HashMap();
1528    
1529        sources.put( "lower", sourceLower );
1530        sources.put( "upper", sourceUpper );
1531        sources.put( "lhs", sourceLhs );
1532        sources.put( "rhs", sourceRhs );
1533    
1534        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouprhs" ), SinkMode.REPLACE );
1535    
1536        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1537    
1538        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1539        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1540    
1541        Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1542        Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1543    
1544        Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1545    
1546        upperLower = new Each( upperLower, new Identity() );
1547    
1548        Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1549    
1550        lhsUpperLower = new Each( lhsUpperLower, new Identity() );
1551    
1552        Pipe grouped = new CoGroup( "cogrouping", pipeRhs, new Fields( "num" ), lhsUpperLower, new Fields( "numLhs" ) );
1553    
1554        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1555    
1556        if( getPlatform().isMapReduce() )
1557          assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1558    
1559        flow.complete();
1560    
1561        validateLength( flow, 37, null );
1562    
1563        List<Tuple> actual = getSinkAsList( flow );
1564    
1565        assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\ta\t1\tA" ) ) );
1566        assertTrue( actual.contains( new Tuple( "5\tE\t5\te\t5\te\t5\tE" ) ) );
1567        }
1568    
1569      @Test
1570      public void testJoinsIntoCoGroup() throws Exception
1571        {
1572        getPlatform().copyFromLocal( inputFileLower );
1573        getPlatform().copyFromLocal( inputFileUpper );
1574    
1575        getPlatform().copyFromLocal( inputFileLhs );
1576        getPlatform().copyFromLocal( inputFileRhs );
1577    
1578        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1579        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1580    
1581        Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1582        Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1583    
1584        Map sources = new HashMap();
1585    
1586        sources.put( "lower", sourceLower );
1587        sources.put( "upper", sourceUpper );
1588        sources.put( "lhs", sourceLhs );
1589        sources.put( "rhs", sourceRhs );
1590    
1591        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogroup" ), SinkMode.REPLACE );
1592    
1593        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1594    
1595        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1596        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1597    
1598        Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1599        Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1600    
1601        Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower1", "charUpperLower1", "numUpperLower2", "charUpperLower2" ) );
1602    
1603        upperLower = new Each( upperLower, new Identity() );
1604    
1605        Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "numLhsRhs1", "charLhsRhs1", "numLhsRhs2", "charLhsRhs2" ) );
1606    
1607        lhsRhs = new Each( lhsRhs, new Identity() );
1608    
1609        Pipe grouped = new CoGroup( "cogrouping", upperLower, new Fields( "numUpperLower1" ), lhsRhs, new Fields( "numLhsRhs1" ) );
1610    
1611        Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1612    
1613        if( getPlatform().isMapReduce() )
1614          assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1615    
1616        flow.complete();
1617    
1618        validateLength( flow, 37, null );
1619    
1620        List<Tuple> actual = getSinkAsList( flow );
1621    
1622        assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) );
1623        assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) );
1624        }
1625    
1626      public static class AllComparator implements Comparator<Comparable>, Hasher<Comparable>, Serializable
1627        {
1628    
1629        @Override
1630        public int compare( Comparable lhs, Comparable rhs )
1631          {
1632          return lhs.toString().compareTo( rhs.toString() );
1633          }
1634    
1635        @Override
1636        public int hashCode( Comparable value )
1637          {
1638          if( value == null )
1639            return 0;
1640    
1641          return value.toString().hashCode();
1642          }
1643        }
1644    
1645      /**
1646       * Tests Hasher being honored even if default comparator is null.
1647       *
1648       * @throws Exception
1649       */
1650      @Test
1651      public void testJoinWithHasher() throws Exception
1652        {
1653        getPlatform().copyFromLocal( inputFileLower );
1654        getPlatform().copyFromLocal( inputFileUpper );
1655    
1656        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1657        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1658    
1659        Map sources = new HashMap();
1660    
1661        sources.put( "lower", sourceLower );
1662        sources.put( "upper", sourceUpper );
1663    
1664        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinhasher" ), SinkMode.REPLACE );
1665    
1666        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1667    
1668        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1669    
1670        pipeLower = new Each( pipeLower, new Fields( "num" ), new ExpressionFunction( Fields.ARGS, "Integer.parseInt( num )", String.class ), Fields.REPLACE );
1671    
1672        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1673    
1674        Fields num = new Fields( "num" );
1675        num.setComparator( "num", new AllComparator() );
1676    
1677        Pipe splice = new HashJoin( pipeLower, num, pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
1678    
1679        Map<Object, Object> properties = getProperties();
1680    
1681        Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
1682    
1683        flow.complete();
1684    
1685        validateLength( flow, 5 );
1686    
1687        List<Tuple> values = getSinkAsList( flow );
1688    
1689        assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
1690        assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
1691        }
1692    
1693      @Test
1694      public void testJoinNone() throws Exception
1695        {
1696        getPlatform().copyFromLocal( inputFileLower );
1697        getPlatform().copyFromLocal( inputFileUpper );
1698    
1699        Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1700        Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1701    
1702        Map sources = new HashMap();
1703    
1704        sources.put( "lower", sourceLower );
1705        sources.put( "upper", sourceUpper );
1706    
1707        Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinnone" ), SinkMode.REPLACE );
1708    
1709        Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1710    
1711        Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1712        Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1713    
1714        Pipe splice = new HashJoin( pipeLower, Fields.NONE, pipeUpper, Fields.NONE, Fields.size( 4 ) );
1715    
1716        Map<Object, Object> properties = getProperties();
1717    
1718        Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
1719    
1720        flow.complete();
1721    
1722        validateLength( flow, 25 );
1723    
1724        List<Tuple> values = getSinkAsList( flow );
1725    
1726        assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
1727        assertTrue( values.contains( new Tuple( "1\ta\t2\tB" ) ) );
1728        assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
1729        }
1730      }