001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading; 022 023 import java.io.Serializable; 024 import java.util.Comparator; 025 import java.util.HashMap; 026 import java.util.HashSet; 027 import java.util.List; 028 import java.util.Map; 029 import java.util.Set; 030 031 import cascading.flow.Flow; 032 import cascading.operation.Function; 033 import cascading.operation.Identity; 034 import cascading.operation.aggregator.First; 035 import cascading.operation.expression.ExpressionFunction; 036 import cascading.operation.regex.RegexFilter; 037 import cascading.operation.regex.RegexSplitter; 038 import cascading.pipe.CoGroup; 039 import cascading.pipe.Each; 040 import cascading.pipe.Every; 041 import cascading.pipe.GroupBy; 042 import cascading.pipe.HashJoin; 043 import cascading.pipe.Pipe; 044 import cascading.pipe.joiner.InnerJoin; 045 import cascading.pipe.joiner.Joiner; 046 import cascading.pipe.joiner.LeftJoin; 047 import cascading.pipe.joiner.MixedJoin; 048 import cascading.pipe.joiner.OuterJoin; 049 import cascading.pipe.joiner.RightJoin; 050 import cascading.tap.SinkMode; 051 import cascading.tap.Tap; 052 import cascading.tuple.Fields; 053 import cascading.tuple.Hasher; 054 import cascading.tuple.Tuple; 055 import org.junit.Test; 056 057 import static data.InputData.*; 058 059 060 public class JoinFieldedPipesPlatformTest extends PlatformTestCase 061 { 062 public JoinFieldedPipesPlatformTest() 063 { 064 super( true, 4, 1 ); // leave cluster testing enabled 065 } 066 067 @Test 068 public void testCross() throws Exception 069 { 070 getPlatform().copyFromLocal( inputFileLhs ); 071 getPlatform().copyFromLocal( inputFileRhs ); 072 073 Map sources = new HashMap(); 074 075 sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) ); 076 sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) ); 077 078 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE ); 079 080 Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); 081 Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); 082 083 Pipe cross = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); 084 085 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross ); 086 087 flow.complete(); 088 089 validateLength( flow, 37, null ); 090 091 List<Tuple> values = getSinkAsList( flow ); 092 093 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 094 assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); 095 } 096 097 @Test 098 public void testJoin() throws Exception 099 { 100 getPlatform().copyFromLocal( inputFileLower ); 101 getPlatform().copyFromLocal( inputFileUpper ); 102 103 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 104 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 105 106 Map sources = new HashMap(); 107 108 sources.put( "lower", sourceLower ); 109 sources.put( "upper", sourceUpper ); 110 111 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE ); 112 113 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 114 115 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 116 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 117 118 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 119 120 Map<Object, Object> properties = getProperties(); 121 122 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 123 124 flow.complete(); 125 126 validateLength( flow, 5 ); 127 128 List<Tuple> values = getSinkAsList( flow ); 129 130 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 131 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 132 } 133 134 @Test 135 public void testJoinSamePipeName() throws Exception 136 { 137 getPlatform().copyFromLocal( inputFileLower ); 138 getPlatform().copyFromLocal( inputFileUpper ); 139 140 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 141 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 142 143 Map sources = new HashMap(); 144 145 sources.put( "lower", sourceLower ); 146 sources.put( "upper", sourceUpper ); 147 148 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "renamedpipes" ), SinkMode.REPLACE ); 149 150 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 151 152 Pipe pipeLower = new Pipe( "lower" ); 153 Pipe pipeUpper = new Pipe( "upper" ); 154 155 // these pipes will hide the source name, and could cause one to be lost 156 pipeLower = new Pipe( "same", pipeLower ); 157 pipeUpper = new Pipe( "same", pipeUpper ); 158 159 pipeLower = new Each( pipeLower, new Fields( "line" ), splitter ); 160 pipeUpper = new Each( pipeUpper, new Fields( "line" ), splitter ); 161 162 // pipeLower = new Each( pipeLower, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) ); 163 // pipeUpper = new Each( pipeUpper, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) ); 164 165 pipeLower = new Pipe( "left", pipeLower ); 166 pipeUpper = new Pipe( "right", pipeUpper ); 167 168 // pipeLower = new Each( pipeLower, new Debug( true ) ); 169 // pipeUpper = new Each( pipeUpper, new Debug( true ) ); 170 171 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 172 173 // splice = new Each( splice, new Debug( true ) ); 174 splice = new Pipe( "splice", splice ); 175 splice = new Pipe( "tail", splice ); 176 177 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 178 179 flow.complete(); 180 181 validateLength( flow, 5 ); 182 183 List<Tuple> values = getSinkAsList( flow ); 184 185 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 186 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 187 } 188 189 @Test 190 public void testJoinWithUnknowns() throws Exception 191 { 192 getPlatform().copyFromLocal( inputFileLower ); 193 getPlatform().copyFromLocal( inputFileUpper ); 194 195 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 196 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 197 198 Map sources = new HashMap(); 199 200 sources.put( "lower", sourceLower ); 201 sources.put( "upper", sourceUpper ); 202 203 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "unknown" ), SinkMode.REPLACE ); 204 205 Function splitter = new RegexSplitter( Fields.UNKNOWN, " " ); 206 207 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 208 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 209 210 Pipe splice = new HashJoin( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ), Fields.size( 4 ) ); 211 212 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 213 214 flow.complete(); 215 216 validateLength( flow, 5 ); 217 218 List<Tuple> values = getSinkAsList( flow ); 219 220 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 221 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 222 } 223 224 /** 225 * this test intentionally filters out all values so the intermediate tap is empty. this tap is cogrouped with 226 * a new stream using an outerjoin. 227 * 228 * @throws Exception 229 */ 230 @Test 231 public void testJoinFilteredBranch() throws Exception 232 { 233 getPlatform().copyFromLocal( inputFileLower ); 234 getPlatform().copyFromLocal( inputFileUpper ); 235 236 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 237 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 238 239 Map sources = new HashMap(); 240 241 sources.put( "lower", sourceLower ); 242 sources.put( "upper", sourceUpper ); 243 244 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinfilteredbranch" ), SinkMode.REPLACE ); 245 246 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 247 248 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 249 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 250 pipeUpper = new Each( pipeUpper, new Fields( "num" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all 251 pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) ); 252 253 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() ); 254 255 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 256 257 flow.complete(); 258 259 validateLength( flow, 5 ); 260 261 List<Tuple> values = getSinkAsList( flow ); 262 263 assertTrue( values.contains( new Tuple( "1\ta\tnull\tnull" ) ) ); 264 assertTrue( values.contains( new Tuple( "2\tb\tnull\tnull" ) ) ); 265 } 266 267 @Test 268 public void testJoinSelf() throws Exception 269 { 270 getPlatform().copyFromLocal( inputFileLower ); 271 272 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 273 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 274 275 Map sources = new HashMap(); 276 277 sources.put( "lower", sourceLower ); 278 sources.put( "upper", sourceUpper ); 279 280 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinself" ), SinkMode.REPLACE ); 281 282 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 283 284 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 285 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 286 287 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 288 289 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 290 291 flow.complete(); 292 293 validateLength( flow, 5 ); 294 295 List<Tuple> values = getSinkAsList( flow ); 296 297 assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) ); 298 assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) ); 299 } 300 301 /** 302 * Method testCoGroupAfterEvery tests that a tmp tap is inserted after the Every in the cogroup join 303 * 304 * @throws Exception when 305 */ 306 @Test 307 public void testJoinAfterEvery() throws Exception 308 { 309 getPlatform().copyFromLocal( inputFileLower ); 310 getPlatform().copyFromLocal( inputFileUpper ); 311 312 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 313 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 314 315 Map sources = new HashMap(); 316 317 sources.put( "lower", sourceLower ); 318 sources.put( "upper", sourceUpper ); 319 320 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "afterevery" ), SinkMode.REPLACE ); 321 322 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 323 324 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 325 pipeLower = new GroupBy( pipeLower, new Fields( "num" ) ); 326 pipeLower = new Every( pipeLower, new Fields( "char" ), new First(), Fields.ALL ); 327 328 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 329 pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) ); 330 pipeUpper = new Every( pipeUpper, new Fields( "char" ), new First(), Fields.ALL ); 331 332 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 333 334 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 335 336 flow.complete(); 337 338 validateLength( flow, 5, null ); 339 340 List<Tuple> values = getSinkAsList( flow ); 341 342 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 343 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 344 } 345 346 @Test 347 public void testJoinInnerSingleField() throws Exception 348 { 349 getPlatform().copyFromLocal( inputFileLowerOffset ); 350 getPlatform().copyFromLocal( inputFileUpper ); 351 352 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset ); 353 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 354 355 Map sources = new HashMap(); 356 357 sources.put( "lower", sourceLower ); 358 sources.put( "upper", sourceUpper ); 359 360 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joininnersingle" ), SinkMode.REPLACE ); 361 362 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char" ), " " ), new Fields( "num1" ) ); 363 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char" ), " " ), new Fields( "num2" ) ); 364 365 Pipe join = new HashJoin( pipeLower, new Fields( "num1" ), pipeUpper, new Fields( "num2" ) ); 366 367 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join ); 368 369 flow.complete(); 370 371 validateLength( flow, 3, null ); 372 373 Set<Tuple> results = new HashSet<Tuple>(); 374 375 results.add( new Tuple( "1\t1" ) ); 376 results.add( new Tuple( "5\t5" ) ); 377 378 List<Tuple> actual = getSinkAsList( flow ); 379 380 results.removeAll( actual ); 381 382 assertEquals( 0, results.size() ); 383 } 384 385 /** 386 * 1 a1 387 * 1 a2 388 * 1 a3 389 * 2 b1 390 * 3 c1 391 * 4 d1 392 * 4 d2 393 * 4 d3 394 * 5 e1 395 * 5 e2 396 * 5 e3 397 * 7 g1 398 * 7 g2 399 * 7 g3 400 * 7 g4 401 * 7 g5 402 * null h1 403 * <p/> 404 * 1 A1 405 * 1 A2 406 * 1 A3 407 * 2 B1 408 * 2 B2 409 * 2 B3 410 * 4 D1 411 * 6 F1 412 * 6 F2 413 * null H1 414 * <p/> 415 * 1 a1 1 A1 416 * 1 a1 1 A2 417 * 1 a1 1 A3 418 * 1 a2 1 A1 419 * 1 a2 1 A2 420 * 1 a2 1 A3 421 * 1 a3 1 A1 422 * 1 a3 1 A2 423 * 1 a3 1 A3 424 * 2 b1 2 B1 425 * 2 b1 2 B2 426 * 2 b1 2 B3 427 * 4 d1 4 D1 428 * 4 d2 4 D1 429 * 4 d3 4 D1 430 * null h1 null H1 431 * 432 * @throws Exception 433 */ 434 @Test 435 public void testJoinInner() throws Exception 436 { 437 HashSet<Tuple> results = new HashSet<Tuple>(); 438 439 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 440 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 441 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 442 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 443 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 444 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 445 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 446 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 447 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 448 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 449 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 450 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 451 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 452 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 453 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 454 results.add( new Tuple( null, "h1", null, "H1" ) ); 455 456 handleJoins( "joininner", new InnerJoin(), results ); 457 } 458 459 /** 460 * /** 461 * 1 a1 462 * 1 a2 463 * 1 a3 464 * 2 b1 465 * 3 c1 466 * 4 d1 467 * 4 d2 468 * 4 d3 469 * 5 e1 470 * 5 e2 471 * 5 e3 472 * 7 g1 473 * 7 g2 474 * 7 g3 475 * 7 g4 476 * 7 g5 477 * null h1 478 * <p/> 479 * 1 A1 480 * 1 A2 481 * 1 A3 482 * 2 B1 483 * 2 B2 484 * 2 B3 485 * 4 D1 486 * 6 F1 487 * 6 F2 488 * null H1 489 * <p/> 490 * 1 a1 1 A1 491 * 1 a1 1 A2 492 * 1 a1 1 A3 493 * 1 a2 1 A1 494 * 1 a2 1 A2 495 * 1 a2 1 A3 496 * 1 a3 1 A1 497 * 1 a3 1 A2 498 * 1 a3 1 A3 499 * 2 b1 2 B1 500 * 2 b1 2 B2 501 * 2 b1 2 B3 502 * 3 c1 null null 503 * 4 d1 4 D1 504 * 4 d2 4 D1 505 * 4 d3 4 D1 506 * 5 e1 null null 507 * 5 e2 null null 508 * 5 e3 null null 509 * null null 6 F1 510 * null null 6 F2 511 * 7 g1 null null 512 * 7 g2 null null 513 * 7 g3 null null 514 * 7 g4 null null 515 * 7 g5 null null 516 * null h1 null H1 517 * 518 * @throws Exception 519 */ 520 @Test 521 public void testJoinOuter() throws Exception 522 { 523 // skip if hadoop cluster mode, outer joins don't behave the same 524 if( getPlatform().isMapReduce() && getPlatform().isUseCluster() ) 525 return; 526 527 Set<Tuple> results = new HashSet<Tuple>(); 528 529 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 530 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 531 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 532 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 533 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 534 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 535 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 536 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 537 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 538 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 539 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 540 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 541 results.add( new Tuple( "3", "c1", null, null ) ); 542 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 543 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 544 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 545 results.add( new Tuple( "5", "e1", null, null ) ); 546 results.add( new Tuple( "5", "e2", null, null ) ); 547 results.add( new Tuple( "5", "e3", null, null ) ); 548 results.add( new Tuple( null, null, "6", "F1" ) ); 549 results.add( new Tuple( null, null, "6", "F2" ) ); 550 results.add( new Tuple( "7", "g1", null, null ) ); 551 results.add( new Tuple( "7", "g2", null, null ) ); 552 results.add( new Tuple( "7", "g3", null, null ) ); 553 results.add( new Tuple( "7", "g4", null, null ) ); 554 results.add( new Tuple( "7", "g5", null, null ) ); 555 results.add( new Tuple( null, "h1", null, "H1" ) ); 556 557 handleJoins( "joinouter", new OuterJoin(), results ); 558 } 559 560 /** 561 * 1 a1 562 * 1 a2 563 * 1 a3 564 * 2 b1 565 * 3 c1 566 * 4 d1 567 * 4 d2 568 * 4 d3 569 * 5 e1 570 * 5 e2 571 * 5 e3 572 * 7 g1 573 * 7 g2 574 * 7 g3 575 * 7 g4 576 * 7 g5 577 * null h1 578 * <p/> 579 * 1 A1 580 * 1 A2 581 * 1 A3 582 * 2 B1 583 * 2 B2 584 * 2 B3 585 * 4 D1 586 * 6 F1 587 * 6 F2 588 * null H1 589 * <p/> 590 * 1 a1 1 A1 591 * 1 a1 1 A2 592 * 1 a1 1 A3 593 * 1 a2 1 A1 594 * 1 a2 1 A2 595 * 1 a2 1 A3 596 * 1 a3 1 A1 597 * 1 a3 1 A2 598 * 1 a3 1 A3 599 * 2 b1 2 B1 600 * 2 b1 2 B2 601 * 2 b1 2 B3 602 * 3 c1 null null 603 * 4 d1 4 D1 604 * 4 d2 4 D1 605 * 4 d3 4 D1 606 * 5 e1 null null 607 * 5 e2 null null 608 * 5 e3 null null 609 * 7 g1 null null 610 * 7 g2 null null 611 * 7 g3 null null 612 * 7 g4 null null 613 * 7 g5 null null 614 * null h1 null H1 615 * 616 * @throws Exception 617 */ 618 @Test 619 public void testJoinInnerOuter() throws Exception 620 { 621 Set<Tuple> results = new HashSet<Tuple>(); 622 623 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 624 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 625 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 626 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 627 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 628 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 629 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 630 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 631 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 632 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 633 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 634 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 635 results.add( new Tuple( "3", "c1", null, null ) ); 636 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 637 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 638 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 639 results.add( new Tuple( "5", "e1", null, null ) ); 640 results.add( new Tuple( "5", "e2", null, null ) ); 641 results.add( new Tuple( "5", "e3", null, null ) ); 642 results.add( new Tuple( "7", "g1", null, null ) ); 643 results.add( new Tuple( "7", "g2", null, null ) ); 644 results.add( new Tuple( "7", "g3", null, null ) ); 645 results.add( new Tuple( "7", "g4", null, null ) ); 646 results.add( new Tuple( "7", "g5", null, null ) ); 647 results.add( new Tuple( null, "h1", null, "H1" ) ); 648 649 handleJoins( "joininnerouter", new LeftJoin(), results ); 650 } 651 652 /** 653 * 1 a1 654 * 1 a2 655 * 1 a3 656 * 2 b1 657 * 3 c1 658 * 4 d1 659 * 4 d2 660 * 4 d3 661 * 5 e1 662 * 5 e2 663 * 5 e3 664 * 7 g1 665 * 7 g2 666 * 7 g3 667 * 7 g4 668 * 7 g5 669 * null h1 670 * <p/> 671 * 1 A1 672 * 1 A2 673 * 1 A3 674 * 2 B1 675 * 2 B2 676 * 2 B3 677 * 4 D1 678 * 6 F1 679 * 6 F2 680 * null H1 681 * <p/> 682 * 1 a1 1 A1 683 * 1 a1 1 A2 684 * 1 a1 1 A3 685 * 1 a2 1 A1 686 * 1 a2 1 A2 687 * 1 a2 1 A3 688 * 1 a3 1 A1 689 * 1 a3 1 A2 690 * 1 a3 1 A3 691 * 2 b1 2 B1 692 * 2 b1 2 B2 693 * 2 b1 2 B3 694 * 4 d1 4 D1 695 * 4 d2 4 D1 696 * 4 d3 4 D1 697 * null null 6 F1 698 * null null 6 F2 699 * null h1 null H1 700 * 701 * @throws Exception 702 */ 703 @Test 704 public void testJoinOuterInner() throws Exception 705 { 706 // skip if hadoop cluster mode, outer joins don't behave the same 707 if( getPlatform().isMapReduce() && getPlatform().isUseCluster() ) 708 return; 709 710 Set<Tuple> results = new HashSet<Tuple>(); 711 712 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 713 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 714 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 715 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 716 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 717 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 718 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 719 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 720 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 721 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 722 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 723 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 724 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 725 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 726 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 727 results.add( new Tuple( null, null, "6", "F1" ) ); 728 results.add( new Tuple( null, null, "6", "F2" ) ); 729 results.add( new Tuple( null, "h1", null, "H1" ) ); 730 731 handleJoins( "joinouterinner", new RightJoin(), results ); 732 } 733 734 private void handleJoins( String path, Joiner joiner, Set<Tuple> results ) throws Exception 735 { 736 getPlatform().copyFromLocal( inputFileLhsSparse ); 737 getPlatform().copyFromLocal( inputFileRhsSparse ); 738 739 Fields fields = new Fields( "num", "char" ).applyTypes( Integer.class, String.class ); 740 Tap sourceLower = getPlatform().getDelimitedFile( fields, " ", inputFileLhsSparse ); 741 Tap sourceUpper = getPlatform().getDelimitedFile( fields, " ", inputFileRhsSparse ); 742 743 Map sources = new HashMap(); 744 745 sources.put( "lower", sourceLower ); 746 sources.put( "upper", sourceUpper ); 747 748 Tap sink = getPlatform().getDelimitedFile( Fields.size( 4, String.class ), "\t", getOutputPath( path ), SinkMode.REPLACE ); 749 750 Pipe pipeLower = new Pipe( "lower" ); 751 Pipe pipeUpper = new Pipe( "upper" ); 752 753 Fields declaredFields = new Fields( "num", "char", "num2", "char2" ); 754 Fields groupingFields = new Fields( "num" ); 755 756 Pipe splice = new HashJoin( pipeLower, groupingFields, pipeUpper, groupingFields, declaredFields, joiner ); 757 758 splice = new Each( splice, Fields.ALL, new Identity(), Fields.RESULTS ); 759 760 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 761 762 flow.complete(); 763 764 validateLength( flow, results.size() ); 765 766 List<Tuple> actual = getSinkAsList( flow ); 767 768 results.removeAll( actual ); 769 770 assertEquals( 0, results.size() ); 771 } 772 773 /** 774 * 1 a 775 * 5 b 776 * 6 c 777 * 5 b 778 * 5 e 779 * <p/> 780 * 1 A 781 * 2 B 782 * 3 C 783 * 4 D 784 * 5 E 785 * <p/> 786 * 1 a 787 * 2 b 788 * 3 c 789 * 4 d 790 * 5 e 791 * <p/> 792 * 1 a 1 A 1 a 793 * - - 2 B 2 b 794 * - - 3 C 3 c 795 * - - 4 D 4 d 796 * 5 b 5 E 5 e 797 * 5 e 5 E 5 e 798 * 799 * @throws Exception 800 */ 801 @Test 802 public void testJoinMixed() throws Exception 803 { 804 // skip if hadoop cluster mode, outer joins don't behave the same 805 if( getPlatform().isMapReduce() && getPlatform().isUseCluster() ) 806 return; 807 808 getPlatform().copyFromLocal( inputFileLowerOffset ); 809 getPlatform().copyFromLocal( inputFileLower ); 810 getPlatform().copyFromLocal( inputFileUpper ); 811 812 Tap sourceLowerOffset = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset ); 813 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 814 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 815 816 Map sources = new HashMap(); 817 818 sources.put( "loweroffset", sourceLowerOffset ); 819 sources.put( "lower", sourceLower ); 820 sources.put( "upper", sourceUpper ); 821 822 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinmixed" ), SinkMode.REPLACE ); 823 824 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 825 826 Pipe pipeLowerOffset = new Each( new Pipe( "loweroffset" ), new Fields( "line" ), splitter ); 827 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 828 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 829 830 Pipe[] pipes = Pipe.pipes( pipeLowerOffset, pipeUpper, pipeLower ); 831 Fields[] fields = Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) ); 832 833 MixedJoin join = new MixedJoin( new boolean[]{MixedJoin.OUTER, MixedJoin.INNER, MixedJoin.OUTER} ); 834 Pipe splice = new HashJoin( pipes, fields, Fields.size( 6 ), join ); 835 836 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 837 838 flow.complete(); 839 840 validateLength( flow, 6 ); 841 842 Set<Tuple> results = new HashSet<Tuple>(); 843 844 results.add( new Tuple( "1\ta\t1\tA\t1\ta" ) ); 845 results.add( new Tuple( "null\tnull\t2\tB\t2\tb" ) ); 846 results.add( new Tuple( "null\tnull\t3\tC\t3\tc" ) ); 847 results.add( new Tuple( "null\tnull\t4\tD\t4\td" ) ); 848 results.add( new Tuple( "5\tb\t5\tE\t5\te" ) ); 849 results.add( new Tuple( "5\te\t5\tE\t5\te" ) ); 850 851 List<Tuple> actual = getSinkAsList( flow ); 852 853 results.removeAll( actual ); 854 855 assertEquals( 0, results.size() ); 856 } 857 858 @Test 859 public void testJoinDiffFields() throws Exception 860 { 861 getPlatform().copyFromLocal( inputFileLower ); 862 getPlatform().copyFromLocal( inputFileUpper ); 863 864 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 865 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 866 867 Map sources = new HashMap(); 868 869 sources.put( "lower", sourceLower ); 870 sources.put( "upper", sourceUpper ); 871 872 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "difffields" ), SinkMode.REPLACE ); 873 874 Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " ); 875 Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " ); 876 877 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower ); 878 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper ); 879 880 Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) ); 881 882 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe ); 883 884 flow.complete(); 885 886 validateLength( flow, 5 ); 887 888 List<Tuple> actual = getSinkAsList( flow ); 889 890 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) ); 891 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) ); 892 } 893 894 @Test 895 public void testJoinGroupBy() throws Exception 896 { 897 getPlatform().copyFromLocal( inputFileLower ); 898 getPlatform().copyFromLocal( inputFileUpper ); 899 900 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 901 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 902 903 Map sources = new HashMap(); 904 905 sources.put( "lower", sourceLower ); 906 sources.put( "upper", sourceUpper ); 907 908 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupby" ), SinkMode.REPLACE ); 909 910 Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " ); 911 Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " ); 912 913 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower ); 914 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper ); 915 916 Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) ); 917 918 Pipe groupby = new GroupBy( pipe, new Fields( "numA" ) ); 919 920 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, groupby ); 921 922 flow.complete(); 923 924 validateLength( flow, 5, null ); 925 926 List<Tuple> actual = getSinkAsList( flow ); 927 928 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) ); 929 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) ); 930 } 931 932 @Test 933 public void testJoinSamePipe() throws Exception 934 { 935 getPlatform().copyFromLocal( inputFileLower ); 936 937 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 938 939 Map sources = new HashMap(); 940 941 sources.put( "lower", source ); 942 943 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe" ), SinkMode.REPLACE ); 944 945 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 946 947 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 948 949 Pipe pipe = new HashJoin( pipeLower, new Fields( "num" ), 1, new Fields( "num1", "char1", "num2", "char2" ) ); 950 951 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe ); 952 953 flow.complete(); 954 955 validateLength( flow, 5, null ); 956 957 List<Tuple> actual = getSinkAsList( flow ); 958 959 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 960 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 961 } 962 963 @Test 964 public void testJoinSamePipe2() throws Exception 965 { 966 getPlatform().copyFromLocal( inputFileLower ); 967 968 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 969 970 Map sources = new HashMap(); 971 972 sources.put( "lower", source ); 973 974 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe2" ), SinkMode.REPLACE ); 975 976 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 977 978 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 979 980 Pipe join = new HashJoin( pipeLower, new Fields( "num" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 981 982 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join ); 983 984 flow.complete(); 985 986 validateLength( flow, 5, null ); 987 988 List<Tuple> actual = getSinkAsList( flow ); 989 990 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 991 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 992 } 993 994 @Test 995 public void testJoinSamePipe3() throws Exception 996 { 997 getPlatform().copyFromLocal( inputFileLower ); 998 999 Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); 1000 1001 Map sources = new HashMap(); 1002 1003 sources.put( "lower", source ); 1004 1005 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe3" ), SinkMode.REPLACE ); 1006 1007 Pipe pipe = new Pipe( "lower" ); 1008 1009 Pipe lhs = new Pipe( "lhs", pipe ); 1010 Pipe rhs = new Pipe( "rhs", pipe ); 1011 1012 Pipe join = new HashJoin( lhs, new Fields( "num" ), rhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1013 1014 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join ); 1015 1016 flow.complete(); 1017 1018 validateLength( flow, 5, null ); 1019 1020 List<Tuple> actual = getSinkAsList( flow ); 1021 1022 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 1023 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 1024 } 1025 1026 /** 1027 * Same source as rightmost 1028 * <p/> 1029 * should be a single job as the same file accumulates into the joins 1030 * 1031 * @throws Exception 1032 */ 1033 @Test 1034 public void testJoinAroundJoinRightMost() throws Exception 1035 { 1036 getPlatform().copyFromLocal( inputFileLower ); 1037 getPlatform().copyFromLocal( inputFileUpper ); 1038 1039 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1040 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1041 1042 Map sources = new HashMap(); 1043 1044 sources.put( "lower", sourceLower ); 1045 sources.put( "upper1", sourceUpper ); 1046 sources.put( "upper2", sourceUpper ); 1047 1048 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinrightmost" ), SinkMode.REPLACE ); 1049 1050 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1051 1052 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1053 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1054 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1055 1056 Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1057 1058 splice1 = new Each( splice1, new Identity() ); 1059 1060 Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1061 1062 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1063 1064 // flow.writeDOT( "joinaroundrightmost.dot" ); 1065 1066 if( getPlatform().isMapReduce() ) 1067 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1068 1069 flow.complete(); 1070 1071 validateLength( flow, 5, null ); 1072 1073 List<Tuple> actual = getSinkAsList( flow ); 1074 1075 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) ); 1076 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) ); 1077 } 1078 1079 /** 1080 * Same source as leftmost 1081 * 1082 * @throws Exception 1083 */ 1084 @Test 1085 public void testJoinAroundJoinLeftMost() throws Exception 1086 { 1087 getPlatform().copyFromLocal( inputFileLower ); 1088 getPlatform().copyFromLocal( inputFileUpper ); 1089 1090 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1091 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1092 1093 Map sources = new HashMap(); 1094 1095 sources.put( "lower", sourceLower ); 1096 sources.put( "upper1", sourceUpper ); 1097 sources.put( "upper2", sourceUpper ); 1098 1099 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinleftmost" ), SinkMode.REPLACE ); 1100 1101 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1102 1103 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1104 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1105 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1106 1107 Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1108 1109 splice1 = new Each( splice1, new Identity() ); 1110 1111 Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1112 1113 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1114 1115 // flow.writeDOT( "joinaroundleftmost.dot" ); 1116 1117 if( getPlatform().isMapReduce() ) 1118 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1119 1120 flow.complete(); 1121 1122 validateLength( flow, 5, null ); 1123 1124 List<Tuple> actual = getSinkAsList( flow ); 1125 1126 assertTrue( actual.contains( new Tuple( "1\tA\t1\tA\t1\ta" ) ) ); 1127 assertTrue( actual.contains( new Tuple( "2\tB\t2\tB\t2\tb" ) ) ); 1128 } 1129 1130 /** 1131 * Upper as leftmost and rightmost forcing two jobs 1132 * 1133 * @throws Exception 1134 */ 1135 @Test 1136 public void testJoinAroundJoinRightMostSwapped() throws Exception 1137 { 1138 getPlatform().copyFromLocal( inputFileLower ); 1139 getPlatform().copyFromLocal( inputFileUpper ); 1140 1141 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1142 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1143 1144 Map sources = new HashMap(); 1145 1146 sources.put( "lower", sourceLower ); 1147 sources.put( "upper1", sourceUpper ); 1148 sources.put( "upper2", sourceUpper ); 1149 1150 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinswapped" ), SinkMode.REPLACE ); 1151 1152 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1153 1154 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1155 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1156 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1157 1158 Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1159 1160 splice1 = new Each( splice1, new Identity() ); 1161 1162 // upper2 becomes leftmost, forcing a tap between the joins 1163 Pipe splice2 = new HashJoin( pipeUpper2, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1164 1165 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1166 1167 if( getPlatform().isMapReduce() ) 1168 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1169 1170 flow.complete(); 1171 1172 validateLength( flow, 5, null ); 1173 1174 List<Tuple> actual = getSinkAsList( flow ); 1175 1176 assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\tA" ) ) ); 1177 assertTrue( actual.contains( new Tuple( "2\tB\t2\tb\t2\tB" ) ) ); 1178 } 1179 1180 @Test 1181 public void testJoinGroupByJoin() throws Exception 1182 { 1183 getPlatform().copyFromLocal( inputFileLower ); 1184 getPlatform().copyFromLocal( inputFileUpper ); 1185 getPlatform().copyFromLocal( inputFileJoined ); 1186 1187 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1188 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1189 Tap sourceJoined = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileJoined ); 1190 1191 Map sources = new HashMap(); 1192 1193 sources.put( "lower", sourceLower ); 1194 sources.put( "upper", sourceUpper ); 1195 sources.put( "joined", sourceJoined ); 1196 1197 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupbyjoin" ), SinkMode.REPLACE ); 1198 1199 Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " ); 1200 Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " ); 1201 Function splitterJoined = new RegexSplitter( new Fields( "numC", "lowerC", "upperC" ), "\t" ); 1202 1203 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower ); 1204 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper ); 1205 Pipe pipeJoined = new Each( new Pipe( "joined" ), new Fields( "line" ), splitterJoined ); 1206 1207 Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) ); 1208 1209 pipe = new GroupBy( pipe, new Fields( "numA" ) ); 1210 1211 pipe = new HashJoin( pipe, new Fields( "numA" ), pipeJoined, new Fields( "numC" ) ); 1212 1213 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe ); 1214 1215 if( getPlatform().isMapReduce() ) 1216 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1217 1218 flow.complete(); 1219 1220 validateLength( flow, 5, null ); 1221 1222 List<Tuple> actual = getSinkAsList( flow ); 1223 1224 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\tA" ) ) ); 1225 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tb\tB" ) ) ); 1226 } 1227 1228 /** 1229 * here the same file is fed into the same HashJoin. 1230 * <p/> 1231 * This is three jobs. 1232 * <p/> 1233 * a temp tap is inserted before the accumulated branch for two reasons on the common HashJoin 1234 * <p/> 1235 * it is assumed the accumulated side is filtered down, so pushing to disk will preserve io 1236 * if accumulated side was streamed instead via a fork, only part of the file will accumulate into the HashJoin 1237 * <p/> 1238 * /-T-\ <-- accumulated 1239 * T HJ 1240 * \---/ <-- streamed 1241 * 1242 * @throws Exception 1243 */ 1244 @Test 1245 public void testJoinSameSourceIntoJoin() throws Exception 1246 { 1247 getPlatform().copyFromLocal( inputFileLower ); 1248 getPlatform().copyFromLocal( inputFileUpper ); 1249 1250 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1251 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1252 1253 Map sources = new HashMap(); 1254 1255 sources.put( "lower", sourceLower ); 1256 sources.put( "upper1", sourceUpper ); 1257 sources.put( "upper2", sourceUpper ); 1258 1259 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceintojoin" ), SinkMode.REPLACE ); 1260 1261 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1262 1263 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1264 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1265 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1266 1267 Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1268 1269 splice1 = new Each( splice1, new Identity() ); 1270 1271 Pipe splice2 = new HashJoin( pipeLower, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1272 1273 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1274 1275 // flow.writeDOT( "joinsamesourceintojoin.dot" ); 1276 1277 if( getPlatform().isMapReduce() ) 1278 assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() ); 1279 1280 flow.complete(); 1281 1282 validateLength( flow, 5, null ); 1283 1284 List<Tuple> actual = getSinkAsList( flow ); 1285 1286 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) ); 1287 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) ); 1288 } 1289 1290 /** 1291 * Tests that two independent streamed sources with loadable tributaries properly plan into a GroupBy 1292 * without loading unused sources 1293 * 1294 * @throws Exception 1295 */ 1296 @Test 1297 public void testJoinsIntoGroupBy() throws Exception 1298 { 1299 getPlatform().copyFromLocal( inputFileLower ); 1300 getPlatform().copyFromLocal( inputFileUpper ); 1301 1302 getPlatform().copyFromLocal( inputFileLhs ); 1303 getPlatform().copyFromLocal( inputFileRhs ); 1304 1305 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1306 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1307 1308 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1309 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1310 1311 Map sources = new HashMap(); 1312 1313 sources.put( "lower", sourceLower ); 1314 sources.put( "upper", sourceUpper ); 1315 sources.put( "lhs", sourceLhs ); 1316 sources.put( "rhs", sourceRhs ); 1317 1318 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintogroupby" ), SinkMode.REPLACE ); 1319 1320 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1321 1322 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1323 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1324 1325 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1326 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1327 1328 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1329 1330 upperLower = new Each( upperLower, new Identity() ); 1331 1332 Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1333 1334 lhsRhs = new Each( lhsRhs, new Identity() ); 1335 1336 Pipe grouped = new GroupBy( "merging", Pipe.pipes( upperLower, lhsRhs ), new Fields( "num1" ) ); 1337 1338 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1339 1340 if( getPlatform().isMapReduce() ) 1341 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1342 1343 flow.complete(); 1344 1345 validateLength( flow, 42, null ); 1346 1347 List<Tuple> actual = getSinkAsList( flow ); 1348 1349 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) ); 1350 assertTrue( actual.contains( new Tuple( "5\te\t5\tE" ) ) ); 1351 } 1352 1353 @Test 1354 public void testJoinSamePipeAroundGroupBy() throws Exception 1355 { 1356 getPlatform().copyFromLocal( inputFileLower ); 1357 1358 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1359 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipearoundgroupby" ), SinkMode.REPLACE ); 1360 1361 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1362 1363 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1364 1365 Pipe lhsPipe = new Each( new Pipe( "lhs", pipeLower ), new Identity() ); 1366 1367 Pipe rhsPipe = new Each( new Pipe( "rhs", pipeLower ), new Identity() ); 1368 1369 rhsPipe = new GroupBy( rhsPipe, new Fields( "num" ) ); 1370 1371 rhsPipe = new Each( rhsPipe, new Identity() ); 1372 1373 Pipe pipe = new HashJoin( lhsPipe, new Fields( "num" ), rhsPipe, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1374 1375 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 1376 1377 flow.complete(); 1378 1379 validateLength( flow, 5, null ); 1380 1381 List<Tuple> actual = getSinkAsList( flow ); 1382 1383 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 1384 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 1385 } 1386 1387 /** 1388 * This test results in two MR jobs because one join feeds into the accumulated side of the second. A mapper 1389 * can only stream on branch at a time forcing a temp file between the mappers. see next test for swapped join 1390 * 1391 * @throws Exception 1392 */ 1393 @Test 1394 public void testJoinsIntoCoGroupLhs() throws Exception 1395 { 1396 getPlatform().copyFromLocal( inputFileLower ); 1397 getPlatform().copyFromLocal( inputFileUpper ); 1398 1399 getPlatform().copyFromLocal( inputFileLhs ); 1400 getPlatform().copyFromLocal( inputFileRhs ); 1401 1402 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1403 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1404 1405 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1406 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1407 1408 Map sources = new HashMap(); 1409 1410 sources.put( "lower", sourceLower ); 1411 sources.put( "upper", sourceUpper ); 1412 sources.put( "lhs", sourceLhs ); 1413 sources.put( "rhs", sourceRhs ); 1414 1415 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhs" ), SinkMode.REPLACE ); 1416 1417 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1418 1419 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1420 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1421 1422 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1423 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1424 1425 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1426 1427 upperLower = new Each( upperLower, new Identity() ); 1428 1429 Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1430 1431 lhsUpperLower = new Each( lhsUpperLower, new Identity() ); 1432 1433 Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) ); 1434 1435 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1436 1437 if( getPlatform().isMapReduce() ) 1438 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1439 1440 flow.complete(); 1441 1442 validateLength( flow, 37, null ); 1443 1444 List<Tuple> actual = getSinkAsList( flow ); 1445 1446 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta\t1\tA\t1\tA" ) ) ); 1447 assertTrue( actual.contains( new Tuple( "5\ta\t5\te\t5\tE\t5\tA" ) ) ); 1448 } 1449 1450 /** 1451 * This test results in one MR jobs because one join feeds into the streamed side of the second. 1452 * 1453 * @throws Exception 1454 */ 1455 @Test 1456 public void testJoinsIntoCoGroupLhsSwappedJoin() throws Exception 1457 { 1458 getPlatform().copyFromLocal( inputFileLower ); 1459 getPlatform().copyFromLocal( inputFileUpper ); 1460 1461 getPlatform().copyFromLocal( inputFileLhs ); 1462 getPlatform().copyFromLocal( inputFileRhs ); 1463 1464 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1465 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1466 1467 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1468 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1469 1470 Map sources = new HashMap(); 1471 1472 sources.put( "lower", sourceLower ); 1473 sources.put( "upper", sourceUpper ); 1474 sources.put( "lhs", sourceLhs ); 1475 sources.put( "rhs", sourceRhs ); 1476 1477 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhsswappedjoin" ), SinkMode.REPLACE ); 1478 1479 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1480 1481 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1482 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1483 1484 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1485 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1486 1487 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1488 1489 upperLower = new Each( upperLower, new Identity() ); 1490 1491 Pipe lhsUpperLower = new HashJoin( upperLower, new Fields( "numUpperLower" ), pipeLhs, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower", "numLhs", "charLhs" ) ); 1492 1493 lhsUpperLower = new Each( lhsUpperLower, new Identity() ); 1494 1495 Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) ); 1496 1497 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1498 1499 if( getPlatform().isMapReduce() ) 1500 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1501 1502 flow.complete(); 1503 1504 validateLength( flow, 37, null ); 1505 1506 List<Tuple> actual = getSinkAsList( flow ); 1507 1508 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) ); 1509 assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) ); 1510 } 1511 1512 @Test 1513 public void testJoinsIntoCoGroupRhs() throws Exception 1514 { 1515 getPlatform().copyFromLocal( inputFileLower ); 1516 getPlatform().copyFromLocal( inputFileUpper ); 1517 1518 getPlatform().copyFromLocal( inputFileLhs ); 1519 getPlatform().copyFromLocal( inputFileRhs ); 1520 1521 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1522 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1523 1524 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1525 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1526 1527 Map sources = new HashMap(); 1528 1529 sources.put( "lower", sourceLower ); 1530 sources.put( "upper", sourceUpper ); 1531 sources.put( "lhs", sourceLhs ); 1532 sources.put( "rhs", sourceRhs ); 1533 1534 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouprhs" ), SinkMode.REPLACE ); 1535 1536 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1537 1538 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1539 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1540 1541 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1542 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1543 1544 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1545 1546 upperLower = new Each( upperLower, new Identity() ); 1547 1548 Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1549 1550 lhsUpperLower = new Each( lhsUpperLower, new Identity() ); 1551 1552 Pipe grouped = new CoGroup( "cogrouping", pipeRhs, new Fields( "num" ), lhsUpperLower, new Fields( "numLhs" ) ); 1553 1554 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1555 1556 if( getPlatform().isMapReduce() ) 1557 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1558 1559 flow.complete(); 1560 1561 validateLength( flow, 37, null ); 1562 1563 List<Tuple> actual = getSinkAsList( flow ); 1564 1565 assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\ta\t1\tA" ) ) ); 1566 assertTrue( actual.contains( new Tuple( "5\tE\t5\te\t5\te\t5\tE" ) ) ); 1567 } 1568 1569 @Test 1570 public void testJoinsIntoCoGroup() throws Exception 1571 { 1572 getPlatform().copyFromLocal( inputFileLower ); 1573 getPlatform().copyFromLocal( inputFileUpper ); 1574 1575 getPlatform().copyFromLocal( inputFileLhs ); 1576 getPlatform().copyFromLocal( inputFileRhs ); 1577 1578 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1579 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1580 1581 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1582 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1583 1584 Map sources = new HashMap(); 1585 1586 sources.put( "lower", sourceLower ); 1587 sources.put( "upper", sourceUpper ); 1588 sources.put( "lhs", sourceLhs ); 1589 sources.put( "rhs", sourceRhs ); 1590 1591 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogroup" ), SinkMode.REPLACE ); 1592 1593 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1594 1595 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1596 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1597 1598 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1599 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1600 1601 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower1", "charUpperLower1", "numUpperLower2", "charUpperLower2" ) ); 1602 1603 upperLower = new Each( upperLower, new Identity() ); 1604 1605 Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "numLhsRhs1", "charLhsRhs1", "numLhsRhs2", "charLhsRhs2" ) ); 1606 1607 lhsRhs = new Each( lhsRhs, new Identity() ); 1608 1609 Pipe grouped = new CoGroup( "cogrouping", upperLower, new Fields( "numUpperLower1" ), lhsRhs, new Fields( "numLhsRhs1" ) ); 1610 1611 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1612 1613 if( getPlatform().isMapReduce() ) 1614 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1615 1616 flow.complete(); 1617 1618 validateLength( flow, 37, null ); 1619 1620 List<Tuple> actual = getSinkAsList( flow ); 1621 1622 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) ); 1623 assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) ); 1624 } 1625 1626 public static class AllComparator implements Comparator<Comparable>, Hasher<Comparable>, Serializable 1627 { 1628 1629 @Override 1630 public int compare( Comparable lhs, Comparable rhs ) 1631 { 1632 return lhs.toString().compareTo( rhs.toString() ); 1633 } 1634 1635 @Override 1636 public int hashCode( Comparable value ) 1637 { 1638 if( value == null ) 1639 return 0; 1640 1641 return value.toString().hashCode(); 1642 } 1643 } 1644 1645 /** 1646 * Tests Hasher being honored even if default comparator is null. 1647 * 1648 * @throws Exception 1649 */ 1650 @Test 1651 public void testJoinWithHasher() throws Exception 1652 { 1653 getPlatform().copyFromLocal( inputFileLower ); 1654 getPlatform().copyFromLocal( inputFileUpper ); 1655 1656 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1657 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1658 1659 Map sources = new HashMap(); 1660 1661 sources.put( "lower", sourceLower ); 1662 sources.put( "upper", sourceUpper ); 1663 1664 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinhasher" ), SinkMode.REPLACE ); 1665 1666 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1667 1668 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1669 1670 pipeLower = new Each( pipeLower, new Fields( "num" ), new ExpressionFunction( Fields.ARGS, "Integer.parseInt( num )", String.class ), Fields.REPLACE ); 1671 1672 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1673 1674 Fields num = new Fields( "num" ); 1675 num.setComparator( "num", new AllComparator() ); 1676 1677 Pipe splice = new HashJoin( pipeLower, num, pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 1678 1679 Map<Object, Object> properties = getProperties(); 1680 1681 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 1682 1683 flow.complete(); 1684 1685 validateLength( flow, 5 ); 1686 1687 List<Tuple> values = getSinkAsList( flow ); 1688 1689 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 1690 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 1691 } 1692 1693 @Test 1694 public void testJoinNone() throws Exception 1695 { 1696 getPlatform().copyFromLocal( inputFileLower ); 1697 getPlatform().copyFromLocal( inputFileUpper ); 1698 1699 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1700 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1701 1702 Map sources = new HashMap(); 1703 1704 sources.put( "lower", sourceLower ); 1705 sources.put( "upper", sourceUpper ); 1706 1707 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinnone" ), SinkMode.REPLACE ); 1708 1709 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1710 1711 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1712 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1713 1714 Pipe splice = new HashJoin( pipeLower, Fields.NONE, pipeUpper, Fields.NONE, Fields.size( 4 ) ); 1715 1716 Map<Object, Object> properties = getProperties(); 1717 1718 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 1719 1720 flow.complete(); 1721 1722 validateLength( flow, 25 ); 1723 1724 List<Tuple> values = getSinkAsList( flow ); 1725 1726 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 1727 assertTrue( values.contains( new Tuple( "1\ta\t2\tB" ) ) ); 1728 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 1729 } 1730 }