001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.tap.local; 022 023 import java.beans.ConstructorProperties; 024 import java.io.File; 025 import java.io.FileInputStream; 026 import java.io.FileNotFoundException; 027 import java.io.IOException; 028 import java.io.InputStream; 029 import java.io.OutputStream; 030 import java.util.Properties; 031 032 import cascading.flow.FlowProcess; 033 import cascading.tap.SinkMode; 034 import cascading.tap.Tap; 035 import cascading.tap.local.io.TapFileOutputStream; 036 import cascading.tap.partition.BasePartitionTap; 037 import cascading.tap.partition.Partition; 038 import cascading.tuple.TupleEntrySchemeCollector; 039 import cascading.tuple.TupleEntrySchemeIterator; 040 041 /** 042 * Class PartitionTap can be used to write tuple streams out to files and sub-directories based on the values in the 043 * current {@link cascading.tuple.Tuple} instance. 044 * <p/> 045 * The constructor takes a {@link cascading.tap.local.FileTap} {@link cascading.tap.Tap} and a {@link Partition} 046 * implementation. This allows Tuple values at given positions to be used as directory names. 047 * <p/> 048 * {@code openWritesThreshold} limits the number of open files to be output to. This value defaults to 300 files. 049 * Each time the threshold is exceeded, 10% of the least recently used open files will be closed. 050 * <p/> 051 * PartitionTap will populate a given {@code partition} without regard to case of the values being used. Thus 052 * the resulting paths {@code 2012/June/} and {@code 2012/june/} will likely result in two open files into the same 053 * location. Forcing the case to be consistent with a custom Partition implementation or an upstream 054 * {@link cascading.operation.Function} is recommended, see {@link cascading.operation.expression.ExpressionFunction}. 055 */ 056 public class PartitionTap extends BasePartitionTap<Properties, InputStream, OutputStream> 057 { 058 /** 059 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 060 * base path and default {@link cascading.scheme.Scheme}, and the partition. 061 * 062 * @param parent of type Tap 063 * @param partition of type String 064 */ 065 @ConstructorProperties({"parent", "partition"}) 066 public PartitionTap( FileTap parent, Partition partition ) 067 { 068 this( parent, partition, OPEN_WRITES_THRESHOLD_DEFAULT ); 069 } 070 071 /** 072 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 073 * base path and default {@link cascading.scheme.Scheme}, and the partition. 074 * <p/> 075 * {@code openWritesThreshold} limits the number of open files to be output to. 076 * 077 * @param parent of type Hfs 078 * @param partition of type String 079 * @param openWritesThreshold of type int 080 */ 081 @ConstructorProperties({"parent", "partition", "openWritesThreshold"}) 082 public PartitionTap( FileTap parent, Partition partition, int openWritesThreshold ) 083 { 084 super( parent, partition, openWritesThreshold ); 085 } 086 087 /** 088 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 089 * base path and default {@link cascading.scheme.Scheme}, and the partition. 090 * 091 * @param parent of type Tap 092 * @param partition of type String 093 * @param sinkMode of type SinkMode 094 */ 095 @ConstructorProperties({"parent", "partition", "sinkMode"}) 096 public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode ) 097 { 098 super( parent, partition, sinkMode ); 099 } 100 101 /** 102 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 103 * base path and default {@link cascading.scheme.Scheme}, and the partition. 104 * <p/> 105 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 106 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 107 * 108 * @param parent of type Tap 109 * @param partition of type String 110 * @param sinkMode of type SinkMode 111 * @param keepParentOnDelete of type boolean 112 */ 113 @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete"}) 114 public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete ) 115 { 116 this( parent, partition, sinkMode, keepParentOnDelete, OPEN_WRITES_THRESHOLD_DEFAULT ); 117 } 118 119 /** 120 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 121 * base path and default {@link cascading.scheme.Scheme}, and the partition. 122 * <p/> 123 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 124 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 125 * <p/> 126 * {@code openWritesThreshold} limits the number of open files to be output to. 127 * 128 * @param parent of type Tap 129 * @param partition of type String 130 * @param sinkMode of type SinkMode 131 * @param keepParentOnDelete of type boolean 132 * @param openWritesThreshold of type int 133 */ 134 @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete", "openWritesThreshold"}) 135 public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete, int openWritesThreshold ) 136 { 137 super( parent, partition, sinkMode, keepParentOnDelete, openWritesThreshold ); 138 } 139 140 @Override 141 protected String getCurrentIdentifier( FlowProcess<Properties> flowProcess ) 142 { 143 return null; 144 } 145 146 @Override 147 public boolean deleteResource( Properties conf ) throws IOException 148 { 149 String[] childIdentifiers = ( (FileTap) parent ).getChildIdentifiers( conf, Integer.MAX_VALUE, false ); 150 151 if( childIdentifiers.length == 0 ) 152 return true; 153 154 boolean result = false; 155 156 for( String childIdentifier : childIdentifiers ) 157 result |= new File( childIdentifier ).delete(); 158 159 return result; 160 } 161 162 @Override 163 protected TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<Properties> flowProcess, Tap parent, String path, long sequence ) throws IOException 164 { 165 TapFileOutputStream output = new TapFileOutputStream( parent, path, true ); // always append 166 167 return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, parent, output ); 168 } 169 170 @Override 171 protected TupleEntrySchemeIterator createTupleEntrySchemeIterator( FlowProcess<Properties> flowProcess, Tap parent, String path, InputStream input ) throws FileNotFoundException 172 { 173 if( input == null ) 174 input = new FileInputStream( path ); 175 176 return new TupleEntrySchemeIterator( flowProcess, parent.getScheme(), input, path ); 177 } 178 }