001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.scheme.local; 022 023 import java.beans.ConstructorProperties; 024 import java.io.IOException; 025 import java.io.InputStream; 026 import java.io.InputStreamReader; 027 import java.io.LineNumberReader; 028 import java.io.OutputStream; 029 import java.io.OutputStreamWriter; 030 import java.io.PrintWriter; 031 import java.io.UnsupportedEncodingException; 032 import java.nio.charset.Charset; 033 import java.util.Properties; 034 035 import cascading.flow.FlowProcess; 036 import cascading.scheme.Scheme; 037 import cascading.scheme.SinkCall; 038 import cascading.scheme.SourceCall; 039 import cascading.tap.Tap; 040 import cascading.tap.TapException; 041 import cascading.tuple.Fields; 042 import cascading.tuple.TupleEntry; 043 044 /** 045 * A TextLine is a type of {@link cascading.scheme.Scheme} for plain text files. Files are broken into 046 * lines. Either line-feed or carriage-return are used to signal end of line. 047 * <p/> 048 * By default, this scheme returns a {@link cascading.tuple.Tuple} with two fields, "num" and "line". Where "num" 049 * is the line number for "line". 050 * <p/> 051 * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names 052 * to be used instead of the names "num" and "line". sinkFields is a selector and is by default {@link Fields#ALL}. 053 * Any available field names can be given if only a subset of the incoming fields should be used. 054 * <p/> 055 * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples 056 * will simply be the "line" value using the given field name. 057 * <p/> 058 * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before 059 * writing out the line. 060 * <p/> 061 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor 062 * argument. 063 */ 064 public class TextLine extends Scheme<Properties, InputStream, OutputStream, LineNumberReader, PrintWriter> 065 { 066 public static final String DEFAULT_CHARSET = "UTF-8"; 067 068 private String charsetName = DEFAULT_CHARSET; 069 070 /** 071 * Creates a new TextLine instance that sources "num" and "line" fields, and sinks all incoming fields, where 072 * "num" is the line number of the line in the input file. 073 */ 074 public TextLine() 075 { 076 super( new Fields( "num", "line" ), Fields.ALL ); 077 } 078 079 /** 080 * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the 081 * subsequent tuples. 082 * 083 * @param sourceFields of Fields 084 */ 085 @ConstructorProperties({"sourceFields"}) 086 public TextLine( Fields sourceFields ) 087 { 088 super( sourceFields ); 089 090 verify( sourceFields ); 091 } 092 093 /** 094 * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the 095 * subsequent tuples. 096 * 097 * @param sourceFields of Fields 098 * @param charsetName of type String 099 */ 100 @ConstructorProperties({"sourceFields", "charsetName"}) 101 public TextLine( Fields sourceFields, String charsetName ) 102 { 103 super( sourceFields ); 104 105 // throws an exception if not found 106 setCharsetName( charsetName ); 107 108 verify( sourceFields ); 109 } 110 111 /** 112 * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the 113 * subsequent tuples. 114 * 115 * @param sourceFields of Fields 116 * @param sinkFields of Fields 117 */ 118 @ConstructorProperties({"sourceFields", "sinkFields"}) 119 public TextLine( Fields sourceFields, Fields sinkFields ) 120 { 121 super( sourceFields, sinkFields ); 122 123 verify( sourceFields ); 124 } 125 126 /** 127 * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the 128 * subsequent tuples. 129 * 130 * @param sourceFields of Fields 131 * @param sinkFields of Fields 132 * @param charsetName of type String 133 */ 134 @ConstructorProperties({"sourceFields", "sinkFields", "charsetName"}) 135 public TextLine( Fields sourceFields, Fields sinkFields, String charsetName ) 136 { 137 super( sourceFields, sinkFields ); 138 139 // throws an exception if not found 140 setCharsetName( charsetName ); 141 142 verify( sourceFields ); 143 } 144 145 private void setCharsetName( String charsetName ) 146 { 147 if( charsetName != null ) 148 this.charsetName = charsetName; 149 150 Charset.forName( this.charsetName ); 151 } 152 153 protected void verify( Fields sourceFields ) 154 { 155 if( sourceFields.size() < 1 || sourceFields.size() > 2 ) 156 throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" ); 157 } 158 159 public LineNumberReader createInput( InputStream inputStream ) 160 { 161 try 162 { 163 return new LineNumberReader( new InputStreamReader( inputStream, charsetName ) ); 164 } 165 catch( UnsupportedEncodingException exception ) 166 { 167 throw new TapException( exception ); 168 } 169 } 170 171 public PrintWriter createOutput( OutputStream outputStream ) 172 { 173 try 174 { 175 return new PrintWriter( new OutputStreamWriter( outputStream, charsetName ) ); 176 } 177 catch( UnsupportedEncodingException exception ) 178 { 179 throw new TapException( exception ); 180 } 181 } 182 183 @Override 184 public void presentSourceFields( FlowProcess<Properties> process, Tap tap, Fields fields ) 185 { 186 // do nothing 187 } 188 189 @Override 190 public void presentSinkFields( FlowProcess<Properties> process, Tap tap, Fields fields ) 191 { 192 // do nothing 193 } 194 195 @Override 196 public void sourceConfInit( FlowProcess<Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf ) 197 { 198 } 199 200 @Override 201 public void sinkConfInit( FlowProcess<Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf ) 202 { 203 } 204 205 @Override 206 public void sourcePrepare( FlowProcess<Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException 207 { 208 sourceCall.setContext( createInput( sourceCall.getInput() ) ); 209 } 210 211 @Override 212 public boolean source( FlowProcess<Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException 213 { 214 // first line is 0, this matches offset being zero, so when throwing out the first line for comments 215 int lineNumber = sourceCall.getContext().getLineNumber(); 216 String line = sourceCall.getContext().readLine(); 217 218 if( line == null ) 219 return false; 220 221 TupleEntry incomingEntry = sourceCall.getIncomingEntry(); 222 223 if( getSourceFields().size() == 1 ) 224 { 225 incomingEntry.setObject( 0, line ); 226 } 227 else 228 { 229 incomingEntry.setInteger( 0, lineNumber ); 230 incomingEntry.setString( 1, line ); 231 } 232 233 return true; 234 } 235 236 @Override 237 public void sourceCleanup( FlowProcess<Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException 238 { 239 sourceCall.setContext( null ); 240 } 241 242 @Override 243 public void sinkPrepare( FlowProcess<Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException 244 { 245 sinkCall.setContext( createOutput( sinkCall.getOutput() ) ); 246 } 247 248 @Override 249 public void sink( FlowProcess<Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException 250 { 251 sinkCall.getContext().println( sinkCall.getOutgoingEntry().getTuple().toString() ); 252 } 253 254 @Override 255 public void sinkCleanup( FlowProcess<Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException 256 { 257 sinkCall.getContext().flush(); 258 sinkCall.setContext( null ); 259 } 260 }