001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.scheme.local;
022    
023    import java.beans.ConstructorProperties;
024    import java.io.IOException;
025    import java.io.InputStream;
026    import java.io.InputStreamReader;
027    import java.io.LineNumberReader;
028    import java.io.OutputStream;
029    import java.io.OutputStreamWriter;
030    import java.io.PrintWriter;
031    import java.io.UnsupportedEncodingException;
032    import java.nio.charset.Charset;
033    import java.util.Properties;
034    
035    import cascading.flow.FlowProcess;
036    import cascading.scheme.Scheme;
037    import cascading.scheme.SinkCall;
038    import cascading.scheme.SourceCall;
039    import cascading.tap.Tap;
040    import cascading.tap.TapException;
041    import cascading.tuple.Fields;
042    import cascading.tuple.TupleEntry;
043    
044    /**
045     * A TextLine is a type of {@link cascading.scheme.Scheme} for plain text files. Files are broken into
046     * lines. Either line-feed or carriage-return are used to signal end of line.
047     * <p/>
048     * By default, this scheme returns a {@link cascading.tuple.Tuple} with two fields, "num" and "line". Where "num"
049     * is the line number for "line".
050     * <p/>
051     * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names
052     * to be used instead of the names "num" and "line". sinkFields is a selector and is by default {@link Fields#ALL}.
053     * Any available field names can be given if only a subset of the incoming fields should be used.
054     * <p/>
055     * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples
056     * will simply be the "line" value using the given field name.
057     * <p/>
058     * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before
059     * writing out the line.
060     * <p/>
061     * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor
062     * argument.
063     */
064    public class TextLine extends Scheme<Properties, InputStream, OutputStream, LineNumberReader, PrintWriter>
065      {
066      public static final String DEFAULT_CHARSET = "UTF-8";
067    
068      private String charsetName = DEFAULT_CHARSET;
069    
070      /**
071       * Creates a new TextLine instance that sources "num" and "line" fields, and sinks all incoming fields, where
072       * "num" is the line number of the line in the input file.
073       */
074      public TextLine()
075        {
076        super( new Fields( "num", "line" ), Fields.ALL );
077        }
078    
079      /**
080       * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
081       * subsequent tuples.
082       *
083       * @param sourceFields of Fields
084       */
085      @ConstructorProperties({"sourceFields"})
086      public TextLine( Fields sourceFields )
087        {
088        super( sourceFields );
089    
090        verify( sourceFields );
091        }
092    
093      /**
094       * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
095       * subsequent tuples.
096       *
097       * @param sourceFields of Fields
098       * @param charsetName  of type String
099       */
100      @ConstructorProperties({"sourceFields", "charsetName"})
101      public TextLine( Fields sourceFields, String charsetName )
102        {
103        super( sourceFields );
104    
105        // throws an exception if not found
106        setCharsetName( charsetName );
107    
108        verify( sourceFields );
109        }
110    
111      /**
112       * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
113       * subsequent tuples.
114       *
115       * @param sourceFields of Fields
116       * @param sinkFields   of Fields
117       */
118      @ConstructorProperties({"sourceFields", "sinkFields"})
119      public TextLine( Fields sourceFields, Fields sinkFields )
120        {
121        super( sourceFields, sinkFields );
122    
123        verify( sourceFields );
124        }
125    
126      /**
127       * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
128       * subsequent tuples.
129       *
130       * @param sourceFields of Fields
131       * @param sinkFields   of Fields
132       * @param charsetName  of type String
133       */
134      @ConstructorProperties({"sourceFields", "sinkFields", "charsetName"})
135      public TextLine( Fields sourceFields, Fields sinkFields, String charsetName )
136        {
137        super( sourceFields, sinkFields );
138    
139        // throws an exception if not found
140        setCharsetName( charsetName );
141    
142        verify( sourceFields );
143        }
144    
145      private void setCharsetName( String charsetName )
146        {
147        if( charsetName != null )
148          this.charsetName = charsetName;
149    
150        Charset.forName( this.charsetName );
151        }
152    
153      protected void verify( Fields sourceFields )
154        {
155        if( sourceFields.size() < 1 || sourceFields.size() > 2 )
156          throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" );
157        }
158    
159      public LineNumberReader createInput( InputStream inputStream )
160        {
161        try
162          {
163          return new LineNumberReader( new InputStreamReader( inputStream, charsetName ) );
164          }
165        catch( UnsupportedEncodingException exception )
166          {
167          throw new TapException( exception );
168          }
169        }
170    
171      public PrintWriter createOutput( OutputStream outputStream )
172        {
173        try
174          {
175          return new PrintWriter( new OutputStreamWriter( outputStream, charsetName ) );
176          }
177        catch( UnsupportedEncodingException exception )
178          {
179          throw new TapException( exception );
180          }
181        }
182    
183      @Override
184      public void presentSourceFields( FlowProcess<Properties> process, Tap tap, Fields fields )
185        {
186        // do nothing
187        }
188    
189      @Override
190      public void presentSinkFields( FlowProcess<Properties> process, Tap tap, Fields fields )
191        {
192        // do nothing
193        }
194    
195      @Override
196      public void sourceConfInit( FlowProcess<Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf )
197        {
198        }
199    
200      @Override
201      public void sinkConfInit( FlowProcess<Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf )
202        {
203        }
204    
205      @Override
206      public void sourcePrepare( FlowProcess<Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException
207        {
208        sourceCall.setContext( createInput( sourceCall.getInput() ) );
209        }
210    
211      @Override
212      public boolean source( FlowProcess<Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException
213        {
214        // first line is 0, this matches offset being zero, so when throwing out the first line for comments
215        int lineNumber = sourceCall.getContext().getLineNumber();
216        String line = sourceCall.getContext().readLine();
217    
218        if( line == null )
219          return false;
220    
221        TupleEntry incomingEntry = sourceCall.getIncomingEntry();
222    
223        if( getSourceFields().size() == 1 )
224          {
225          incomingEntry.setObject( 0, line );
226          }
227        else
228          {
229          incomingEntry.setInteger( 0, lineNumber );
230          incomingEntry.setString( 1, line );
231          }
232    
233        return true;
234        }
235    
236      @Override
237      public void sourceCleanup( FlowProcess<Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException
238        {
239        sourceCall.setContext( null );
240        }
241    
242      @Override
243      public void sinkPrepare( FlowProcess<Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException
244        {
245        sinkCall.setContext( createOutput( sinkCall.getOutput() ) );
246        }
247    
248      @Override
249      public void sink( FlowProcess<Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException
250        {
251        sinkCall.getContext().println( sinkCall.getOutgoingEntry().getTuple().toString() );
252        }
253    
254      @Override
255      public void sinkCleanup( FlowProcess<Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException
256        {
257        sinkCall.getContext().flush();
258        sinkCall.setContext( null );
259        }
260      }