001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.tap.local;
022    
023    import java.beans.ConstructorProperties;
024    import java.io.File;
025    import java.io.FileInputStream;
026    import java.io.FileNotFoundException;
027    import java.io.IOException;
028    import java.io.InputStream;
029    import java.io.OutputStream;
030    import java.util.Properties;
031    
032    import cascading.flow.FlowProcess;
033    import cascading.tap.SinkMode;
034    import cascading.tap.Tap;
035    import cascading.tap.local.io.TapFileOutputStream;
036    import cascading.tap.partition.BasePartitionTap;
037    import cascading.tap.partition.Partition;
038    import cascading.tuple.TupleEntrySchemeCollector;
039    import cascading.tuple.TupleEntrySchemeIterator;
040    
041    /**
042     * Class PartitionTap can be used to write tuple streams out to files and sub-directories based on the values in the
043     * current {@link cascading.tuple.Tuple} instance.
044     * <p/>
045     * The constructor takes a {@link cascading.tap.local.FileTap} {@link cascading.tap.Tap} and a {@link Partition}
046     * implementation. This allows Tuple values at given positions to be used as directory names.
047     * <p/>
048     * {@code openWritesThreshold} limits the number of open files to be output to. This value defaults to 300 files.
049     * Each time the threshold is exceeded, 10% of the least recently used open files will be closed.
050     * <p/>
051     * PartitionTap will populate a given {@code partition} without regard to case of the values being used. Thus
052     * the resulting paths {@code 2012/June/} and {@code 2012/june/} will likely result in two open files into the same
053     * location. Forcing the case to be consistent with a custom Partition implementation or an upstream
054     * {@link cascading.operation.Function} is recommended, see {@link cascading.operation.expression.ExpressionFunction}.
055     */
056    public class PartitionTap extends BasePartitionTap<Properties, InputStream, OutputStream>
057      {
058      /**
059       * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
060       * base path and default {@link cascading.scheme.Scheme}, and the partition.
061       *
062       * @param parent    of type Tap
063       * @param partition of type String
064       */
065      @ConstructorProperties({"parent", "partition"})
066      public PartitionTap( FileTap parent, Partition partition )
067        {
068        this( parent, partition, OPEN_WRITES_THRESHOLD_DEFAULT );
069        }
070    
071      /**
072       * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
073       * base path and default {@link cascading.scheme.Scheme}, and the partition.
074       * <p/>
075       * {@code openWritesThreshold} limits the number of open files to be output to.
076       *
077       * @param parent              of type Hfs
078       * @param partition           of type String
079       * @param openWritesThreshold of type int
080       */
081      @ConstructorProperties({"parent", "partition", "openWritesThreshold"})
082      public PartitionTap( FileTap parent, Partition partition, int openWritesThreshold )
083        {
084        super( parent, partition, openWritesThreshold );
085        }
086    
087      /**
088       * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
089       * base path and default {@link cascading.scheme.Scheme}, and the partition.
090       *
091       * @param parent    of type Tap
092       * @param partition of type String
093       * @param sinkMode  of type SinkMode
094       */
095      @ConstructorProperties({"parent", "partition", "sinkMode"})
096      public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode )
097        {
098        super( parent, partition, sinkMode );
099        }
100    
101      /**
102       * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
103       * base path and default {@link cascading.scheme.Scheme}, and the partition.
104       * <p/>
105       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
106       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
107       *
108       * @param parent             of type Tap
109       * @param partition          of type String
110       * @param sinkMode           of type SinkMode
111       * @param keepParentOnDelete of type boolean
112       */
113      @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete"})
114      public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete )
115        {
116        this( parent, partition, sinkMode, keepParentOnDelete, OPEN_WRITES_THRESHOLD_DEFAULT );
117        }
118    
119      /**
120       * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
121       * base path and default {@link cascading.scheme.Scheme}, and the partition.
122       * <p/>
123       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
124       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
125       * <p/>
126       * {@code openWritesThreshold} limits the number of open files to be output to.
127       *
128       * @param parent              of type Tap
129       * @param partition           of type String
130       * @param sinkMode            of type SinkMode
131       * @param keepParentOnDelete  of type boolean
132       * @param openWritesThreshold of type int
133       */
134      @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete", "openWritesThreshold"})
135      public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete, int openWritesThreshold )
136        {
137        super( parent, partition, sinkMode, keepParentOnDelete, openWritesThreshold );
138        }
139    
140      @Override
141      protected String getCurrentIdentifier( FlowProcess<Properties> flowProcess )
142        {
143        return null;
144        }
145    
146      @Override
147      public boolean deleteResource( Properties conf ) throws IOException
148        {
149        String[] childIdentifiers = ( (FileTap) parent ).getChildIdentifiers( conf, Integer.MAX_VALUE, false );
150    
151        if( childIdentifiers.length == 0 )
152          return true;
153    
154        boolean result = false;
155    
156        for( String childIdentifier : childIdentifiers )
157          result |= new File( childIdentifier ).delete();
158    
159        return result;
160        }
161    
162      @Override
163      protected TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<Properties> flowProcess, Tap parent, String path, long sequence ) throws IOException
164        {
165        TapFileOutputStream output = new TapFileOutputStream( parent, path, true ); // always append
166    
167        return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, parent, output );
168        }
169    
170      @Override
171      protected TupleEntrySchemeIterator createTupleEntrySchemeIterator( FlowProcess<Properties> flowProcess, Tap parent, String path, InputStream input ) throws FileNotFoundException
172        {
173        if( input == null )
174          input = new FileInputStream( path );
175    
176        return new TupleEntrySchemeIterator( flowProcess, parent.getScheme(), input, path );
177        }
178      }