001/*
002 * Copyright (c) 2017-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.local.tap.aws.s3;
022
023import java.io.FilterInputStream;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.OutputStream;
027import java.io.PipedInputStream;
028import java.io.PipedOutputStream;
029import java.net.URI;
030import java.net.URISyntaxException;
031import java.util.ArrayList;
032import java.util.Iterator;
033import java.util.List;
034import java.util.Properties;
035import java.util.function.Predicate;
036import java.util.regex.Pattern;
037
038import cascading.flow.FlowProcess;
039import cascading.property.PropertyUtil;
040import cascading.scheme.FileFormat;
041import cascading.scheme.Scheme;
042import cascading.tap.SinkMode;
043import cascading.tap.Tap;
044import cascading.tap.local.PartitionTap;
045import cascading.tap.type.FileType;
046import cascading.tap.type.TapWith;
047import cascading.tuple.TupleEntryCollector;
048import cascading.tuple.TupleEntryIterator;
049import cascading.tuple.TupleEntrySchemeCollector;
050import cascading.tuple.TupleEntrySchemeIterator;
051import cascading.util.CloseableIterator;
052import cascading.util.Util;
053import com.amazonaws.ClientConfiguration;
054import com.amazonaws.client.builder.AwsClientBuilder;
055import com.amazonaws.services.s3.AmazonS3;
056import com.amazonaws.services.s3.AmazonS3ClientBuilder;
057import com.amazonaws.services.s3.model.ObjectMetadata;
058import com.amazonaws.services.s3.model.S3ObjectSummary;
059import com.amazonaws.services.s3.transfer.TransferManager;
060import com.amazonaws.services.s3.transfer.TransferManagerBuilder;
061import com.amazonaws.services.s3.transfer.Upload;
062import com.amazonaws.services.s3.transfer.model.UploadResult;
063import org.slf4j.Logger;
064import org.slf4j.LoggerFactory;
065
066import static cascading.util.Util.isEmpty;
067
068/**
069 * Class S3Tap is a Cascading local-mode {@link Tap} providing read and write access to data stored in Amazon S3 buckets.
070 * <p>
071 * This Tap is not intended to be used with any of the other Cascading planners unless they specify they are local-mode
072 * compatible.
073 * <p>
074 * S3Tap can read a single key, all objects underneath a key-prefix, or all objects under a key-prefix that match
075 * a given globbing pattern.
076 * <p>
077 * See the various constructors for the available access parametrizations. Of note are the constructors that take
078 * a {@link URI} instance. The URI should be in the following format:
079 * {@code s3://[bucket]/<key|key-prefix><?glob>}
080 * <p>
081 * Where bucket is the only required value. The key references a single object, the key-prefix is used to access
082 * a set of objects with a common prefix value. The glob value is use to further narrow the resulting object set.
083 * <p>
084 * The globbing pattern is specified by the {@link java.nio.file.FileSystem#getPathMatcher} method.
085 * <p>
086 * This Tap was designed to allow applications to effectively poll an S3 bucket for new keys to be processed.
087 * <p>
088 * When used with the {@link S3FileCheckpointer} class, a map of keys last consumed by each bucket will be tracked
089 * on disk, with the map surviving JVM restarts allowing for applications to exit and restart safely without
090 * retrieving duplicate data.
091 * <p>
092 * The {@link S3Checkpointer#commit()} method is only called during a graceful shutdown of the Flow or JVM, but every
093 * consumed key is passed to the S3Checkpointer, so custom implementations can choose to persist the key more
094 * frequently.
095 * <p>
096 * AWS Credentials are handled by {@link com.amazonaws.auth.DefaultAWSCredentialsProviderChain}.
097 */
098public class S3Tap extends Tap<Properties, InputStream, OutputStream> implements FileType<Properties>, TapWith<Properties, InputStream, OutputStream>
099  {
100  /** Field LOG */
101  private static final Logger LOG = LoggerFactory.getLogger( S3Tap.class );
102
103  /** Field SEQUENCE_TOKEN */
104  public static final String SEQUENCE_TOKEN = "{sequence}";
105  /** Field MIME_DIRECTORY */
106  public static final String MIME_DIRECTORY = "application/x-directory";
107  /** Field DEFAULT_DELIMITER */
108  public static final String DEFAULT_DELIMITER = "/";
109
110  /** Field s3Client */
111  AmazonS3 s3Client = null;
112  /** Field bucketName */
113  String bucketName = null;
114  /** Field key */
115  String key = null;
116  /** Field filter */
117  Predicate<String> filter;
118  /** Field delimiter */
119  String delimiter = DEFAULT_DELIMITER;
120  /** Field checkpointer */
121  S3Checkpointer checkpointer;
122
123  private transient ObjectMetadata objectMetadata;
124
125  /**
126   * Method makeURI creates a new S3 URI from the given parameters.
127   *
128   * @param bucketName the S3 bucket name
129   * @param keyPrefix  the S3 object key or key-prefix
130   * @return an URI instance
131   */
132  public static URI makeURI( String bucketName, String keyPrefix )
133    {
134    return makeURI( bucketName, keyPrefix, null );
135    }
136
137  /**
138   * Method makeURI creates a new S3 URI from the given parameters.
139   *
140   * @param bucketName the S3 bucket name
141   * @param keyPrefix  the S3 object key or key-prefix
142   * @param glob       the globbing pattern to apply to the keys
143   * @return an URI instance
144   */
145  public static URI makeURI( String bucketName, String keyPrefix, String glob )
146    {
147    if( bucketName == null )
148      throw new IllegalArgumentException( "bucketName may not be null" );
149
150    try
151      {
152      if( keyPrefix == null )
153        keyPrefix = "/";
154      else if( !keyPrefix.startsWith( "/" ) )
155        keyPrefix = "/" + keyPrefix;
156
157      return new URI( "s3", bucketName, keyPrefix, glob, null );
158      }
159    catch( URISyntaxException exception )
160      {
161      throw new IllegalArgumentException( exception.getMessage(), exception );
162      }
163    }
164
165  /**
166   * Constructor S3Tap creates a new S3Tap instance.
167   *
168   * @param scheme     of Scheme
169   * @param bucketName of String
170   */
171  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName )
172    {
173    this( scheme, bucketName, null, null, null, SinkMode.KEEP );
174    }
175
176  /**
177   * Constructor S3Tap creates a new S3Tap instance.
178   *
179   * @param scheme     of Scheme
180   * @param bucketName of String
181   * @param key        of String
182   */
183  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key )
184    {
185    this( scheme, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
186    }
187
188  /**
189   * Constructor S3Tap creates a new S3Tap instance.
190   *
191   * @param scheme     of Scheme
192   * @param bucketName of String
193   * @param key        of String
194   * @param delimiter  of String
195   */
196  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter )
197    {
198    this( scheme, null, null, bucketName, key, delimiter, SinkMode.KEEP );
199    }
200
201  /**
202   * Constructor S3Tap creates a new S3Tap instance.
203   *
204   * @param scheme     of Scheme
205   * @param bucketName of String
206   * @param filter     of Predicate
207   */
208  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter )
209    {
210    this( scheme, bucketName, null, filter, SinkMode.KEEP );
211    }
212
213  /**
214   * Constructor S3Tap creates a new S3Tap instance.
215   *
216   * @param scheme     of Scheme
217   * @param bucketName of String
218   * @param key        of String
219   * @param filter     of Predicate
220   */
221  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter )
222    {
223    this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP );
224    }
225
226  /**
227   * Constructor S3Tap creates a new S3Tap instance.
228   *
229   * @param scheme     of Scheme
230   * @param bucketName of String
231   * @param key        of String
232   * @param delimiter  of String
233   * @param filter     of Predicate
234   */
235  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter )
236    {
237    this( scheme, null, null, bucketName, key, delimiter, filter, SinkMode.KEEP );
238    }
239
240  /**
241   * Constructor S3Tap creates a new S3Tap instance.
242   *
243   * @param scheme     of Scheme
244   * @param s3Client   of AmazonS3
245   * @param bucketName of String
246   */
247  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName )
248    {
249    this( scheme, s3Client, bucketName, null, SinkMode.KEEP );
250    }
251
252  /**
253   * Constructor S3Tap creates a new S3Tap instance.
254   *
255   * @param scheme     of Scheme
256   * @param s3Client   of AmazonS3
257   * @param bucketName of String
258   * @param key        of String
259   */
260  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key )
261    {
262    this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
263    }
264
265  /**
266   * Constructor S3Tap creates a new S3Tap instance.
267   *
268   * @param scheme     of Scheme
269   * @param s3Client   of AmazonS3
270   * @param bucketName of String
271   * @param key        of String
272   * @param delimiter  of String
273   */
274  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter )
275    {
276    this( scheme, s3Client, bucketName, key, delimiter, null, SinkMode.KEEP );
277    }
278
279  /**
280   * Constructor S3Tap creates a new S3Tap instance.
281   *
282   * @param scheme     of Scheme
283   * @param s3Client   of AmazonS3
284   * @param bucketName of String
285   * @param key        of String
286   * @param delimiter  of String
287   * @param filter     of Predicate
288   */
289  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter )
290    {
291    this( scheme, s3Client, null, bucketName, key, delimiter, filter, SinkMode.KEEP );
292    }
293
294  /**
295   * Constructor S3Tap creates a new S3Tap instance.
296   *
297   * @param scheme       of Scheme
298   * @param checkpointer of S3Checkpointer
299   * @param bucketName   of String
300   */
301  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName )
302    {
303    this( scheme, checkpointer, bucketName, null, null, null, SinkMode.KEEP );
304    }
305
306  /**
307   * Constructor S3Tap creates a new S3Tap instance.
308   *
309   * @param scheme       of Scheme
310   * @param checkpointer of S3Checkpointer
311   * @param bucketName   of String
312   * @param key          of String
313   */
314  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key )
315    {
316    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
317    }
318
319  /**
320   * Constructor S3Tap creates a new S3Tap instance.
321   *
322   * @param scheme       of Scheme
323   * @param checkpointer of S3Checkpointer
324   * @param bucketName   of String
325   * @param key          of String
326   * @param delimiter    of String
327   */
328  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter )
329    {
330    this( scheme, null, checkpointer, bucketName, key, delimiter, SinkMode.KEEP );
331    }
332
333  /**
334   * Constructor S3Tap creates a new S3Tap instance.
335   *
336   * @param scheme       of Scheme
337   * @param checkpointer of S3Checkpointer
338   * @param bucketName   of String
339   * @param filter       of Predicate
340   */
341  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter )
342    {
343    this( scheme, checkpointer, bucketName, null, filter, SinkMode.KEEP );
344    }
345
346  /**
347   * Constructor S3Tap creates a new S3Tap instance.
348   *
349   * @param scheme       of Scheme
350   * @param checkpointer of S3Checkpointer
351   * @param bucketName   of String
352   * @param key          of String
353   * @param filter       of Predicate
354   */
355  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter )
356    {
357    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP );
358    }
359
360  /**
361   * Constructor S3Tap creates a new S3Tap instance.
362   *
363   * @param scheme       of Scheme
364   * @param checkpointer of S3Checkpointer
365   * @param bucketName   of String
366   * @param key          of String
367   * @param delimiter    of String
368   * @param filter       of Predicate
369   */
370  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter )
371    {
372    this( scheme, null, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP );
373    }
374
375  /**
376   * Constructor S3Tap creates a new S3Tap instance.
377   *
378   * @param scheme       of Scheme
379   * @param s3Client     of AmazonS3
380   * @param checkpointer of S3Checkpointer
381   * @param bucketName   of String
382   */
383  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName )
384    {
385    this( scheme, s3Client, checkpointer, bucketName, null, SinkMode.KEEP );
386    }
387
388  /**
389   * Constructor S3Tap creates a new S3Tap instance.
390   *
391   * @param scheme       of Scheme
392   * @param s3Client     of AmazonS3
393   * @param checkpointer of S3Checkpointer
394   * @param bucketName   of String
395   * @param key          of String
396   */
397  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key )
398    {
399    this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
400    }
401
402  /**
403   * Constructor S3Tap creates a new S3Tap instance.
404   *
405   * @param scheme       of Scheme
406   * @param s3Client     of AmazonS3
407   * @param checkpointer of S3Checkpointer
408   * @param bucketName   of String
409   * @param key          of String
410   * @param delimiter    of String
411   */
412  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter )
413    {
414    this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, SinkMode.KEEP );
415    }
416
417  /**
418   * Constructor S3Tap creates a new S3Tap instance.
419   *
420   * @param scheme       of Scheme
421   * @param s3Client     of AmazonS3
422   * @param checkpointer of S3Checkpointer
423   * @param bucketName   of String
424   * @param key          of String
425   * @param delimiter    of String
426   * @param filter       of Predicate
427   */
428  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter )
429    {
430    this( scheme, s3Client, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP );
431    }
432
433  /**
434   * Constructor S3Tap creates a new S3Tap instance.
435   *
436   * @param scheme     of Scheme
437   * @param bucketName of String
438   */
439  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, SinkMode sinkMode )
440    {
441    this( scheme, bucketName, null, null, null, sinkMode );
442    }
443
444  /**
445   * Constructor S3Tap creates a new S3Tap instance.
446   *
447   * @param scheme     of Scheme
448   * @param bucketName of String
449   * @param key        of String
450   */
451  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, SinkMode sinkMode )
452    {
453    this( scheme, bucketName, key, DEFAULT_DELIMITER );
454    }
455
456  /**
457   * Constructor S3Tap creates a new S3Tap instance.
458   *
459   * @param scheme     of Scheme
460   * @param bucketName of String
461   * @param key        of String
462   * @param delimiter  of String
463   */
464  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, SinkMode sinkMode )
465    {
466    this( scheme, null, null, bucketName, key, delimiter, sinkMode );
467    }
468
469  /**
470   * Constructor S3Tap creates a new S3Tap instance.
471   *
472   * @param scheme     of Scheme
473   * @param bucketName of String
474   * @param filter     of Predicate
475   */
476  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter, SinkMode sinkMode )
477    {
478    this( scheme, bucketName, null, filter, sinkMode );
479    }
480
481  /**
482   * Constructor S3Tap creates a new S3Tap instance.
483   *
484   * @param scheme     of Scheme
485   * @param bucketName of String
486   * @param key        of String
487   * @param filter     of Predicate
488   */
489  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode )
490    {
491    this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode );
492    }
493
494  /**
495   * Constructor S3Tap creates a new S3Tap instance.
496   *
497   * @param scheme     of Scheme
498   * @param bucketName of String
499   * @param key        of String
500   * @param delimiter  of String
501   * @param filter     of Predicate
502   * @param sinkMode   of SinkMode
503   */
504  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
505    {
506    this( scheme, null, null, bucketName, key, delimiter, filter, sinkMode );
507    }
508
509  /**
510   * Constructor S3Tap creates a new S3Tap instance.
511   *
512   * @param scheme     of Scheme
513   * @param s3Client   of AmazonS3
514   * @param bucketName of String
515   * @param sinkMode   of SinkMode
516   */
517  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, SinkMode sinkMode )
518    {
519    this( scheme, s3Client, bucketName, null, sinkMode );
520    }
521
522  /**
523   * Constructor S3Tap creates a new S3Tap instance.
524   *
525   * @param scheme     of Scheme
526   * @param s3Client   of AmazonS3
527   * @param bucketName of String
528   * @param key        of String
529   * @param sinkMode   of SinkMode
530   */
531  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, SinkMode sinkMode )
532    {
533    this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, sinkMode );
534    }
535
536  /**
537   * Constructor S3Tap creates a new S3Tap instance.
538   *
539   * @param scheme     of Scheme
540   * @param s3Client   of AmazonS3
541   * @param bucketName of String
542   * @param key        of String
543   * @param delimiter  of String
544   * @param sinkMode   of SinkMode
545   */
546  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, SinkMode sinkMode )
547    {
548    this( scheme, s3Client, bucketName, key, delimiter, null, sinkMode );
549    }
550
551  /**
552   * Constructor S3Tap creates a new S3Tap instance.
553   *
554   * @param scheme     of Scheme
555   * @param s3Client   of AmazonS3
556   * @param bucketName of String
557   * @param key        of String
558   * @param delimiter  of String
559   * @param filter     of Predicate
560   * @param sinkMode   of SinkMode
561   */
562  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
563    {
564    this( scheme, s3Client, null, bucketName, key, delimiter, filter, sinkMode );
565    }
566
567  /**
568   * Constructor S3Tap creates a new S3Tap instance.
569   *
570   * @param scheme       of Scheme
571   * @param checkpointer of S3Checkpointer
572   * @param bucketName   of String
573   * @param sinkMode     of SinkMode
574   */
575  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode )
576    {
577    this( scheme, checkpointer, bucketName, null, null, null, sinkMode );
578    }
579
580  /**
581   * Constructor S3Tap creates a new S3Tap instance.
582   *
583   * @param scheme       of Scheme
584   * @param checkpointer of S3Checkpointer
585   * @param bucketName   of String
586   * @param key          of String
587   * @param sinkMode     of SinkMode
588   */
589  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode )
590    {
591    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode );
592    }
593
594  /**
595   * Constructor S3Tap creates a new S3Tap instance.
596   *
597   * @param scheme       of Scheme
598   * @param checkpointer of S3Checkpointer
599   * @param bucketName   of String
600   * @param key          of String
601   * @param delimiter    of String
602   * @param sinkMode     of SinkMode
603   */
604  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode )
605    {
606    this( scheme, null, checkpointer, bucketName, key, delimiter, sinkMode );
607    }
608
609  /**
610   * Constructor S3Tap creates a new S3Tap instance.
611   *
612   * @param scheme       of Scheme
613   * @param checkpointer of S3Checkpointer
614   * @param bucketName   of String
615   * @param filter       of Predicate
616   * @param sinkMode     of SinkMode
617   */
618  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter, SinkMode sinkMode )
619    {
620    this( scheme, checkpointer, bucketName, null, filter, sinkMode );
621    }
622
623  /**
624   * Constructor S3Tap creates a new S3Tap instance.
625   *
626   * @param scheme       of Scheme
627   * @param checkpointer of S3Checkpointer
628   * @param bucketName   of String
629   * @param key          of String
630   * @param filter       of Predicate
631   * @param sinkMode     of SinkMode
632   */
633  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode )
634    {
635    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode );
636    }
637
638  /**
639   * Constructor S3Tap creates a new S3Tap instance.
640   *
641   * @param scheme       of Scheme
642   * @param checkpointer of S3Checkpointer
643   * @param bucketName   of String
644   * @param key          of String
645   * @param delimiter    of String
646   * @param filter       of Predicate
647   * @param sinkMode     of SinkMode
648   */
649  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
650    {
651    this( scheme, null, checkpointer, bucketName, key, delimiter, filter, sinkMode );
652    }
653
654  /**
655   * Constructor S3Tap creates a new S3Tap instance.
656   *
657   * @param scheme       of Scheme
658   * @param s3Client     of AmazonS3
659   * @param checkpointer of S3Checkpointer
660   * @param bucketName   of String
661   * @param sinkMode     of SinkMode
662   */
663  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode )
664    {
665    this( scheme, s3Client, checkpointer, bucketName, null, sinkMode );
666    }
667
668  /**
669   * Constructor S3Tap creates a new S3Tap instance.
670   *
671   * @param scheme       of Scheme
672   * @param s3Client     of AmazonS3
673   * @param checkpointer of S3Checkpointer
674   * @param bucketName   of String
675   * @param key          of String
676   * @param sinkMode     of SinkMode
677   */
678  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode )
679    {
680    this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode );
681    }
682
683  /**
684   * Constructor S3Tap creates a new S3Tap instance.
685   *
686   * @param scheme       of Scheme
687   * @param s3Client     of AmazonS3
688   * @param checkpointer of S3Checkpointer
689   * @param bucketName   of String
690   * @param key          of String
691   * @param delimiter    of String
692   * @param sinkMode     of SinkMode
693   */
694  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode )
695    {
696    this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, sinkMode );
697    }
698
699  /**
700   * Constructor S3Tap creates a new S3Tap instance.
701   *
702   * @param scheme       of Scheme
703   * @param s3Client     of AmazonS3
704   * @param checkpointer of S3Checkpointer
705   * @param bucketName   of String
706   * @param key          of String
707   * @param delimiter    of String
708   * @param filter       of Predicate
709   * @param sinkMode     of SinkMode
710   */
711  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
712    {
713    super( scheme, sinkMode );
714    this.s3Client = s3Client;
715    this.checkpointer = checkpointer;
716    this.bucketName = bucketName;
717
718    if( isEmpty( this.bucketName ) )
719      throw new IllegalArgumentException( "bucket name may not be null or empty" );
720
721    this.key = key;
722    this.delimiter = delimiter;
723    this.filter = filter;
724    }
725
726  /**
727   * Constructor S3Tap creates a new S3Tap instance.
728   *
729   * @param scheme     of Scheme
730   * @param identifier of URI
731   */
732  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier )
733    {
734    this( scheme, null, null, identifier, SinkMode.KEEP );
735    }
736
737  /**
738   * Constructor S3Tap creates a new S3Tap instance.
739   *
740   * @param scheme     of Scheme
741   * @param s3Client   of AmazonS3
742   * @param identifier of URI
743   */
744  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier )
745    {
746    this( scheme, s3Client, null, identifier, SinkMode.KEEP );
747    }
748
749  /**
750   * Constructor S3Tap creates a new S3Tap instance.
751   *
752   * @param scheme       of Scheme
753   * @param checkpointer of S3Checkpointer
754   * @param identifier   of URI
755   */
756  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier )
757    {
758    this( scheme, null, checkpointer, identifier, SinkMode.KEEP );
759    }
760
761  /**
762   * Constructor S3Tap creates a new S3Tap instance.
763   *
764   * @param scheme       of Scheme
765   * @param s3Client     of AmazonS3
766   * @param checkpointer of S3Checkpointer
767   * @param identifier   of URI
768   */
769  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier )
770    {
771    this( scheme, s3Client, checkpointer, identifier, SinkMode.KEEP );
772    }
773
774  /**
775   * Constructor S3Tap creates a new S3Tap instance.
776   *
777   * @param scheme     of Scheme
778   * @param identifier of URI
779   * @param sinkMode   of SinkMode
780   */
781  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier, SinkMode sinkMode )
782    {
783    this( scheme, null, null, identifier, sinkMode );
784    }
785
786  /**
787   * Constructor S3Tap creates a new S3Tap instance.
788   *
789   * @param scheme     of Scheme
790   * @param s3Client   of AmazonS3
791   * @param identifier of URI
792   * @param sinkMode   of SinkMode
793   */
794  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier, SinkMode sinkMode )
795    {
796    this( scheme, s3Client, null, identifier, sinkMode );
797    }
798
799  /**
800   * Constructor S3Tap creates a new S3Tap instance.
801   *
802   * @param scheme       of Scheme
803   * @param checkpointer of S3Checkpointer
804   * @param identifier   of URI
805   * @param sinkMode     of SinkMode
806   */
807  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode )
808    {
809    this( scheme, null, checkpointer, identifier, sinkMode );
810    }
811
812  /**
813   * Constructor S3Tap creates a new S3Tap instance.
814   *
815   * @param scheme       of Scheme
816   * @param s3Client     of AmazonS3
817   * @param checkpointer of S3Checkpointer
818   * @param identifier   of URI
819   * @param sinkMode     of SinkMode
820   */
821  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode )
822    {
823    super( scheme, sinkMode );
824    this.s3Client = s3Client;
825    this.checkpointer = checkpointer;
826
827    if( identifier == null )
828      throw new IllegalArgumentException( "identifier may not be null" );
829
830    if( !identifier.getScheme().equalsIgnoreCase( "s3" ) )
831      throw new IllegalArgumentException( "identifier does not have s3 scheme" );
832
833    this.bucketName = getBucketNameFor( identifier );
834
835    if( isEmpty( this.bucketName ) )
836      throw new IllegalArgumentException( "bucket name may not be null or empty" + identifier );
837
838    this.key = cleanKey( identifier );
839
840    if( identifier.getQuery() != null )
841      filter = globPredicate( identifier.getQuery() );
842    }
843
844  protected String getBucketNameFor( URI identifier )
845    {
846    String authority = identifier.getAuthority();
847
848    if( isEmpty( authority ) )
849      throw new IllegalArgumentException( "identifier must have an authority: " + identifier );
850
851    int pos = authority.indexOf( '@' );
852
853    if( pos != -1 )
854      return authority.substring( pos + 1 );
855
856    return authority;
857    }
858
859  private static Predicate<String> globPredicate( String glob )
860    {
861    String regex = getRegexForGlob( glob );
862    Pattern pattern = Pattern.compile( regex );
863
864    return string -> pattern.matcher( string ).matches();
865    }
866
867  private static String getRegexForGlob( String glob )
868    {
869    return (String) Util.invokeStaticMethod(
870      "sun.nio.fs.Globs",
871      "toUnixRegexPattern",
872      new Object[]{glob},
873      new Class[]{String.class}
874    );
875    }
876
877  @Override
878  public TapWith<Properties, InputStream, OutputStream> withScheme( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme )
879    {
880    // don't lazily create s3Client
881    return new S3Tap( scheme, s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), getSinkMode() );
882    }
883
884  @Override
885  public TapWith<Properties, InputStream, OutputStream> withChildIdentifier( String identifier )
886    {
887    URI uri;
888
889    if( identifier.startsWith( "s3://" ) )
890      uri = URI.create( identifier );
891    else if( identifier.startsWith( getBucketName() ) )
892      uri = makeURI( identifier, null );
893    else
894      uri = makeURI( getBucketName(), getKey() + ( identifier.startsWith( delimiter ) ? identifier : delimiter + identifier ) );
895
896    // don't lazily create s3Client
897    return new S3Tap( getScheme(), s3Client, uri, getSinkMode() );
898    }
899
900  @Override
901  public TapWith<Properties, InputStream, OutputStream> withSinkMode( SinkMode sinkMode )
902    {
903    // don't lazily create s3Client
904    return new S3Tap( getScheme(), s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), sinkMode );
905    }
906
907  protected String cleanKey( URI identifier )
908    {
909    String path = identifier.getPath();
910
911    if( path.startsWith( "/" ) )
912      path = path.substring( 1 );
913
914    return path;
915    }
916
917  protected AmazonS3 getS3Client( Properties properties )
918    {
919    // return provided client
920    if( s3Client != null )
921      return s3Client;
922
923    AmazonS3ClientBuilder standard = AmazonS3ClientBuilder.standard();
924
925    if( properties != null )
926      {
927      String endpoint = properties.getProperty( S3TapProps.S3_ENDPOINT );
928      String region = properties.getProperty( S3TapProps.S3_REGION, "us-east-1" );
929
930      if( properties.containsKey( S3TapProps.S3_PROXY_HOST ) )
931        {
932        ClientConfiguration config = new ClientConfiguration()
933          .withProxyHost( properties.getProperty( S3TapProps.S3_PROXY_HOST ) )
934          .withProxyPort( PropertyUtil.getIntProperty( properties, S3TapProps.S3_PROXY_PORT, -1 ) );
935
936        standard.withClientConfiguration( config );
937        }
938
939      if( endpoint != null )
940        standard.withEndpointConfiguration( new AwsClientBuilder.EndpointConfiguration( endpoint, region ) );
941      else
942        standard.setRegion( region );
943
944      if( Boolean.parseBoolean( properties.getProperty( S3TapProps.S3_PATH_STYLE_ACCESS, "false" ) ) )
945        standard.enablePathStyleAccess();
946      }
947
948    return standard.build();
949    }
950
951  /**
952   * Method getCheckpointer returns the checkpointer of this S3Tap object.
953   *
954   * @return the checkpointer (type S3Checkpointer) of this S3Tap object.
955   */
956  public S3Checkpointer getCheckpointer()
957    {
958    return checkpointer;
959    }
960
961  /**
962   * Method getBucketName returns the bucketName of this S3Tap object.
963   *
964   * @return the bucketName (type String) of this S3Tap object.
965   */
966  public String getBucketName()
967    {
968    return bucketName;
969    }
970
971  /**
972   * Method getKey returns the key of this S3Tap object.
973   *
974   * @return the key (type String) of this S3Tap object.
975   */
976  public String getKey()
977    {
978    return key;
979    }
980
981  protected String getMarker()
982    {
983    if( checkpointer != null )
984      return checkpointer.getLastKey( getBucketName() );
985
986    return null;
987    }
988
989  protected void setLastMarker( String marker )
990    {
991    if( checkpointer != null )
992      checkpointer.setLastKey( getBucketName(), marker );
993    }
994
995  protected void commitMarker()
996    {
997    if( checkpointer != null )
998      checkpointer.commit();
999    }
1000
1001  /**
1002   * Method getFilter returns the filter of this S3Tap object.
1003   *
1004   * @return the filter (type Predicate) of this S3Tap object.
1005   */
1006  public Predicate<String> getFilter()
1007    {
1008    return filter;
1009    }
1010
1011  /**
1012   * Method getDelimiter returns the delimiter of this S3Tap object.
1013   *
1014   * @return the delimiter (type String) of this S3Tap object.
1015   */
1016  public String getDelimiter()
1017    {
1018    return delimiter;
1019    }
1020
1021  @Override
1022  public String getIdentifier()
1023    {
1024    return makeStringIdentifier( getBucketName(), getKey() );
1025    }
1026
1027  @Override
1028  public String getFullIdentifier( Properties conf )
1029    {
1030    return getIdentifier();
1031    }
1032
1033  @Override
1034  public boolean deleteResource( Properties conf ) throws IOException
1035    {
1036    AmazonS3 s3Client = getS3Client( conf );
1037
1038    s3Client.deleteObject( getBucketName(), getKey() );
1039
1040    return true;
1041    }
1042
1043  @Override
1044  public boolean createResource( Properties conf ) throws IOException
1045    {
1046    AmazonS3 s3Client = getS3Client( conf );
1047
1048    s3Client.putObject( getBucketName(), getKey(), "" );
1049
1050    return true;
1051    }
1052
1053  protected ObjectMetadata getObjectMetadata( Properties conf )
1054    {
1055    if( objectMetadata == null )
1056      objectMetadata = getS3Client( conf ).getObjectMetadata( getBucketName(), getKey() );
1057
1058    return objectMetadata;
1059    }
1060
1061  private class CheckedFilterInputStream extends FilterInputStream
1062    {
1063    public CheckedFilterInputStream( InputStream inputStream )
1064      {
1065      super( inputStream );
1066      }
1067    }
1068
1069  @Override
1070  public TupleEntryIterator openForRead( FlowProcess<? extends Properties> flowProcess, InputStream input ) throws IOException
1071    {
1072    AmazonS3 s3Client = getS3Client( flowProcess.getConfig() );
1073
1074    final String[] identifier = new String[ 1 ];
1075
1076    CloseableIterator<InputStream> iterator = new CloseableIterator<InputStream>()
1077      {
1078      S3Iterable iterable = S3Iterable.iterable( s3Client, getBucketName(), getKey() )
1079        .withFilter( getFilter() )
1080        .withMarker( getMarker() );
1081
1082      Iterator<S3ObjectSummary> iterator = iterable.iterator();
1083      InputStream lastInputStream;
1084
1085      @Override
1086      public boolean hasNext()
1087        {
1088        return iterator.hasNext();
1089        }
1090
1091      @Override
1092      public InputStream next()
1093        {
1094        safeClose();
1095
1096        S3ObjectSummary objectSummary = iterator.next();
1097
1098        identifier[ 0 ] = makeStringIdentifier( objectSummary.getBucketName(), objectSummary.getKey() );
1099
1100        flowProcess.getFlowProcessContext().setSourcePath( identifier[ 0 ] );
1101
1102        if( LOG.isDebugEnabled() )
1103          LOG.debug( "s3 retrieving: {}/{}, with size: {}", objectSummary.getBucketName(), objectSummary.getKey(), objectSummary.getSize() );
1104
1105        // getObject does not seem to fill the InputStream, nor does the InputStream support marking
1106        // may make sense to wrap this iterator in a iterate ahead iterator that attempts to pre-fetch objects in a different thread
1107        lastInputStream = new CheckedFilterInputStream( s3Client.getObject( objectSummary.getBucketName(), objectSummary.getKey() ).getObjectContent() )
1108          {
1109          @Override
1110          public void close() throws IOException
1111            {
1112            setLastMarker( objectSummary.getKey() );
1113            super.close();
1114            }
1115          };
1116
1117        return lastInputStream;
1118        }
1119
1120      private void safeClose()
1121        {
1122        try
1123          {
1124          if( lastInputStream != null )
1125            lastInputStream.close();
1126
1127          lastInputStream = null;
1128          }
1129        catch( IOException exception )
1130          {
1131          // do nothing
1132          }
1133        }
1134
1135      @Override
1136      public void close()
1137        {
1138        safeClose();
1139        commitMarker();
1140        }
1141      };
1142
1143    return new TupleEntrySchemeIterator<Properties, InputStream>( flowProcess, this, getScheme(), iterator, () -> identifier[ 0 ] );
1144    }
1145
1146  @Override
1147  public TupleEntryCollector openForWrite( FlowProcess<? extends Properties> flowProcess, OutputStream outputStream ) throws IOException
1148    {
1149    AmazonS3 s3Client = getS3Client( flowProcess.getConfig() );
1150
1151    if( !s3Client.doesBucketExist( getBucketName() ) )
1152      s3Client.createBucket( getBucketName() );
1153
1154    PipedInputStream pipedInputStream = new PipedInputStream();
1155    PipedOutputStream pipedOutputStream = new PipedOutputStream( pipedInputStream );
1156
1157    TransferManager transferManager = TransferManagerBuilder.standard().withS3Client( s3Client ).build();
1158
1159    ObjectMetadata metadata = new ObjectMetadata();
1160
1161    if( LOG.isDebugEnabled() )
1162      LOG.debug( "starting upload: {}", getIdentifier() );
1163
1164    final String key = resolveKey( flowProcess, getKey() );
1165
1166    Upload upload = transferManager.upload( getBucketName(), key, pipedInputStream, metadata );
1167
1168    return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, this, getScheme(), pipedOutputStream, makeStringIdentifier( getBucketName(), key ) )
1169      {
1170      @Override
1171      public void close()
1172        {
1173        super.close();
1174
1175        try
1176          {
1177          UploadResult uploadResult = upload.waitForUploadResult();
1178
1179          if( uploadResult != null )
1180            {
1181            if( LOG.isDebugEnabled() )
1182              LOG.debug( "completed upload: {}, with key: {}", getIdentifier(), uploadResult.getKey() );
1183            }
1184          }
1185        catch( InterruptedException exception )
1186          {
1187          // ignore
1188          }
1189        finally
1190          {
1191          transferManager.shutdownNow( false );
1192          }
1193        }
1194      };
1195    }
1196
1197  protected String resolveKey( FlowProcess<? extends Properties> flowProcess, String key )
1198    {
1199    int partNum = flowProcess.getIntegerProperty( PartitionTap.PART_NUM_PROPERTY, 0 );
1200
1201    key = key.replace( SEQUENCE_TOKEN, String.format( "%05d", partNum ) );
1202
1203    if( getScheme() instanceof FileFormat )
1204      return key + "." + ( (FileFormat) getScheme() ).getExtension();
1205
1206    return key;
1207    }
1208
1209  @Override
1210  public boolean resourceExists( Properties conf ) throws IOException
1211    {
1212    if( getKey() == null )
1213      return getS3Client( conf ).doesBucketExist( getBucketName() );
1214
1215    return getKey().endsWith( "/" ) || getS3Client( conf ).doesObjectExist( getBucketName(), getKey() );
1216    }
1217
1218  @Override
1219  public long getModifiedTime( Properties conf ) throws IOException
1220    {
1221    return getObjectMetadata( conf ).getLastModified().getTime();
1222    }
1223
1224  @Override
1225  public boolean isDirectory( FlowProcess<? extends Properties> flowProcess ) throws IOException
1226    {
1227    return MIME_DIRECTORY.equalsIgnoreCase( getObjectMetadata( flowProcess.getConfig() ).getContentType() );
1228    }
1229
1230  @Override
1231  public boolean isDirectory( Properties conf ) throws IOException
1232    {
1233    return isDirectory( FlowProcess.nullFlowProcess() );
1234    }
1235
1236  @Override
1237  public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess ) throws IOException
1238    {
1239    return getChildIdentifiers( flowProcess.getConfig() );
1240    }
1241
1242  @Override
1243  public String[] getChildIdentifiers( Properties conf ) throws IOException
1244    {
1245    return getChildIdentifiers( conf, 1, false );
1246    }
1247
1248  @Override
1249  public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess, int depth, boolean fullyQualified ) throws IOException
1250    {
1251    return getChildIdentifiers( flowProcess.getConfig(), depth, fullyQualified );
1252    }
1253
1254  @Override
1255  public String[] getChildIdentifiers( Properties conf, int depth, boolean fullyQualified ) throws IOException
1256    {
1257    if( !resourceExists( conf ) )
1258      return new String[ 0 ];
1259
1260    S3Iterable objects = S3Iterable.iterable( getS3Client( conf ), getBucketName(), getKey() )
1261      .withDelimiter( getDelimiter() )
1262      .withMaxDepth( depth )
1263      .withFilter( getFilter() )
1264      .withMarker( getMarker() );
1265
1266    Iterator<S3ObjectSummary> iterator = objects.iterator();
1267
1268    List<String> results = new ArrayList<>();
1269
1270    while( iterator.hasNext() )
1271      results.add( makePath( iterator, fullyQualified ) );
1272
1273    return results.toArray( new String[ results.size() ] );
1274    }
1275
1276  protected String makePath( Iterator<S3ObjectSummary> iterator, boolean fullyQualified )
1277    {
1278    String key = iterator.next().getKey();
1279
1280    if( fullyQualified )
1281      return makeStringIdentifier( getBucketName(), key );
1282
1283    return key.substring( getKey().length() );
1284    }
1285
1286  @Override
1287  public long getSize( FlowProcess<? extends Properties> flowProcess ) throws IOException
1288    {
1289    return getSize( flowProcess.getConfig() );
1290    }
1291
1292  @Override
1293  public long getSize( Properties conf ) throws IOException
1294    {
1295    if( isDirectory( conf ) )
1296      return 0;
1297
1298    return getObjectMetadata( conf ).getInstanceLength();
1299    }
1300
1301  protected static String makeStringIdentifier( String bucketName, String keyPrefix )
1302    {
1303    if( isEmpty( keyPrefix ) )
1304      return String.format( "s3://%s/", bucketName );
1305
1306    return String.format( "s3://%s/%s", bucketName, keyPrefix );
1307    }
1308  }