001/*
002 * Copyright (c) 2017-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.local.tap.aws.s3;
022
023import java.io.FilterInputStream;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.OutputStream;
027import java.io.PipedInputStream;
028import java.io.PipedOutputStream;
029import java.net.URI;
030import java.net.URISyntaxException;
031import java.util.ArrayList;
032import java.util.Iterator;
033import java.util.List;
034import java.util.Properties;
035import java.util.function.Predicate;
036import java.util.regex.Pattern;
037
038import cascading.flow.FlowProcess;
039import cascading.property.PropertyUtil;
040import cascading.scheme.FileFormat;
041import cascading.scheme.Scheme;
042import cascading.tap.SinkMode;
043import cascading.tap.Tap;
044import cascading.tap.local.PartitionTap;
045import cascading.tap.type.FileType;
046import cascading.tap.type.TapWith;
047import cascading.tuple.TupleEntryCollector;
048import cascading.tuple.TupleEntryIterator;
049import cascading.tuple.TupleEntrySchemeCollector;
050import cascading.tuple.TupleEntrySchemeIterator;
051import cascading.util.CloseableIterator;
052import cascading.util.Util;
053import com.amazonaws.ClientConfiguration;
054import com.amazonaws.client.builder.AwsClientBuilder;
055import com.amazonaws.services.s3.AmazonS3;
056import com.amazonaws.services.s3.AmazonS3ClientBuilder;
057import com.amazonaws.services.s3.model.AmazonS3Exception;
058import com.amazonaws.services.s3.model.ObjectMetadata;
059import com.amazonaws.services.s3.model.S3ObjectSummary;
060import com.amazonaws.services.s3.transfer.TransferManager;
061import com.amazonaws.services.s3.transfer.TransferManagerBuilder;
062import com.amazonaws.services.s3.transfer.Upload;
063import com.amazonaws.services.s3.transfer.model.UploadResult;
064import org.slf4j.Logger;
065import org.slf4j.LoggerFactory;
066
067import static cascading.util.Util.isEmpty;
068
069/**
070 * Class S3Tap is a Cascading local-mode {@link Tap} providing read and write access to data stored in Amazon S3 buckets.
071 * <p>
072 * This Tap is not intended to be used with any of the other Cascading planners unless they specify they are local-mode
073 * compatible.
074 * <p>
075 * S3Tap can read a single key, all objects underneath a key-prefix, or all objects under a key-prefix that match
076 * a given globbing pattern.
077 * <p>
078 * See the various constructors for the available access parametrizations. Of note are the constructors that take
079 * a {@link URI} instance. The URI should be in the following format:
080 * {@code s3://[bucket]/<key|key-prefix><?glob>}
081 * <p>
082 * Where bucket is the only required value. The key references a single object, the key-prefix is used to access
083 * a set of objects with a common prefix value. The glob value is use to further narrow the resulting object set.
084 * <p>
085 * The globbing pattern is specified by the {@link java.nio.file.FileSystem#getPathMatcher} method.
086 * <p>
087 * This Tap was designed to allow applications to effectively poll an S3 bucket for new keys to be processed.
088 * <p>
089 * When used with the {@link S3FileCheckpointer} class, a map of keys last consumed by each bucket will be tracked
090 * on disk, with the map surviving JVM restarts allowing for applications to exit and restart safely without
091 * retrieving duplicate data.
092 * <p>
093 * The {@link S3Checkpointer#commit()} method is only called during a graceful shutdown of the Flow or JVM, but every
094 * consumed key is passed to the S3Checkpointer, so custom implementations can choose to persist the key more
095 * frequently.
096 * <p>
097 * AWS Credentials are handled by {@link com.amazonaws.auth.DefaultAWSCredentialsProviderChain}.
098 */
099public class S3Tap extends Tap<Properties, InputStream, OutputStream> implements FileType<Properties>, TapWith<Properties, InputStream, OutputStream>
100  {
101  /** Field LOG */
102  private static final Logger LOG = LoggerFactory.getLogger( S3Tap.class );
103
104  /** Field SEQUENCE_TOKEN */
105  public static final String SEQUENCE_TOKEN = "{sequence}";
106  /** Field MIME_DIRECTORY */
107  public static final String MIME_DIRECTORY = "application/x-directory";
108  /** Field DEFAULT_DELIMITER */
109  public static final String DEFAULT_DELIMITER = "/";
110
111  /** Field s3Client */
112  AmazonS3 s3Client = null;
113  /** Field bucketName */
114  String bucketName = null;
115  /** Field key */
116  String key = null;
117  /** Field filter */
118  Predicate<String> filter;
119  /** Field delimiter */
120  String delimiter = DEFAULT_DELIMITER;
121  /** Field checkpointer */
122  S3Checkpointer checkpointer;
123
124  private transient ObjectMetadata objectMetadata;
125
126  /**
127   * Method makeURI creates a new S3 URI from the given parameters.
128   *
129   * @param bucketName the S3 bucket name
130   * @param keyPrefix  the S3 object key or key-prefix
131   * @return an URI instance
132   */
133  public static URI makeURI( String bucketName, String keyPrefix )
134    {
135    return makeURI( bucketName, keyPrefix, null );
136    }
137
138  /**
139   * Method makeURI creates a new S3 URI from the given parameters.
140   *
141   * @param bucketName the S3 bucket name
142   * @param keyPrefix  the S3 object key or key-prefix
143   * @param glob       the globbing pattern to apply to the keys
144   * @return an URI instance
145   */
146  public static URI makeURI( String bucketName, String keyPrefix, String glob )
147    {
148    if( bucketName == null )
149      throw new IllegalArgumentException( "bucketName may not be null" );
150
151    try
152      {
153      if( keyPrefix == null )
154        keyPrefix = "/";
155      else if( !keyPrefix.startsWith( "/" ) )
156        keyPrefix = "/" + keyPrefix;
157
158      return new URI( "s3", bucketName, keyPrefix, glob, null );
159      }
160    catch( URISyntaxException exception )
161      {
162      throw new IllegalArgumentException( exception.getMessage(), exception );
163      }
164    }
165
166  /**
167   * Constructor S3Tap creates a new S3Tap instance.
168   *
169   * @param scheme     of Scheme
170   * @param bucketName of String
171   */
172  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName )
173    {
174    this( scheme, bucketName, null, null, null, SinkMode.KEEP );
175    }
176
177  /**
178   * Constructor S3Tap creates a new S3Tap instance.
179   *
180   * @param scheme     of Scheme
181   * @param bucketName of String
182   * @param key        of String
183   */
184  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key )
185    {
186    this( scheme, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
187    }
188
189  /**
190   * Constructor S3Tap creates a new S3Tap instance.
191   *
192   * @param scheme     of Scheme
193   * @param bucketName of String
194   * @param key        of String
195   * @param delimiter  of String
196   */
197  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter )
198    {
199    this( scheme, null, null, bucketName, key, delimiter, SinkMode.KEEP );
200    }
201
202  /**
203   * Constructor S3Tap creates a new S3Tap instance.
204   *
205   * @param scheme     of Scheme
206   * @param bucketName of String
207   * @param filter     of Predicate
208   */
209  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter )
210    {
211    this( scheme, bucketName, null, filter, SinkMode.KEEP );
212    }
213
214  /**
215   * Constructor S3Tap creates a new S3Tap instance.
216   *
217   * @param scheme     of Scheme
218   * @param bucketName of String
219   * @param key        of String
220   * @param filter     of Predicate
221   */
222  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter )
223    {
224    this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP );
225    }
226
227  /**
228   * Constructor S3Tap creates a new S3Tap instance.
229   *
230   * @param scheme     of Scheme
231   * @param bucketName of String
232   * @param key        of String
233   * @param delimiter  of String
234   * @param filter     of Predicate
235   */
236  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter )
237    {
238    this( scheme, null, null, bucketName, key, delimiter, filter, SinkMode.KEEP );
239    }
240
241  /**
242   * Constructor S3Tap creates a new S3Tap instance.
243   *
244   * @param scheme     of Scheme
245   * @param s3Client   of AmazonS3
246   * @param bucketName of String
247   */
248  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName )
249    {
250    this( scheme, s3Client, bucketName, null, SinkMode.KEEP );
251    }
252
253  /**
254   * Constructor S3Tap creates a new S3Tap instance.
255   *
256   * @param scheme     of Scheme
257   * @param s3Client   of AmazonS3
258   * @param bucketName of String
259   * @param key        of String
260   */
261  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key )
262    {
263    this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
264    }
265
266  /**
267   * Constructor S3Tap creates a new S3Tap instance.
268   *
269   * @param scheme     of Scheme
270   * @param s3Client   of AmazonS3
271   * @param bucketName of String
272   * @param key        of String
273   * @param delimiter  of String
274   */
275  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter )
276    {
277    this( scheme, s3Client, bucketName, key, delimiter, null, SinkMode.KEEP );
278    }
279
280  /**
281   * Constructor S3Tap creates a new S3Tap instance.
282   *
283   * @param scheme     of Scheme
284   * @param s3Client   of AmazonS3
285   * @param bucketName of String
286   * @param key        of String
287   * @param delimiter  of String
288   * @param filter     of Predicate
289   */
290  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter )
291    {
292    this( scheme, s3Client, null, bucketName, key, delimiter, filter, SinkMode.KEEP );
293    }
294
295  /**
296   * Constructor S3Tap creates a new S3Tap instance.
297   *
298   * @param scheme       of Scheme
299   * @param checkpointer of S3Checkpointer
300   * @param bucketName   of String
301   */
302  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName )
303    {
304    this( scheme, checkpointer, bucketName, null, null, null, SinkMode.KEEP );
305    }
306
307  /**
308   * Constructor S3Tap creates a new S3Tap instance.
309   *
310   * @param scheme       of Scheme
311   * @param checkpointer of S3Checkpointer
312   * @param bucketName   of String
313   * @param key          of String
314   */
315  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key )
316    {
317    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
318    }
319
320  /**
321   * Constructor S3Tap creates a new S3Tap instance.
322   *
323   * @param scheme       of Scheme
324   * @param checkpointer of S3Checkpointer
325   * @param bucketName   of String
326   * @param key          of String
327   * @param delimiter    of String
328   */
329  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter )
330    {
331    this( scheme, null, checkpointer, bucketName, key, delimiter, SinkMode.KEEP );
332    }
333
334  /**
335   * Constructor S3Tap creates a new S3Tap instance.
336   *
337   * @param scheme       of Scheme
338   * @param checkpointer of S3Checkpointer
339   * @param bucketName   of String
340   * @param filter       of Predicate
341   */
342  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter )
343    {
344    this( scheme, checkpointer, bucketName, null, filter, SinkMode.KEEP );
345    }
346
347  /**
348   * Constructor S3Tap creates a new S3Tap instance.
349   *
350   * @param scheme       of Scheme
351   * @param checkpointer of S3Checkpointer
352   * @param bucketName   of String
353   * @param key          of String
354   * @param filter       of Predicate
355   */
356  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter )
357    {
358    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP );
359    }
360
361  /**
362   * Constructor S3Tap creates a new S3Tap instance.
363   *
364   * @param scheme       of Scheme
365   * @param checkpointer of S3Checkpointer
366   * @param bucketName   of String
367   * @param key          of String
368   * @param delimiter    of String
369   * @param filter       of Predicate
370   */
371  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter )
372    {
373    this( scheme, null, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP );
374    }
375
376  /**
377   * Constructor S3Tap creates a new S3Tap instance.
378   *
379   * @param scheme       of Scheme
380   * @param s3Client     of AmazonS3
381   * @param checkpointer of S3Checkpointer
382   * @param bucketName   of String
383   */
384  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName )
385    {
386    this( scheme, s3Client, checkpointer, bucketName, null, SinkMode.KEEP );
387    }
388
389  /**
390   * Constructor S3Tap creates a new S3Tap instance.
391   *
392   * @param scheme       of Scheme
393   * @param s3Client     of AmazonS3
394   * @param checkpointer of S3Checkpointer
395   * @param bucketName   of String
396   * @param key          of String
397   */
398  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key )
399    {
400    this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP );
401    }
402
403  /**
404   * Constructor S3Tap creates a new S3Tap instance.
405   *
406   * @param scheme       of Scheme
407   * @param s3Client     of AmazonS3
408   * @param checkpointer of S3Checkpointer
409   * @param bucketName   of String
410   * @param key          of String
411   * @param delimiter    of String
412   */
413  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter )
414    {
415    this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, SinkMode.KEEP );
416    }
417
418  /**
419   * Constructor S3Tap creates a new S3Tap instance.
420   *
421   * @param scheme       of Scheme
422   * @param s3Client     of AmazonS3
423   * @param checkpointer of S3Checkpointer
424   * @param bucketName   of String
425   * @param key          of String
426   * @param delimiter    of String
427   * @param filter       of Predicate
428   */
429  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter )
430    {
431    this( scheme, s3Client, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP );
432    }
433
434  /**
435   * Constructor S3Tap creates a new S3Tap instance.
436   *
437   * @param scheme     of Scheme
438   * @param bucketName of String
439   */
440  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, SinkMode sinkMode )
441    {
442    this( scheme, bucketName, null, null, null, sinkMode );
443    }
444
445  /**
446   * Constructor S3Tap creates a new S3Tap instance.
447   *
448   * @param scheme     of Scheme
449   * @param bucketName of String
450   * @param key        of String
451   */
452  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, SinkMode sinkMode )
453    {
454    this( scheme, bucketName, key, DEFAULT_DELIMITER, sinkMode );
455    }
456
457  /**
458   * Constructor S3Tap creates a new S3Tap instance.
459   *
460   * @param scheme     of Scheme
461   * @param bucketName of String
462   * @param key        of String
463   * @param delimiter  of String
464   */
465  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, SinkMode sinkMode )
466    {
467    this( scheme, null, null, bucketName, key, delimiter, sinkMode );
468    }
469
470  /**
471   * Constructor S3Tap creates a new S3Tap instance.
472   *
473   * @param scheme     of Scheme
474   * @param bucketName of String
475   * @param filter     of Predicate
476   */
477  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter, SinkMode sinkMode )
478    {
479    this( scheme, bucketName, null, filter, sinkMode );
480    }
481
482  /**
483   * Constructor S3Tap creates a new S3Tap instance.
484   *
485   * @param scheme     of Scheme
486   * @param bucketName of String
487   * @param key        of String
488   * @param filter     of Predicate
489   */
490  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode )
491    {
492    this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode );
493    }
494
495  /**
496   * Constructor S3Tap creates a new S3Tap instance.
497   *
498   * @param scheme     of Scheme
499   * @param bucketName of String
500   * @param key        of String
501   * @param delimiter  of String
502   * @param filter     of Predicate
503   * @param sinkMode   of SinkMode
504   */
505  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
506    {
507    this( scheme, null, null, bucketName, key, delimiter, filter, sinkMode );
508    }
509
510  /**
511   * Constructor S3Tap creates a new S3Tap instance.
512   *
513   * @param scheme     of Scheme
514   * @param s3Client   of AmazonS3
515   * @param bucketName of String
516   * @param sinkMode   of SinkMode
517   */
518  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, SinkMode sinkMode )
519    {
520    this( scheme, s3Client, bucketName, null, sinkMode );
521    }
522
523  /**
524   * Constructor S3Tap creates a new S3Tap instance.
525   *
526   * @param scheme     of Scheme
527   * @param s3Client   of AmazonS3
528   * @param bucketName of String
529   * @param key        of String
530   * @param sinkMode   of SinkMode
531   */
532  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, SinkMode sinkMode )
533    {
534    this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, sinkMode );
535    }
536
537  /**
538   * Constructor S3Tap creates a new S3Tap instance.
539   *
540   * @param scheme     of Scheme
541   * @param s3Client   of AmazonS3
542   * @param bucketName of String
543   * @param key        of String
544   * @param delimiter  of String
545   * @param sinkMode   of SinkMode
546   */
547  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, SinkMode sinkMode )
548    {
549    this( scheme, s3Client, bucketName, key, delimiter, null, sinkMode );
550    }
551
552  /**
553   * Constructor S3Tap creates a new S3Tap instance.
554   *
555   * @param scheme     of Scheme
556   * @param s3Client   of AmazonS3
557   * @param bucketName of String
558   * @param key        of String
559   * @param delimiter  of String
560   * @param filter     of Predicate
561   * @param sinkMode   of SinkMode
562   */
563  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
564    {
565    this( scheme, s3Client, null, bucketName, key, delimiter, filter, sinkMode );
566    }
567
568  /**
569   * Constructor S3Tap creates a new S3Tap instance.
570   *
571   * @param scheme       of Scheme
572   * @param checkpointer of S3Checkpointer
573   * @param bucketName   of String
574   * @param sinkMode     of SinkMode
575   */
576  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode )
577    {
578    this( scheme, checkpointer, bucketName, null, null, null, sinkMode );
579    }
580
581  /**
582   * Constructor S3Tap creates a new S3Tap instance.
583   *
584   * @param scheme       of Scheme
585   * @param checkpointer of S3Checkpointer
586   * @param bucketName   of String
587   * @param key          of String
588   * @param sinkMode     of SinkMode
589   */
590  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode )
591    {
592    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode );
593    }
594
595  /**
596   * Constructor S3Tap creates a new S3Tap instance.
597   *
598   * @param scheme       of Scheme
599   * @param checkpointer of S3Checkpointer
600   * @param bucketName   of String
601   * @param key          of String
602   * @param delimiter    of String
603   * @param sinkMode     of SinkMode
604   */
605  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode )
606    {
607    this( scheme, null, checkpointer, bucketName, key, delimiter, sinkMode );
608    }
609
610  /**
611   * Constructor S3Tap creates a new S3Tap instance.
612   *
613   * @param scheme       of Scheme
614   * @param checkpointer of S3Checkpointer
615   * @param bucketName   of String
616   * @param filter       of Predicate
617   * @param sinkMode     of SinkMode
618   */
619  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter, SinkMode sinkMode )
620    {
621    this( scheme, checkpointer, bucketName, null, filter, sinkMode );
622    }
623
624  /**
625   * Constructor S3Tap creates a new S3Tap instance.
626   *
627   * @param scheme       of Scheme
628   * @param checkpointer of S3Checkpointer
629   * @param bucketName   of String
630   * @param key          of String
631   * @param filter       of Predicate
632   * @param sinkMode     of SinkMode
633   */
634  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode )
635    {
636    this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode );
637    }
638
639  /**
640   * Constructor S3Tap creates a new S3Tap instance.
641   *
642   * @param scheme       of Scheme
643   * @param checkpointer of S3Checkpointer
644   * @param bucketName   of String
645   * @param key          of String
646   * @param delimiter    of String
647   * @param filter       of Predicate
648   * @param sinkMode     of SinkMode
649   */
650  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
651    {
652    this( scheme, null, checkpointer, bucketName, key, delimiter, filter, sinkMode );
653    }
654
655  /**
656   * Constructor S3Tap creates a new S3Tap instance.
657   *
658   * @param scheme       of Scheme
659   * @param s3Client     of AmazonS3
660   * @param checkpointer of S3Checkpointer
661   * @param bucketName   of String
662   * @param sinkMode     of SinkMode
663   */
664  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode )
665    {
666    this( scheme, s3Client, checkpointer, bucketName, null, sinkMode );
667    }
668
669  /**
670   * Constructor S3Tap creates a new S3Tap instance.
671   *
672   * @param scheme       of Scheme
673   * @param s3Client     of AmazonS3
674   * @param checkpointer of S3Checkpointer
675   * @param bucketName   of String
676   * @param key          of String
677   * @param sinkMode     of SinkMode
678   */
679  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode )
680    {
681    this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode );
682    }
683
684  /**
685   * Constructor S3Tap creates a new S3Tap instance.
686   *
687   * @param scheme       of Scheme
688   * @param s3Client     of AmazonS3
689   * @param checkpointer of S3Checkpointer
690   * @param bucketName   of String
691   * @param key          of String
692   * @param delimiter    of String
693   * @param sinkMode     of SinkMode
694   */
695  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode )
696    {
697    this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, sinkMode );
698    }
699
700  /**
701   * Constructor S3Tap creates a new S3Tap instance.
702   *
703   * @param scheme       of Scheme
704   * @param s3Client     of AmazonS3
705   * @param checkpointer of S3Checkpointer
706   * @param bucketName   of String
707   * @param key          of String
708   * @param delimiter    of String
709   * @param filter       of Predicate
710   * @param sinkMode     of SinkMode
711   */
712  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode )
713    {
714    super( scheme, sinkMode );
715    this.s3Client = s3Client;
716    this.checkpointer = checkpointer;
717    this.bucketName = bucketName;
718
719    if( isEmpty( this.bucketName ) )
720      throw new IllegalArgumentException( "bucket name may not be null or empty" );
721
722    this.key = key;
723    this.delimiter = delimiter;
724    this.filter = filter;
725    }
726
727  /**
728   * Constructor S3Tap creates a new S3Tap instance.
729   *
730   * @param scheme     of Scheme
731   * @param identifier of URI
732   */
733  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier )
734    {
735    this( scheme, null, null, identifier, SinkMode.KEEP );
736    }
737
738  /**
739   * Constructor S3Tap creates a new S3Tap instance.
740   *
741   * @param scheme     of Scheme
742   * @param s3Client   of AmazonS3
743   * @param identifier of URI
744   */
745  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier )
746    {
747    this( scheme, s3Client, null, identifier, SinkMode.KEEP );
748    }
749
750  /**
751   * Constructor S3Tap creates a new S3Tap instance.
752   *
753   * @param scheme       of Scheme
754   * @param checkpointer of S3Checkpointer
755   * @param identifier   of URI
756   */
757  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier )
758    {
759    this( scheme, null, checkpointer, identifier, SinkMode.KEEP );
760    }
761
762  /**
763   * Constructor S3Tap creates a new S3Tap instance.
764   *
765   * @param scheme       of Scheme
766   * @param s3Client     of AmazonS3
767   * @param checkpointer of S3Checkpointer
768   * @param identifier   of URI
769   */
770  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier )
771    {
772    this( scheme, s3Client, checkpointer, identifier, SinkMode.KEEP );
773    }
774
775  /**
776   * Constructor S3Tap creates a new S3Tap instance.
777   *
778   * @param scheme     of Scheme
779   * @param identifier of URI
780   * @param sinkMode   of SinkMode
781   */
782  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier, SinkMode sinkMode )
783    {
784    this( scheme, null, null, identifier, sinkMode );
785    }
786
787  /**
788   * Constructor S3Tap creates a new S3Tap instance.
789   *
790   * @param scheme     of Scheme
791   * @param s3Client   of AmazonS3
792   * @param identifier of URI
793   * @param sinkMode   of SinkMode
794   */
795  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier, SinkMode sinkMode )
796    {
797    this( scheme, s3Client, null, identifier, sinkMode );
798    }
799
800  /**
801   * Constructor S3Tap creates a new S3Tap instance.
802   *
803   * @param scheme       of Scheme
804   * @param checkpointer of S3Checkpointer
805   * @param identifier   of URI
806   * @param sinkMode     of SinkMode
807   */
808  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode )
809    {
810    this( scheme, null, checkpointer, identifier, sinkMode );
811    }
812
813  /**
814   * Constructor S3Tap creates a new S3Tap instance.
815   *
816   * @param scheme       of Scheme
817   * @param s3Client     of AmazonS3
818   * @param checkpointer of S3Checkpointer
819   * @param identifier   of URI
820   * @param sinkMode     of SinkMode
821   */
822  public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode )
823    {
824    super( scheme, sinkMode );
825    this.s3Client = s3Client;
826    this.checkpointer = checkpointer;
827
828    if( identifier == null )
829      throw new IllegalArgumentException( "identifier may not be null" );
830
831    if( !identifier.getScheme().equalsIgnoreCase( "s3" ) )
832      throw new IllegalArgumentException( "identifier does not have s3 scheme" );
833
834    this.bucketName = getBucketNameFor( identifier );
835
836    if( isEmpty( this.bucketName ) )
837      throw new IllegalArgumentException( "bucket name may not be null or empty" + identifier );
838
839    this.key = cleanKey( identifier );
840
841    if( identifier.getQuery() != null )
842      filter = globPredicate( identifier.getQuery() );
843    }
844
845  protected String getBucketNameFor( URI identifier )
846    {
847    String authority = identifier.getAuthority();
848
849    if( isEmpty( authority ) )
850      throw new IllegalArgumentException( "identifier must have an authority: " + identifier );
851
852    int pos = authority.indexOf( '@' );
853
854    if( pos != -1 )
855      return authority.substring( pos + 1 );
856
857    return authority;
858    }
859
860  private static Predicate<String> globPredicate( String glob )
861    {
862    String regex = getRegexForGlob( glob );
863    Pattern pattern = Pattern.compile( regex );
864
865    return string -> pattern.matcher( string ).matches();
866    }
867
868  private static String getRegexForGlob( String glob )
869    {
870    return (String) Util.invokeStaticMethod(
871      "sun.nio.fs.Globs",
872      "toUnixRegexPattern",
873      new Object[]{glob},
874      new Class[]{String.class}
875    );
876    }
877
878  @Override
879  public TapWith<Properties, InputStream, OutputStream> withScheme( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme )
880    {
881    // don't lazily create s3Client
882    return new S3Tap( scheme, s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), getSinkMode() );
883    }
884
885  @Override
886  public TapWith<Properties, InputStream, OutputStream> withChildIdentifier( String identifier )
887    {
888    URI uri;
889
890    if( identifier.startsWith( "s3://" ) )
891      uri = URI.create( identifier );
892    else if( identifier.startsWith( getBucketName() ) )
893      uri = makeURI( identifier, null );
894    else
895      uri = makeURI( getBucketName(), getKey() + ( identifier.startsWith( delimiter ) ? identifier : delimiter + identifier ) );
896
897    // don't lazily create s3Client
898    return new S3Tap( getScheme(), s3Client, uri, getSinkMode() );
899    }
900
901  @Override
902  public TapWith<Properties, InputStream, OutputStream> withSinkMode( SinkMode sinkMode )
903    {
904    // don't lazily create s3Client
905    return new S3Tap( getScheme(), s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), sinkMode );
906    }
907
908  protected String cleanKey( URI identifier )
909    {
910    String path = identifier.getPath();
911
912    if( path.startsWith( "/" ) )
913      path = path.substring( 1 );
914
915    return path;
916    }
917
918  protected AmazonS3 getS3Client( Properties properties )
919    {
920    // return provided client
921    if( s3Client != null )
922      return s3Client;
923
924    AmazonS3ClientBuilder standard = AmazonS3ClientBuilder.standard();
925
926    if( properties != null )
927      {
928      String endpoint = properties.getProperty( S3TapProps.S3_ENDPOINT );
929      String region = properties.getProperty( S3TapProps.S3_REGION, "us-east-1" );
930
931      if( properties.containsKey( S3TapProps.S3_PROXY_HOST ) )
932        {
933        ClientConfiguration config = new ClientConfiguration()
934          .withProxyHost( properties.getProperty( S3TapProps.S3_PROXY_HOST ) )
935          .withProxyPort( PropertyUtil.getIntProperty( properties, S3TapProps.S3_PROXY_PORT, -1 ) );
936
937        standard.withClientConfiguration( config );
938        }
939
940      if( endpoint != null )
941        standard.withEndpointConfiguration( new AwsClientBuilder.EndpointConfiguration( endpoint, region ) );
942      else
943        standard.setRegion( region );
944
945      if( Boolean.parseBoolean( properties.getProperty( S3TapProps.S3_PATH_STYLE_ACCESS, "false" ) ) )
946        standard.enablePathStyleAccess();
947      }
948
949    return standard.build();
950    }
951
952  /**
953   * Method getCheckpointer returns the checkpointer of this S3Tap object.
954   *
955   * @return the checkpointer (type S3Checkpointer) of this S3Tap object.
956   */
957  public S3Checkpointer getCheckpointer()
958    {
959    return checkpointer;
960    }
961
962  /**
963   * Method getBucketName returns the bucketName of this S3Tap object.
964   *
965   * @return the bucketName (type String) of this S3Tap object.
966   */
967  public String getBucketName()
968    {
969    return bucketName;
970    }
971
972  /**
973   * Method getKey returns the key of this S3Tap object.
974   *
975   * @return the key (type String) of this S3Tap object.
976   */
977  public String getKey()
978    {
979    return key;
980    }
981
982  protected String getMarker()
983    {
984    if( checkpointer != null )
985      return checkpointer.getLastKey( getBucketName() );
986
987    return null;
988    }
989
990  protected void setLastMarker( String marker )
991    {
992    if( checkpointer != null )
993      checkpointer.setLastKey( getBucketName(), marker );
994    }
995
996  protected void commitMarker()
997    {
998    if( checkpointer != null )
999      checkpointer.commit();
1000    }
1001
1002  /**
1003   * Method getFilter returns the filter of this S3Tap object.
1004   *
1005   * @return the filter (type Predicate) of this S3Tap object.
1006   */
1007  public Predicate<String> getFilter()
1008    {
1009    return filter;
1010    }
1011
1012  /**
1013   * Method getDelimiter returns the delimiter of this S3Tap object.
1014   *
1015   * @return the delimiter (type String) of this S3Tap object.
1016   */
1017  public String getDelimiter()
1018    {
1019    return delimiter;
1020    }
1021
1022  @Override
1023  public String getIdentifier()
1024    {
1025    return makeStringIdentifier( getBucketName(), getKey() );
1026    }
1027
1028  @Override
1029  public String getFullIdentifier( Properties conf )
1030    {
1031    return getIdentifier();
1032    }
1033
1034  @Override
1035  public boolean deleteResource( Properties conf ) throws IOException
1036    {
1037    AmazonS3 s3Client = getS3Client( conf );
1038
1039    try
1040      {
1041      s3Client.deleteObject( getBucketName(), getKey() );
1042      }
1043    catch( AmazonS3Exception exception )
1044      {
1045      throw handleException( s3Client, exception );
1046      }
1047
1048    return true;
1049    }
1050
1051  @Override
1052  public boolean createResource( Properties conf ) throws IOException
1053    {
1054    AmazonS3 s3Client = getS3Client( conf );
1055
1056    try
1057      {
1058      s3Client.putObject( getBucketName(), getKey(), "" );
1059      }
1060    catch( AmazonS3Exception exception )
1061      {
1062      throw handleException( s3Client, exception );
1063      }
1064
1065    return true;
1066    }
1067
1068  protected ObjectMetadata getObjectMetadata( Properties conf )
1069    {
1070    try
1071      {
1072      if( objectMetadata == null )
1073        objectMetadata = getS3Client( conf ).getObjectMetadata( getBucketName(), getKey() );
1074
1075      return objectMetadata;
1076      }
1077    catch( AmazonS3Exception exception )
1078      {
1079      throw handleException( getS3Client( conf ), exception );
1080      }
1081    }
1082
1083  private class CheckedFilterInputStream extends FilterInputStream
1084    {
1085    public CheckedFilterInputStream( InputStream inputStream )
1086      {
1087      super( inputStream );
1088      }
1089    }
1090
1091  @Override
1092  public TupleEntryIterator openForRead( FlowProcess<? extends Properties> flowProcess, InputStream input ) throws IOException
1093    {
1094    AmazonS3 s3Client = getS3Client( flowProcess.getConfig() );
1095
1096    final String[] identifier = new String[ 1 ];
1097
1098    CloseableIterator<InputStream> iterator = new CloseableIterator<InputStream>()
1099      {
1100      S3Iterable iterable = S3Iterable.iterable( s3Client, getBucketName(), getKey() )
1101        .withFilter( getFilter() )
1102        .withMarker( getMarker() );
1103
1104      Iterator<S3ObjectSummary> iterator = iterable.iterator();
1105      InputStream lastInputStream;
1106
1107      @Override
1108      public boolean hasNext()
1109        {
1110        return iterator.hasNext();
1111        }
1112
1113      @Override
1114      public InputStream next()
1115        {
1116        safeClose();
1117
1118        S3ObjectSummary objectSummary = iterator.next();
1119
1120        identifier[ 0 ] = makeStringIdentifier( objectSummary.getBucketName(), objectSummary.getKey() );
1121
1122        flowProcess.getFlowProcessContext().setSourcePath( identifier[ 0 ] );
1123
1124        if( LOG.isDebugEnabled() )
1125          LOG.debug( "s3 retrieving: {}/{}, with size: {}", objectSummary.getBucketName(), objectSummary.getKey(), objectSummary.getSize() );
1126
1127        // getObject does not seem to fill the InputStream, nor does the InputStream support marking
1128        // may make sense to wrap this iterator in a iterate ahead iterator that attempts to pre-fetch objects in a different thread
1129        lastInputStream = new CheckedFilterInputStream( s3Client.getObject( objectSummary.getBucketName(), objectSummary.getKey() ).getObjectContent() )
1130          {
1131          @Override
1132          public void close() throws IOException
1133            {
1134            setLastMarker( objectSummary.getKey() );
1135            super.close();
1136            }
1137          };
1138
1139        return lastInputStream;
1140        }
1141
1142      private void safeClose()
1143        {
1144        try
1145          {
1146          if( lastInputStream != null )
1147            lastInputStream.close();
1148
1149          lastInputStream = null;
1150          }
1151        catch( IOException exception )
1152          {
1153          // do nothing
1154          }
1155        }
1156
1157      @Override
1158      public void close()
1159        {
1160        safeClose();
1161        commitMarker();
1162        }
1163      };
1164
1165    return new TupleEntrySchemeIterator<Properties, InputStream>( flowProcess, this, getScheme(), iterator, () -> identifier[ 0 ] );
1166    }
1167
1168  @Override
1169  public TupleEntryCollector openForWrite( FlowProcess<? extends Properties> flowProcess, OutputStream outputStream ) throws IOException
1170    {
1171    AmazonS3 s3Client = getS3Client( flowProcess.getConfig() );
1172
1173    if( !s3Client.doesBucketExistV2( getBucketName() ) )
1174      s3Client.createBucket( getBucketName() );
1175
1176    PipedInputStream pipedInputStream = new PipedInputStream();
1177    PipedOutputStream pipedOutputStream = new PipedOutputStream( pipedInputStream );
1178
1179    TransferManager transferManager = TransferManagerBuilder.standard().withS3Client( s3Client ).build();
1180
1181    ObjectMetadata metadata = new ObjectMetadata();
1182
1183    if( LOG.isDebugEnabled() )
1184      LOG.debug( "starting upload: {}", getIdentifier() );
1185
1186    final String key = resolveKey( flowProcess, getKey() );
1187
1188    Upload upload = transferManager.upload( getBucketName(), key, pipedInputStream, metadata );
1189
1190    return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, this, getScheme(), pipedOutputStream, makeStringIdentifier( getBucketName(), key ) )
1191      {
1192      @Override
1193      public void close()
1194        {
1195        super.close();
1196
1197        try
1198          {
1199          UploadResult uploadResult = upload.waitForUploadResult();
1200
1201          if( uploadResult != null )
1202            {
1203            if( LOG.isDebugEnabled() )
1204              LOG.debug( "completed upload: {}, with key: {}", getIdentifier(), uploadResult.getKey() );
1205            }
1206          }
1207        catch( InterruptedException exception )
1208          {
1209          // ignore
1210          }
1211        finally
1212          {
1213          transferManager.shutdownNow( false );
1214          }
1215        }
1216      };
1217    }
1218
1219  protected String resolveKey( FlowProcess<? extends Properties> flowProcess, String key )
1220    {
1221    int partNum = flowProcess.getIntegerProperty( PartitionTap.PART_NUM_PROPERTY, 0 );
1222
1223    key = key.replace( SEQUENCE_TOKEN, String.format( "%05d", partNum ) );
1224
1225    if( getScheme() instanceof FileFormat )
1226      return key + "." + ( (FileFormat) getScheme() ).getExtension();
1227
1228    return key;
1229    }
1230
1231  @Override
1232  public boolean resourceExists( Properties conf ) throws IOException
1233    {
1234    AmazonS3 s3Client = getS3Client( conf );
1235
1236    try
1237      {
1238      if( getKey() == null )
1239        return s3Client.doesBucketExistV2( getBucketName() );
1240
1241      return s3Client.doesObjectExist( getBucketName(), getKey() );
1242      }
1243    catch( AmazonS3Exception exception )
1244      {
1245      throw handleException( s3Client, exception );
1246      }
1247    }
1248
1249  protected AmazonS3Exception handleException( AmazonS3 s3Client, AmazonS3Exception exception )
1250    {
1251    if( exception.getStatusCode() == 400 )
1252      {
1253      LOG.error( "s3 request failed, try changing the AWS Region from: {}, using property: {}", s3Client.getRegionName(), S3TapProps.S3_REGION, exception );
1254      }
1255
1256    return exception;
1257    }
1258
1259  @Override
1260  public long getModifiedTime( Properties conf ) throws IOException
1261    {
1262    return getObjectMetadata( conf ).getLastModified().getTime();
1263    }
1264
1265  @Override
1266  public boolean isDirectory( FlowProcess<? extends Properties> flowProcess ) throws IOException
1267    {
1268    return MIME_DIRECTORY.equalsIgnoreCase( getObjectMetadata( flowProcess.getConfig() ).getContentType() );
1269    }
1270
1271  @Override
1272  public boolean isDirectory( Properties conf ) throws IOException
1273    {
1274    return isDirectory( FlowProcess.nullFlowProcess() );
1275    }
1276
1277  @Override
1278  public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess ) throws IOException
1279    {
1280    return getChildIdentifiers( flowProcess.getConfig() );
1281    }
1282
1283  @Override
1284  public String[] getChildIdentifiers( Properties conf ) throws IOException
1285    {
1286    return getChildIdentifiers( conf, 1, false );
1287    }
1288
1289  @Override
1290  public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess, int depth, boolean fullyQualified ) throws IOException
1291    {
1292    return getChildIdentifiers( flowProcess.getConfig(), depth, fullyQualified );
1293    }
1294
1295  @Override
1296  public String[] getChildIdentifiers( Properties conf, int depth, boolean fullyQualified ) throws IOException
1297    {
1298    if( !resourceExists( conf ) )
1299      return new String[ 0 ];
1300
1301    S3Iterable objects = S3Iterable.iterable( getS3Client( conf ), getBucketName(), getKey() )
1302      .withDelimiter( getDelimiter() )
1303      .withMaxDepth( depth )
1304      .withFilter( getFilter() )
1305      .withMarker( getMarker() );
1306
1307    Iterator<S3ObjectSummary> iterator = objects.iterator();
1308
1309    List<String> results = new ArrayList<>();
1310
1311    while( iterator.hasNext() )
1312      results.add( makePath( iterator, fullyQualified ) );
1313
1314    return results.toArray( new String[ results.size() ] );
1315    }
1316
1317  protected String makePath( Iterator<S3ObjectSummary> iterator, boolean fullyQualified )
1318    {
1319    String key = iterator.next().getKey();
1320
1321    if( fullyQualified )
1322      return makeStringIdentifier( getBucketName(), key );
1323
1324    return key.substring( getKey().length() );
1325    }
1326
1327  @Override
1328  public long getSize( FlowProcess<? extends Properties> flowProcess ) throws IOException
1329    {
1330    return getSize( flowProcess.getConfig() );
1331    }
1332
1333  @Override
1334  public long getSize( Properties conf ) throws IOException
1335    {
1336    if( isDirectory( conf ) )
1337      return 0;
1338
1339    return getObjectMetadata( conf ).getInstanceLength();
1340    }
1341
1342  protected static String makeStringIdentifier( String bucketName, String keyPrefix )
1343    {
1344    if( isEmpty( keyPrefix ) )
1345      return String.format( "s3://%s/", bucketName );
1346
1347    return String.format( "s3://%s/%s", bucketName, keyPrefix );
1348    }
1349  }