001/* 002 * Copyright (c) 2017-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.local.tap.aws.s3; 022 023import java.io.FilterInputStream; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.OutputStream; 027import java.io.PipedInputStream; 028import java.io.PipedOutputStream; 029import java.net.URI; 030import java.net.URISyntaxException; 031import java.util.ArrayList; 032import java.util.Iterator; 033import java.util.List; 034import java.util.Properties; 035import java.util.function.Predicate; 036import java.util.regex.Pattern; 037 038import cascading.flow.FlowProcess; 039import cascading.property.PropertyUtil; 040import cascading.scheme.FileFormat; 041import cascading.scheme.Scheme; 042import cascading.tap.SinkMode; 043import cascading.tap.Tap; 044import cascading.tap.local.PartitionTap; 045import cascading.tap.type.FileType; 046import cascading.tap.type.TapWith; 047import cascading.tuple.TupleEntryCollector; 048import cascading.tuple.TupleEntryIterator; 049import cascading.tuple.TupleEntrySchemeCollector; 050import cascading.tuple.TupleEntrySchemeIterator; 051import cascading.util.CloseableIterator; 052import cascading.util.Util; 053import com.amazonaws.ClientConfiguration; 054import com.amazonaws.client.builder.AwsClientBuilder; 055import com.amazonaws.services.s3.AmazonS3; 056import com.amazonaws.services.s3.AmazonS3ClientBuilder; 057import com.amazonaws.services.s3.model.ObjectMetadata; 058import com.amazonaws.services.s3.model.S3ObjectSummary; 059import com.amazonaws.services.s3.transfer.TransferManager; 060import com.amazonaws.services.s3.transfer.TransferManagerBuilder; 061import com.amazonaws.services.s3.transfer.Upload; 062import com.amazonaws.services.s3.transfer.model.UploadResult; 063import org.slf4j.Logger; 064import org.slf4j.LoggerFactory; 065 066import static cascading.util.Util.isEmpty; 067 068/** 069 * Class S3Tap is a Cascading local-mode {@link Tap} providing read and write access to data stored in Amazon S3 buckets. 070 * <p> 071 * This Tap is not intended to be used with any of the other Cascading planners unless they specify they are local-mode 072 * compatible. 073 * <p> 074 * S3Tap can read a single key, all objects underneath a key-prefix, or all objects under a key-prefix that match 075 * a given globbing pattern. 076 * <p> 077 * See the various constructors for the available access parametrizations. Of note are the constructors that take 078 * a {@link URI} instance. The URI should be in the following format: 079 * {@code s3://[bucket]/<key|key-prefix><?glob>} 080 * <p> 081 * Where bucket is the only required value. The key references a single object, the key-prefix is used to access 082 * a set of objects with a common prefix value. The glob value is use to further narrow the resulting object set. 083 * <p> 084 * The globbing pattern is specified by the {@link java.nio.file.FileSystem#getPathMatcher} method. 085 * <p> 086 * This Tap was designed to allow applications to effectively poll an S3 bucket for new keys to be processed. 087 * <p> 088 * When used with the {@link S3FileCheckpointer} class, a map of keys last consumed by each bucket will be tracked 089 * on disk, with the map surviving JVM restarts allowing for applications to exit and restart safely without 090 * retrieving duplicate data. 091 * <p> 092 * The {@link S3Checkpointer#commit()} method is only called during a graceful shutdown of the Flow or JVM, but every 093 * consumed key is passed to the S3Checkpointer, so custom implementations can choose to persist the key more 094 * frequently. 095 * <p> 096 * AWS Credentials are handled by {@link com.amazonaws.auth.DefaultAWSCredentialsProviderChain}. 097 */ 098public class S3Tap extends Tap<Properties, InputStream, OutputStream> implements FileType<Properties>, TapWith<Properties, InputStream, OutputStream> 099 { 100 /** Field LOG */ 101 private static final Logger LOG = LoggerFactory.getLogger( S3Tap.class ); 102 103 /** Field SEQUENCE_TOKEN */ 104 public static final String SEQUENCE_TOKEN = "{sequence}"; 105 /** Field MIME_DIRECTORY */ 106 public static final String MIME_DIRECTORY = "application/x-directory"; 107 /** Field DEFAULT_DELIMITER */ 108 public static final String DEFAULT_DELIMITER = "/"; 109 110 /** Field s3Client */ 111 AmazonS3 s3Client = null; 112 /** Field bucketName */ 113 String bucketName = null; 114 /** Field key */ 115 String key = null; 116 /** Field filter */ 117 Predicate<String> filter; 118 /** Field delimiter */ 119 String delimiter = DEFAULT_DELIMITER; 120 /** Field checkpointer */ 121 S3Checkpointer checkpointer; 122 123 private transient ObjectMetadata objectMetadata; 124 125 /** 126 * Method makeURI creates a new S3 URI from the given parameters. 127 * 128 * @param bucketName the S3 bucket name 129 * @param keyPrefix the S3 object key or key-prefix 130 * @return an URI instance 131 */ 132 public static URI makeURI( String bucketName, String keyPrefix ) 133 { 134 return makeURI( bucketName, keyPrefix, null ); 135 } 136 137 /** 138 * Method makeURI creates a new S3 URI from the given parameters. 139 * 140 * @param bucketName the S3 bucket name 141 * @param keyPrefix the S3 object key or key-prefix 142 * @param glob the globbing pattern to apply to the keys 143 * @return an URI instance 144 */ 145 public static URI makeURI( String bucketName, String keyPrefix, String glob ) 146 { 147 if( bucketName == null ) 148 throw new IllegalArgumentException( "bucketName may not be null" ); 149 150 try 151 { 152 if( keyPrefix == null ) 153 keyPrefix = "/"; 154 else if( !keyPrefix.startsWith( "/" ) ) 155 keyPrefix = "/" + keyPrefix; 156 157 return new URI( "s3", bucketName, keyPrefix, glob, null ); 158 } 159 catch( URISyntaxException exception ) 160 { 161 throw new IllegalArgumentException( exception.getMessage(), exception ); 162 } 163 } 164 165 /** 166 * Constructor S3Tap creates a new S3Tap instance. 167 * 168 * @param scheme of Scheme 169 * @param bucketName of String 170 */ 171 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName ) 172 { 173 this( scheme, bucketName, null, null, null, SinkMode.KEEP ); 174 } 175 176 /** 177 * Constructor S3Tap creates a new S3Tap instance. 178 * 179 * @param scheme of Scheme 180 * @param bucketName of String 181 * @param key of String 182 */ 183 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key ) 184 { 185 this( scheme, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 186 } 187 188 /** 189 * Constructor S3Tap creates a new S3Tap instance. 190 * 191 * @param scheme of Scheme 192 * @param bucketName of String 193 * @param key of String 194 * @param delimiter of String 195 */ 196 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter ) 197 { 198 this( scheme, null, null, bucketName, key, delimiter, SinkMode.KEEP ); 199 } 200 201 /** 202 * Constructor S3Tap creates a new S3Tap instance. 203 * 204 * @param scheme of Scheme 205 * @param bucketName of String 206 * @param filter of Predicate 207 */ 208 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter ) 209 { 210 this( scheme, bucketName, null, filter, SinkMode.KEEP ); 211 } 212 213 /** 214 * Constructor S3Tap creates a new S3Tap instance. 215 * 216 * @param scheme of Scheme 217 * @param bucketName of String 218 * @param key of String 219 * @param filter of Predicate 220 */ 221 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter ) 222 { 223 this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP ); 224 } 225 226 /** 227 * Constructor S3Tap creates a new S3Tap instance. 228 * 229 * @param scheme of Scheme 230 * @param bucketName of String 231 * @param key of String 232 * @param delimiter of String 233 * @param filter of Predicate 234 */ 235 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter ) 236 { 237 this( scheme, null, null, bucketName, key, delimiter, filter, SinkMode.KEEP ); 238 } 239 240 /** 241 * Constructor S3Tap creates a new S3Tap instance. 242 * 243 * @param scheme of Scheme 244 * @param s3Client of AmazonS3 245 * @param bucketName of String 246 */ 247 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName ) 248 { 249 this( scheme, s3Client, bucketName, null, SinkMode.KEEP ); 250 } 251 252 /** 253 * Constructor S3Tap creates a new S3Tap instance. 254 * 255 * @param scheme of Scheme 256 * @param s3Client of AmazonS3 257 * @param bucketName of String 258 * @param key of String 259 */ 260 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key ) 261 { 262 this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 263 } 264 265 /** 266 * Constructor S3Tap creates a new S3Tap instance. 267 * 268 * @param scheme of Scheme 269 * @param s3Client of AmazonS3 270 * @param bucketName of String 271 * @param key of String 272 * @param delimiter of String 273 */ 274 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter ) 275 { 276 this( scheme, s3Client, bucketName, key, delimiter, null, SinkMode.KEEP ); 277 } 278 279 /** 280 * Constructor S3Tap creates a new S3Tap instance. 281 * 282 * @param scheme of Scheme 283 * @param s3Client of AmazonS3 284 * @param bucketName of String 285 * @param key of String 286 * @param delimiter of String 287 * @param filter of Predicate 288 */ 289 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter ) 290 { 291 this( scheme, s3Client, null, bucketName, key, delimiter, filter, SinkMode.KEEP ); 292 } 293 294 /** 295 * Constructor S3Tap creates a new S3Tap instance. 296 * 297 * @param scheme of Scheme 298 * @param checkpointer of S3Checkpointer 299 * @param bucketName of String 300 */ 301 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName ) 302 { 303 this( scheme, checkpointer, bucketName, null, null, null, SinkMode.KEEP ); 304 } 305 306 /** 307 * Constructor S3Tap creates a new S3Tap instance. 308 * 309 * @param scheme of Scheme 310 * @param checkpointer of S3Checkpointer 311 * @param bucketName of String 312 * @param key of String 313 */ 314 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key ) 315 { 316 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 317 } 318 319 /** 320 * Constructor S3Tap creates a new S3Tap instance. 321 * 322 * @param scheme of Scheme 323 * @param checkpointer of S3Checkpointer 324 * @param bucketName of String 325 * @param key of String 326 * @param delimiter of String 327 */ 328 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter ) 329 { 330 this( scheme, null, checkpointer, bucketName, key, delimiter, SinkMode.KEEP ); 331 } 332 333 /** 334 * Constructor S3Tap creates a new S3Tap instance. 335 * 336 * @param scheme of Scheme 337 * @param checkpointer of S3Checkpointer 338 * @param bucketName of String 339 * @param filter of Predicate 340 */ 341 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter ) 342 { 343 this( scheme, checkpointer, bucketName, null, filter, SinkMode.KEEP ); 344 } 345 346 /** 347 * Constructor S3Tap creates a new S3Tap instance. 348 * 349 * @param scheme of Scheme 350 * @param checkpointer of S3Checkpointer 351 * @param bucketName of String 352 * @param key of String 353 * @param filter of Predicate 354 */ 355 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter ) 356 { 357 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP ); 358 } 359 360 /** 361 * Constructor S3Tap creates a new S3Tap instance. 362 * 363 * @param scheme of Scheme 364 * @param checkpointer of S3Checkpointer 365 * @param bucketName of String 366 * @param key of String 367 * @param delimiter of String 368 * @param filter of Predicate 369 */ 370 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter ) 371 { 372 this( scheme, null, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP ); 373 } 374 375 /** 376 * Constructor S3Tap creates a new S3Tap instance. 377 * 378 * @param scheme of Scheme 379 * @param s3Client of AmazonS3 380 * @param checkpointer of S3Checkpointer 381 * @param bucketName of String 382 */ 383 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName ) 384 { 385 this( scheme, s3Client, checkpointer, bucketName, null, SinkMode.KEEP ); 386 } 387 388 /** 389 * Constructor S3Tap creates a new S3Tap instance. 390 * 391 * @param scheme of Scheme 392 * @param s3Client of AmazonS3 393 * @param checkpointer of S3Checkpointer 394 * @param bucketName of String 395 * @param key of String 396 */ 397 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key ) 398 { 399 this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 400 } 401 402 /** 403 * Constructor S3Tap creates a new S3Tap instance. 404 * 405 * @param scheme of Scheme 406 * @param s3Client of AmazonS3 407 * @param checkpointer of S3Checkpointer 408 * @param bucketName of String 409 * @param key of String 410 * @param delimiter of String 411 */ 412 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter ) 413 { 414 this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, SinkMode.KEEP ); 415 } 416 417 /** 418 * Constructor S3Tap creates a new S3Tap instance. 419 * 420 * @param scheme of Scheme 421 * @param s3Client of AmazonS3 422 * @param checkpointer of S3Checkpointer 423 * @param bucketName of String 424 * @param key of String 425 * @param delimiter of String 426 * @param filter of Predicate 427 */ 428 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter ) 429 { 430 this( scheme, s3Client, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP ); 431 } 432 433 /** 434 * Constructor S3Tap creates a new S3Tap instance. 435 * 436 * @param scheme of Scheme 437 * @param bucketName of String 438 */ 439 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, SinkMode sinkMode ) 440 { 441 this( scheme, bucketName, null, null, null, sinkMode ); 442 } 443 444 /** 445 * Constructor S3Tap creates a new S3Tap instance. 446 * 447 * @param scheme of Scheme 448 * @param bucketName of String 449 * @param key of String 450 */ 451 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, SinkMode sinkMode ) 452 { 453 this( scheme, bucketName, key, DEFAULT_DELIMITER ); 454 } 455 456 /** 457 * Constructor S3Tap creates a new S3Tap instance. 458 * 459 * @param scheme of Scheme 460 * @param bucketName of String 461 * @param key of String 462 * @param delimiter of String 463 */ 464 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, SinkMode sinkMode ) 465 { 466 this( scheme, null, null, bucketName, key, delimiter, sinkMode ); 467 } 468 469 /** 470 * Constructor S3Tap creates a new S3Tap instance. 471 * 472 * @param scheme of Scheme 473 * @param bucketName of String 474 * @param filter of Predicate 475 */ 476 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter, SinkMode sinkMode ) 477 { 478 this( scheme, bucketName, null, filter, sinkMode ); 479 } 480 481 /** 482 * Constructor S3Tap creates a new S3Tap instance. 483 * 484 * @param scheme of Scheme 485 * @param bucketName of String 486 * @param key of String 487 * @param filter of Predicate 488 */ 489 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode ) 490 { 491 this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode ); 492 } 493 494 /** 495 * Constructor S3Tap creates a new S3Tap instance. 496 * 497 * @param scheme of Scheme 498 * @param bucketName of String 499 * @param key of String 500 * @param delimiter of String 501 * @param filter of Predicate 502 * @param sinkMode of SinkMode 503 */ 504 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 505 { 506 this( scheme, null, null, bucketName, key, delimiter, filter, sinkMode ); 507 } 508 509 /** 510 * Constructor S3Tap creates a new S3Tap instance. 511 * 512 * @param scheme of Scheme 513 * @param s3Client of AmazonS3 514 * @param bucketName of String 515 * @param sinkMode of SinkMode 516 */ 517 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, SinkMode sinkMode ) 518 { 519 this( scheme, s3Client, bucketName, null, sinkMode ); 520 } 521 522 /** 523 * Constructor S3Tap creates a new S3Tap instance. 524 * 525 * @param scheme of Scheme 526 * @param s3Client of AmazonS3 527 * @param bucketName of String 528 * @param key of String 529 * @param sinkMode of SinkMode 530 */ 531 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, SinkMode sinkMode ) 532 { 533 this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, sinkMode ); 534 } 535 536 /** 537 * Constructor S3Tap creates a new S3Tap instance. 538 * 539 * @param scheme of Scheme 540 * @param s3Client of AmazonS3 541 * @param bucketName of String 542 * @param key of String 543 * @param delimiter of String 544 * @param sinkMode of SinkMode 545 */ 546 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, SinkMode sinkMode ) 547 { 548 this( scheme, s3Client, bucketName, key, delimiter, null, sinkMode ); 549 } 550 551 /** 552 * Constructor S3Tap creates a new S3Tap instance. 553 * 554 * @param scheme of Scheme 555 * @param s3Client of AmazonS3 556 * @param bucketName of String 557 * @param key of String 558 * @param delimiter of String 559 * @param filter of Predicate 560 * @param sinkMode of SinkMode 561 */ 562 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 563 { 564 this( scheme, s3Client, null, bucketName, key, delimiter, filter, sinkMode ); 565 } 566 567 /** 568 * Constructor S3Tap creates a new S3Tap instance. 569 * 570 * @param scheme of Scheme 571 * @param checkpointer of S3Checkpointer 572 * @param bucketName of String 573 * @param sinkMode of SinkMode 574 */ 575 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode ) 576 { 577 this( scheme, checkpointer, bucketName, null, null, null, sinkMode ); 578 } 579 580 /** 581 * Constructor S3Tap creates a new S3Tap instance. 582 * 583 * @param scheme of Scheme 584 * @param checkpointer of S3Checkpointer 585 * @param bucketName of String 586 * @param key of String 587 * @param sinkMode of SinkMode 588 */ 589 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode ) 590 { 591 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode ); 592 } 593 594 /** 595 * Constructor S3Tap creates a new S3Tap instance. 596 * 597 * @param scheme of Scheme 598 * @param checkpointer of S3Checkpointer 599 * @param bucketName of String 600 * @param key of String 601 * @param delimiter of String 602 * @param sinkMode of SinkMode 603 */ 604 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode ) 605 { 606 this( scheme, null, checkpointer, bucketName, key, delimiter, sinkMode ); 607 } 608 609 /** 610 * Constructor S3Tap creates a new S3Tap instance. 611 * 612 * @param scheme of Scheme 613 * @param checkpointer of S3Checkpointer 614 * @param bucketName of String 615 * @param filter of Predicate 616 * @param sinkMode of SinkMode 617 */ 618 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter, SinkMode sinkMode ) 619 { 620 this( scheme, checkpointer, bucketName, null, filter, sinkMode ); 621 } 622 623 /** 624 * Constructor S3Tap creates a new S3Tap instance. 625 * 626 * @param scheme of Scheme 627 * @param checkpointer of S3Checkpointer 628 * @param bucketName of String 629 * @param key of String 630 * @param filter of Predicate 631 * @param sinkMode of SinkMode 632 */ 633 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode ) 634 { 635 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode ); 636 } 637 638 /** 639 * Constructor S3Tap creates a new S3Tap instance. 640 * 641 * @param scheme of Scheme 642 * @param checkpointer of S3Checkpointer 643 * @param bucketName of String 644 * @param key of String 645 * @param delimiter of String 646 * @param filter of Predicate 647 * @param sinkMode of SinkMode 648 */ 649 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 650 { 651 this( scheme, null, checkpointer, bucketName, key, delimiter, filter, sinkMode ); 652 } 653 654 /** 655 * Constructor S3Tap creates a new S3Tap instance. 656 * 657 * @param scheme of Scheme 658 * @param s3Client of AmazonS3 659 * @param checkpointer of S3Checkpointer 660 * @param bucketName of String 661 * @param sinkMode of SinkMode 662 */ 663 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode ) 664 { 665 this( scheme, s3Client, checkpointer, bucketName, null, sinkMode ); 666 } 667 668 /** 669 * Constructor S3Tap creates a new S3Tap instance. 670 * 671 * @param scheme of Scheme 672 * @param s3Client of AmazonS3 673 * @param checkpointer of S3Checkpointer 674 * @param bucketName of String 675 * @param key of String 676 * @param sinkMode of SinkMode 677 */ 678 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode ) 679 { 680 this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode ); 681 } 682 683 /** 684 * Constructor S3Tap creates a new S3Tap instance. 685 * 686 * @param scheme of Scheme 687 * @param s3Client of AmazonS3 688 * @param checkpointer of S3Checkpointer 689 * @param bucketName of String 690 * @param key of String 691 * @param delimiter of String 692 * @param sinkMode of SinkMode 693 */ 694 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode ) 695 { 696 this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, sinkMode ); 697 } 698 699 /** 700 * Constructor S3Tap creates a new S3Tap instance. 701 * 702 * @param scheme of Scheme 703 * @param s3Client of AmazonS3 704 * @param checkpointer of S3Checkpointer 705 * @param bucketName of String 706 * @param key of String 707 * @param delimiter of String 708 * @param filter of Predicate 709 * @param sinkMode of SinkMode 710 */ 711 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 712 { 713 super( scheme, sinkMode ); 714 this.s3Client = s3Client; 715 this.checkpointer = checkpointer; 716 this.bucketName = bucketName; 717 718 if( isEmpty( this.bucketName ) ) 719 throw new IllegalArgumentException( "bucket name may not be null or empty" ); 720 721 this.key = key; 722 this.delimiter = delimiter; 723 this.filter = filter; 724 } 725 726 /** 727 * Constructor S3Tap creates a new S3Tap instance. 728 * 729 * @param scheme of Scheme 730 * @param identifier of URI 731 */ 732 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier ) 733 { 734 this( scheme, null, null, identifier, SinkMode.KEEP ); 735 } 736 737 /** 738 * Constructor S3Tap creates a new S3Tap instance. 739 * 740 * @param scheme of Scheme 741 * @param s3Client of AmazonS3 742 * @param identifier of URI 743 */ 744 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier ) 745 { 746 this( scheme, s3Client, null, identifier, SinkMode.KEEP ); 747 } 748 749 /** 750 * Constructor S3Tap creates a new S3Tap instance. 751 * 752 * @param scheme of Scheme 753 * @param checkpointer of S3Checkpointer 754 * @param identifier of URI 755 */ 756 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier ) 757 { 758 this( scheme, null, checkpointer, identifier, SinkMode.KEEP ); 759 } 760 761 /** 762 * Constructor S3Tap creates a new S3Tap instance. 763 * 764 * @param scheme of Scheme 765 * @param s3Client of AmazonS3 766 * @param checkpointer of S3Checkpointer 767 * @param identifier of URI 768 */ 769 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier ) 770 { 771 this( scheme, s3Client, checkpointer, identifier, SinkMode.KEEP ); 772 } 773 774 /** 775 * Constructor S3Tap creates a new S3Tap instance. 776 * 777 * @param scheme of Scheme 778 * @param identifier of URI 779 * @param sinkMode of SinkMode 780 */ 781 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier, SinkMode sinkMode ) 782 { 783 this( scheme, null, null, identifier, sinkMode ); 784 } 785 786 /** 787 * Constructor S3Tap creates a new S3Tap instance. 788 * 789 * @param scheme of Scheme 790 * @param s3Client of AmazonS3 791 * @param identifier of URI 792 * @param sinkMode of SinkMode 793 */ 794 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier, SinkMode sinkMode ) 795 { 796 this( scheme, s3Client, null, identifier, sinkMode ); 797 } 798 799 /** 800 * Constructor S3Tap creates a new S3Tap instance. 801 * 802 * @param scheme of Scheme 803 * @param checkpointer of S3Checkpointer 804 * @param identifier of URI 805 * @param sinkMode of SinkMode 806 */ 807 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode ) 808 { 809 this( scheme, null, checkpointer, identifier, sinkMode ); 810 } 811 812 /** 813 * Constructor S3Tap creates a new S3Tap instance. 814 * 815 * @param scheme of Scheme 816 * @param s3Client of AmazonS3 817 * @param checkpointer of S3Checkpointer 818 * @param identifier of URI 819 * @param sinkMode of SinkMode 820 */ 821 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode ) 822 { 823 super( scheme, sinkMode ); 824 this.s3Client = s3Client; 825 this.checkpointer = checkpointer; 826 827 if( identifier == null ) 828 throw new IllegalArgumentException( "identifier may not be null" ); 829 830 if( !identifier.getScheme().equalsIgnoreCase( "s3" ) ) 831 throw new IllegalArgumentException( "identifier does not have s3 scheme" ); 832 833 this.bucketName = getBucketNameFor( identifier ); 834 835 if( isEmpty( this.bucketName ) ) 836 throw new IllegalArgumentException( "bucket name may not be null or empty" + identifier ); 837 838 this.key = cleanKey( identifier ); 839 840 if( identifier.getQuery() != null ) 841 filter = globPredicate( identifier.getQuery() ); 842 } 843 844 protected String getBucketNameFor( URI identifier ) 845 { 846 String authority = identifier.getAuthority(); 847 848 if( isEmpty( authority ) ) 849 throw new IllegalArgumentException( "identifier must have an authority: " + identifier ); 850 851 int pos = authority.indexOf( '@' ); 852 853 if( pos != -1 ) 854 return authority.substring( pos + 1 ); 855 856 return authority; 857 } 858 859 private static Predicate<String> globPredicate( String glob ) 860 { 861 String regex = getRegexForGlob( glob ); 862 Pattern pattern = Pattern.compile( regex ); 863 864 return string -> pattern.matcher( string ).matches(); 865 } 866 867 private static String getRegexForGlob( String glob ) 868 { 869 return (String) Util.invokeStaticMethod( 870 "sun.nio.fs.Globs", 871 "toUnixRegexPattern", 872 new Object[]{glob}, 873 new Class[]{String.class} 874 ); 875 } 876 877 @Override 878 public TapWith<Properties, InputStream, OutputStream> withScheme( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme ) 879 { 880 // don't lazily create s3Client 881 return new S3Tap( scheme, s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), getSinkMode() ); 882 } 883 884 @Override 885 public TapWith<Properties, InputStream, OutputStream> withChildIdentifier( String identifier ) 886 { 887 URI uri; 888 889 if( identifier.startsWith( "s3://" ) ) 890 uri = URI.create( identifier ); 891 else if( identifier.startsWith( getBucketName() ) ) 892 uri = makeURI( identifier, null ); 893 else 894 uri = makeURI( getBucketName(), getKey() + ( identifier.startsWith( delimiter ) ? identifier : delimiter + identifier ) ); 895 896 // don't lazily create s3Client 897 return new S3Tap( getScheme(), s3Client, uri, getSinkMode() ); 898 } 899 900 @Override 901 public TapWith<Properties, InputStream, OutputStream> withSinkMode( SinkMode sinkMode ) 902 { 903 // don't lazily create s3Client 904 return new S3Tap( getScheme(), s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), sinkMode ); 905 } 906 907 protected String cleanKey( URI identifier ) 908 { 909 String path = identifier.getPath(); 910 911 if( path.startsWith( "/" ) ) 912 path = path.substring( 1 ); 913 914 return path; 915 } 916 917 protected AmazonS3 getS3Client( Properties properties ) 918 { 919 // return provided client 920 if( s3Client != null ) 921 return s3Client; 922 923 AmazonS3ClientBuilder standard = AmazonS3ClientBuilder.standard(); 924 925 if( properties != null ) 926 { 927 String endpoint = properties.getProperty( S3TapProps.S3_ENDPOINT ); 928 String region = properties.getProperty( S3TapProps.S3_REGION, "us-east-1" ); 929 930 if( properties.containsKey( S3TapProps.S3_PROXY_HOST ) ) 931 { 932 ClientConfiguration config = new ClientConfiguration() 933 .withProxyHost( properties.getProperty( S3TapProps.S3_PROXY_HOST ) ) 934 .withProxyPort( PropertyUtil.getIntProperty( properties, S3TapProps.S3_PROXY_PORT, -1 ) ); 935 936 standard.withClientConfiguration( config ); 937 } 938 939 if( endpoint != null ) 940 standard.withEndpointConfiguration( new AwsClientBuilder.EndpointConfiguration( endpoint, region ) ); 941 else 942 standard.setRegion( region ); 943 944 if( Boolean.parseBoolean( properties.getProperty( S3TapProps.S3_PATH_STYLE_ACCESS, "false" ) ) ) 945 standard.enablePathStyleAccess(); 946 } 947 948 return standard.build(); 949 } 950 951 /** 952 * Method getCheckpointer returns the checkpointer of this S3Tap object. 953 * 954 * @return the checkpointer (type S3Checkpointer) of this S3Tap object. 955 */ 956 public S3Checkpointer getCheckpointer() 957 { 958 return checkpointer; 959 } 960 961 /** 962 * Method getBucketName returns the bucketName of this S3Tap object. 963 * 964 * @return the bucketName (type String) of this S3Tap object. 965 */ 966 public String getBucketName() 967 { 968 return bucketName; 969 } 970 971 /** 972 * Method getKey returns the key of this S3Tap object. 973 * 974 * @return the key (type String) of this S3Tap object. 975 */ 976 public String getKey() 977 { 978 return key; 979 } 980 981 protected String getMarker() 982 { 983 if( checkpointer != null ) 984 return checkpointer.getLastKey( getBucketName() ); 985 986 return null; 987 } 988 989 protected void setLastMarker( String marker ) 990 { 991 if( checkpointer != null ) 992 checkpointer.setLastKey( getBucketName(), marker ); 993 } 994 995 protected void commitMarker() 996 { 997 if( checkpointer != null ) 998 checkpointer.commit(); 999 } 1000 1001 /** 1002 * Method getFilter returns the filter of this S3Tap object. 1003 * 1004 * @return the filter (type Predicate) of this S3Tap object. 1005 */ 1006 public Predicate<String> getFilter() 1007 { 1008 return filter; 1009 } 1010 1011 /** 1012 * Method getDelimiter returns the delimiter of this S3Tap object. 1013 * 1014 * @return the delimiter (type String) of this S3Tap object. 1015 */ 1016 public String getDelimiter() 1017 { 1018 return delimiter; 1019 } 1020 1021 @Override 1022 public String getIdentifier() 1023 { 1024 return makeStringIdentifier( getBucketName(), getKey() ); 1025 } 1026 1027 @Override 1028 public String getFullIdentifier( Properties conf ) 1029 { 1030 return getIdentifier(); 1031 } 1032 1033 @Override 1034 public boolean deleteResource( Properties conf ) throws IOException 1035 { 1036 AmazonS3 s3Client = getS3Client( conf ); 1037 1038 s3Client.deleteObject( getBucketName(), getKey() ); 1039 1040 return true; 1041 } 1042 1043 @Override 1044 public boolean createResource( Properties conf ) throws IOException 1045 { 1046 AmazonS3 s3Client = getS3Client( conf ); 1047 1048 s3Client.putObject( getBucketName(), getKey(), "" ); 1049 1050 return true; 1051 } 1052 1053 protected ObjectMetadata getObjectMetadata( Properties conf ) 1054 { 1055 if( objectMetadata == null ) 1056 objectMetadata = getS3Client( conf ).getObjectMetadata( getBucketName(), getKey() ); 1057 1058 return objectMetadata; 1059 } 1060 1061 private class CheckedFilterInputStream extends FilterInputStream 1062 { 1063 public CheckedFilterInputStream( InputStream inputStream ) 1064 { 1065 super( inputStream ); 1066 } 1067 } 1068 1069 @Override 1070 public TupleEntryIterator openForRead( FlowProcess<? extends Properties> flowProcess, InputStream input ) throws IOException 1071 { 1072 AmazonS3 s3Client = getS3Client( flowProcess.getConfig() ); 1073 1074 final String[] identifier = new String[ 1 ]; 1075 1076 CloseableIterator<InputStream> iterator = new CloseableIterator<InputStream>() 1077 { 1078 S3Iterable iterable = S3Iterable.iterable( s3Client, getBucketName(), getKey() ) 1079 .withFilter( getFilter() ) 1080 .withMarker( getMarker() ); 1081 1082 Iterator<S3ObjectSummary> iterator = iterable.iterator(); 1083 InputStream lastInputStream; 1084 1085 @Override 1086 public boolean hasNext() 1087 { 1088 return iterator.hasNext(); 1089 } 1090 1091 @Override 1092 public InputStream next() 1093 { 1094 safeClose(); 1095 1096 S3ObjectSummary objectSummary = iterator.next(); 1097 1098 identifier[ 0 ] = makeStringIdentifier( objectSummary.getBucketName(), objectSummary.getKey() ); 1099 1100 flowProcess.getFlowProcessContext().setSourcePath( identifier[ 0 ] ); 1101 1102 if( LOG.isDebugEnabled() ) 1103 LOG.debug( "s3 retrieving: {}/{}, with size: {}", objectSummary.getBucketName(), objectSummary.getKey(), objectSummary.getSize() ); 1104 1105 // getObject does not seem to fill the InputStream, nor does the InputStream support marking 1106 // may make sense to wrap this iterator in a iterate ahead iterator that attempts to pre-fetch objects in a different thread 1107 lastInputStream = new CheckedFilterInputStream( s3Client.getObject( objectSummary.getBucketName(), objectSummary.getKey() ).getObjectContent() ) 1108 { 1109 @Override 1110 public void close() throws IOException 1111 { 1112 setLastMarker( objectSummary.getKey() ); 1113 super.close(); 1114 } 1115 }; 1116 1117 return lastInputStream; 1118 } 1119 1120 private void safeClose() 1121 { 1122 try 1123 { 1124 if( lastInputStream != null ) 1125 lastInputStream.close(); 1126 1127 lastInputStream = null; 1128 } 1129 catch( IOException exception ) 1130 { 1131 // do nothing 1132 } 1133 } 1134 1135 @Override 1136 public void close() 1137 { 1138 safeClose(); 1139 commitMarker(); 1140 } 1141 }; 1142 1143 return new TupleEntrySchemeIterator<Properties, InputStream>( flowProcess, this, getScheme(), iterator, () -> identifier[ 0 ] ); 1144 } 1145 1146 @Override 1147 public TupleEntryCollector openForWrite( FlowProcess<? extends Properties> flowProcess, OutputStream outputStream ) throws IOException 1148 { 1149 AmazonS3 s3Client = getS3Client( flowProcess.getConfig() ); 1150 1151 if( !s3Client.doesBucketExist( getBucketName() ) ) 1152 s3Client.createBucket( getBucketName() ); 1153 1154 PipedInputStream pipedInputStream = new PipedInputStream(); 1155 PipedOutputStream pipedOutputStream = new PipedOutputStream( pipedInputStream ); 1156 1157 TransferManager transferManager = TransferManagerBuilder.standard().withS3Client( s3Client ).build(); 1158 1159 ObjectMetadata metadata = new ObjectMetadata(); 1160 1161 if( LOG.isDebugEnabled() ) 1162 LOG.debug( "starting upload: {}", getIdentifier() ); 1163 1164 final String key = resolveKey( flowProcess, getKey() ); 1165 1166 Upload upload = transferManager.upload( getBucketName(), key, pipedInputStream, metadata ); 1167 1168 return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, this, getScheme(), pipedOutputStream, makeStringIdentifier( getBucketName(), key ) ) 1169 { 1170 @Override 1171 public void close() 1172 { 1173 super.close(); 1174 1175 try 1176 { 1177 UploadResult uploadResult = upload.waitForUploadResult(); 1178 1179 if( uploadResult != null ) 1180 { 1181 if( LOG.isDebugEnabled() ) 1182 LOG.debug( "completed upload: {}, with key: {}", getIdentifier(), uploadResult.getKey() ); 1183 } 1184 } 1185 catch( InterruptedException exception ) 1186 { 1187 // ignore 1188 } 1189 finally 1190 { 1191 transferManager.shutdownNow( false ); 1192 } 1193 } 1194 }; 1195 } 1196 1197 protected String resolveKey( FlowProcess<? extends Properties> flowProcess, String key ) 1198 { 1199 int partNum = flowProcess.getIntegerProperty( PartitionTap.PART_NUM_PROPERTY, 0 ); 1200 1201 key = key.replace( SEQUENCE_TOKEN, String.format( "%05d", partNum ) ); 1202 1203 if( getScheme() instanceof FileFormat ) 1204 return key + "." + ( (FileFormat) getScheme() ).getExtension(); 1205 1206 return key; 1207 } 1208 1209 @Override 1210 public boolean resourceExists( Properties conf ) throws IOException 1211 { 1212 if( getKey() == null ) 1213 return getS3Client( conf ).doesBucketExist( getBucketName() ); 1214 1215 return getKey().endsWith( "/" ) || getS3Client( conf ).doesObjectExist( getBucketName(), getKey() ); 1216 } 1217 1218 @Override 1219 public long getModifiedTime( Properties conf ) throws IOException 1220 { 1221 return getObjectMetadata( conf ).getLastModified().getTime(); 1222 } 1223 1224 @Override 1225 public boolean isDirectory( FlowProcess<? extends Properties> flowProcess ) throws IOException 1226 { 1227 return MIME_DIRECTORY.equalsIgnoreCase( getObjectMetadata( flowProcess.getConfig() ).getContentType() ); 1228 } 1229 1230 @Override 1231 public boolean isDirectory( Properties conf ) throws IOException 1232 { 1233 return isDirectory( FlowProcess.nullFlowProcess() ); 1234 } 1235 1236 @Override 1237 public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess ) throws IOException 1238 { 1239 return getChildIdentifiers( flowProcess.getConfig() ); 1240 } 1241 1242 @Override 1243 public String[] getChildIdentifiers( Properties conf ) throws IOException 1244 { 1245 return getChildIdentifiers( conf, 1, false ); 1246 } 1247 1248 @Override 1249 public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess, int depth, boolean fullyQualified ) throws IOException 1250 { 1251 return getChildIdentifiers( flowProcess.getConfig(), depth, fullyQualified ); 1252 } 1253 1254 @Override 1255 public String[] getChildIdentifiers( Properties conf, int depth, boolean fullyQualified ) throws IOException 1256 { 1257 if( !resourceExists( conf ) ) 1258 return new String[ 0 ]; 1259 1260 S3Iterable objects = S3Iterable.iterable( getS3Client( conf ), getBucketName(), getKey() ) 1261 .withDelimiter( getDelimiter() ) 1262 .withMaxDepth( depth ) 1263 .withFilter( getFilter() ) 1264 .withMarker( getMarker() ); 1265 1266 Iterator<S3ObjectSummary> iterator = objects.iterator(); 1267 1268 List<String> results = new ArrayList<>(); 1269 1270 while( iterator.hasNext() ) 1271 results.add( makePath( iterator, fullyQualified ) ); 1272 1273 return results.toArray( new String[ results.size() ] ); 1274 } 1275 1276 protected String makePath( Iterator<S3ObjectSummary> iterator, boolean fullyQualified ) 1277 { 1278 String key = iterator.next().getKey(); 1279 1280 if( fullyQualified ) 1281 return makeStringIdentifier( getBucketName(), key ); 1282 1283 return key.substring( getKey().length() ); 1284 } 1285 1286 @Override 1287 public long getSize( FlowProcess<? extends Properties> flowProcess ) throws IOException 1288 { 1289 return getSize( flowProcess.getConfig() ); 1290 } 1291 1292 @Override 1293 public long getSize( Properties conf ) throws IOException 1294 { 1295 if( isDirectory( conf ) ) 1296 return 0; 1297 1298 return getObjectMetadata( conf ).getInstanceLength(); 1299 } 1300 1301 protected static String makeStringIdentifier( String bucketName, String keyPrefix ) 1302 { 1303 if( isEmpty( keyPrefix ) ) 1304 return String.format( "s3://%s/", bucketName ); 1305 1306 return String.format( "s3://%s/%s", bucketName, keyPrefix ); 1307 } 1308 }