001/*
002 * Copyright (c) 2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.aws.s3.logs;
022
023import java.io.IOException;
024import java.net.URI;
025import java.util.TimeZone;
026
027import cascading.flow.Flow;
028import cascading.flow.local.LocalFlowConnector;
029import cascading.local.tap.aws.s3.S3FileCheckpointer;
030import cascading.local.tap.aws.s3.S3Tap;
031import cascading.local.tap.kafka.KafkaTap;
032import cascading.local.tap.kafka.TextKafkaScheme;
033import cascading.operation.Debug;
034import cascading.operation.regex.RegexParser;
035import cascading.operation.text.DateFormatter;
036import cascading.pipe.Each;
037import cascading.pipe.Pipe;
038import cascading.scheme.local.TextDelimited;
039import cascading.scheme.local.TextLine;
040import cascading.tap.SinkMode;
041import cascading.tap.Tap;
042import cascading.tap.local.DirTap;
043import cascading.tap.local.PartitionTap;
044import cascading.tap.partition.DelimitedPartition;
045import cascading.tuple.Fields;
046import cascading.tuple.type.DateType;
047
048import static cascading.flow.FlowDef.flowDef;
049import static cascading.local.tap.kafka.TextKafkaScheme.OFFSET_FIELDS;
050import static cascading.local.tap.kafka.TextKafkaScheme.TOPIC_FIELDS;
051
052/**
053 * A trivial application that can read S3 logs from a S3 bucket, place them into a Kafka topic,
054 * the the logs from the topic, parse them, and write them to directories partitioned on log values.
055 */
056public class Main
057  {
058  public static final String DD_MMM_YYYY = "dd-MMM-yyyy";
059  public static final TimeZone UTC = TimeZone.getTimeZone( "UTC" );
060  public static final DateType DMY = new DateType( DD_MMM_YYYY, UTC );
061  public static final Fields KEY = new Fields( "date", DMY );
062  public static final Fields LINE = new Fields( "line", String.class );
063  public static final Fields KEY_LINE = KEY.append( LINE );
064
065  public static void main( String[] args ) throws IOException
066    {
067    if( args.length < 3 )
068      return;
069
070    System.out.println( "source s3 uri = " + args[ 0 ] );
071    System.out.println( "kafka host = " + args[ 1 ] );
072    System.out.println( "sink file path = " + args[ 2 ] );
073
074    if( args.length == 4 )
075      System.out.println( "checkpoint file path = " + args[ 3 ] );
076
077    // read from an S3 bucket
078    // optionally restart where a previous run left off
079    S3FileCheckpointer checkpointer = args.length == 4 ? new S3FileCheckpointer() : new S3FileCheckpointer( args[ 3 ] );
080    Tap inputTap = new S3Tap( new TextLine(), checkpointer, URI.create( args[ 0 ] ) );
081
082    // write and read from a Kafka queue
083    Tap queueTap = new KafkaTap<>( new TextKafkaScheme( TOPIC_FIELDS.append( OFFSET_FIELDS ).append( KEY_LINE ) ), args[ 1 ], "parsers", "logs" );
084
085    // write to disk, using log data to create the directory structure
086    // if file exists, append to it -- we aren't duplicating s3 reads so this is safe
087    DelimitedPartition partitioner = new DelimitedPartition( KEY.append( S3Logs.OPERATION ), "/", "logs.csv" );
088    Tap outputTap = new PartitionTap(
089      new DirTap( new TextDelimited( true, ",", "\"" ), args[ 2 ], SinkMode.UPDATE ), partitioner
090    );
091
092    Pipe ingress = new Pipe( "head" );
093
094    // extract the log timestamp and reduce to day/month/year for use as the queue key
095    ingress = new Each( ingress, new Fields( "line" ), new RegexParser( S3Logs.TIME, S3Logs.REGEX, 3 ), new Fields( "time", "line" ) );
096    ingress = new Each( ingress, S3Logs.TIME, new DateFormatter( KEY, DD_MMM_YYYY, UTC ), KEY_LINE );
097
098    // watch the progress on the console
099    ingress = new Each( ingress, new Debug( true ) );
100
101    Flow ingressFlow = new LocalFlowConnector().connect( flowDef()
102      .setName( "ingress" )
103      .addSource( ingress, inputTap )
104      .addSink( ingress, queueTap )
105      .addTail( ingress )
106    );
107
108    // start reading from S3 and writing to a Kafka queue
109    ingressFlow.start();
110
111    Pipe egress = new Pipe( "head" );
112
113    // parse the full log into its fields and primitive values -- S3Logs.FIELDS declard field names and field types
114    egress = new Each( egress, new Fields( "line" ), new RegexParser( S3Logs.FIELDS, S3Logs.REGEX ), KEY.append( S3Logs.FIELDS ) );
115
116    // watch the progress on the console
117    egress = new Each( egress, new Debug( true ) );
118
119    Flow egressFlow = new LocalFlowConnector().connect( flowDef()
120      .setName( "egress" )
121      .addSource( egress, queueTap )
122      .addSink( egress, outputTap )
123      .addTail( egress )
124    );
125
126    // start reading from the Kafka queue and writing to the directory as ./[dd-MMM-yyyy]/[S3 operation]/logs.csv
127    egressFlow.start();
128
129    egressFlow.complete();
130    System.out.println( "completed egress" );
131    ingressFlow.complete();
132    System.out.println( "completed ingress" );
133    }
134  }