001/* 002 * Copyright (c) 2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.aws.s3.logs; 022 023import java.io.IOException; 024import java.net.URI; 025import java.util.TimeZone; 026 027import cascading.flow.Flow; 028import cascading.flow.local.LocalFlowConnector; 029import cascading.local.tap.aws.s3.S3FileCheckpointer; 030import cascading.local.tap.aws.s3.S3Tap; 031import cascading.local.tap.kafka.KafkaTap; 032import cascading.local.tap.kafka.TextKafkaScheme; 033import cascading.operation.Debug; 034import cascading.operation.regex.RegexParser; 035import cascading.operation.text.DateFormatter; 036import cascading.pipe.Each; 037import cascading.pipe.Pipe; 038import cascading.scheme.local.TextDelimited; 039import cascading.scheme.local.TextLine; 040import cascading.tap.SinkMode; 041import cascading.tap.Tap; 042import cascading.tap.local.DirTap; 043import cascading.tap.local.PartitionTap; 044import cascading.tap.partition.DelimitedPartition; 045import cascading.tuple.Fields; 046import cascading.tuple.type.DateType; 047 048import static cascading.flow.FlowDef.flowDef; 049import static cascading.local.tap.kafka.TextKafkaScheme.OFFSET_FIELDS; 050import static cascading.local.tap.kafka.TextKafkaScheme.TOPIC_FIELDS; 051 052/** 053 * A trivial application that can read S3 logs from a S3 bucket, place them into a Kafka topic, 054 * the the logs from the topic, parse them, and write them to directories partitioned on log values. 055 */ 056public class Main 057 { 058 public static final String DD_MMM_YYYY = "dd-MMM-yyyy"; 059 public static final TimeZone UTC = TimeZone.getTimeZone( "UTC" ); 060 public static final DateType DMY = new DateType( DD_MMM_YYYY, UTC ); 061 public static final Fields KEY = new Fields( "date", DMY ); 062 public static final Fields LINE = new Fields( "line", String.class ); 063 public static final Fields KEY_LINE = KEY.append( LINE ); 064 065 public static void main( String[] args ) throws IOException 066 { 067 if( args.length < 3 ) 068 return; 069 070 System.out.println( "source s3 uri = " + args[ 0 ] ); 071 System.out.println( "kafka host = " + args[ 1 ] ); 072 System.out.println( "sink file path = " + args[ 2 ] ); 073 074 if( args.length == 4 ) 075 System.out.println( "checkpoint file path = " + args[ 3 ] ); 076 077 // read from an S3 bucket 078 // optionally restart where a previous run left off 079 S3FileCheckpointer checkpointer = args.length == 4 ? new S3FileCheckpointer() : new S3FileCheckpointer( args[ 3 ] ); 080 Tap inputTap = new S3Tap( new TextLine(), checkpointer, URI.create( args[ 0 ] ) ); 081 082 // write and read from a Kafka queue 083 Tap queueTap = new KafkaTap<>( new TextKafkaScheme( TOPIC_FIELDS.append( OFFSET_FIELDS ).append( KEY_LINE ) ), args[ 1 ], "parsers", "logs" ); 084 085 // write to disk, using log data to create the directory structure 086 // if file exists, append to it -- we aren't duplicating s3 reads so this is safe 087 DelimitedPartition partitioner = new DelimitedPartition( KEY.append( S3Logs.OPERATION ), "/", "logs.csv" ); 088 Tap outputTap = new PartitionTap( 089 new DirTap( new TextDelimited( true, ",", "\"" ), args[ 2 ], SinkMode.UPDATE ), partitioner 090 ); 091 092 Pipe ingress = new Pipe( "head" ); 093 094 // extract the log timestamp and reduce to day/month/year for use as the queue key 095 ingress = new Each( ingress, new Fields( "line" ), new RegexParser( S3Logs.TIME, S3Logs.REGEX, 3 ), new Fields( "time", "line" ) ); 096 ingress = new Each( ingress, S3Logs.TIME, new DateFormatter( KEY, DD_MMM_YYYY, UTC ), KEY_LINE ); 097 098 // watch the progress on the console 099 ingress = new Each( ingress, new Debug( true ) ); 100 101 Flow ingressFlow = new LocalFlowConnector().connect( flowDef() 102 .setName( "ingress" ) 103 .addSource( ingress, inputTap ) 104 .addSink( ingress, queueTap ) 105 .addTail( ingress ) 106 ); 107 108 // start reading from S3 and writing to a Kafka queue 109 ingressFlow.start(); 110 111 Pipe egress = new Pipe( "head" ); 112 113 // parse the full log into its fields and primitive values -- S3Logs.FIELDS declard field names and field types 114 egress = new Each( egress, new Fields( "line" ), new RegexParser( S3Logs.FIELDS, S3Logs.REGEX ), KEY.append( S3Logs.FIELDS ) ); 115 116 // watch the progress on the console 117 egress = new Each( egress, new Debug( true ) ); 118 119 Flow egressFlow = new LocalFlowConnector().connect( flowDef() 120 .setName( "egress" ) 121 .addSource( egress, queueTap ) 122 .addSink( egress, outputTap ) 123 .addTail( egress ) 124 ); 125 126 // start reading from the Kafka queue and writing to the directory as ./[dd-MMM-yyyy]/[S3 operation]/logs.csv 127 egressFlow.start(); 128 129 egressFlow.complete(); 130 System.out.println( "completed egress" ); 131 ingressFlow.complete(); 132 System.out.println( "completed ingress" ); 133 } 134 }