set hive.map.aggr=true
set hive.groupby.skewindata=false
set mapred.reduce.tasks=31


CREATE TABLE dest1(key STRING, c1 INT, c2 STRING, C3 INT, c4 INT) STORED AS TEXTFILE


EXPLAIN EXTENDED
FROM srcpart src
INSERT OVERWRITE TABLE dest1 
SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) 
WHERE src.ds = '2008-04-08'
GROUP BY substr(src.key,1,1)
ABSTRACT SYNTAX TREE:
  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME srcpart) src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL src) value)))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL src) ds) '2008-04-08')) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1))))

STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1
  Stage-2 depends on stages: Stage-0

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Alias -> Map Operator Tree:
        src 
          TableScan
            alias: src
            GatherStats: false
            Select Operator
              expressions:
                    expr: key
                    type: string
                    expr: value
                    type: string
              outputColumnNames: key, value
              Group By Operator
                aggregations:
                      expr: count(DISTINCT substr(value, 5))
                      expr: sum(substr(value, 5))
                      expr: sum(DISTINCT substr(value, 5))
                      expr: count(DISTINCT value)
                bucketGroup: false
                keys:
                      expr: substr(key, 1, 1)
                      type: string
                      expr: substr(value, 5)
                      type: string
                      expr: value
                      type: string
                mode: hash
                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
                Reduce Output Operator
                  key expressions:
                        expr: _col0
                        type: string
                        expr: _col1
                        type: string
                        expr: _col2
                        type: string
                  sort order: +++
                  Map-reduce partition columns:
                        expr: _col0
                        type: string
                  tag: -1
                  value expressions:
                        expr: _col3
                        type: bigint
                        expr: _col4
                        type: double
                        expr: _col5
                        type: double
                        expr: _col6
                        type: bigint
      Needs Tagging: false
      Path -> Alias:
        hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 [src]
        hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 [src]
      Path -> Partition:
        hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 
          Partition
            base file name: hr=11
            input format: org.apache.hadoop.mapred.TextInputFormat
            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
            partition values:
              ds 2008-04-08
              hr 11
            properties:
              bucket_count -1
              columns key,value
              columns.types string:string
              file.inputformat org.apache.hadoop.mapred.TextInputFormat
              file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              location hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11
              name default.srcpart
              numFiles 1
              numPartitions 4
              numRows 0
              partition_columns ds/hr
              rawDataSize 0
              serialization.ddl struct srcpart { string key, string value}
              serialization.format 1
              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              totalSize 5812
              transient_lastDdlTime 1366737715
            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
          
              input format: org.apache.hadoop.mapred.TextInputFormat
              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              properties:
                bucket_count -1
                columns key,value
                columns.types string:string
                file.inputformat org.apache.hadoop.mapred.TextInputFormat
                file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                location hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart
                name default.srcpart
                numFiles 4
                numPartitions 4
                numRows 0
                partition_columns ds/hr
                rawDataSize 0
                serialization.ddl struct srcpart { string key, string value}
                serialization.format 1
                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                totalSize 23248
                transient_lastDdlTime 1366737718
              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              name: default.srcpart
            name: default.srcpart
        hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 
          Partition
            base file name: hr=12
            input format: org.apache.hadoop.mapred.TextInputFormat
            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
            partition values:
              ds 2008-04-08
              hr 12
            properties:
              bucket_count -1
              columns key,value
              columns.types string:string
              file.inputformat org.apache.hadoop.mapred.TextInputFormat
              file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              location hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12
              name default.srcpart
              numFiles 1
              numPartitions 4
              numRows 0
              partition_columns ds/hr
              rawDataSize 0
              serialization.ddl struct srcpart { string key, string value}
              serialization.format 1
              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              totalSize 5812
              transient_lastDdlTime 1366737716
            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
          
              input format: org.apache.hadoop.mapred.TextInputFormat
              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              properties:
                bucket_count -1
                columns key,value
                columns.types string:string
                file.inputformat org.apache.hadoop.mapred.TextInputFormat
                file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                location hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/srcpart
                name default.srcpart
                numFiles 4
                numPartitions 4
                numRows 0
                partition_columns ds/hr
                rawDataSize 0
                serialization.ddl struct srcpart { string key, string value}
                serialization.format 1
                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                totalSize 23248
                transient_lastDdlTime 1366737718
              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              name: default.srcpart
            name: default.srcpart
      Reduce Operator Tree:
        Group By Operator
          aggregations:
                expr: count(DISTINCT KEY._col1:0._col0)
                expr: sum(VALUE._col1)
                expr: sum(DISTINCT KEY._col1:1._col0)
                expr: count(DISTINCT KEY._col1:2._col0)
          bucketGroup: false
          keys:
                expr: KEY._col0
                type: string
          mode: mergepartial
          outputColumnNames: _col0, _col1, _col2, _col3, _col4
          Select Operator
            expressions:
                  expr: _col0
                  type: string
                  expr: _col1
                  type: bigint
                  expr: concat(_col0, _col2)
                  type: string
                  expr: _col3
                  type: double
                  expr: _col4
                  type: bigint
            outputColumnNames: _col0, _col1, _col2, _col3, _col4
            Select Operator
              expressions:
                    expr: _col0
                    type: string
                    expr: UDFToInteger(_col1)
                    type: int
                    expr: _col2
                    type: string
                    expr: UDFToInteger(_col3)
                    type: int
                    expr: UDFToInteger(_col4)
                    type: int
              outputColumnNames: _col0, _col1, _col2, _col3, _col4
              File Output Operator
                compressed: false
                GlobalTableId: 1
                directory: hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/tmp/hive-root/hive_2013-04-23_11-34-17_553_86036101665952261/-ext-10000
                NumFilesPerFileSink: 1
                Stats Publishing Key Prefix: hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/tmp/hive-root/hive_2013-04-23_11-34-17_553_86036101665952261/-ext-10000/
                table:
                    input format: org.apache.hadoop.mapred.TextInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                    properties:
                      bucket_count -1
                      columns key,c1,c2,c3,c4
                      columns.types string:int:string:int:int
                      file.inputformat org.apache.hadoop.mapred.TextInputFormat
                      file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                      location hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/dest1
                      name default.dest1
                      serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4}
                      serialization.format 1
                      serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                      transient_lastDdlTime 1366742057
                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                    name: default.dest1
                TotalFiles: 1
                GatherStats: true
                MultiFileSpray: false
      Truncated Path -> Alias:
        /srcpart/ds=2008-04-08/hr=11 [src]
        /srcpart/ds=2008-04-08/hr=12 [src]

  Stage: Stage-0
    Move Operator
      tables:
          replace: true
          source: hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/tmp/hive-root/hive_2013-04-23_11-34-17_553_86036101665952261/-ext-10000
          table:
              input format: org.apache.hadoop.mapred.TextInputFormat
              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              properties:
                bucket_count -1
                columns key,c1,c2,c3,c4
                columns.types string:int:string:int:int
                file.inputformat org.apache.hadoop.mapred.TextInputFormat
                file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                location hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/user/hive/warehouse/dest1
                name default.dest1
                serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4}
                serialization.format 1
                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                transient_lastDdlTime 1366742057
              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              name: default.dest1
          tmp directory: hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/tmp/hive-root/hive_2013-04-23_11-34-17_553_86036101665952261/-ext-10001

  Stage: Stage-2
    Stats-Aggr Operator
      Stats Aggregation Key Prefix: hdfs://mgrover-bigtop-centos-1.ent.cloudera.com:17020/tmp/hive-root/hive_2013-04-23_11-34-17_553_86036101665952261/-ext-10000/




FROM srcpart src
INSERT OVERWRITE TABLE dest1 
SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) 
WHERE src.ds = '2008-04-08'
GROUP BY substr(src.key,1,1)
Deleted /user/hive/warehouse/dest1


SELECT dest1.* FROM dest1
0	1	00.0	0	1
1	71	132828.0	10044	71
2	69	251142.0	15780	69
3	62	364008.0	20119	62
4	74	4105526.0	30965	74
5	6	5794.0	278	6
6	5	6796.0	331	5
7	6	71470.0	447	6
8	8	81524.0	595	8
9	7	92094.0	577	7
