Thursday 18 February 2016

Create Identity column in Hive for dump in Hbase


  • We need to write a UDF for this.

package org.apache.hadoop.hive.contrib.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.LongWritable;

@Description(name = "row_sequence",
    value = "_FUNC_() - Returns a generated row sequence number starting from 1")
@UDFType(deterministic = false, stateful = true)

public class UDFRowSequence extends UDF
{
  private LongWritable result = new LongWritable();

  public UDFRowSequence() {
    result.set(0);
  }

  public LongWritable evaluate() {
    result.set(result.get() + 1);
    return result;
  }
}


  • Create a jar for this.
  • Create a Hive managed Hbase table with id as column for auto increment.

create table ga_tblagetest(id String,userAgeBracket String, date String,sessions String,percentNewSessions String,newUsers String,bounceRate String,pageviewsPerSession String,avgSessionDuration String)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:userAgeBracket,cf1:date,cf1:sessions,cf1:percentNewSessions,cf1:newUsers,cf1:bounceRate,cf1:pageviewsPerSession,cf1:avgSessionDuration")
TBLPROPERTIES ("hbase.table.name" = "ga_tblagetest");


  • In hive, add the jar : ADD JAR /path/customhiveudf.jar
  • create temporary function incrementrow as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
  • CREATE TEMPORARY TABLE default.tmp2 AS SELECT incrementrow(),* FROM socialmedia.tblage;
  • INSERT OVERWRITE TABLE ga_tblagetest SELECT * FROM default.tmp2;
Data with row key can be queried in Hbase.
  • scan 'ga_tblagetest'

No comments:

Post a Comment