- We need to write a UDF for this.
package org.apache.hadoop.hive.contrib.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.LongWritable;
@Description(name = "row_sequence",
value = "_FUNC_() - Returns a generated row sequence number starting from 1")
@UDFType(deterministic = false, stateful = true)
public class UDFRowSequence extends UDF
{
private LongWritable result = new LongWritable();
public UDFRowSequence() {
result.set(0);
}
public LongWritable evaluate() {
result.set(result.get() + 1);
return result;
}
}
- Create a jar for this.
- Create a Hive managed Hbase table with id as column for auto increment.
create table ga_tblagetest(id String,userAgeBracket String, date String,sessions String,percentNewSessions String,newUsers String,bounceRate String,pageviewsPerSession String,avgSessionDuration String)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:userAgeBracket,cf1:date,cf1:sessions,cf1:percentNewSessions,cf1:newUsers,cf1:bounceRate,cf1:pageviewsPerSession,cf1:avgSessionDuration")
TBLPROPERTIES ("hbase.table.name" = "ga_tblagetest");
- In hive, add the jar : ADD JAR /path/customhiveudf.jar
- create temporary function incrementrow as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
- CREATE TEMPORARY TABLE default.tmp2 AS SELECT incrementrow(),* FROM socialmedia.tblage;
- INSERT OVERWRITE TABLE ga_tblagetest SELECT * FROM default.tmp2;
- scan 'ga_tblagetest'