Tuesday 20 December 2016

bash Script to get Hue link in amazon EMR AWS on cluster startup

#!/bin/bash

set -xe
# =========================================================== #
# Created By: Lokesh Nanda
# Date: 21/12/2016
# Sample call: /home/hadoop/HueScripts/hueurl.sh
# Getting the DNS name of EMR on cluster startup and dumping it to /home/hadoop/HueScripts/hueurl.properties
# BDP UI to read /home/hadoop/HueScripts/hueurl.properties file to get the latest link
# =========================================================== #

InstanceIDDev=$(aws emr list-clusters | grep -m 1 \"Id\" | awk '{print $2}')
InstanceIDEMR=$(echo "$InstanceIDDev" | sed 's/\"//g'| sed 's/\,//g')
DescribeEMR=$(aws emr describe-cluster --cluster-id "${InstanceIDEMR}" | python -c "import sys, json; print json.load(sys.stdin)['Cluster']['MasterPublicDnsName']")
echo "${DescribeEMR}"> /home/hadoop/HueScripts/hueurl.properties
sudo sed -i s/ec2/huelink=ec2/ /home/hadoop/HueScripts/hueurl.properties
sudo sed -i s/compute.amazonaws.com/compute.amazonaws.com:8888/ /home/hadoop/HueScripts/hueurl.properties

Tuesday 25 October 2016

Logging in Bash Script

Logger program to be used in scripts. (Create on logger.sh and use it in all the sh files for logging)

#!/bin/bash

# =========================================================== #
# Created By: Lokesh Nanda
# Date: 24/10/2016
# Accepts a logfile name as parameter.
# Sample call: /home/admin123/Logging/logger.sh ${logfilename}
# used in shell script for logging.
# Supports 8 functions
# =========================================================== #


SCRIPT_FILE=$1
SCRIPT_LOG=/home/admin123/Logging/${SCRIPT_FILE}.log
touch "$SCRIPT_LOG"

function SUCCESS(){
    local msg="$1"
    timeAndDate=$(date)
    echo "[$timeAndDate] [BDP_SUCCESS]  $msg" >> "$SCRIPT_LOG"
    echo "[$timeAndDate] [BDP_SUCCESS]  $msg"
}

function PROPERTYFILEERROR(){
 timeAndDate=$(date)
 script_name=$(basename "$0")
 local msg="$1"
 script_name="${script_name%.*}"
 echo "[$timeAndDate] [BDP_DEBUG]  > $script_name $msg" >> "$SCRIPT_LOG"
 echo "[$timeAndDate] [BDP_DEBUG]  > $script_name $msg"
}

function SCRIPTENTRY(){
 timeAndDate=$(date)
 script_name=$(basename "$0")
 script_name="${script_name%.*}"
 echo "[$timeAndDate] [BDP_DEBUG]  > $script_name $FUNCNAME" >> "$SCRIPT_LOG"
 echo "[$timeAndDate] [BDP_DEBUG]  > $script_name $FUNCNAME"
}

function SCRIPTEXIT(){
 script_name=$(basename "$0")
 script_name="${script_name%.*}"
 echo "[$timeAndDate] [BDP_DEBUG]  < $script_name $FUNCNAME" >> "$SCRIPT_LOG"
 echo "[$timeAndDate] [BDP_DEBUG]  < $script_name $FUNCNAME"
}

function ENTRY(){
 local cfn="${FUNCNAME[1]}"
 timeAndDate=$(date)
 echo "[$timeAndDate] [BDP_DEBUG]  > $cfn $FUNCNAME" >> "$SCRIPT_LOG"
 echo "[$timeAndDate] [BDP_DEBUG]  > $cfn $FUNCNAME"
}

function EXIT(){
 local cfn="${FUNCNAME[1]}"
 timeAndDate=$(date)
 echo "[$timeAndDate] [BDP_DEBUG]  < $cfn $FUNCNAME" >> "$SCRIPT_LOG"
 echo "[$timeAndDate] [BDP_DEBUG]  < $cfn $FUNCNAME"
}


function INFO(){
    local msg="$1"
    timeAndDate=$(date)
    echo "[$timeAndDate] [BDP_INFO]  $msg" >> "$SCRIPT_LOG"
    echo "[$timeAndDate] [BDP_INFO]  $msg"
}


function DEBUG(){
    local msg="$1"
    timeAndDate=$(date)
 echo "[$timeAndDate] [BDP_DEBUG]  $msg" >> "$SCRIPT_LOG"
 echo "[$timeAndDate] [BDP_DEBUG]  $msg"
}

function ERROR(){
    local msg="$1"
    timeAndDate=$(date)
    echo "[$timeAndDate] [BDP_ERROR]  $msg" >> "$SCRIPT_LOG"
    echo "[$timeAndDate] [BDP_ERROR]  $msg"
}


testlogger.sh to show how to call logger.sh and implement logging 

#!/bin/bash
#set -e

# =========================================================== #
# Created By: Lokesh Nanda
# Date: 24/10/2016
# Calls logger.sh for logging framework
# Sample call: /home/admin123/Logging/testlogger.sh
# CHANGES IN SCRIPT: 
#      logfilename : file name of log (ex - testloginfo)
#
# Supports 8 functions
# =========================================================== #


logfilename=testloginfo
source /home/admin123/Logging/logger.sh ${logfilename}
SCRIPTENTRY

. /home/admin123/sqoop-jobs/connections/68.properties || PROPERTYFILEERROR "401: Failed to read property file"

jobName=mysqltest123
table=twitter
hiveDatabase=testdb
hiveTable=testtable26
stagingDir="${STORAGEURI}/stagingDir/${hiveDatabase}/${hiveTable}"
targetDir="${STORAGEURI}/${hiveDatabase}/${hiveTable}"

triggerSqoopJob(){
    ENTRY
    DEBUG "Triggering sqoop job import from Database $1 and Table $2"
    sqoop import --driver com.mysql.jdbc.Driver --connect jdbc:mysql://${SERVERNAME}:${PORT}/${DATABASENAME} --username ${USERNAME} --password ${PASSWORD} --enclosed-by '\"' --table ${table} --num-mappers ${MAXMAPPER} --null-string \'\\\\N\' --null-non-string \'\\\\N\' --hive-drop-import-delims  --map-column-java Token=String,AccountName=String,ConsumerKey=String,ConsumerSecret=String,HashTags=String,HiveTable=String,isActive=String,Secret=String --target-dir ${stagingDir} >> /home/admin123/Logging/${logfilename}.log 2>&1 && SUCCESS "sqoop import successful Database $1 and Table $2" || ERROR "404: Failed to run sqoop import for Database $1 and Table $2" 
    INFO "SqoopJob details updated for Database $1 and Table $2"
    EXIT
}

triggerHiveJob(){
    ENTRY
    DEBUG "Triggering Hive Job"
    hive -e "CREATE DATABASE IF NOT EXISTS ${hiveDatabase};

DROP TABLE IF EXISTS ${hiveDatabase}.stage_${hiveTable};

CREATE EXTERNAL table if not exists ${hiveDatabase}.stage_${hiveTable} (
\`ID\`  INT,
\`Token\`  VARCHAR(255),
\`AccountName\`  VARCHAR(255),
\`ActiveEndDate\`  TIMESTAMP,
\`ActiveStartDate\`  TIMESTAMP,
\`ConsumerKey\`  VARCHAR(255),
\`ConsumerSecret\`  VARCHAR(255),
\`CreatedDate\`  TIMESTAMP,
\`HashTags\`  VARCHAR(255),
\`HiveTable\`  VARCHAR(255),
\`isActive\`  VARCHAR(255),
\`Secret\`  VARCHAR(255),
\`UserId\`  INT)
ROW FORMAT SERDE 'com.bizo.hive.serde.csv.CSVSerde'
Location '${stagingDir}';" >> /home/admin123/Logging/${logfilename}.log 2>&1 && SUCCESS "Hive job was successful for Database $1 and Table $2" || ERROR "402: Failed to run Hive job for Database $1 and Table $2" 

INFO "HiveJob details updated for Database $1 and Table $2"
EXIT

}


triggerSqoopJob ${DATABASENAME} ${table}
triggerHiveJob ${hiveDatabase} stage_${hiveTable}

SCRIPTEXIT


SampleOutput:


Sample Log:

[Tue Oct 25 08:11:31 UTC 2016] [BDP_DEBUG]  > testlogger SCRIPTENTRY
[Tue Oct 25 08:11:31 UTC 2016] [BDP_DEBUG]  > triggerSqoopJob ENTRY
[Tue Oct 25 08:11:31 UTC 2016] [BDP_DEBUG]  Triggering sqoop job import from Database oxigen and Table twitter
Warning: /usr/hdp/2.4.2.4-5/accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
16/10/25 08:11:32 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6.2.4.2.4-5
16/10/25 08:11:33 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
16/10/25 08:11:33 WARN sqoop.ConnFactory: Parameter --driver is set to an explicit driver however appropriate connection manager is not being set (via --connection-manager). Sqoop is going to fall back to org.apache.sqoop.manager.GenericJdbcManager. Please specify explicitly which connection manager should be used next time.
16/10/25 08:11:33 INFO manager.SqlManager: Using default fetchSize of 1000
16/10/25 08:11:33 INFO tool.CodeGenTool: Beginning code generation
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/hdp/2.4.2.4-5/hadoop/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/hdp/2.4.2.4-5/zookeeper/lib/slf4j-log4j12-1.6.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
16/10/25 08:11:34 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM twitter AS t WHERE 1=0
16/10/25 08:11:34 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM twitter AS t WHERE 1=0
16/10/25 08:11:34 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /usr/hdp/2.4.2.4-5/hadoop-mapreduce
Note: /tmp/sqoop-admin123/compile/9ba3c1ba3ed2d158a63dab4a5f950ebd/twitter.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
16/10/25 08:11:36 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-admin123/compile/9ba3c1ba3ed2d158a63dab4a5f950ebd/twitter.jar
16/10/25 08:11:37 INFO mapreduce.ImportJobBase: Beginning import of twitter
16/10/25 08:11:37 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM twitter AS t WHERE 1=0
16/10/25 08:11:39 INFO impl.TimelineClientImpl: Timeline service address: http://headnodehost:8188/ws/v1/timeline/
16/10/25 08:11:40 ERROR tool.ImportTool: Encountered IOException running import job: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory wasb://oxigencentoscluster@sacentos.blob.core.windows.net/sqoop-import-data/stagingDir/testdb/testtable26 already exists
at org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:146)
at org.apache.hadoop.mapreduce.JobSubmitter.checkSpecs(JobSubmitter.java:266)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:139)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1290)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1287)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1709)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1287)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1308)
at org.apache.sqoop.mapreduce.ImportJobBase.doSubmitJob(ImportJobBase.java:196)
at org.apache.sqoop.mapreduce.ImportJobBase.runJob(ImportJobBase.java:169)
at org.apache.sqoop.mapreduce.ImportJobBase.runImport(ImportJobBase.java:266)
at org.apache.sqoop.manager.SqlManager.importTable(SqlManager.java:673)
at org.apache.sqoop.tool.ImportTool.importTable(ImportTool.java:497)
at org.apache.sqoop.tool.ImportTool.run(ImportTool.java:605)
at org.apache.sqoop.Sqoop.run(Sqoop.java:148)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)
at org.apache.sqoop.Sqoop.runSqoop(Sqoop.java:184)
at org.apache.sqoop.Sqoop.runTool(Sqoop.java:226)
at org.apache.sqoop.Sqoop.runTool(Sqoop.java:235)
at org.apache.sqoop.Sqoop.main(Sqoop.java:244)

[Tue Oct 25 08:11:40 UTC 2016] [BDP_ERROR]  404: Failed to run sqoop import for Database oxigen and Table twitter
[Tue Oct 25 08:11:40 UTC 2016] [BDP_INFO]  SqoopJob details updated for Database oxigen and Table twitter
[Tue Oct 25 08:11:40 UTC 2016] [BDP_DEBUG]  < triggerSqoopJob EXIT
[Tue Oct 25 08:11:40 UTC 2016] [BDP_DEBUG]  > triggerHiveJob ENTRY
[Tue Oct 25 08:11:40 UTC 2016] [BDP_DEBUG]  Triggering Hive Job
WARNING: Use "yarn jar" to launch YARN applications.

Logging initialized using configuration in file:/etc/hive/2.4.2.4-5/0/hive-log4j.properties
Exception in thread "main" java.lang.RuntimeException: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.RetriableException): org.apache.hadoop.hdfs.server.namenode.SafeModeException: Cannot create directory /tmp/hive/admin123/4bd585e6-03c5-4f26-8da0-4b54da07b738. Name node is in safe mode.
The reported blocks 0 needs additional 22 blocks to reach the threshold 0.9900 of total blocks 22.
The number of live datanodes 0 has reached the minimum number 0. Safe mode will be turned off automatically once the thresholds have been reached.
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkNameNodeSafeMode(FSNamesystem.java:1331)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java:3971)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcServer.java:1081)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:630)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:616)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:969)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2206)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2202)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1709)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2200)
Caused by: org.apache.hadoop.hdfs.server.namenode.SafeModeException: Cannot create directory /tmp/hive/admin123/4bd585e6-03c5-4f26-8da0-4b54da07b738. Name node is in safe mode.
The reported blocks 0 needs additional 22 blocks to reach the threshold 0.9900 of total blocks 22.
The number of live datanodes 0 has reached the minimum number 0. Safe mode will be turned off automatically once the thresholds have been reached.
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkNameNodeSafeMode(FSNamesystem.java:1327)
... 12 more

at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:503)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:680)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:624)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.RetriableException): org.apache.hadoop.hdfs.server.namenode.SafeModeException: Cannot create directory /tmp/hive/admin123/4bd585e6-03c5-4f26-8da0-4b54da07b738. Name node is in safe mode.
The reported blocks 0 needs additional 22 blocks to reach the threshold 0.9900 of total blocks 22.
The number of live datanodes 0 has reached the minimum number 0. Safe mode will be turned off automatically once the thresholds have been reached.
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkNameNodeSafeMode(FSNamesystem.java:1331)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java:3971)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcServer.java:1081)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:630)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:616)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:969)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2206)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2202)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1709)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2200)
Caused by: org.apache.hadoop.hdfs.server.namenode.SafeModeException: Cannot create directory /tmp/hive/admin123/4bd585e6-03c5-4f26-8da0-4b54da07b738. Name node is in safe mode.
The reported blocks 0 needs additional 22 blocks to reach the threshold 0.9900 of total blocks 22.
The number of live datanodes 0 has reached the minimum number 0. Safe mode will be turned off automatically once the thresholds have been reached.
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkNameNodeSafeMode(FSNamesystem.java:1327)
... 12 more

at org.apache.hadoop.ipc.Client.call(Client.java:1426)
at org.apache.hadoop.ipc.Client.call(Client.java:1363)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229)
at com.sun.proxy.$Proxy14.mkdirs(Unknown Source)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.mkdirs(ClientNamenodeProtocolTranslatorPB.java:560)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:256)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:104)
at com.sun.proxy.$Proxy15.mkdirs(Unknown Source)
at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:3064)
at org.apache.hadoop.hdfs.DFSClient.mkdirs(DFSClient.java:3034)
at org.apache.hadoop.hdfs.DistributedFileSystem$23.doCall(DistributedFileSystem.java:1105)
at org.apache.hadoop.hdfs.DistributedFileSystem$23.doCall(DistributedFileSystem.java:1101)
at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at org.apache.hadoop.hdfs.DistributedFileSystem.mkdirsInternal(DistributedFileSystem.java:1101)
at org.apache.hadoop.hdfs.DistributedFileSystem.mkdirs(DistributedFileSystem.java:1094)
at org.apache.hadoop.hive.ql.session.SessionState.createPath(SessionState.java:631)
at org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionState.java:556)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:489)
... 8 more
[Tue Oct 25 08:13:43 UTC 2016] [BDP_ERROR]  402: Failed to run Hive job for Database testdb and Table stage_testtable26
[Tue Oct 25 08:13:43 UTC 2016] [BDP_INFO]  HiveJob details updated for Database testdb and Table stage_testtable26
[Tue Oct 25 08:13:43 UTC 2016] [BDP_DEBUG]  < triggerHiveJob EXIT
[Tue Oct 25 08:13:43 UTC 2016] [BDP_DEBUG]  < testlogger SCRIPTEXIT

Monday 26 September 2016

Install NiFi in Spark HDInsight Cluster

sudo mkdir tars

cd tars/

sudo wget http://public-repo-1.hortonworks.com/HDF/ubuntu14/1.x/updates/1.2.0.1/HDF-1.2.0.1-1.tar.gz

sudo chmod -R 777 /home/admin123/tars/HDF-1.2.0.1-1.tar.gz

sudo tar -xzvf /home/admin123/tars/HDF-1.2.0.1-1.tar.gz

sudo chmod -R 777 /home/admin123/tars/HDF-1.2.0.1-1

sudo vi /home/admin123/tars/HDF-1.2.0.1-1/nifi/conf/nifi.properties
   change port 8095 (webservices)
   change key: (security)

cd /home/admin123/tars/HDF-1.2.0.1-1/nifi/bin/

sudo ./nifi.sh install

sudo service nifi status

sudo service nifi start

hostname -f

Copy the url
http://hn0-linuxh.xxxxxxxxxxxxxxxxxymrxnfih0c.ix.internal.cloudapp.net:8095/nifi/

Thursday 18 August 2016

Change Timezone to IST in CENTOS

sudo mv /etc/localtime /etc/localtime.bak
sudo ln -s /usr/share/zoneinfo/Asia/Kolkata /etc/localtime

date
Thu Aug 18 12:58:33 IST 2016

Friday 12 August 2016

Hbase Table backup/Restore

These are the following steps for backup/restore :


1 . Export table to hdfs directory :


hbase org.apache.hadoop.hbase.mapreduce.Export \
   <tablename> <outputdir> [<versions> [<starttime> [<endtime>]]]
   
Example:

hbase org.apache.hadoop.hbase.mapreduce.Export  test1  /hbase_backup/test1

Note: To import in different cluster/or if table is not in hbase ,table must be created in that particular cluster  before import command .



2.Import/Restore data in hbase cluster from hdfs

 Once table instance is created in hbase cluster :now we can import hbase table (restore)

Command :

hbase org.apache.hadoop.hbase.mapreduce.Import <tablename> <inputdir>

Example:

hbase org.apache.hadoop.hbase.mapreduce.Import test1 /hbase_backup/test1


Wednesday 10 August 2016

Submit Spark job on yarn cluster

 Error: Could not find or load main class org.apache.spark.deploy.yarn.ApplicationMaster
Solution : 


If you are using spark with hdp/hdinsight, then we have to do following things.
  1. Add these entries in your $SPARK_HOME/conf/spark-defaults.conf
    spark.driver.extraJavaOptions -Dhdp.version=2.2.9.1-19 (your installed HDP version)
    spark.yarn.am.extraJavaOptions -Dhdp.version=2.2.9.1-19 (your installed HDP version)
  2. create java-opts file in $SPARK_HOME/conf and add the installed HDP version in that file like
-Dhdp.version=2.2.9.1-19 (your installed HDP version)
to know hdp verion please run command hdp-select status hadoop-client in the cluster

Example command :
spark-submit --class org.apache.spark.examples.SparkPi \
    --master yarn \
    --deploy-mode cluster \
    --driver-memory 1g \
    --executor-memory 2g \
    --executor-cores 1 \
    --queue default \
    /usr/hdp/current/spark/lib/spark-examples*.jar \
    10

Error : jar changed on src filesystem(Spark on yarn cluster mode )

if you are getting this error it means you are uploading assembly jars 
SolutionIn yarn-cluster mode, Spark submit automatically uploads the assembly jar to a distributed cache that all executor containers read from, so there is no need to manually copy the assembly jar to all nodes (or pass it through --jars).
It seems there are two versions of the same jar in your HDFS. Try removing all old jars from your .sparkStaging directory and try again ,it should work .

Thursday 21 July 2016

Spark Memory Consumption Optimization

Changing spark.executor.memory and spark.executor.instances will bring down memory consumption.

By default the value for spark.executor.memory is 4608m and spark.executor.instances is 2




When I run Spark-Shell after SSH, below is the memory consumption footprint.






Change spark-executor.memory to 1608m and run Spark-Shell. Below is the memory consumption footprint.


Now Changed spark.executor.instances to 1 and run Spark-Shell. Below is the memory consumption footprint.


Install Python Libraries to use in Pyspark (HDINSIGHT SPARK CLUSTER)

Install the libraries using commands below:
cd /usr/bin/anaconda/bin/

export PATH=/usr/bin/anaconda/bin:$PATH

conda update matplotlib

conda install Theano

pip install scikit-neuralnetwork

pip install vaderSentiment

Tuesday 19 July 2016

Get List of Hosts in HDINSIGHT Cluster

sudo apt-get install jq

curl -u admin:PASSWORD -G "https://CLUSTERNAME.azurehdinsight.net/api/v1/clusters/CLUSTERNAME/hosts" | jq '.items[].Hosts.host_name'

OUTPUT:

% Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2090  100  2090    0     0   2557      0 --:--:-- --:--:-- --:--:--  2555
"hn0-linuxh.bpa0iysj5klejpk0fzlmcicoke.ix.internal.cloudapp.net"
"hn1-linuxh.bpa0iysj5klejpk0fzlmcicoke.ix.internal.cloudapp.net"
"wn1-linuxh.bpa0iysj5klejpk0fzlmcicoke.ix.internal.cloudapp.net"
"zk0-linuxh.bpa0iysj5klejpk0fzlmcicoke.ix.internal.cloudapp.net"
"zk1-linuxh.bpa0iysj5klejpk0fzlmcicoke.ix.internal.cloudapp.net"
"zk6-linuxh.bpa0iysj5klejpk0fzlmcicoke.ix.internal.cloudapp.net"


Parse the output and dump the list to a text file.

Monday 27 June 2016

Command to find a directory in Linux

sudo find / -name "dir-name-here"

Create Cluster Authentication without Organization ID

Download .ps1 file : https://gallery.technet.microsoft.com/scriptcenter/Self-signed-certificate-5920a7c6

PS C:\> Import-Module -Name F:\New-SelfSignedCertificateEx.ps1
PS C:\> New-SelfSignedCertificateEx -Subject "CN=exampleapp" -KeySpec "Exchange" -FriendlyName "exampleapp"
PS C:\> $cert = Get-ChildItem -Path cert:\CurrentUser\My\* -DnsName exampleapp
PS C:\> $keyValue = [System.Convert]::ToBase64String($cert.GetRawCertData())
PS C:\> Add-AzureRmAccount

PS C:\> $azureAdApplication = New-AzureRmADApplication -DisplayName "exampleapp" -HomePage "https://www.contoso.org" -Id
entifierUris "https://www.contoso.org/example" -KeyValue $keyValue -KeyType AsymmetricX509Cert -EndDate $cert.NotAfter -
StartDate $cert.NotBefore

PS C:\> $azureAdApplication

Note: ApplicationId           : 934fcbb6-66xxxxxxxxxxxxxxxxxxxxx

PS C:\> New-AzureRmADServicePrincipal -ApplicationId $azureAdApplication.ApplicationId

PS C:\> (Get-AzureRmSubscription).TenantId

Note: TenantId

PS C:\> (Get-ChildItem -Path cert:\CurrentUser\My\* -DnsName exampleapp).Thumbprint

Note: Thumbprint

Add-AzureRmAccount -ServicePrincipal -CertificateThumbprint CFC1DC609A547xxxxxxxxxxxxxxxxxxxxxxxx -ApplicationId 934fcbb6-66xxxxxxxxxxxxxxxxxxxxxxxxxxxx -TenantId edc9c7e2-1bxxxxxxxxxxxxxxxxxxxxxxxxxxx

Next, proceed with create cluster script. 

Wednesday 15 June 2016

Hbase and Hive Support

*********************************************************************************
CREATE table in hbase using hive:
*********************************************************************************
create table hbase_wallets(Id BIGINT, MDN VARCHAR(20), Password VARCHAR(1000), eWalletId VARCHAR(20), Date TIMESTAMP, TransactionNumber VARCHAR(25), Requester VARCHAR(20), Instrument TINYINT, Status TINYINT, WalletType TINYINT, Opt1 VARCHAR(50), Opt2 VARCHAR(50), Opt3 VARCHAR(50), IsB2B BOOLEAN, UserId VARCHAR(50), Banks INT, WalletOwner INT, Currency TINYINT, INSERT_DT TIMESTAMP, UPDATE_DT TIMESTAMP, ACTIVE_FLAG CHAR(1))
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:MDN,cf1:Password,cf1:eWalletId,cf1:Date,cf1:TransactionNumber,cf1:Requester,cf1:Instrument,cf1:Status,cf1:WalletType,cf2:Opt1,cf2:Opt2,cf2:Opt3,cf2:IsB2B,cf2:UserId,cf2:Banks,cf2:WalletOwner,cf2:Currency,cf2:INSERT_DT,cf2:UPDATE_DT,cf2:ACTIVE_FLAG")
TBLPROPERTIES ("hbase.table.name" = "hive_wallets");

*********************************************************************************
To move data from hive to hbase:
*********************************************************************************
INSERT OVERWRITE TABLE hbase_wallets SELECT * FROM oxi_consolidated_orc.Wallets;

*********************************************************************************
CREATE table in hbase using hive:
*********************************************************************************
use default;
create EXTERNAL table hbase_actions(Id INT, description VARCHAR(80), insert_dt timestamp, update_dt timestamp, active_flag char(1))
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:description,cf1:insert_dt,cf1:update_dt,cf1:active_flag")
TBLPROPERTIES ("hbase.table.name" = "hbase_actions")

LOCATION 'wasb://linuxxxxxxhdinsightcluster@oxigxxxxxxcompute.blob.core.windows.net/test/actionshbasetest';

*********************************************************************************
To move data from hive to hbase:
*********************************************************************************
INSERT OVERWRITE TABLE hbase_wallets_test SELECT Id FROM oxi_consolidated.Wallets;

Get Bulk data from SFTP Server using automated expect script

This summary is not available. Please click here to view the post.

Get CSV data from SFTP and create Hive Table using Python script

Content of Expect Script

#!/usr/bin/expect
cd /home/admin123/SFTP/sftpdata
spawn sftp root@ftp.xxxxx.com
expect "password:"
send "ihCtwxxxxxxxxxxxxq\n"
expect "sftp>"
send "get /root/FRED-GGGDTPBRA188N.csv\n"
expect "sftp>"
send "exit\n"
expect ":~$"
spawn hadoop fs -mkdir -p /ftpdata/testdirdump
expect ":~$"
spawn hadoop fs -copyFromLocal -f /home/admin123/SFTP/sftpdata/FRED-GGGDTPBRA188N.csv /ftpdata/testdirdump
expect ":~$"
spawn python /home/admin123/scripts/GenerateHiveScript.py /home/admin123/SFTP/sftpdata/FRED-GGGDTPBRA188N.csv sftptestdata sftpFRED /ftpdata/testdirdump /home/admin123/SFTP/sftphivequery/hivequery
expect ":~$"
spawn sudo chmod -R 777 /home/admin123/SFTP/sftphivequery/hivequerysftpFRED.hql
expect ":~$"
spawn hive -f /home/admin123/SFTP/sftphivequery/hivequerysftpFRED.hql
expect eof
catch wait result


Contents of GenerateHiveScript.py 

'''
Created on 09-May-2016

@author: Lokesh.Nanda
'''

#import string
import re
#import time
import sys
import csv


source_path = sys.argv[1]
#F:\StockStreamingData\NSEStockData.20160428-131159.csv
database_name = sys.argv[2]
database_name_mod = re.sub('[^a-zA-Z0-9 \n\.]', '_', database_name)
#ftpdata
table_name = sys.argv[3]
table_name_mod = re.sub('[^a-zA-Z0-9 \n\.]', '_', table_name)
#testtable
hdfs_file_loc = sys.argv[4]
#/datapull/stocks
hive_query_path = sys.argv[5]
#F:\StockStreamingData\hivequery

with open(source_path, 'r') as f:
#with open('/home/admin123/NSEStockData.20160505-060003.csv', 'r') as f:
#with open('C:\Users\lokesh.nanda\Desktop\MOCK_DATA.csv', 'r') as f:
        first_line = f.readline()
        flag = False
        symbol = "~`!@#$%^&*()-+={}[]:>;'</?*-+"
        for i in first_line:
            if i in symbol:
                flag = True
                break
       
        #a = ","
        #count_comma = first_line.count(a)
        #first_line = csv.reader(first_line)
        #count_comma = first_line.count

for item in csv.reader([first_line], skipinitialspace=True):
   count_comma = len(item)

       
        finaldump = "create database if not exists " + database_name_mod + "; use " + database_name_mod + "; create external table " + table_name_mod +"("
        if(flag):
            i = 0
            while(i <= count_comma):
                finaldump = finaldump + "col"+ str(i) + " String,"
                i = i+1
        else:
            columns = first_line.split(',')
   #columns = csv.reader(first_line)
            for col in columns:
                finaldump = finaldump + "`" + col + "` String,"
        finaldump = finaldump[:-1] + ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '" +hdfs_file_loc+ "';"

with open(hive_query_path + table_name + '.hql', 'w') as the_file:

    the_file.write(finaldump)

HdInsight Cluster Delete Script - PART 4

$resourceGroupName = "xxxxRG"
$clusterName = "linuxhdinsightclusterxxxx"
$azUserName = "xxxx@outlook.com"
$azUserPW = ConvertTo-SecureString -String "xxxxxxxxxx" -AsPlainText -Force
$cred = New-Object System.Management.Automation.PSCredential($azUserName, $azUserPW)
Login-AzureRmAccount -Credential $cred
Invoke-Expression "C:\Cluster\eodscript.ps1"
Remove-AzureRmHDInsightCluster -ResourceGroupName $resourceGroupName -ClusterName $clusterName

HdInsight Cluster Create Script - PART 3

Import-Module "C:\Program Files (x86)\Microsoft SDKs\Azure\PowerShell\ServiceManagement\Azure\Azure.psd1"
$resourceGroupName = "xxxxRG"
$storageAccountName = "saxxxx"
$containerName = "linuxhdinsightclusterxxxx"
$storageAccountKey = "nNoxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx90A=="
$subscriptionName = "xxxxxxxxxxxx"
$storageAccountLocation = "North Central US"
$clusterName = "linuxhdinsightclusterxxxx"
$clusterNodeCount = "8"
$azUserName = "xxxxxxxx@outlook.com"
$azUserPW = ConvertTo-SecureString -String "xxxxxxxxxx" -AsPlainText -Force
$cred = New-Object System.Management.Automation.PSCredential($azUserName, $azUserPW)
Login-AzureRmAccount -Credential $cred
Select-AzureRmSubscription -SubscriptionName $subscriptionName
$blobStorageName = "saxxxx.blob.core.windows.net"
$sshUserName = "admin123"
$sshUserPwd = "Test@45678"
$sshUserPW = ConvertTo-SecureString -String $sshUserPwd -AsPlainText -Force
$sshCredentials = New-Object System.Management.Automation.PSCredential($sshUserName,$sshUserPW)
$computerName = "linuxhdinsightclusterxxxx-ssh.azurehdinsight.net"
$htpUserName = "admin"
$htpUserPwd = "Test@45678"
$htpUserPW = ConvertTo-SecureString -String $htpUserPwd -AsPlainText -Force
$credentials = New-Object System.Management.Automation.PSCredential($htpUserName,$htpUserPW)
$oozieUserName = "rootx"
$oozieUserPwd = "Test@45678"
$oozieUserPW = ConvertTo-SecureString -String $oozieUserPwd -AsPlainText -Force
$oozieCred = New-Object System.Management.Automation.PSCredential($oozieUserName,$oozieUserPW)
$config = New-AzureRmHDInsightClusterConfig
$config = Add-AzureRmHDInsightScriptAction -Config $config -Name "Install Hue"  -NodeType HeadNode  -Parameters "Test@45678" -Uri https://hdiconfigactions.blob.core.windows.net/linuxhueconfigactionv01/install-hue-uber-v01.sh
$config = Add-AzureRmHDInsightScriptAction -Config $config -Name "Install R"  -NodeType HeadNode -Uri https://hdiconfigactions.blob.core.windows.net/linuxrconfigactionv01/r-installer-v01.sh
$config = Add-AzureRmHDInsightScriptAction -Config $config -Name "Install R"  -NodeType WorkerNode -Uri https://hdiconfigactions.blob.core.windows.net/linuxrconfigactionv01/r-installer-v01.sh
$config = Add-AzureRmHDInsightScriptAction -Config $config -Name "Install Solr"  -NodeType HeadNode  -Uri https://hdiconfigactions.blob.core.windows.net/linuxsolrconfigactionv01/solr-installer-v01.sh
$config = Add-AzureRmHDInsightScriptAction -Config $config -Name "Install Solr"  -NodeType WorkerNode -Uri https://hdiconfigactions.blob.core.windows.net/linuxsolrconfigactionv01/solr-installer-v01.sh

New-AzureRmHDInsightCluster -ClusterName $clusterName -config $config -ResourceGroupName $resourceGroupName -HttpCredential $credentials -Location $storageAccountLocation -DefaultStorageAccountName "$storageAccountName.blob.core.windows.net" -DefaultStorageAccountKey $storageAccountKey -DefaultStorageContainer $containerName  -ClusterSizeInNodes $clusterNodeCount -ClusterType "Spark" -OSType "Linux" -Version "3.4" -SshCredential $sshCredentials
Invoke-Expression "C:\Cluster\bootup.ps1"

HdInsight Cluster Backup Strategy - PART 2 eod.ps1

$servername = "xxxxxxxxxxxxxx-ssh.azurehdinsight.net"
$username = "admin123"
$password = "Test@45678"
$port = 22


$command1 = "hadoop fs -rm -R wasb://container-name@storageacc-name.blob.core.windows.net/backup/*"
$command2 = "hadoop fs -copyFromLocal /home/admin123/* wasb://container-namebackup@storageacc-name.blob.core.windows.net/backup/ "
$commandname = @($command1,$command2)

Import-Module SSH-Sessions
New-SshSession -ComputerName $servername -Username $username -Password $password -Port $port

foreach ($c in $commandname) {
$SshResults = Invoke-SshCommand -InvokeOnAll -Command $c
}
Write-Host "Done!!!"

Remove-SshSession -RemoveAll

HdInsight Cluster Backup Strategy - PART 1 bootup.ps1

Create a PowerShell Script as shown below:

$servername = "xxxxxxxx-ssh.azurehdinsight.net"
$username = "admin123"
$password = "Test@45678"
$port = 22
$command2 = "hadoop fs -copyToLocal wasb://container-name@storage-name.blob.core.windows.net/backup/* /home/admin123/"
$command3 = "sudo chmod -R 777 /home/admin123/*"
$command4 = "hadoop fs -rm -R wasb://conatinerarchive@storage-name.blob.core.windows.net/archive/*"
$command5 = "hadoop fs -copyFromLocal /home/admin123/* wasb://conatinerarchive@storage-name.blob.core.windows.net/archive/ "
$command6 = "/home/admin123/scripts/executehivebootup.sh"
$command7 = "/home/admin123/HueScripts/hueurl.sh"
$command8 = "sudo /home/admin123/scripts/add_configuration.sh"
$command9 = "sudo apt-get install expect"
$command10 = "/home/admin123/scripts/SFTPrun.sh"

$commandname = @($command2,$command3,$command4,$command5)

Import-Module SSH-Sessions
New-SshSession -ComputerName $servername -Username $username -Password $password -Port $port

Write-Host "Started bootup data pull..!!!"

foreach ($c in $commandname) {
$SshResults = Invoke-SshCommand -InvokeOnAll -Command $c
}
Write-Host "Step 1 : Done With Copying..!!!"

Write-Host "Step 2 : Creating Schemas..."

$SshResults2 = Invoke-SshCommand -InvokeOnAll -Command $command6

Write-Host "Step 3 : Generating Hue Link...."

$SshResults3 = Invoke-SshCommand -InvokeOnAll -Command $command7

Write-Host "Step 4 : Completed...."

$SshResults4 = Invoke-SshCommand -InvokeOnAll -Command $command8

Write-Host "Step 5 : Completed core site changes..."

$SshResults5 = Invoke-SshCommand -InvokeOnAll -Command $command9

Write-Host "Step 6 : Installed dependent libraries..."

$SshResults5 = Invoke-SshCommand -InvokeOnAll -Command $command10

Write-Host "Step 7 : Installed  SFTP dependencies..."

Remove-SshSession -RemoveAll


***************************************
Contents of shell scripts used:
***************************************

executehivebootup.sh

oxi_array=(`find /home/admin123/HiveStartup/ -type f -follow` )
for element in "${oxi_array[@]}"
do
  hive -f "${element}"
done

Content of HiveStartup
Create a folder HiveSchema - Add a file xxxdata.hql
xxxdata.hql:
CREATE DATABASE IF NOT EXISTS xxxdata;
USE xxxdata;
CREATE EXTERNAL TABLE IF NOT EXISTS xxxdata( Bill_account string,Reading_type string, Reading_datetime Timestamp,Reading_value string ,`Interval` string )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LOCATION 'wasb://container-name@storageacc-name.blob.core.windows.net/xxxdata/';


hueurl.sh

hostname -f> /home/admin123/HueScripts/hueurl.properties
sudo sed -i s/hn0/huelink=hn0/ /home/admin123/HueScripts/hueurl.properties

Content of hueurl.properties

huelink=hn0-linuxxxxxxxxxxxxxxxjqd33z3g.ex.internal.cloudapp.net


add_configuration.sh

chmod -R 777 /usr/hdp/2.2.*/hadoop/lib
cp /home/admin123/jar/com.mysql.jdbc_5.1.5.jar /usr/hdp/2.2.*/hadoop/lib/
cp /etc/hadoop/conf/core-site.xml /home/admin123/scripts/
awk '/<configuration>/ {print;print "<property>\n<name>fs.azure.account.key.storageaccname.blob.core.windows.net</name>\n<value>Fajj9v2xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxetg==</value>\n</property>\n<property>\n<name>fs.azure.account.key.storageaccname.blob.core.windows.net</name>\n<value>vNcoxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxrUw==</value>\n</property>\n<property>\n<name>fs.azure.account.key.storageaccname.blob.core.windows.net</name>\n<value>PUWxxxxxxxxxxxxxxxxxxHq5iQ==</value>\n</property>"; next }1' /home/admin123/scripts/core-site.xml > /etc/hadoop/conf/core-site.xml


SFTPrun.sh

sftp_array=(`find /home/admin123/SFTP/sftpfirsttime/ -type f -follow` )
for element in "${sftp_array[@]}"
do
  "${element}"
done

Contents of sfttpfirsttime: ftp_test_com.sh

#!/usr/bin/expect

spawn sftp demo@test.rebex.net
expect "yes/no)?"
send "yes\n"
expect "password:"
send "password\n"
expect "sftp>"
send "exit\n"
interact


Partition Table after restart of Hdinsight CLuster

CREATE EXTERNAL TABLE amerendata_partitioned (Bill_account string,Reading_type string, Reading_datetime Timestamp,Reading_value string ,`Interval` string)
PARTITIONED BY (Reading_month String)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS ORC Location '/amerenORCpartitioned/';
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='1');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='2');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='3');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='4');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='5');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='6');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='7');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='8');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='9');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='10');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='11');
ALTER TABLE amerendata_partitioned ADD PARTITION(Reading_month='12');

Sunday 6 March 2016

Run shell script at start of System

Using cron this can be accomplished .

crontab -e
then select editor (vim,gedit,nano)

Add following command in crontab editor :

@reboot  /path/toscript/

Then save the crontab . Now each time your system bootups this script will be invoked.

Thursday 3 March 2016

Create Table in HBase using Linux Script



#!/bin/bash

TABLE="sample"

exec hbase shell <<EOF
     create "${TABLE}",'f1'
EOF

save this script as .sh file and run. It will create a table in HBase.

Thursday 18 February 2016

Create Identity column in Hive for dump in Hbase


  • We need to write a UDF for this.

package org.apache.hadoop.hive.contrib.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.LongWritable;

@Description(name = "row_sequence",
    value = "_FUNC_() - Returns a generated row sequence number starting from 1")
@UDFType(deterministic = false, stateful = true)

public class UDFRowSequence extends UDF
{
  private LongWritable result = new LongWritable();

  public UDFRowSequence() {
    result.set(0);
  }

  public LongWritable evaluate() {
    result.set(result.get() + 1);
    return result;
  }
}


  • Create a jar for this.
  • Create a Hive managed Hbase table with id as column for auto increment.

create table ga_tblagetest(id String,userAgeBracket String, date String,sessions String,percentNewSessions String,newUsers String,bounceRate String,pageviewsPerSession String,avgSessionDuration String)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:userAgeBracket,cf1:date,cf1:sessions,cf1:percentNewSessions,cf1:newUsers,cf1:bounceRate,cf1:pageviewsPerSession,cf1:avgSessionDuration")
TBLPROPERTIES ("hbase.table.name" = "ga_tblagetest");


  • In hive, add the jar : ADD JAR /path/customhiveudf.jar
  • create temporary function incrementrow as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
  • CREATE TEMPORARY TABLE default.tmp2 AS SELECT incrementrow(),* FROM socialmedia.tblage;
  • INSERT OVERWRITE TABLE ga_tblagetest SELECT * FROM default.tmp2;
Data with row key can be queried in Hbase.
  • scan 'ga_tblagetest'

Friday 12 February 2016

Script for getting count and name of table from Hive in a text file

OUTPUT="$(hive -e 'use oxi_consolidated; show tables;')"
hive -e "select count(*) from ${OUTPUT}" > countoxi.txt

Execute all .hql files inside a directory:

oxi_array=(`find /home/oxigen/HiveStartup/ -type f -follow` )
for element in "${oxi_array[@]}"
do
  hive -f "${element}"
done

Sunday 7 February 2016

Connect Linux HDInsight cluster with PowerBi/Tableau


  1.  Download and install Microsoft Hive ODBC Driver.
    1. https://www.microsoft.com/en-us/download/confirmation.aspx?id=40886
  2. Open ODBC Data source Administrator





































3.  Connected !!



Thursday 4 February 2016

Shell Script for automating GA

todaysdate=`date +"%Y-%m-%d"`
path="/home/brillio/GoogleAnalytics/CSVData/"
dir="/home/brillio/GoogleAnalytics"
jarname="googleanalytics"
hdfspath="/socialmedia/googleanalytics/"
if
test -d $path ;
then
sudo rm -r $path*
else echo "FileNotExist"
fi
java -jar $jarname.jar
sudo chmod -R 777 $dir
tablename=( "tblAge" "tblBehaviourAndNetwork" "tblGender" "tblGeo" "tblTechnologyBrowser" "tblTechnologyOperatingSystem" "tblLanguage" )
for table in ${tablename[@]}
do
if
hadoop fs -test -f $hdfspath$table/$table\_$todaysdate\.csv ;
then
hadoop fs -rm -r $hdfspath$table/$table\_$todaysdate\.csv
else echo "FileNotExist"
fi
hadoop fs -copyFromLocal $path$table\_* $hdfspath$table/
done

Tuesday 2 February 2016

Monitor Spark Processes (List spark processes running)

If you are running spark process over yarn cluster then there  is a way to list the spark processes running

yarn application -list

Spark Streaming Process

To kill a running process :

 yarn application -kill application_id


Copy Startup Files from Azure Blob to Local everytime the cluster is created Using Powershell

$servername = "xxxxxxxxxxxxxxxxxxxxx"
$username = "xxxxxxxxx"
$password = "xxxxxxxxxxxxx"
$port = 22
$foldername = @("HueScripts","completereload")
Import-Module SSH-Sessions
New-SshSession -ComputerName $servername -Username $username -Password $password -Port $port

foreach ($f in $foldername) {
   $command1 = "test -d `"/home/brillio/$f`" && sudo rm -R /home/brillio/$f || echo Does Not Exist Proceeding..."
   $SshResults1 = Invoke-SshCommand -InvokeOnAll -Command $command1
   $command2 = "sudo mkdir /home/brillio/$f"
   $SshResults2 = Invoke-SshCommand -InvokeOnAll -Command $command2
   $command3 = "sudo chmod -R 777 /home/brillio/$f"
   $SshResults3 = Invoke-SshCommand -InvokeOnAll -Command $command3
   $command4 = "hadoop fs -copyToLocal wasb://xxxxxxx@xxxxxxxxxxx/$f/* /home/brillio/$f/"
   $SshResults4 = Invoke-SshCommand -InvokeOnAll -Command $command4
   $command5 = "sudo chmod -R 777 /home/brillio/$f"
   $SshResults5 = Invoke-SshCommand -InvokeOnAll -Command $command5
   Write-Host "Done $f"
 }

 $command6 = "/home/brillio/hueurl.sh"
 $SshResults6 = Invoke-SshCommand -InvokeOnAll -Command $command6

Remove-SshSession -RemoveAll

How to pass Parameter in HIVE hql



hive -e 'create database ${db}' -define db='twitter_stream'


OR

if you are calling hive hql in shell script  :

hive -e 'create database ${db}' -define db=$1

Monday 1 February 2016

Catch Logs In Sqoop


To output only in log file and not in UI:

/home/brillio/sqoop-jobs/test.sh > /home/brillio/logsqoop.txt 2>&1

To output in UI and log file:

/home/brillio/sqoop-jobs/test.sh 2>&1 | tee /home/brillio/logsqoop.txt

Friday 22 January 2016

Cannot change directory in Windows cmd

cd /d F:\Python3\Scripts

This should work ! :)

NUMPY installation Error using PIP:

https://www.microsoft.com/en-us/download/confirmation.aspx?id=44266
http://www.lfd.uci.edu/~gohlke/pythonlibs/#numpy


Wednesday 13 January 2016

Delete Script HDInsight Cluster

Login-AzureRmAccount
Import-AzurePublishSettingsFile "F:\Microsoft Azure Sponsorship-12-16-2015-credentials (2).publishsettings"
Get-AzureRmSubscription  # list your subscriptions and get your subscription ID
$subscriptionName="Microsoft Azure Sponsorship"
#Select-AzureRmSubscription -SubscriptionId "<Your Azure Subscription ID>"
Select-AzureRmSubscription -SubscriptionName $subscriptionName

$resourceGroupName = "hdinsight_southeastasia"

$storageAccountName = "XXXXXXXXXX"
$containerName = "XXXXXXXXXXX"

$clusterName = "XXXXXXXXXXXXX"
$location = "Southeast Asia"
$clusterNodes = "4"
$clusterType="HBase"
$ClusterVersion="3.2.7.757"

# Get the Storage account key
#$storageAccountKey = Get-AzureRmStorageAccountKey -ResourceGroupName $resourceGroupName -Name $storageAccountName | %{ $_.Key1 }
echo "Authentication Done ..!!!"
echo "Cluster deletion in Progress"
# Delete the HDInsight cluster
Remove-AzureRmHDInsightCluster -ClusterName $clusterName
echo "Cluster deleted"

Create script HDInsight Cluster

Login-AzureRmAccount
Import-AzurePublishSettingsFile "F:\Microsoft Azure Sponsorship-12-16-2015-credentials (2).publishsettings"
Get-AzureRmSubscription  # list your subscriptions and get your subscription ID
$subscriptionName="Microsoft Azure Sponsorship"
#Select-AzureRmSubscription -SubscriptionId "<Your Azure Subscription ID>"
Select-AzureRmSubscription -SubscriptionName $subscriptionName

$resourceGroupName = "hdinsight_southeastasia"

$storageAccountName = "XXXXXXXXXXXXXX"
$containerName = "XXXXXXXXXXXXX"

$clusterName = "XXXXXXXXXXXXX"
$location = "Southeast Asia"
$clusterNodes = "4"
$clusterType="HBase"
$ClusterVersion="3.2.7.757"

# Get the Storage account key
$storageAccountKey = Get-AzureRmStorageAccountKey -ResourceGroupName $resourceGroupName -Name $storageAccountName | %{ $_.Key1 }
echo "Authentication Done ..!!!"
echo "Cluster creation in Progress"
# Create a new HDInsight cluster
New-AzureRmHDInsightCluster -ResourceGroupName $resourceGroupName `
    -ClusterName $clusterName `
    -Location $location `
    -DefaultStorageAccountName "$storageAccountName.blob.core.windows.net" `
    -DefaultStorageAccountKey $storageAccountKey `
    -DefaultStorageContainer $containerName  `
    -ClusterSizeInNodes $clusterNodes `
-ClusterType $clusterType `


echo "Cluster Created.."

Sunday 10 January 2016

Contract Between HashCode and equals() Method in Java

Contract between hashcode and equals():

If two objects are considered equal using equals() method Then they must have identical hashcode() .
So if you override equals() then override hashCode() as well.

Saturday 9 January 2016

How to transfer data from Hadoop cluster to Azure Blob

Add following properties into core-site.xml

<property>
        <name>fs.azure.account.key.<@container name>.blob.core.windows.net</name>
        <value><account key value></value>
   </property>

And make access of blob as container in Azure Portal

Now We can transfer data from hadoop cluster to azure blob container.

hadoop fs - localpath  azure_blob_path

For transfering Hdfs to azure blob :

hadoop distcp hdfs://hostname:8020/filename  azure_blob_path

Azure Blob path can be  :

wasb://<containername>@<storageAccountName>.blob.core.windows.net