Tuesday, 15 May 2018

Simple Threading execution to download json data

from datetime import datetime
from multiprocessing.pool import ThreadPool
requests.packages.urllib3.disable_warnings() 
#comment_list_path = 'C:\\Users\\lnanda\\Desktop\\Lokesh\\inmoment\\inmoment_main\\\inmoment_weekly_surveyids\\commentlist'
comment_list_path = 'C:\\Users\\lnanda\\Desktop\\Lokesh\\inmoment\\inmoment_main\\\inmoment_weekly_surveyids\\commentlist_nov2017'
commentidsdeltapath = 'C:\\Users\\lnanda\\Desktop\\Lokesh\\inmoment\\inmoment_main\\\inmoment_weekly_surveyids\\commentdeltalist_nov2017' 
#commentidsdeltapath = 'C:\\Users\\lnanda\\Desktop\\Lokesh\\inmoment\\inmoment_main\\\inmoment_weekly_surveyids\\commentdeltalist' 
#raw_comment_path = 'C:\\Users\\lnanda\\Desktop\\Lokesh\\inmoment\\inmoment_main\\inmoment_weekly_data\\comment_2017_2018'
raw_comment_path = 'C:\\Users\\lnanda\\Desktop\\Lokesh\\inmoment\\inmoment_main\\inmoment_weekly_data\\comment_nov2017'
access_token = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

def read_comment_ids(comment_list_path):
    comment_ids = open(comment_list_path).read().splitlines()
    return comment_ids
    
def get_updated_comment_list(comment_ids_list,comment_ids_delta_list):
    diff_list = list(set(comment_ids_list)-set(comment_ids_delta_list))
    return diff_list

def update_comment_delta(commentidsdeltapath, commentId):
    with open(commentidsdeltapath, "a") as file:
        file.write(str(commentId))
        file.write('\n')

def getDateTime():
    return str(datetime.now()).replace(' ','')

def writeComment(text, commentId):
    with open(raw_comment_path, "a") as file:
        json.dump(text, file)
        file.write('\n')
    update_comment_delta(commentidsdeltapath, commentId)
            
def getComment(commentId):
    headers = {'content-type': 'application/json'}
    headers['Authorization'] = 'OAuth '+access_token
    get_comment = "https://www.xxxxxxxxxxxxt.com/api/rest/1.0/comment/"+str(commentId)
    try:
        r = requests.get(get_comment, headers=headers, verify=False)
    except Exception as err:
        print(str(err))
        time.sleep(30)
        print('Sleeping for 30 sec..........')
        r = requests.get(get_comment, headers=headers, verify=False)
    if(r.status_code == 200):
        val = json.loads(r.text)
        '''
        with open(raw_comment_path, "a") as file:
            json.dump(val, file)
            file.write('\n')
        update_comment_delta(commentidsdeltapath, commentId)
        '''
        return val, commentId 
            
try:
    comment_ids_list = read_comment_ids(comment_list_path)
except Exception as e:
    print('There is an error reading from ' + str(comment_list_path))
    
try:
    comment_ids_delta_list = read_comment_ids(commentidsdeltapath)
except Exception as e:
    print('There is an error reading from ' + str(commentidsdeltapath)+ ' This is first load')
    comment_ids_delta_list = []

diff_commentid_list_raw = get_updated_comment_list(comment_ids_list,comment_ids_delta_list)
    
count_diff_id = len(diff_commentid_list_raw)
if count_diff_id == 0:
    print("All comment id's done")
else:
    #for commentId in diff_commentid_list_raw:
        #getComment(access_token,commentId)
    results = ThreadPool(20).imap_unordered(getComment, diff_commentid_list_raw)
    for text, commentId in results:
        writeComment(text, commentId)
        
try:
    comment_ids_delta_list = read_comment_ids(commentidsdeltapath)
    diff_commentid_list_raw = get_updated_comment_list(comment_ids_list,comment_ids_delta_list)
    count_diff_id = len(diff_commentid_list_raw)
    if count_diff_id == 0:
        print('DOne !!!!!!')
    else:
        print('[main] - Count of diff id is not equal to 0, NEED TO RERUN')
except Exception as e:
    print(str(e))