Crawling the historical viewing records of all videos uploaded by a YouTube user

# -*- coding: utf-8 -*-
“””
Created on Tue Mar 06 02:41:22 2012

@author: Wu Lingfei
“””
import re
import csv
import time
import numpy as np
import win32com.client
import gdata.youtube
import gdata.youtube.service

address = “D:/research/dissertation/youtubeData/”
def getVideoList (username):
try:
url = ‘http://gdata.youtube.com/feeds/api/users/’+username+’/uploads’
yt_service = gdata.youtube.service.YouTubeService()
#yt_service.ssl = True
feed = yt_service.GetYouTubeVideoFeed(url)
urls = []
for entry in feed.entry:
tu = entry.GetSwfUrl()
tv = re.split(“version”,re.split(“/v/”,tu)[1])[0][:-1]
tt = ‘http://www.youtube.com/watch?v=’+tv
urls.append(tt)
return(np.array(urls))
except:
return(np.array([‘non’]))
def getVideoViews (url) :
try:
ie6=win32com.client.Dispatch(“InternetExplorer.Application”)
ie6.Navigate(url)
# open IE
#ie6.Visible = 1 #if you want to see the explore brower
while ie6.Busy:
time.sleep(2)
document=ie6.Document
document.getElementById(“watch-insight-button”).click()
time.sleep(2)
#click the view statistics
tt = document.body.innerHTML
tt = unicode(tt)
tt = tt.encode(‘ascii’,’ignore’)
ie6.Quit()
# obtain the body of html, convert it to string and close IE
p1 = re.findall(‘src=”http://chart.apis.google.com/.+’, tt)
p2 = p1[0]
vs = re.split(‘&’,re.split(‘chd=t:’,p2)[1])[0]
vs = re.split(“,”,vs)
vs = map(float, vs)
# get the max value of y axis
maxy = re.split(‘\|’,re.split(‘chxr=0,0,’,p2)[1])[0]
maxy=float(maxy)
# get the starting and ending dates of records
dates=re.split(‘&’,re.split(‘chxl=1:’,p2)[1])[0]
dates=re.split(‘\|’,dates)
startdate = dates[1]
enddate = dates[-1]
#get the 100 data points provided by youtube
data = maxy*np.array(vs)/100
data = np.append(np.array([startdate,enddate]),data)
if len(data) < 102:
data = np.append(data,np.tile(“0”,102-len(data)))
return(data)
else:
return(data)
except:
return np.append(np.array([“start”,”end”]),np.tile(“0”,100))

def getUserViews (username):
try:
tu = getVideoList(username)
td = map(getVideoViews, tu)
for i in np.arange(len(tu)):
td[i] = np.append(tu[i],td[i])
add = address + username + ‘.csv’
file = open(add,’wb’)
w = csv.writer(file,delimiter=’,’,quotechar=’|’, quoting=csv.QUOTE_MINIMAL)
w.writerows(td)
file.close()
except:
pass
myfile = csv.reader(open(“D:/research/dissertation/youtubeData/namelist2.csv”, “rb”))
us = []
us.extend(myfile)
us = us[0]
us = us[3:]

start = time.clock()
map(getUserViews,us)
print time.clock()-start

Leave a Reply