Random sampling YouTube video uploaders using YouTube Data API

# -*- coding: utf-8 -*-
“””
Created on Thu Mar 15 10:15:02 2012

@author: Wu Lingfei
“””
import re
import csv
import time
import urllib
import gdata.youtube
from random import choice
import numpy as np
import gdata.youtube.service
from BeautifulSoup import BeautifulSoup

 

#———-Preparing data for random video generator————–
#—-language————-
languagedata = [“aa”, “ab”, “af”, “ak”, “sq”, “am”, “ar”, “an”, “hy”, “as”, “av”, \
“ae”, “ay”, “az”, “ba”, “bm”, “eu”, “be”, “bn”, “bh”, “bi”, “bo”, \
“bs”, “br”, “bg”, “my”, “ca”, “cs”, “ch”, “ce”, “zh”, “cu”, “cv”, \
“kw”, “co”, “cr”, “cy”, “cs”, “da”, “de”, “dv”, “nl”, “dz”, “el”, \
“en”, “eo”, “et”, “eu”, “ee”, “fo”, “fa”, “fj”, “fi”, “fr”, “fr”, \
“fy”, “ff”, “Ga”, “ka”, “de”, “gd”, “ga”, “gl”, “gv”, “el”, “gn”, \
“gu”, “ht”, “ha”, “he”, “hz”, “hi”, “ho”, “hr”, “hu”, “hy”, “ig”, \
“is”, “io”, “ii”, “iu”, “ie”, “ia”, “id”, “ik”, “is”, “it”, “jv”, \
“ja”, “kl”, “kn”, “ks”, “ka”, “kr”, “kk”, “km”, “ki”, “rw”, “ky”, \
“kv”, “kg”, “ko”, “kj”, “ku”, “lo”, “la”, “lv”, “li”, “ln”, “lt”, \
“lb”, “lu”, “lg”, “mk”, “mh”, “ml”, “mi”, “mr”, “ms”, “mk”, “mg”, \
“mt”, “mn”, “mi”, “ms”, “my”, “na”, “nv”, “nr”, “nd”, “ng”, “ne”, \
“nl”, “nn”, “nb”, “no”, “ny”, “oc”, “oj”, “or”, “om”, “os”, “pa”, \
“fa”, “pi”, “pl”, “pt”, “ps”, “qu”, “rm”, “ro”, “ro”, “rn”, “ru”, \
“sg”, “sa”, “si”, “sk”, “sk”, “sl”, “se”, “sm”, “sn”, “sd”, “so”, \
“st”, “es”, “sq”, “sc”, “sr”, “ss”, “su”, “sw”, “sv”, “ty”, “ta”, \
“tt”, “te”, “tg”, “tl”, “th”, “bo”, “ti”, “to”, “tn”, “ts”, “tk”, \
“tr”, “tw”, “ug”, “uk”, “ur”, “uz”, “ve”, “vi”, “vo”, “cy”, “wa”, \
“wo”, “xh”, “yi”, “yo”, “za”, “zh”, “zu”]
#—-qureyterm————-
#qt = choice(letters)+choice(letters)+choice(letters)
#—-orderby————-
orderdata = [‘relevance’,’published’,’viewCount’,’rating’]
#—-duration————-
durationdata = [‘short’,’medium’,’long’]
#—-category————-
categorydata = [‘Autos & Vehicles’, ‘Comedy’, ‘Education’, ‘Entertainment’, \
‘Film & Animation’, ‘Gaming, Howto & Style’, ‘Music, News & Politics’, \
‘Nonprofits & Activism’, ‘People & Blogs’, ‘Pets & Animals’, \
‘Science & Technology’, ‘Sports’, ‘Travel & Events’]
#———-End of preparing data for random video generator———-

p1 = ‘http://gdata.youtube.com/feeds/api/videos?q=a&orderby=relevance&v=2&max-results=50’
p2a = ‘&lr=en’
p2b = ‘&lr=zh’
p3a = ‘&duration=short’
p3b = ‘&duration=%2Dshort’
p4a = ‘&category=Sports’
p4b = ‘&category=Gaming’
p4c = ‘&category=Pets%26Animals’
p4d = ‘&category=Travel%26Events’
u1 = p1+p2a+p3a+p4a
u2 = p1+p2a+p3a+p4b
u3 = p1+p2a+p3a+p4c
u4 = p1+p2a+p3a+p4d
u5 = p1+p2a+p3b+p4a
u6 = p1+p2a+p3b+p4b
u7 = p1+p2a+p3b+p4c
u8 = p1+p2a+p3b+p4d
u9 = p1+p2b+p3a+p4a
u10 = p1+p2b+p3a+p4b
u11 = p1+p2b+p3a+p4c
u12 = p1+p2b+p3a+p4d
u13 = p1+p2b+p3b+p4a
u14 = p1+p2b+p3b+p4b
u15 = p1+p2b+p3b+p4c
u16 = p1+p2b+p3b+p4d

def getusers (uu):
yt_service = gdata.youtube.service.YouTubeService()
feed = yt_service.GetYouTubeVideoFeed(uu)
def nameFromfeedentry(number):
urla = feed.entry[number].media.player.url
urlb = re.split(‘&’,urla)[0]
html = urllib.urlopen(urlb).read()
parser = BeautifulSoup(html)
partHtml = parser.findAll(text=re.compile(“VIDEO_USERNAME”))
pp = re.findall(‘VIDEO_USERNAME.+’, partHtml[0])
username = re.split(‘\”‘,pp[0])[1]
return(username)
l = len(feed.entry)
users = map(nameFromfeedentry,range(l))
return users

start = time.clock()
user77 = getusers(u7)
print time.clock()-start

user6.extend(user66)
len(user6)
len(sorted(set(user6)))

user6=sorted(set(user6))[:30]

user1 = user1[:30]
user2
user3
user4
user5 = user5[:30]
user6
user7
user8
user9 = user9[:30]
user13 = user13
user14 = user14

 

def randomVideoGenerator (number):
p1 = ‘http://gdata.youtube.com/feeds/api/videos?’
p2 = ‘q=’+’the’
p3 = ‘&lr=’+choice(languagedata)
p4 = ‘&orderby=’+choice(orderdata)
p5 = ‘&duration’+choice(durationdata)
p6 = ‘&category=’+choice(categorydata)
p7 = ‘&v=2&max-results=10’
feedurl = p1+p2+p3+p4+p5+p6+p7
yt_service = gdata.youtube.service.YouTubeService()
feed = yt_service.GetYouTubeVideoFeed(feedurl)
try:
Firstentry = feed.entry[0]
Firsturl = Firstentry.media.player.url
url = re.split(‘&’,Firsturl)[0]
return (url)
except:
return (‘na’)

def getUserFromVideo (url):
html = urllib.urlopen(url).read()
parser = BeautifulSoup(html)
partHtml = parser.findAll(text=re.compile(“VIDEO_USERNAME”))
pp = re.findall(‘VIDEO_USERNAME.+’, partHtml[0])
username = re.split(‘\”‘,pp[0])[1]
return username

#start = time.clock()
videos = map(randomVideoGenerator,np.arange(1,50))
videos = np.array(videos)
videos = videos[np.where(videos!=’na’)]
users = map(getUserFromVideo, videos)
users
#print time.clock()-start

finalusers=[]

#finalusers.extend(users)
#finalusers=sorted(set(finalusers))
#len(finalusers)

myfile = open(‘D:/research/dissertation/youtubeData/MyRandomUsers2.csv’, ‘wb’)
wr = csv.writer(myfile, delimiter=’,’ ,quoting=csv.QUOTE_ALL)
wr.writerow(finalusers)
myfile.close()
testfile = open(‘D:/…/MyRandomUsers2.csv’, ‘rb’)
test = np.recfromcsv(testfile)
testfile.close()
names = range(0,len(test))
for i in names:
names[i] = test[i][0]

Leave a Reply