Archive for the ‘Python’ Category

Networkx experiment

Thursday, December 27th, 2012

# -*- coding: utf-8 -*-
“””
Created on Wed Aug 31 23:39:02 2011

@author: Wu Lingfei
“””

import networkx as nx
import matplotlib.pyplot as plt
G=nx.read_weighted_edgelist(“D:/research/www traffic network/flowweb980.csv”,
comments=’#’, delimiter=’,’, create_using=None,
nodetype=None, encoding=’utf-8′)
pos=nx.spring_layout(G)
n = G.number_of_edges()
colors=range(n)
#nx.draw_random(G,,alpha=0.5,node_color=”blue”, with_labels=False)
nx.draw(G,pos,node_size=10,node_color=’#A0CBE2′,edge_color=colors,width=1,
edge_cmap=plt.cm.Blues,with_labels=False)
#plt.savefig(“edge_colormap.png”) # save as png
plt.show() # display

Crawling the historical viewing records of all videos uploaded by a YouTube user

Thursday, December 27th, 2012

# -*- coding: utf-8 -*-
“””
Created on Tue Mar 06 02:41:22 2012

@author: Wu Lingfei
“””
import re
import csv
import time
import numpy as np
import win32com.client
import gdata.youtube
import gdata.youtube.service

address = “D:/research/dissertation/youtubeData/”
def getVideoList (username):
try:
url = ‘http://gdata.youtube.com/feeds/api/users/’+username+’/uploads’
yt_service = gdata.youtube.service.YouTubeService()
#yt_service.ssl = True
feed = yt_service.GetYouTubeVideoFeed(url)
urls = []
for entry in feed.entry:
tu = entry.GetSwfUrl()
tv = re.split(“version”,re.split(“/v/”,tu)[1])[0][:-1]
tt = ‘http://www.youtube.com/watch?v=’+tv
urls.append(tt)
return(np.array(urls))
except:
return(np.array([‘non’]))
def getVideoViews (url) :
try:
ie6=win32com.client.Dispatch(“InternetExplorer.Application”)
ie6.Navigate(url)
# open IE
#ie6.Visible = 1 #if you want to see the explore brower
while ie6.Busy:
time.sleep(2)
document=ie6.Document
document.getElementById(“watch-insight-button”).click()
time.sleep(2)
#click the view statistics
tt = document.body.innerHTML
tt = unicode(tt)
tt = tt.encode(‘ascii’,’ignore’)
ie6.Quit()
# obtain the body of html, convert it to string and close IE
p1 = re.findall(‘src=”http://chart.apis.google.com/.+’, tt)
p2 = p1[0]
vs = re.split(‘&’,re.split(‘chd=t:’,p2)[1])[0]
vs = re.split(“,”,vs)
vs = map(float, vs)
# get the max value of y axis
maxy = re.split(‘\|’,re.split(‘chxr=0,0,’,p2)[1])[0]
maxy=float(maxy)
# get the starting and ending dates of records
dates=re.split(‘&’,re.split(‘chxl=1:’,p2)[1])[0]
dates=re.split(‘\|’,dates)
startdate = dates[1]
enddate = dates[-1]
#get the 100 data points provided by youtube
data = maxy*np.array(vs)/100
data = np.append(np.array([startdate,enddate]),data)
if len(data) < 102:
data = np.append(data,np.tile(“0”,102-len(data)))
return(data)
else:
return(data)
except:
return np.append(np.array([“start”,”end”]),np.tile(“0”,100))

def getUserViews (username):
try:
tu = getVideoList(username)
td = map(getVideoViews, tu)
for i in np.arange(len(tu)):
td[i] = np.append(tu[i],td[i])
add = address + username + ‘.csv’
file = open(add,’wb’)
w = csv.writer(file,delimiter=’,’,quotechar=’|’, quoting=csv.QUOTE_MINIMAL)
w.writerows(td)
file.close()
except:
pass
myfile = csv.reader(open(“D:/research/dissertation/youtubeData/namelist2.csv”, “rb”))
us = []
us.extend(myfile)
us = us[0]
us = us[3:]

start = time.clock()
map(getUserViews,us)
print time.clock()-start

Crawling the historical viewing records of a video using browser automation in Python

Thursday, December 27th, 2012

# -*- coding: utf-8 -*-
“””
Created on Tue Mar 06 02:41:22 2012

@author: Wu Lingfei
“””
import re
import csv
import time
import numpy as np
import win32com.client
import gdata.youtube
import gdata.youtube.service

address = “D:/research/dissertation/youtubeData/”
def getVideoList (username):
try:
url = ‘http://gdata.youtube.com/feeds/api/users/’+username+’/uploads’
yt_service = gdata.youtube.service.YouTubeService()
#yt_service.ssl = True
feed = yt_service.GetYouTubeVideoFeed(url)
urls = []
for entry in feed.entry:
tu = entry.GetSwfUrl()
tv = re.split(“version”,re.split(“/v/”,tu)[1])[0][:-1]
tt = ‘http://www.youtube.com/watch?v=’+tv
urls.append(tt)
return(np.array(urls))
except:
return(np.array([‘non’]))
def getVideoViews (url) :
try:
ie6=win32com.client.Dispatch(“InternetExplorer.Application”)
ie6.Navigate(url)
# open IE
#ie6.Visible = 1 #if you want to see the explore brower
while ie6.Busy:
time.sleep(2)
document=ie6.Document
document.getElementById(“watch-insight-button”).click()
time.sleep(2)
#click the view statistics
tt = document.body.innerHTML
tt = unicode(tt)
tt = tt.encode(‘ascii’,’ignore’)
ie6.Quit()
# obtain the body of html, convert it to string and close IE
p1 = re.findall(‘src=”http://chart.apis.google.com/.+’, tt)
p2 = re.findall(‘Total views.*H4’, tt)
vs = re.split(‘\<‘,re.split(‘Total views:’,p2[0])[1])[0]
vs = vs.replace(‘,’,”)
vs = float(vs.strip())
# get the max value of y axis
v = p1[0]
t1 = re.findall(‘chxl.*chxp’, v)
t2 = re.split(‘&amp’,re.split(‘chxl=1:|’,t1[0])[1])[0]
startdate = t2[1:9]
enddate = t2[-8:]
# get the starting and ending dates of records
d1 = re.findall(‘chd.*amp’, v)
d2 = re.split(‘&amp’,re.split(‘chd=t:’,d1[0])[1])[0]
d3 = re.split(“,”,d2)
d4 = np.array(map(float,d3))/100
maxvl = vs/d4[-1]
data = d4*maxvl
#get the 100 data points provided by youtube
data = np.append(np.array([startdate,enddate]),data)
if len(data) < 102:
data = np.append(data,np.tile(“0”,102-len(data)))
return(data)
else:
return(data)
except:
return np.append(np.array([“start”,”end”]),np.tile(“0”,100))

def getUserViews (username):
try:
tu = getVideoList(username)
td = map(getVideoViews, tu)
for i in np.arange(len(tu)):
td[i] = np.append(tu[i],td[i])
add = address + username + ‘.csv’
file = open(add,’wb’)
w = csv.writer(file,delimiter=’,’,quotechar=’|’, quoting=csv.QUOTE_MINIMAL)
w.writerows(td)
file.close()
except:
pass
myfile = csv.reader(open(“D:/research/dissertation/youtubeData/MyRandomUsers2.csv”, “rb”))
us = []
us.extend(myfile)
us = us[0]

start = time.clock()
map(getUserViews,us[197:200])
print time.clock()-start

Random sampling YouTube video uploaders using YouTube Data API

Thursday, December 27th, 2012

# -*- coding: utf-8 -*-
“””
Created on Thu Mar 15 10:15:02 2012

@author: Wu Lingfei
“””
import re
import csv
import time
import urllib
import gdata.youtube
from random import choice
import numpy as np
import gdata.youtube.service
from BeautifulSoup import BeautifulSoup

 

#———-Preparing data for random video generator————–
#—-language————-
languagedata = [“aa”, “ab”, “af”, “ak”, “sq”, “am”, “ar”, “an”, “hy”, “as”, “av”, \
“ae”, “ay”, “az”, “ba”, “bm”, “eu”, “be”, “bn”, “bh”, “bi”, “bo”, \
“bs”, “br”, “bg”, “my”, “ca”, “cs”, “ch”, “ce”, “zh”, “cu”, “cv”, \
“kw”, “co”, “cr”, “cy”, “cs”, “da”, “de”, “dv”, “nl”, “dz”, “el”, \
“en”, “eo”, “et”, “eu”, “ee”, “fo”, “fa”, “fj”, “fi”, “fr”, “fr”, \
“fy”, “ff”, “Ga”, “ka”, “de”, “gd”, “ga”, “gl”, “gv”, “el”, “gn”, \
“gu”, “ht”, “ha”, “he”, “hz”, “hi”, “ho”, “hr”, “hu”, “hy”, “ig”, \
“is”, “io”, “ii”, “iu”, “ie”, “ia”, “id”, “ik”, “is”, “it”, “jv”, \
“ja”, “kl”, “kn”, “ks”, “ka”, “kr”, “kk”, “km”, “ki”, “rw”, “ky”, \
“kv”, “kg”, “ko”, “kj”, “ku”, “lo”, “la”, “lv”, “li”, “ln”, “lt”, \
“lb”, “lu”, “lg”, “mk”, “mh”, “ml”, “mi”, “mr”, “ms”, “mk”, “mg”, \
“mt”, “mn”, “mi”, “ms”, “my”, “na”, “nv”, “nr”, “nd”, “ng”, “ne”, \
“nl”, “nn”, “nb”, “no”, “ny”, “oc”, “oj”, “or”, “om”, “os”, “pa”, \
“fa”, “pi”, “pl”, “pt”, “ps”, “qu”, “rm”, “ro”, “ro”, “rn”, “ru”, \
“sg”, “sa”, “si”, “sk”, “sk”, “sl”, “se”, “sm”, “sn”, “sd”, “so”, \
“st”, “es”, “sq”, “sc”, “sr”, “ss”, “su”, “sw”, “sv”, “ty”, “ta”, \
“tt”, “te”, “tg”, “tl”, “th”, “bo”, “ti”, “to”, “tn”, “ts”, “tk”, \
“tr”, “tw”, “ug”, “uk”, “ur”, “uz”, “ve”, “vi”, “vo”, “cy”, “wa”, \
“wo”, “xh”, “yi”, “yo”, “za”, “zh”, “zu”]
#—-qureyterm————-
#qt = choice(letters)+choice(letters)+choice(letters)
#—-orderby————-
orderdata = [‘relevance’,’published’,’viewCount’,’rating’]
#—-duration————-
durationdata = [‘short’,’medium’,’long’]
#—-category————-
categorydata = [‘Autos & Vehicles’, ‘Comedy’, ‘Education’, ‘Entertainment’, \
‘Film & Animation’, ‘Gaming, Howto & Style’, ‘Music, News & Politics’, \
‘Nonprofits & Activism’, ‘People & Blogs’, ‘Pets & Animals’, \
‘Science & Technology’, ‘Sports’, ‘Travel & Events’]
#———-End of preparing data for random video generator———-

p1 = ‘http://gdata.youtube.com/feeds/api/videos?q=a&orderby=relevance&v=2&max-results=50’
p2a = ‘&lr=en’
p2b = ‘&lr=zh’
p3a = ‘&duration=short’
p3b = ‘&duration=%2Dshort’
p4a = ‘&category=Sports’
p4b = ‘&category=Gaming’
p4c = ‘&category=Pets%26Animals’
p4d = ‘&category=Travel%26Events’
u1 = p1+p2a+p3a+p4a
u2 = p1+p2a+p3a+p4b
u3 = p1+p2a+p3a+p4c
u4 = p1+p2a+p3a+p4d
u5 = p1+p2a+p3b+p4a
u6 = p1+p2a+p3b+p4b
u7 = p1+p2a+p3b+p4c
u8 = p1+p2a+p3b+p4d
u9 = p1+p2b+p3a+p4a
u10 = p1+p2b+p3a+p4b
u11 = p1+p2b+p3a+p4c
u12 = p1+p2b+p3a+p4d
u13 = p1+p2b+p3b+p4a
u14 = p1+p2b+p3b+p4b
u15 = p1+p2b+p3b+p4c
u16 = p1+p2b+p3b+p4d

def getusers (uu):
yt_service = gdata.youtube.service.YouTubeService()
feed = yt_service.GetYouTubeVideoFeed(uu)
def nameFromfeedentry(number):
urla = feed.entry[number].media.player.url
urlb = re.split(‘&’,urla)[0]
html = urllib.urlopen(urlb).read()
parser = BeautifulSoup(html)
partHtml = parser.findAll(text=re.compile(“VIDEO_USERNAME”))
pp = re.findall(‘VIDEO_USERNAME.+’, partHtml[0])
username = re.split(‘\”‘,pp[0])[1]
return(username)
l = len(feed.entry)
users = map(nameFromfeedentry,range(l))
return users

start = time.clock()
user77 = getusers(u7)
print time.clock()-start

user6.extend(user66)
len(user6)
len(sorted(set(user6)))

user6=sorted(set(user6))[:30]

user1 = user1[:30]
user2
user3
user4
user5 = user5[:30]
user6
user7
user8
user9 = user9[:30]
user13 = user13
user14 = user14

 

def randomVideoGenerator (number):
p1 = ‘http://gdata.youtube.com/feeds/api/videos?’
p2 = ‘q=’+’the’
p3 = ‘&lr=’+choice(languagedata)
p4 = ‘&orderby=’+choice(orderdata)
p5 = ‘&duration’+choice(durationdata)
p6 = ‘&category=’+choice(categorydata)
p7 = ‘&v=2&max-results=10’
feedurl = p1+p2+p3+p4+p5+p6+p7
yt_service = gdata.youtube.service.YouTubeService()
feed = yt_service.GetYouTubeVideoFeed(feedurl)
try:
Firstentry = feed.entry[0]
Firsturl = Firstentry.media.player.url
url = re.split(‘&’,Firsturl)[0]
return (url)
except:
return (‘na’)

def getUserFromVideo (url):
html = urllib.urlopen(url).read()
parser = BeautifulSoup(html)
partHtml = parser.findAll(text=re.compile(“VIDEO_USERNAME”))
pp = re.findall(‘VIDEO_USERNAME.+’, partHtml[0])
username = re.split(‘\”‘,pp[0])[1]
return username

#start = time.clock()
videos = map(randomVideoGenerator,np.arange(1,50))
videos = np.array(videos)
videos = videos[np.where(videos!=’na’)]
users = map(getUserFromVideo, videos)
users
#print time.clock()-start

finalusers=[]

#finalusers.extend(users)
#finalusers=sorted(set(finalusers))
#len(finalusers)

myfile = open(‘D:/research/dissertation/youtubeData/MyRandomUsers2.csv’, ‘wb’)
wr = csv.writer(myfile, delimiter=’,’ ,quoting=csv.QUOTE_ALL)
wr.writerow(finalusers)
myfile.close()
testfile = open(‘D:/…/MyRandomUsers2.csv’, ‘rb’)
test = np.recfromcsv(testfile)
testfile.close()
names = range(0,len(test))
for i in names:
names[i] = test[i][0]

A DIY website language detector using NLTK in python

Thursday, December 27th, 2012

# -*- coding: utf-8 -*-
“””
Created on Sat Apr 14 14:53:28 2012

@author: Wu Lingfei
“””
import re
import csv
import urllib
import numpy as np
from BeautifulSoup import BeautifulSoup
#import nltk
#nltk.download()
from nltk.util import trigrams as nltk_trigrams
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.probability import FreqDist
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, concat

class LangIdCorpusReader(CorpusReader):
”’
LangID corpus reader
”’
CorpusView = StreamBackedCorpusView
def _get_trigram_weight(self, line):
”’
Split a line in a trigram and its frequency count
”’
data = line.strip().split(‘ ‘)
if len(data) == 2:
return (data[1], int(data[0]))
def _read_trigram_block(self, stream):
”’
Read a block of trigram frequencies
”’
freqs = []
for i in range(20):
freqs.append(self._get_trigram_weight(stream.readline()))
return filter(lambda x: x != None, freqs)
def freqs(self, fileids=None):
”’
Return trigram frequencies for a language from the corpus
”’
return concat([self.CorpusView(path, self._read_trigram_block)
for path in self.abspaths(fileids=fileids)])
class LangDetect(object):
language_trigrams = {}
langid = LazyCorpusLoader(‘langid’, LangIdCorpusReader, r'(?!\.).*\.txt’)
def __init__(self, languages=[‘nl’, ‘en’, ‘fr’, ‘de’, ‘es’]):
for lang in languages:
self.language_trigrams[lang] = FreqDist()
for f in self.langid.freqs(fileids=lang+”-3grams.txt”):
self.language_trigrams[lang].inc(f[0], f[1])
def detect(self, text):
”’
Detect the text’s language
”’
words = nltk_word_tokenize(text.lower())
trigrams = {}
scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
for match in words:
for trigram in self.get_word_trigrams(match):
if not trigram in trigrams.keys():
trigrams[trigram] = 0
trigrams[trigram] += 1
total = sum(trigrams.values())
for trigram, count in trigrams.items():
for lang, frequencies in self.language_trigrams.items():
# normalize and add to the total score
scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))
return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]
def get_word_trigrams(self, match):
return [”.join(trigram) for trigram in nltk_trigrams(match) if trigram != None]
texts = [
“De snelle bruine vos springt over de luie hond”,
“The quick brown fox jumps over the lazy dog”,
“Le renard brun rapide saute par-dessus le chien paresseux”,
“Der schnelle braune Fuchs springt über den faulen Hund”,
“El rápido zorro marrón salta sobre el perro perezoso”
]

ld = LangDetect()

for text in texts:
print text, “=>”, ld.detect(text)

seedAddress = “D:/research/The flow structure on the WWW/data/list3.csv”
def importFromCSV (fileAdress):
testfile = open(fileAdress, ‘rb’)
test = np.recfromcsv(testfile)
testfile.close()
names = range(0,len(test))
for i in names:
names[i] = test[i][0]
traffics = range(0,len(test))
for i in traffics:
traffics[i] = test[i][1]
a = np.concatenate((names,traffics),axis=1)
a = a.reshape(2,len(traffics))
a = np.transpose(a)
return(a)

nt = importFromCSV(seedAddress)
names = nt[:,0]

def get_title(aName):
url = “http://www.” + aName
html = urllib.urlopen(url).read()
html_lowered = html.lower();
begin = html_lowered.find(‘<title>’)
end = html_lowered.find(‘</title>’)
if begin == -1 or end == -1:
return None
else:
# Find in the original html
return html[begin+len(‘<title>’):end].strip()

testtitle=get_title(names[2])
u=unicode(testtitle)

ld.detect(“Шеин сообщил о вводе бронетехники в Астрахань выведут на чистую воду”)

 

Collect the top 25 sites in each of the 127 coutries from Alexa

Thursday, December 27th, 2012

# -*- coding: utf-8 -*-
“””
Created on Tue Apr 03 11:36:19 2012

@author: Wu Lingfei
“””

import re
import csv
import urllib
import numpy as np
import time as time
from BeautifulSoup import BeautifulSoup

url = “http://www.alexa.com/topsites/countries”
html = urllib.urlopen(url).read()
parser = BeautifulSoup(html)
tt = unicode(parser)
tt = tt.encode(‘ascii’,’ignore’)
p = re.findall(“/topsites/countries/.+\>”, tt)
p0 = re.split(“href=”,p[0])
countries = []
for i in range(len(p0)):
counrty = re.split(‘”>’,re.split(‘countries/’,p0[i])[1])[0]
countries.append(counrty)
def getTop25ReginalSites(region):
url = “http://www.alexa.com/topsites/countries/” + region
try:
html = urllib.urlopen(url).read()
parser = BeautifulSoup(html)
tt = unicode(parser)
tt = tt.encode(‘ascii’,’ignore’)
p = np.array(re.findall(‘/siteinfo/.+\</a’, tt))
condlist = np.array(map(len, p)) < 60
p0 = p[condlist]
sites = []
for i in range(len(p0)):
site = re.split(‘”>’,re.split(‘info/’,p0[i])[1])[0]
sites.append(site)
final=np.array(sites)
return (final)
except:
return (np.array([“na”,”na”]))
start = time.clock()
websites = map(getTop25ReginalSites, countries)
print time.clock()-start
websites = np.concatenate(websites)
websites=sorted(set(websites))

file = open(‘D:/research/dissertation/newtopwebsitelist.csv’,’wb’)
w = csv.writer(file,delimiter=’,’,quotechar=’|’, quoting=csv.QUOTE_MINIMAL)
w.writerows(websites)
file.close()

url=’http://www.alexa.com/siteinfo/capitalfm.co.ke’

def get3MonthTraffiofWebsite (name):
url = ‘http://www.alexa.com/siteinfo/’ + name
try:
html = urllib.urlopen(url).read()
parser = BeautifulSoup(html)
tt = unicode(parser)
tt = tt.encode(‘ascii’,’ignore’)
p = np.array(re.findall(‘td class=”avg “.+\</td’, tt))
t1 = re.split(‘<‘,re.split(‘>’,p[8])[1])[0]
t2 = re.split(‘<‘,re.split(‘>’,p[11])[1])[0]
if float(t2) < 1:
return (t2)
else:
return (t1)
except:
return (‘na’)
start = time.clock()
traffics = map(get3MonthTraffiofWebsite,websites)
print time.clock()-start

a = np.concatenate((websites,traffics),axis=1)
a = a.reshape(2,len(traffics))
a = np.transpose(a)

file = open(‘D:/research/dissertation/newsitestraffic120304.csv’,’wb’)
w = csv.writer(file,delimiter=’,’,quotechar=’|’, quoting=csv.QUOTE_MINIMAL)
w.writerows(a)
file.close()

Collect weighted clickstreams between sites from Alexa

Thursday, December 27th, 2012

 

# -*- coding: utf-8 -*-
“””
Created on Wed Apr 04 19:32:51 2012

@author: Wu Lingfei
“””

import re
import csv
import urllib
import numpy as np
from BeautifulSoup import BeautifulSoup

seedAddress = “D:/research/The flow structure on the WWW/data/list3.csv”
saveAddress = “D:/research/The flow structure on the WWW/data/raw clickstream network/rawflowweb20120404.csv”
def importFromCSV (fileAdress):
testfile = open(fileAdress, ‘rb’)
test = np.recfromcsv(testfile)
testfile.close()
names = range(0,len(test))
for i in names:
names[i] = test[i][0]
traffics = range(0,len(test))
for i in traffics:
traffics[i] = test[i][1]
a = np.concatenate((names,traffics),axis=1)
a = a.reshape(2,len(traffics))
a = np.transpose(a)
return(a)

nt = importFromCSV(seedAddress)
names = nt[:,0]
traffics = nt[:,1]

def getEdgelistofASite(aName):
def clean(st): return st.string.encode(‘ascii’, ‘ignore’)
def cleanNum(nu): return nu[:-1].encode(‘ascii’, ‘ignore’)
l = 31
url = “http://www.alexa.com/siteinfo/” + aName
try:
html = urllib.urlopen(url).read()
parser = BeautifulSoup(html)
partHtml = parser.findAll(text=re.compile(“clickstreamSnippet”))
partDataUrl = re.findall(“clickstream.+”, partHtml[0])
dataUrl = ‘http://www.alexa.com’ + re.split(“=”, partDataUrl[0])[1][2:-2]
# ^ Get the url of clickstream data of a given website
html2 = urllib.urlopen(dataUrl).read()
parser2 = BeautifulSoup(html2)
numbers = parser2.findAll(text = re.compile(“%”))
sitestext = parser2.findAll(‘a’, href = re.compile(“.siteinfo.”))
# ^ Parse clickstreams and sites from the url of clickstream data
newsites = map(clean, sitestext)
newnumbers = parser2.findAll(text = re.compile(“.%”))
newnumbers2 = map(cleanNum, newnumbers)
tra = float(traffics[names==aName][0])/100
newnumbers3 = np.array(map(float,newnumbers2))*tra
lenAllsites = len(newsites)
lenN = map(len, numbers)
lenUpstreamSites = lenN[1:].index(l)
lenDownstreamSites = lenAllsites – lenUpstreamSites
newUpstreamSites = newsites[0:lenUpstreamSites] + [aName]*lenDownstreamSites
newDownstreamSites = [aName]*lenUpstreamSites + newsites[lenUpstreamSites:]
matrix = [newUpstreamSites, newDownstreamSites, newnumbers3]
weightedgelist = np.transpose(matrix)
# ^ Clean data and construct weighted edgelist
return (weightedgelist)
except:
return (np.array([[“a”,”a”,”1″]]))
def getEdgelistofSites(Names):
es = map(getEdgelistofASite, Names)
es = np.concatenate(es)
file = open(saveAddress,’wb’)
# ^Or elsewhere you want to save the data. #
w = csv.writer(file,delimiter=’,’,quotechar=’|’, quoting=csv.QUOTE_MINIMAL)
w.writerows(es)
file.close()
pass

import time as time
start = time.clock()
getEdgelistofSites(names)
print time.clock()-start
# ^ Running time test

#def plotNetwork(address1, address2):
# G=nx.read_weighted_edgelist(address1, comments=’#’, delimiter=’,’,
# create_using=None, nodetype=None, encoding=’utf-8′)
# pos=nx.spring_layout(G)
# n = G.number_of_edges()
# colors=range(n)
# nx.draw(G,pos,node_size=10,node_color=’#A0CBE2′,edge_color=colors,width=1,
# edge_cmap=plt.cm.Blues,with_labels=True)
# plt.savefig(address2)
# #plt.show() # display
# pass

#plotNetwork(addressforSavingClickstreamNetworkData, addressforSavingClickstreamNetworkFigure)

Detect the language of websites using AlchemyAPI

Thursday, December 27th, 2012

import csv
import random
import urllib
import numpy as np
import time as time
from BeautifulSoup import BeautifulSoup

def detectLanguageByAlchemyapi(url):
try:
adress=”http://access.alchemyapi.com/calls/url/URLGetLanguage?apikey=”
key=”e810a732569fcea6ab6e1848711aa6763fcbebdd”
apiAddress=adress+key+”&url=http://”+url
c = urllib.urlopen(apiAddress).read()
time.sleep(random.random())
s = BeautifulSoup(c)
lang=s.find(‘language’).contents
return (lang[0])
except:
return(“na”)

def importNamesFromCSV (fileAdress):
testfile = open(fileAdress, ‘rb’)
test = np.recfromcsv(testfile)
testfile.close()
names = range(0,len(test))
for i in names:
names[i] = “www.”+test[i][0]
return(names)

ad = “D:/list1.csv”
seedNames = importNamesFromCSV(ad)
langs=[]

start = time.clock()
for i in seedNames[:10]:
langs.append(detectLanguageByAlchemyapi(i))
print time.clock()-start

dictionary = dict(zip(seedNames, langs))

file = open(“D:/newlist1.csv”,’wb’)
w = csv.writer(file,delimiter=’,’,quotechar=’|’, quoting=csv.QUOTE_MINIMAL)
w.writerows(dictionary.items())
file.close()