Posts Tagged ‘Python’

Understanding Basic Epidemic Models with Python

Published by chengjun on March 14th, 2013

Modeling is one way for understanding analytic models. Python supplies many tools to do that.

SIR

First, the python script for SIR

# -*- coding: utf-8 -*-

###################################
### Written by Ilias Soumpasis    #
### ilias.soumpasis@ucd.ie (work) #
### ilias.soumpasis@gmail.com	  #
###################################

import scipy.integrate as spi
import numpy as np
import pylab as pl

beta=1.4247
gamma=0.14286
TS=1.0
ND=70.0
S0=1-1e-6
I0=1e-6
INPUT = (S0, I0, 0.0)

def diff_eqs(INP,t):
	'''The main set of equations'''
	Y=np.zeros((3))
	V = INP
	Y[0] = - beta * V[0] * V[1]
	Y[1] = beta * V[0] * V[1] - gamma * V[1]
	Y[2] = gamma * V[1]
	return Y   # For odeint

t_start = 0.0; t_end = ND; t_inc = TS
t_range = np.arange(t_start, t_end+t_inc, t_inc)
RES = spi.odeint(diff_eqs,INPUT,t_range)

print RES

#Ploting
pl.plot(RES[:,0], '-bs', label='Susceptibles')  # I change -g to g--  # RES[:,0], '-g',
pl.plot(RES[:,2], '-g^', label='Recovereds')  # RES[:,2], '-k',
pl.plot(RES[:,1], '-ro', label='Infectious')
pl.legend(loc=0)
pl.title('SIR epidemic without births or deaths')
pl.xlabel('Time')
pl.ylabel('Susceptibles, Recovereds, and Infectious')
pl.savefig('2.1-SIR-high.png', dpi=900) # This does, too
pl.show()

p2.1-SIS
Second, the python script for SIS

# -*- coding: utf-8 -*-

import scipy.integrate as spi
import numpy as np
import pylab as pl

beta=1.4247
gamma=0.14286
I0=1e-6
ND=70
TS=1.0
INPUT = (1.0-I0, I0)

def diff_eqs(INP,t):
	'''The main set of equations'''
	Y=np.zeros((2))
	V = INP
	Y[0] = - beta * V[0] * V[1] + gamma * V[1]
	Y[1] = beta * V[0] * V[1] - gamma * V[1]
	return Y   # For odeint

t_start = 0.0; t_end = ND; t_inc = TS
t_range = np.arange(t_start, t_end+t_inc, t_inc)
RES = spi.odeint(diff_eqs,INPUT,t_range)

print RES

#Ploting
pl.plot(RES[:,0], '-bs', label='Susceptibles')
pl.plot(RES[:,1], '-ro', label='Infectious')
pl.legend(loc=0)
pl.title('SIS epidemic without births or deaths')
pl.xlabel('Time')
pl.ylabel('Susceptibles and Infectious')
pl.savefig('2.5-SIS-high.png', dpi=900) # This does increase the resolution.
pl.show()

Randomly sampling tweets with stream API

Published by chengjun on January 20th, 2013

information ocean

I want to randomly sample twitter streams. Thus, i turn to the steam api of twitter.
With the help of tweepy package of Python, I tried the following scripts. So far it works pretty well.


# Twitter API Crawler
# -*- coding: utf-8 -*-

'''
Author: chengjun wang
Email: wangchj04@gmail.com
Hong Kong, 2013/01/20
'''
import sys
import tweepy
import codecs
from time import clock

'''OAuth Authentication'''
consumer_key="xcEI4sb...fi6AzBQ"
consumer_secret="5nfeG8...jUX8nU2pafr4hU"
access_token="37595783-Fazh...8fPaH5IaTlz7y"
access_token_secret="fyqUf5...YijKwvQe3I"

auth1 = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth1.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth1)

'''
# Note: Had you wanted to perform the full OAuth dance instead of using
# an access key and access secret, you could have uses the following
# four lines of code instead of the previous line that manually set the
# access token via auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET).
# auth_url = auth.get_authorization_url(signin_with_twitter=True)
# webbrowser.open(auth_url)
# verifier = raw_input('PIN: ').strip()
# auth.get_access_token(verifier)
'''

file = open("C:/Python27/twitter/mydata6.csv",'wb') # save to csv file

print api.me().name # api.update_status('Updating using OAuth authentication via Tweepy!')

start = clock()
print start

'''Specify the stream'''
class StreamListenerChengjun(tweepy.StreamListener):
	def on_status(self, status):
		try:
			tweet = status.text.encode('utf-8')
			tweet = tweet.replace('\n', '\\n')
			user = status.author.screen_name.encode('utf-8')
			userid = status.author.id
			time = status.created_at
			source = status.source
			tweetid = status.id
			timePass = clock()-start
			if timePass%60==0:
				print "I have been working for", timePass, "seconds."
			if not ('RT @' in tweet) :	# Exclude re-tweets
				print >>file, "%s,%s,%s,%s,|%s|,%s" % (userid, user, time, tweetid, tweet, source)

		except Exception, e:
			print >> sys.stderr, 'Encountered Exception:', e
			pass
	def on_error(self, status_code):
		print 'Error: ' + repr(status_code)
		return True # False to stop
	def on_delete(self, status_id, user_id):
		"""Called when a delete notice arrives for a status"""
		print "Delete notice for %s. %s" % (status_id, user_id)
		return
	def on_limit(self, track):
		"""Called when a limitation notice arrvies"""
		print "!!! Limitation notice received: %s" % str(track)
		return
	def on_timeout(self):
		print >> sys.stderr, 'Timeout...'
		time.sleep(10)
		return True

'''Link the tube with tweet stream'''
streamTube = tweepy.Stream(auth=auth1, listener=StreamListenerChengjun(), timeout= 300)  # https://github.com/tweepy/tweepy/issues/83 # setTerms = ['good', 'goodbye', 'goodnight', 'good morning'] # streamer.filter(track = setTerms)
streamTube.sample()

file.close()
pass

timePass = time.clock()-start
print timePass


Scraping New York Times & The Guardian using Python

Published by admin on April 23rd, 2012

I have read the blog post about Scraping New York Times Articles with R. It’s great. I want to reproduce the work with python.
First, we should learn about nytimes article search api.

http://developer.nytimes.com/docs/article_search_api/

Second, we need to register and get the key which will be used in python script.

http://developer.nytimes.com/apps/register

# !/usr/bin/env python
# -*- coding: UTF-8  -*-
# Scraping New York Times using python
# 20120421@ Canberra
# chengjun wang

import json
import urllib2

'''
About the api and the key, see the links above.
'''

'''step 1: input query information'''
apiUrl='http://api.nytimes.com/svc/search/v1/article?format=json'
query='query=occupy+wall+street'                            # set the query word here
apiDate='begin_date=20110901&end_date=20120214'             # set the date here
fields='fields=body%2Curl%2Ctitle%2Cdate%2Cdes_facet%2Cdesk_facet%2Cbyline'
offset='offset=0'
key='api-key=c2c5b91680.......2811165'  # input your key here

'''step 2: get the number of offset/pages'''
link=[apiUrl, query, apiDate, fields, offset, key]
ReqUrl='&'.join(link)
jstr = urllib2.urlopen(ReqUrl).read()  # t = jstr.strip('()')
ts = json.loads( jstr )
number=ts['total'] #  the number of queries  # query=ts['tokens'] # result=ts['results']
print number
seq=range(number/9)  # this is not a good way
print seq

'''step 3: crawl the data and dump into csv'''
import csv
addressForSavingData= "D:/Research/Dropbox/tweets/wapor_assessing online opinion/News coverage of ows/nyt.csv"
file = open(addressForSavingData,'wb') # save to csv file
for i in seq:
    nums=str(i)
    offsets=''.join(['offset=', nums]) # I made error here, and print is a good way to test
    links=[apiUrl, query, apiDate, fields, offsets, key]
    ReqUrls='&'.join(links)
    print "*_____________*", ReqUrls
    jstrs = urllib2.urlopen(ReqUrls).read()
    t = jstrs.strip('()')
    tss= json.loads( t )  # error no joson object
    result = tss['results']
    for ob in result:
        title=ob['title']  # body=ob['body']   # body,url,title,date,des_facet,desk_facet,byline
        print title
        url=ob['url']
        date=ob['date'] # desk_facet=ob['desk_facet']  # byline=ob['byline'] # some author names don't exist
        w = csv.writer(file,delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        w.writerow((date, title, url)) # write it out
file.close()
pass

see the result:

Similarly, you can crawl the article data from The Guardian. See the link below.

http://explorer.content.guardianapis.com/#/?format=json&order-by=newest

After you have registered you app and got the key, we can work on the python script.


# !/usr/bin/env python
# -*- coding: UTF-8  -*-
# Scraping The Guardian using Python
# 20120421@ Canberra
# chengjun wang

import json
import urllib2

'''
http://content.guardianapis.com/search?q=occupy+wall+street&from-date=2011-09-01&to-date=2012-02-14&page=2
&page-size=3&format=json&show-fields=all&use-date=newspaper-edition&api-key=m....g33gzq
'''

'''step 1: input query information'''
apiUrl='http://content.guardianapis.com/search?q=occupy+wall+street' # set the query word here
apiDate='from-date=2011-09-01&to-date=2011-10-14'                     # set the date here
apiPage='page=2'      # set the page
apiNum=10             # set the number of articles in one page
apiPageSize=''.join(['page-size=',str(apiNum)])
fields='format=json&show-fields=all&use-date=newspaper-edition'
key='api-key=mudfuj...g33gzq'  # input your key here

'''step 2: get the number of offset/pages'''
link=[apiUrl, apiDate, apiPage, apiPageSize, fields, key]
ReqUrl='&'.join(link)
jstr = urllib2.urlopen(ReqUrl).read()  # t = jstr.strip('()')
ts = json.loads( jstr )
number=ts['response']['total'] #  the number of queries  # query=ts['tokens'] # result=ts['results']
print number
seq=range(number/(apiNum-1))  # this is not a good way
print seq

'''step 3: crawl the data and dump into csv'''
import csv
addressForSavingData= "D:/Research/Dropbox/tweets/wapor_assessing online opinion/News coverage of ows/guardian.csv"
file = open(addressForSavingData,'wb') # save to csv file
for i in seq:
nums=str(i+1)
apiPages=''.join(['page=', nums]) # I made error here, and print is a good way to test
links= [apiUrl, apiDate, apiPages, apiPageSize, fields, key]
ReqUrls='&'.join(links)
print "*_____________*", ReqUrls
jstrs = urllib2.urlopen(ReqUrls).read()
t = jstrs.strip('()')
tss= json.loads( t )
result = tss['response']['results']
for ob in result:
title=ob['webTitle'].encode('utf-8')  # body=ob['body']   # body,url,title,date,des_facet,desk_facet,byline
print title
section=ob["sectionName"].encode('utf-8')
url=ob['webUrl']
date=ob['fields']['newspaperEditionDate'] # date=ob['webPublicationDate']  # byline=ob['fields']['byline']
w = csv.writer(file,delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
w.writerow((date, title, section, url)) # write it out
file.close()
pass


Learning To Do Sentiment Analysis Using Python & NLTK

Published by admin on March 18th, 2012

This is my first try in learning sentiment analysis using python. Glad to know nltk could distinguish ‘like’ and ‘not like’. It’s great. I am wondering compared it with R.

The method below employ the procedure as the figure demonstrates below.

original author: Laurent Luce

The model uses naive bayes model to classify.

# nltkTest.py
# Twitter sentiment analysis using Python and NLTK
# original author: Laurent Luce
# Reproduced by chengjun wang to test the validity
# 20120319@Canberra

# find the original post by Laurent Luce following the link below:
# http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

import nltk

pos_tweets = [('I love this car', 'positive'),
	('This view is amazing', 'positive'),
	('I feel great this morning', 'positive'),
	('I am so excited about the concert', 'positive'),
	('He is my best friend', 'positive')]

neg_tweets = [('I do not like this car', 'negative'),
	('This view is horrible', 'negative'),
	('I feel tired this morning', 'negative'),
	('I am not looking forward to the concert', 'negative'),
	('He is my enemy', 'negative')]

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
	words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
	tweets.append((words_filtered, sentiment))

# print tweets
# print to see the result

test_tweets = [
	(['feel', 'happy', 'this', 'morning'], 'positive'),
	(['larry', 'friend'], 'positive'),
	(['not', 'like', 'that', 'man'], 'negative'),
	(['house', 'not', 'great'], 'negative'),
	(['your', 'song', 'annoying'], 'negative')]

# print test_tweets

# The list of word features need to be extracted from the tweets. It is a list with every distinct words
# ordered by frequency of appearance. We use the follow ing function to get the list plus the tw o helper
# functions.

def get_words_in_tweets(tweets):
	all_words = []
	for (words, sentiment) in tweets:
		all_words.extend(words)
	return all_words
def get_word_features(wordlist):
	wordlist = nltk.FreqDist(wordlist)
	word_features = wordlist.keys()
	return word_features
# what does word_features do?
word_features = get_word_features(get_words_in_tweets(tweets))
# print word_features

# To create a classifier, we need to decide what features are relevant. To do that, we first need a
# feature extractor. The one we are going to use returns a dictionary indicating what words are
# contained in the input passed. Here, the input is the tweet. We use the word features list defined
# above along with the input to create the dictionary.

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features

# call the feature extractor with the document ['love', 'this', 'car']
# document=['love', 'this', 'car']
# features = extract_features(document)
# print features

training_set = nltk.classify.util.apply_features(extract_features, tweets)
# print training_set
# be careful here, it should be nltk.classify.util.apply_features rather than nltk.classify.apply_features
# apply the features to our classifier using the method apply_features.
# We pass the feature extractor along with the tweets list defined above.

# The variable training_set contains the labeled feature sets. It is a list of tuples which each tuple
# containing the feature dictionary and the sentiment string for each tweet. The sentiment string is
# also called label.

classifier = nltk.NaiveBayesClassifier.train(training_set)
# look inside the classifier train method in the source code of the NLTK library

def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
    # Create the P(label) distribution
    label_probdist = estimator(label_freqdist)
    # Create the P(fval|label, fname) distribution
    feature_probdist = {}
    return NaiveBayesClassifier(label_probdist, feature_probdist)

# print label_probdist.prob('positive')
# print label_probdist.prob('negative')

# print feature_probdist
# print feature_probdist[('negative', 'contains(best)')].prob(True)

# print classifier.show_most_informative_features(32)
# show_most_informative_features

tweet = 'Larry is not my friend'
# print classifier.classify(extract_features(tweet.split()))

# take a look at how the classify method works internally in the NLTK library. What we pass to the classify method is the feature set of
# the tweet we want to analyze. The feature set dictionary indicates that the tweet contains the word "friend".
print extract_features(tweet.split()), '\n'

# def classify(self, featureset):
    # Discard any feature names that we've never seen before.
    # Find the log probability of each label, given the features.
	# {'positive': -1.0, 'negative': -1.0}
	# Then add in the log probability of features given labels.
	# {'positive': -5.4785441837188511, 'negative': -14.784261334886439}
    # Generate a probability distribution dictionary using the dict logprod
	# DictionaryProbDist(logprob, normalize=True, log=True)
    # Return the sample with the greatest probability from the probability
    # distribution dictionary

'''Taking the following test tweet 'Your song is annoying'. The classifier thinks it is positive.
The reason is that we don't have any information on the feature name annoying.
Larger the training sample tweets is, better the classifier will be.'''

# tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))

'''find the original post by Laurent Luce following the link below:
http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/'''