Randomly sampling tweets with stream API

Published by chengjun on January 20th, 2013

information ocean

I want to randomly sample twitter streams. Thus, i turn to the steam api of twitter.
With the help of tweepy package of Python, I tried the following scripts. So far it works pretty well.

# Twitter API Crawler
# -*- coding: utf-8 -*-

Author: chengjun wang
Email: wangchj04@gmail.com
Hong Kong, 2013/01/20
import sys
import tweepy
import codecs
from time import clock

'''OAuth Authentication'''

auth1 = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth1.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth1)

# Note: Had you wanted to perform the full OAuth dance instead of using
# an access key and access secret, you could have uses the following
# four lines of code instead of the previous line that manually set the
# access token via auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET).
# auth_url = auth.get_authorization_url(signin_with_twitter=True)
# webbrowser.open(auth_url)
# verifier = raw_input('PIN: ').strip()
# auth.get_access_token(verifier)

file = open("C:/Python27/twitter/mydata6.csv",'wb') # save to csv file

print api.me().name # api.update_status('Updating using OAuth authentication via Tweepy!')

start = clock()
print start

'''Specify the stream'''
class StreamListenerChengjun(tweepy.StreamListener):
	def on_status(self, status):
			tweet = status.text.encode('utf-8')
			tweet = tweet.replace('\n', '\\n')
			user = status.author.screen_name.encode('utf-8')
			userid = status.author.id
			time = status.created_at
			source = status.source
			tweetid = status.id
			timePass = clock()-start
			if timePass%60==0:
				print "I have been working for", timePass, "seconds."
			if not ('RT @' in tweet) :	# Exclude re-tweets
				print >>file, "%s,%s,%s,%s,|%s|,%s" % (userid, user, time, tweetid, tweet, source)

		except Exception, e:
			print >> sys.stderr, 'Encountered Exception:', e
	def on_error(self, status_code):
		print 'Error: ' + repr(status_code)
		return True # False to stop
	def on_delete(self, status_id, user_id):
		"""Called when a delete notice arrives for a status"""
		print "Delete notice for %s. %s" % (status_id, user_id)
	def on_limit(self, track):
		"""Called when a limitation notice arrvies"""
		print "!!! Limitation notice received: %s" % str(track)
	def on_timeout(self):
		print >> sys.stderr, 'Timeout...'
		return True

'''Link the tube with tweet stream'''
streamTube = tweepy.Stream(auth=auth1, listener=StreamListenerChengjun(), timeout= 300)  # https://github.com/tweepy/tweepy/issues/83 # setTerms = ['good', 'goodbye', 'goodnight', 'good morning'] # streamer.filter(track = setTerms)


timePass = time.clock()-start
print timePass


Leave a Reply

Your email address will not be published. Required fields are marked *



0 visitors online now
0 guests, 0 bots, 0 members