#! /usr/bin/python -t # _*_ coding: UTF-8 _*_ # Last edited on 2011-09-07 00:09:46 by stolfilocal PROG_NAME = "cleanup-raw-tweets.py" PROG_DESC = "Converts raw tweets (copy-pasted from browser) to cooked format" PROG_VERS = "1.0" import sys import re import os sys.path[1:0] = [ sys.path[0] + '/../lib', os.path.expandvars('${STOLFIHOME}/lib'), '.' ] import argparser; from argparser import ArgParser from decimal import * PROG_COPYRIGHT = "Copyright © 2011-09-06 by the State University of Campinas (UNICAMP)" PROG_HELP = \ PROG_NAME+ " \\\n" \ " -grabDate {DATE} \\\n" \ +argparser.help_info_HELP+ " \\\n" \ " [<] INFILE \\\n" \ " > OUTFILE" PROG_INFO = \ "NAME\n" \ " " +PROG_NAME+ " - " +PROG_DESC+ ".\n" \ "\n" \ "SYNOPSIS\n" \ " " +PROG_HELP+ "\n" \ "\n" \ "DESCRIPTION\n" \ " Reads a from {INFILE} (or {stdin} if ommitted) a tweet file" \ " obtained by copy-paste from the Twitter" \ " browser interface. Outputs a reformatted tweet list.\n" \ "\n" \ "OPTIONS\n" \ "\n" \ " -grabDate {DATE}\n" \ " Specifies the date at which the tweets were grabbed, in the format \"{yyyy}-{mm}-{dd}-{HH}{MM}{SS}\".\n" \ "\n" \ "DOCUMENTATION OPTIONS\n" \ +argparser.help_info_INFO+ "\n" \ "\n" \ "SEE ALSO\n" \ " Fermat's last theorem.\n" \ "\n" \ "AUTHOR\n" \ " Created 2011-09-06 by Jorge Stolfi, IC-UNICAMP.\n" \ "\n" \ "MODIFICATION HISTORY\n" \ " 2011-09-06 by J. Stolfi, IC-UNICAMP: created.\n" \ "\n" \ "WARRANTY\n" \ " " +argparser.help_info_NO_WARRANTY+ "\n" \ "\n" \ "RIGHTS\n" \ " " +PROG_COPYRIGHT+ ".\n" \ "\n" \ " " +argparser.help_info_STANDARD_RIGHTS # TESTS: # sys.stderr.write(txtable.insert_thsep_int("123456789","/") + "\n") # FOR REAL: class State : debug = False; # True n_tweets_written = 0; # Number of tweets processed. # Parsing phase: # -1 = between tweets; # 0 = after "»" line # 1 = after first header line (sender name) # 2 = after second header line (sender ID and name again) # 3 = after optional "retweeted by" line # 4 = after optional isolated "@" line # 5 = after tweet text line # 6 = after time and buttons line phase = 0; # Half-parsed tweet: sender_uid = None; # User ID of sender (with "@"). sender_name = None; # Name of sender. when_sent = None; # Time when sent (may be relative or incomplete). tweet_text = None; # Text of the tweet. retweeted_by = None; # User ID of retweeter (with "@"). def parse_args(pp) : "Parses command line arguments.\n" \ "\n" \ " Expects an {ArgParser} instance containing the arguments," \ " still unparsed. Returns {grabDate,inFile,err} where" \ " {grabDate} and {inFile} may be {None}." # sys.stderr.write("called txtable.parse_args\n") # Being optimistic: err = None if pp.keyword_present("-grabDate") : grabDate = pp.get_next() else : grabDate = None pp.skip_parsed(); if ((pp.next < pp.argc) and (not pp.parsed[pp.next])) : inFile = pp.get_next() else : inFile = None; pp.finish(); return grabDate, inFile, err # ---------------------------------------------------------------------- # COMMAND ARGUMENT PARSING pp = ArgParser(sys.argv, sys.stderr, PROG_HELP, PROG_INFO) grabDate,inFile,err = parse_args(pp) if inFile == None : rd = sys.stdin inFile = "/dev/stdin" else : rd = open(inFile) sys.stderr.write("reading from %s ...\n" % inFile) n_lines_read = 0; state = State(); def data_error(msg) : "Prints the error message {msg} about the current input line, and aborts." sys.stderr.write("%s:%d: ** %s\n" % (inFile, n_lines_read, msg)); sys.exit(1) def prog_error(msg) : "Prints the error message {msg} about a programming error, and aborts." sys.stderr.write("%s:%d: ** prog error - %s\n" % (inFile, n_lines_read, msg)); sys.exit(1) def arg_error(msg): "Prints the error message {msg} about the command line arguments, and aborts." sys.stderr.write("** %s\n" % msg); sys.stderr.write("usage: %s\n" % PROG_HELP); sys.exit(1) def clear_tweet(state) : "Clears the current tweet." state.sender_uid = None; state.sender_name = None; state.when_sent = None; state.tweet_text = None; state.retweeted_by = None; # ---------------------------------------------------------------------- def output_tweet(state) : "Outputs to {stdout} the current tweet." if state.phase != 6 : data_error("bad phase for output = %d" % state.phase) state.n_tweets_written += 1 if state.debug : sys.stdout.write("###\n") if state.tweet_text == None : data_error("missing text line in phase %d\n «[%s]»" % (state.phase, lin.encode("utf_8"))) sys.stdout.write("%s at %s\n"% (state.when_sent.encode("utf_8"), grabDate.encode("utf_8"))) sys.stdout.write("%s [%s]"% (state.sender_uid.encode("utf_8"), state.sender_name.encode("utf_8"))); if state.retweeted_by != None : sys.stdout.write(" RT by %s" % state.retweeted_by.encode("utf_8")); sys.stdout.write("\n"); sys.stdout.write("%s\n" % state.tweet_text.encode("utf_8")); sys.stdout.write("----------------------------------------------------------------------\n"); # ---------------------------------------------------------------------- # LOOP ON INPUT LINES clear_tweet(state); state.phase = -1 while 1: lin = rd.readline() if lin == "" : break lin = unicode(lin.strip("\n"),"utf_8") n_lines_read += 1; if re.search(u"^[/][/]",lin) and (state.n_tweets_written == 0) and (state.phase == -1) : # Comment (allowed only at beginning of file, since tweets can begin with "//"): pass elif (lin == u"»") and (state.phase == -1) : # Final line: state.phase = 0 elif re.search(u"^[a-zA-Z0-9_]",lin) and (state.phase == 0) : # First header line - user name: state.sender_name = lin state.phase = 1; elif re.search(u"^[a-zA-Z0-9_]+[ ]+[a-zA-Z0-9_]",lin) and (state.phase == 1) : # Second header line - user id and user name: flds = re.split(u"[ ]+",lin,1) if len(flds) != 2 : prog_error("unexpected split count %d\n «[%s]»" % (len(flds), lin.encode("utf_8"))) state.sender_uid = "@" + flds[0] if state.sender_name != flds[1] : data_error("inconsistent user name «%s» «%s»" % (state.sender_name.encode("utf_8"), flds[1].encode("utf_8"))) state.phase = 2; elif re.search(u"^by [*a-zA-Z0-9_]+$",lin) and (state.phase == 2) : # Retweeted-by line: state.retweeted_by = "@" + lin[3:] state.phase = 3; elif re.search(u"^[@]+[ ]*$",lin) and ((state.phase == 2) or (state.phase == 3)) : # Isolated "@", ignore it: state.phase = 4 elif re.search(u"[ ](Favorite|Unfavorite)([ ](Reply|Undo|Delete|Retweet))+[ ]*$",lin) and (state.phase == 5) : # Time and buttons line: flds = re.split(u"[ ](Favorite|Unfavorite)[ ]",lin,1) if len(flds) != 3 : prog_error("unexpected split count %d\n «[%s]»" % (len(flds), lin.encode("utf_8"))) state.when_sent = flds[0] state.phase = 6 # Output tweet and reset state: output_tweet(state) clear_tweet(state) state.phase = -1 elif (state.phase == 2) or (state.phase == 3) or (state.phase == 4) : # Assume it is a text line: state.tweet_text = lin state.phase = 5 elif re.search(u"^[ ]*$",lin) : # Blank line, ignore: pass else: data_error("invalid line format for phase %d\n «[%s]»" % (state.phase, lin.encode("utf_8"))) if state.phase != -1 : data_error("last tweet is incomplete, phase = " % state.phase) sys.stderr.write("%d lines read.\n" % n_lines_read) sys.stderr.write("%d tweets written.\n" % state.n_tweets_written) sys.stderr.write("done.\n")