#! /usr/bin/python -t
# _*_ coding: UTF-8 _*_
# Last edited on 2011-09-07 00:09:46 by stolfilocal

PROG_NAME = "cleanup-raw-tweets.py"
PROG_DESC = "Converts raw tweets (copy-pasted from browser) to cooked format"
PROG_VERS = "1.0"

import sys
import re
import os
sys.path[1:0] = [ sys.path[0] + '/../lib', os.path.expandvars('${STOLFIHOME}/lib'), '.' ] 
import argparser; from argparser import ArgParser

from decimal import *

PROG_COPYRIGHT = "Copyright © 2011-09-06 by the State University of Campinas (UNICAMP)"

PROG_HELP = \
  PROG_NAME+ " \\\n" \
  "    -grabDate {DATE} \\\n" \
  +argparser.help_info_HELP+ " \\\n" \
  "  [<] INFILE \\\n" \
  "  > OUTFILE"
  
PROG_INFO = \
  "NAME\n" \
  "  " +PROG_NAME+ " - " +PROG_DESC+  ".\n" \
  "\n" \
  "SYNOPSIS\n" \
  "  " +PROG_HELP+ "\n" \
  "\n" \
  "DESCRIPTION\n" \
  "  Reads a from {INFILE} (or {stdin} if ommitted) a tweet file" \
  " obtained by copy-paste from the Twitter" \
  " browser interface.  Outputs a reformatted tweet list.\n" \
  "\n" \
  "OPTIONS\n" \
  "\n" \
  "  -grabDate {DATE}\n" \
  "    Specifies the date at which the tweets were grabbed, in the format \"{yyyy}-{mm}-{dd}-{HH}{MM}{SS}\".\n" \
  "\n" \
  "DOCUMENTATION OPTIONS\n" \
  +argparser.help_info_INFO+ "\n" \
  "\n" \
  "SEE ALSO\n" \
  "  Fermat's last theorem.\n" \
  "\n" \
  "AUTHOR\n" \
  "  Created 2011-09-06 by Jorge Stolfi, IC-UNICAMP.\n" \
  "\n" \
  "MODIFICATION HISTORY\n" \
  "  2011-09-06 by J. Stolfi, IC-UNICAMP: created.\n" \
  "\n" \
  "WARRANTY\n" \
  "  " +argparser.help_info_NO_WARRANTY+ "\n" \
  "\n" \
  "RIGHTS\n" \
  "  " +PROG_COPYRIGHT+ ".\n" \
  "\n" \
  "  " +argparser.help_info_STANDARD_RIGHTS

# TESTS:
# sys.stderr.write(txtable.insert_thsep_int("123456789","/") + "\n")

# FOR REAL:
  
class State :
  debug = False; # True 
  
  n_tweets_written = 0; # Number of tweets processed.

  # Parsing phase:
  #  -1 = between tweets;
  #   0 = after "»" line
  #   1 = after first header line (sender name)
  #   2 = after second header line (sender ID and name again)
  #   3 = after optional "retweeted by" line
  #   4 = after optional isolated "@" line
  #   5 = after tweet text line
  #   6 = after time and buttons line
  phase = 0;
  
  # Half-parsed tweet:
  sender_uid = None;    # User ID of sender (with "@").
  sender_name = None;   # Name of sender.
  when_sent = None;     # Time when sent (may be relative or incomplete).
  tweet_text = None;    # Text of the tweet.
  retweeted_by = None;  # User ID of retweeter (with "@").

def parse_args(pp) :
  "Parses command line arguments.\n" \
  "\n" \
  "  Expects an {ArgParser} instance containing the arguments," \
  " still unparsed.  Returns {grabDate,inFile,err} where" \
  "  {grabDate} and {inFile} may be {None}."
  
  # sys.stderr.write("called txtable.parse_args\n")
 
  # Being optimistic:
  err = None

  if pp.keyword_present("-grabDate") :
    grabDate = pp.get_next()
  else :
    grabDate = None
    
  pp.skip_parsed();
  
  if ((pp.next < pp.argc) and (not pp.parsed[pp.next])) :
    inFile = pp.get_next()
  else :
    inFile = None;
  
  pp.finish();
  return grabDate, inFile, err
  # ----------------------------------------------------------------------

# COMMAND ARGUMENT PARSING
pp = ArgParser(sys.argv, sys.stderr, PROG_HELP, PROG_INFO)
grabDate,inFile,err = parse_args(pp)
if inFile == None :
  rd = sys.stdin
  inFile = "/dev/stdin"
else :
  rd = open(inFile)
sys.stderr.write("reading from %s ...\n" % inFile)
n_lines_read = 0;
  
state = State();

def data_error(msg) :
  "Prints the error message {msg} about the current input line, and aborts."
  sys.stderr.write("%s:%d: ** %s\n" % (inFile, n_lines_read, msg));
  sys.exit(1)

def prog_error(msg) :
  "Prints the error message {msg} about a programming error, and aborts."
  sys.stderr.write("%s:%d: ** prog error - %s\n" % (inFile, n_lines_read, msg));
  sys.exit(1)
 
def arg_error(msg):
  "Prints the error message {msg} about the command line arguments, and aborts."
  sys.stderr.write("** %s\n" % msg);
  sys.stderr.write("usage: %s\n" % PROG_HELP);
  sys.exit(1)

def clear_tweet(state) :
  "Clears the current tweet."
  state.sender_uid = None;
  state.sender_name = None;
  state.when_sent = None;
  state.tweet_text = None;
  state.retweeted_by = None;
  # ----------------------------------------------------------------------
  
def output_tweet(state) :
  "Outputs to {stdout} the current tweet."
  if state.phase != 6 :
    data_error("bad phase for output = %d" % state.phase)
  
  state.n_tweets_written += 1

  if state.debug :
    sys.stdout.write("###\n")

  if state.tweet_text == None :
      data_error("missing text line in phase %d\n  «[%s]»" % (state.phase, lin.encode("utf_8")))
    
  sys.stdout.write("%s at %s\n"% (state.when_sent.encode("utf_8"), grabDate.encode("utf_8")))
  sys.stdout.write("%s [%s]"% (state.sender_uid.encode("utf_8"), state.sender_name.encode("utf_8")));
  if state.retweeted_by != None :
    sys.stdout.write(" RT by %s" % state.retweeted_by.encode("utf_8"));
  sys.stdout.write("\n");
  sys.stdout.write("%s\n" % state.tweet_text.encode("utf_8"));
  sys.stdout.write("----------------------------------------------------------------------\n");
  # ----------------------------------------------------------------------

# LOOP ON INPUT LINES
clear_tweet(state); 
state.phase = -1
while 1:
  lin = rd.readline()
  if lin == "" :
    break
  lin = unicode(lin.strip("\n"),"utf_8")
  n_lines_read += 1;
  if re.search(u"^[/][/]",lin) and (state.n_tweets_written == 0) and (state.phase == -1) : 
    # Comment (allowed only at beginning of file, since tweets can begin with "//"):
    pass
  elif (lin == u"»") and (state.phase == -1) : 
    # Final line:
    state.phase = 0
  elif re.search(u"^[a-zA-Z0-9_]",lin) and (state.phase == 0) : 
    # First header line - user name:
    state.sender_name = lin
    state.phase = 1;
  elif re.search(u"^[a-zA-Z0-9_]+[ ]+[a-zA-Z0-9_]",lin) and (state.phase == 1) : 
    # Second header line - user id and user name:
    flds = re.split(u"[ ]+",lin,1)
    if len(flds) != 2 :
      prog_error("unexpected split count %d\n  «[%s]»" % (len(flds), lin.encode("utf_8")))
    state.sender_uid = "@" + flds[0]
    if state.sender_name != flds[1] :
      data_error("inconsistent user name «%s» «%s»" % (state.sender_name.encode("utf_8"), flds[1].encode("utf_8")))
    state.phase = 2;
  elif re.search(u"^by [*a-zA-Z0-9_]+$",lin) and (state.phase == 2) : 
    # Retweeted-by line:
    state.retweeted_by = "@" + lin[3:]
    state.phase = 3;
  elif re.search(u"^[@]+[ ]*$",lin) and ((state.phase == 2) or (state.phase == 3)) : 
    # Isolated "@", ignore it:
    state.phase = 4
  elif re.search(u"[ ](Favorite|Unfavorite)([ ](Reply|Undo|Delete|Retweet))+[ ]*$",lin) and (state.phase == 5) : 
    # Time and buttons line:
    flds = re.split(u"[ ](Favorite|Unfavorite)[ ]",lin,1)
    if len(flds) != 3 :
      prog_error("unexpected split count %d\n  «[%s]»" % (len(flds), lin.encode("utf_8")))
    state.when_sent = flds[0]
    state.phase = 6
    # Output tweet and reset state:
    output_tweet(state)
    clear_tweet(state)
    state.phase = -1
  elif (state.phase == 2) or (state.phase == 3) or (state.phase == 4) : 
    # Assume it is a text line:
    state.tweet_text = lin
    state.phase = 5
  elif re.search(u"^[ ]*$",lin) : 
    # Blank line, ignore:
    pass
  else:
    data_error("invalid line format for phase %d\n  «[%s]»" % (state.phase, lin.encode("utf_8")))

if state.phase != -1 :
  data_error("last tweet is incomplete, phase = " % state.phase)

sys.stderr.write("%d lines read.\n" % n_lines_read)
sys.stderr.write("%d tweets written.\n" % state.n_tweets_written)
sys.stderr.write("done.\n")