#!/usr/bin/perl -w
#
# Twitter-Timeline in Dateien oder in einer Datenbank (SQLite) sichern.
# Copyright (c) 2009 Oliver Lau <oliver@von-und-fuer-lau.de>
# Alle Rechte vorbehalten.
#
# $Id: twitterbak.pl 4cc17f2f5053 2009/08/12 07:54:35 Oliver Lau <oliver@von-und-fuer-lau.de> $

use utf8;
use strict;
use warnings;
use XML::Simple;
use Getopt::Long;
use LWP;
use DBI;
use Config::IniFiles;
use Cwd;
use HTML::Entities qw(decode_entities);

my $homedir = ($^O eq 'MSWin32')? $ENV{'USERPROFILE'} : $ENV{'HOME'};
my $config_file = undef;

# authorization data for twitter.com
my $username = undef;
my $password = undef;
my $only_user = undef;

my $tweet_page = undef;
my $tweet_count = undef;

# database settings
my $db_name = undef;
my $db_user = undef;
my $db_pass = undef;
my $db_driver = 'SQLite';

# miscellaneous settings
my $initial_backup = 0;
my $force_write = 0;
my $no_bom = 0;
my $output_path = undef;
my $output_string = 'plain,xml,db';

# don't touch the following code ;-)
my $VERSION = '1.1.2';
my $VERBOSE = 1;
my $DEBUG = 0;
my $HELP = 0;
my $QUIET = 0;

sub disclaimer;
sub USAGE;
sub fetch_timeline;
sub fetch_dm;
sub fetch_dm_sent;
sub fetched($);
sub fetch($);
sub init_db;
sub mk_date($);

GetOptions(
           'config=s' => \$config_file,
           'page=i' => \$tweet_page,
           'count=i' => \$tweet_count,
           'only-user' => \$only_user,
           'no-bom' => \$no_bom,
           'force-write' => \$force_write,
           'outputs=s' => \$output_string,
           'path=s' => \$output_path,
           'init' => \$initial_backup,
           'db-name' => \$db_name,
           'verbose' => \$VERBOSE,
           'debug' => \$DEBUG,
           'quiet' => \$QUIET,
           'help' => \$HELP
);

my $cwd = cwd();

GREETING() unless $QUIET;
USAGE() if $HELP;

# Konfigurationsdatei suchen
if (!defined $config_file) {
    foreach ('twitterbak.ini', '.twitterbak', 'twitterbak.cfg') {
        if (-f "$homedir/$_") {
            $config_file = "$homedir/$_";
            last;
        }
    }
    $config_file = 'twitterbak.ini' unless $config_file;
}
-f $config_file or die "Konfigurationsdatei fehlt oder kann nicht geöffnet werden.";
my $cfg = new Config::IniFiles(-file => $config_file);

my $user_agent_string = $cfg->val('http', 'useragent', "TwitterBak/$VERSION");
$only_user = ($cfg->val('twitter', 'onlyuser', 0) =~ /(true|1|yes)/) unless defined $only_user;
$output_path = $cfg->val('output', 'path', "$homedir/MyTweets") unless defined $output_path;

my @outputs = split /,/, $cfg->val('output', 'output', $output_string);

$username = $cfg->val('twitter', 'username', undef);
$password = $cfg->val('twitter', 'password', undef);

defined $username or die 'Bitte "username" in Konfigurationsdatei eintragen.)';
defined $password or die 'Bitte "password" in Konfigurationsdatei eintragen.)';

if (!defined $db_name) {
    $db_name = $username;
    $db_name .= "+friends" unless $only_user;
    $db_name .= '.sqlite';
}

mkdir $output_path unless -d $output_path;
$output_path .= "/$username";
mkdir $output_path unless -d $output_path;
print "Wechseln ins Verzeichnis $output_path ...\n" if $DEBUG;
chdir $output_path;

my $dbh = undef;
if (grep(/db/, @outputs)) {
    print "Verbinden mit Datenbank ...\n" if $VERBOSE && !$QUIET;
    $dbh = DBI->connect("dbi:$db_driver:dbname=$db_name", $db_user, $db_pass)
        or die $DBI::errstr;
    my $sth = $dbh->table_info('%', '%', '%', 'TABLE');
    my $tab = $sth->fetchall_arrayref;
    init_db() if $#{$tab} < 0;
}

my $ua = LWP::UserAgent->new(agent => $user_agent_string);

my $timeline_count = fetch(\&fetch_timeline);
my $dm_count       = fetch(\&fetch_dm);
my $dm_sent_count  = fetch(\&fetch_dm_sent);

print "\n$timeline_count/$dm_count/$dm_sent_count Tweets/DMs/SentDMs heruntergeladen.\n" if $VERBOSE;
chdir $cwd;
exit $timeline_count + $dm_count + $dm_sent_count;


##############################################################
#
# Unterroutinen
#
##############################################################
sub fetch_from_url($) {
    my $url = shift;
    print "URL=$url\n" if $DEBUG;
    my $req = HTTP::Request->new(GET => $url);
    my $res = $ua->request($req);
    $res->is_success or die $res->status_line, "\n";
    return $res->content;
}

sub url_param() {
    my $param = ((defined $tweet_page)? "?page=$tweet_page" : '');
    $param .= ((defined $tweet_page)? '&' : '?') . "count=$tweet_count" if defined $tweet_count;
    return $param;
}

sub fetch_timeline() {
    print 'Herunterladen der ', ($only_user? 'user' : 'user+friends'),
          "-Timeline ...\n" if $VERBOSE && !$QUIET;
    my $url = "https://$username:$password\@twitter.com/statuses/" .
        (($only_user)? 'user' : 'friends') . '_timeline.xml' . url_param();
    return fetch_from_url($url);
}

sub fetch_dm() {
    print "Herunterladen der Direktnachrichten ...\n" if $VERBOSE && !$QUIET;
    my $url = "https://$username:$password\@twitter.com/direct_messages.xml" . url_param();
    return fetch_from_url($url);
}

sub fetch_dm_sent() {
    print "Herunterladen der versendeten Direktnachrichten ...\n" if $VERBOSE && !$QUIET;
    my $url = "https://$username:$password\@twitter.com/direct_messages/sent.xml" . url_param();
    return fetch_from_url($url);
}

sub parse_xml($) {
    my $content = shift;
    print "Parsen der XML-Daten ...\n" if $VERBOSE && !$QUIET;
    my $xml = XMLin($content, KeyAttr => []) or die "XMLin() fehlgeschlagen.";
    return $xml;
}

sub fetched($) {
    my $fetcher = shift;
    my $xml = parse_xml($fetcher->());
    my $sql = 'INSERT OR REPLACE INTO tweets ' .
            '(id, user_id, user_screen_name, user_name, created_at, tweet) ' .
            'VALUES (?, ?, ?, ?, ?, ?)' if defined $dbh;
    my $sth = (!$dbh)? undef : ($dbh->prepare($sql) or die $dbh->errstr);
    my $msg_count = 0;
    my $status = ($fetcher == \&fetch_timeline)? $xml->{'status'} : $xml->{'direct_message'};
    foreach (@{$status}) {
        my $created_at = mk_date($_->{'created_at'});
        my $userid = ($fetcher == \&fetch_timeline)
            ? $_->{'user'}->{'name'}
            : $_->{'sender'}->{'name'} . ' -> ' . $_->{'recipient'}->{'name'};
        print "#$_->{'id'} $userid: $_->{'text'} ($created_at)\n" if $VERBOSE && !$QUIET;
        my $plain_file = $_->{'id'} . '.txt';
        if (grep(/plain/, @outputs) && (!-f $plain_file || $force_write)) {
            open OUT, ">$plain_file"
                or die "Schreiben der Textdatei $plain_file fehlgeschlagen: $!\n";
            my $userlong = ($fetcher == \&fetch_timeline)
                ? $_->{'user'}->{'name'} . ' (' . $_->{'user'}->{'screen_name'} . ')'
                : $_->{'sender'}->{'name'} . ' (' . $_->{'sender'}->{'screen_name'} . ') -> ' .
                  $_->{'recipient'}->{'name'} . ' (' . $_->{'recipient'}->{'screen_name'} . ')';
            my $data = "$userlong: " . decode_entities($_->{'text'}) . " ($created_at)";
            utf8::encode($data);
            print OUT "\x{ef}\x{bb}\x{bf}" unless $no_bom;
            print OUT "$data\n";
            close OUT;
        }
        my $xml_file = "$_->{'id'}.xml";
        if (grep(/xml/, @outputs) && (!-f $xml_file || $force_write)) {
            XMLout($_,
                   NoAttr => 1,
                   OutputFile => $xml_file,
                   SuppressEmpty => 1);
        }
        if (defined $sql) {
            my $userscreen = ($fetcher == \&fetch_timeline)
                ? $_->{'user'}->{'screen_name'}
                : $_->{'sender'}->{'screen_name'} . ' -> ' .
                  $_->{'recipient'}->{'screen_name'};
            my $res = $sth->execute($_->{'id'}, $userid, $userscreen,
                                    $_->{'user'}->{'name'},
                                    $created_at, $_->{'text'})
                or die $sth->errstr . " ($sql)";
        }

        ++$msg_count;
    }
    return $msg_count;
}

sub fetch($) {
    my $fetcher = shift;
    my $total_msg_count = 0;
    if ($initial_backup) {
        my $msg_count = 0;
        $tweet_page = 1;
        while (($msg_count = fetched($fetcher)) > 0) {
            $total_msg_count += $msg_count;
            ++$tweet_page;
        }
    }
    else {
        $total_msg_count = fetched($fetcher);
    }
    return $total_msg_count;
}

sub init_db {
    print "Initialisieren der Datenbank ...\n" if $VERBOSE && !$QUIET;
    my @stmt = split ';', (join '', <DATA>);
    foreach (@stmt) {
        chomp;
        s/\s+$//;
        s/^\s+//;
        next unless $_;
        $dbh->do($_) or die $dbh->errstr . " ($_)";
    };
}

sub mk_date($) {
    # Wed Aug 05 10:06:41 +0000 2009
    my ($wkday, $month, $day, $hours, $mins, $secs, $offset, $yr) =
        (shift =~ /([a-zA-Z]{3}) ([a-zA-Z]{3}) (\d+) (\d{2}):(\d{2}):(\d{2}) ([+-]\d+) (\d+)/);
    my $m = 0;
    foreach ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec') {
        ++$m;
        last if $_ eq $month;
    }
    return sprintf '%04d-%02d-%02d %02d:%02d:%02d', $yr, $m, $day, $hours, $mins, $secs;
}

sub GREETING {
    print "\ntwitterbak.pl v$VERSION -\n",
    "Twitter-Timeline in Dateien oder in einer Datenbank sichern.\n\n",
    "Copyright (c) 2009 Oliver Lau <oliver\@von-und-fuer-lau.de>\n",
    "Alle Rechte vorbehalten.\n\n";
}


sub USAGE {
    print "Aufrufen mit: twitterbak.pl <Optionen>\n",
    "\nOptionen:\n",
    "  --config=file\n",
    "      Pfad und Name zur Konfigurationsdatei\n",
    "  --init\n",
    "      Versuche, beim Einsammeln der Tweets so weit wie möglich in die\n",
    "      Vergangenheit zu gehen. (--page wird ignoriert)\n",
    "  --count=n\n",
    "      n Tweets abholen (max. 200, Default: 20)\n",
    "  --db-name=name\n",
    "      Datenbanknamen vorgeben. (Default: <username>.sqlite, wenn\n",
    "      --onlyuser gewaehlt, sonst <username>+friends.sqlite)\n",
    "  --page=n\n",
    "      n-te Seite abrufen\n",
    "  --path=path\n",
    "      Absoluter Pfad zu dem Verzeichnis, in dem die Tweets abgelegt werden\n",
    "      sollen.\n",
    "  --only-user\n",
    "      nur die User-Timeline abrufen, nicht User+Friends\n",
    "  --force-write\n",
    "      Eventuell bereits vorhandene Dateien überschreiben\n",
    "  --no-bom\n",
    "      Unterbindet das Schreiben des Unicode-BOM in die Textdatei\n",
    "  --outputs=(plain|xml|db)\n",
    "      kommaseparierte Auflistung der Formate, in denen die heruntergeladenen\n",
    "      Tweets abgespeichert werden sollen. Zum Beispiel, um die Tweets als\n",
    "      Textdatei und in der Datenbank abzuspeichern: --outputs=plain,db\n",
    "      Vorgabe: $output_string\n",
    "  --verbose\n",
    "      Detaillierte Informationen über Verarbeitungsschritte ausgeben\n",
    "  --quiet\n",
    "      Sämtliche Bildschirmausgaben unterdrücken\n",
    "\n";
    exit;
}


__DATA__
CREATE TABLE IF NOT EXISTS tweets
(
 id INTEGER PRIMARY KEY,
 user_id INTEGER NOT NULL,
 user_screen_name TEXT,
 user_name TEXT,
 created_at DATETIME NOT NULL,
 tweet TEXT
 );

DROP INDEX IF EXISTS tweetidx;

CREATE INDEX tweetidx ON tweets
(
 user_screen_name,
 user_name,
 tweet
 );
