#!/usr/bin/perl
#
# csv2elastic.pl – Import a CSV file into elastic search.
# Copyright © 2014 by Heise Zeitschriften Verlag GmbH & Co. KG
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Author: Kai Wasserbäch <kaw@heise.de>
#
use strict;
use warnings;
use utf8;
binmode STDOUT, ':encoding(utf-8)';
binmode STDERR, ':encoding(utf-8)';

use JSON;
use Text::CSV;
use LWP::UserAgent;
use HTTP::Request;
use Getopt::Long::Descriptive;
use Regexp::Common qw/URI/;

# print the boiler plate
print <<BOILERPLATE;
csv2elastic.pl  © 2014 Heise Zeitschriften Verlag GmbH & Co. KG
Author: Kai Wasserbäch <kaw\@heise.de>

A small Perl script to import a CSV data set, as exported from a data base for
example, into a Elastic Search instance.

This is free software, you may do what you like with it, as long, as you adhere
to the GPLv3, see <http://www.gnu.org/licenses/>

BOILERPLATE


# make this global and save us some passing around
our ($oUsage, $oOpt);

# dumb method to print the usage and die with a custom error message
sub showUsage {
	my $sDie = shift;
	print($oUsage->text);
	printf(STDERR "\nERROR: %s\n", $sDie)  if($sDie);
	exit 1;
}

sub doSingleIndex {
	my ($sTarget, $sIdxKey, $oCvs, $oFh, $iLimit, ) = @_;

	print "Sending data in multiple requests... ";

	my $iOrigLimit = $iLimit;
	my $oData = ${$oCvs};
	my $oInputH = ${$oFh};
	my $oUa = LWP::UserAgent->new;
	while(my $hLine = $oData->getline_hr($oInputH)) {
		print "\n";
		if(defined $iLimit && $iLimit-- == 0) {
			warn(sprintf("Stopping execution after %d lines",
			  $iOrigLimit));
			last;
		}
		unless(exists $hLine->{$sIdxKey}) {
			warn("\tIndex value missing, skipping...");
			next;
		}

		my $sId = $hLine->{$sIdxKey};
		printf("\tProcessing ID %s... ", $sId);

		my $sData;
		# Replace undef with an empty string, because FacetView can't handle NULL
		my %hCur = %{$hLine};
		if($oOpt->null_as_empty) {
			defined $hCur{$_} or $hCur{$_} = '' for keys %hCur;
		}
		eval { $sData = encode_json(\%hCur); };

		showUsage(sprintf("Failed to parse data into JSON (id=%s): %s", $sId, $@))
		  if($@);

		my $oReq = HTTP::Request->new(
			'PUT',
			sprintf('%s%s%s', $sTarget, substr($sTarget, -1) ne '/' ? '/' : '',
			  $sId),
			undef, $sData
		);
		my $oResp = $oUa->request($oReq);

		showUsage(sprintf("PUT for id %s failed: %s", $sId,
		  $oResp->status_line))  if($oResp->is_error);
		print 'done.';
	}

	showUsage(sprintf("CSV error: %s", $oData->error_diag))  if(!$oData->eof &&
		defined $iLimit && $iLimit != -1);
}

sub doBulkIndex {
	my ($sTarget, $sIdxKey, $oCvs, $oFh, $iLimit, $bUndefAsEmpty) = @_;

	my $sData = '';
	print "Sending data in one request... ";

	my $iOrigLimit	= $iLimit;
	my @aTgtParts	= split('/', $sTarget);
	my $iLastPart	= $#aTgtParts;
	my $sType		= $aTgtParts[$iLastPart--];
	my $sIdx		= $aTgtParts[$iLastPart];
	$sTarget		=~ s/\/$sIdx\/$sType\/?$/\/_bulk/g;

	unless(scalar(@aTgtParts) >= 5) {
		showUsage("\n\tURL construction failed: URL doesn't seem to contain a type and index.");
	}

	# Maybe we should try to validate the extracted URL parts harder
	unless($sType) {
		showUsage("\n\tURL construction failed: couldn't extract _type from given URL");
	}
	unless($sIdx) {
		showUsage("\n\tURL construction failed: couldn't extract _index from given URL");
	}

	# construct base hash for bulk operations
	my $hBaseOp = {
		'index' => {
			'_index'	=> $sIdx,
			'_type'		=> $sType
		}
	};

	# construct the output
	my $oData = ${$oCvs};
	my $oInputH = ${$oFh};
	while(my $hLine = $oData->getline_hr($oInputH)) {
		if(defined $iLimit && $iLimit-- == 0) {
			printf("\nStopped execution after %d lines.", $iOrigLimit);
			last;
		}
		unless(exists $hLine->{$sIdxKey}) {
			warn("\n\tIndex value missing, skipping...");
			next;
		}

		$hBaseOp->{index}->{_id} = $hLine->{$sIdxKey};
		# Replace undef with an empty string, because FacetView can't handle NULL
		my %hCur = %{$hLine};
		if($oOpt->null_as_empty) {
			defined $hCur{$_} or $hCur{$_} = '' for keys %hCur;
		}
		eval {
			$sData .= sprintf("%s\n%s\n", encode_json($hBaseOp),
			  encode_json(\%hCur));
		};

		showUsage(sprintf("Failed to parse data into JSON (id=%s): %s",
		  $hLine->{$sIdxKey}, $@))  if($@);
		$hBaseOp->{index}->{_id} = undef;
	}
	showUsage(sprintf("CSV error: %s", $oData->error_diag))  if(!$oData->eof &&
		defined $iLimit && $iLimit != -1);

	my $oReq = HTTP::Request->new('PUT', $sTarget, undef, $sData);
	my $oUa = LWP::UserAgent->new;
	my $oResp = $oUa->request($oReq);

	showUsage(sprintf("Bulk PUT failed: %s\n\t%s", $oResp->status_line,
	  $oResp->decoded_content))  if($oResp->is_error);

	print " done.\n";
}

($oOpt, $oUsage) = describe_options(
	'%c %o <CSV-FILE> <PUT URL>',
	['field-separator|f=s',	'Define the CSV field separator (default: \t)',
	  {default => "\t"}],
	['bulk|b',		'Send all data in one request, cuts execution time '
	  . "down. But you don't see, which part of your data caused the failure."],
	['lines|n=i',		'Limit the number of lines parsed to this value.'],
	['null-as-empty',	'Replace NULL values in the JSON with an empty string.'],
	['help|h',		'Show this help'],
	[],
	["<CSV-FILE>\t\tCSV file, separated by the field separator, containing\n"
	 . "\t\t\t\tthe data and the field names in the first line.\n\t"
	 . "<PUT URL>\t\tTarget URL, which shall be used in the PUT request.\n\n"
	 . "\tThe CSV files must be UTF-8 encoded!"
	],
	[],
	["This script needs the followin Perl modules:\n"
	 . "\t  - JSON\n\t  - Text::CSV\n\t  - LWP::UserAgent\n\t  - HTTP::Request"
	 . "\n\t  - Getopt::Long::Descriptive\n\t  - Regexp::Common"
	]
);

# was the usage requested?
print($oUsage->text), exit 0  if($oOpt->help);

# check we got two input files, one detailing the headers and one containing
# the data
unless(scalar(@ARGV) == 2) {
	showUsage(
	  "Expected two input files containing the field definitions and data and a PUT URL."
	);
}
if(!-r $ARGV[0]) {
	showUsage('Input file is unreadable.')
}

# validate the URL parameter, allow HTTPS as well
# base definition from Regexp::Common, which defines $RE.
my $oValidUrl = qr($RE{URI}{HTTP}{-scheme=>qr/https?/});
unless($ARGV[1] =~ m/$oValidUrl/) {
	showUsage("Didn't get a valid PUT URL.");
}

my $oData = Text::CSV->new({
	empty_is_undef  => 1,
	sep_char        => $oOpt->field_separator
}) or showUsage("Couldn't create CSV object!");
my $oFh;
open $oFh, '<:encoding(utf-8)', $ARGV[0]
	or showUsage(sprintf("Failed to open header definition file: %s", $ARGV[0]));
my $aHeaders = $oData->getline($oFh);

if(!$aHeaders || ref $aHeaders ne 'ARRAY') {
	showUsage(sprintf("Couldn't parse %s and extract header definitions: %s",
		$ARGV[0], $oData->error_diag));
}

if(scalar(@{$aHeaders}) == 1) {
	showUsage(sprintf('Field separator »%s« seems to be wrong, found just one '
	  . 'field (%s).', $oOpt->field_separator, $aHeaders->[0]));
}

$oData->column_names($aHeaders);
my $sIdxKey = $aHeaders->[0];
printf("Using %s as index field.\n", $sIdxKey);

if($oOpt->bulk) {
	doBulkIndex($ARGV[1], $sIdxKey, \$oData, \$oFh, $oOpt->lines);
} else {
	doSingleIndex($ARGV[1], $sIdxKey, \$oData, \$oFh, $oOpt->lines);
}

close $oFh;
print "\n";
