#!/usr/bin/perl
#
# selectRowByCol - select a row or set of rows in file1 based on a match to a 
# column in file2
# 
# Usage: selectRowByCol [opts] --col <#> <file1> <file2>
# 
#    --col <#>       Column in the row of file1 to match on ('eq'), default 1
#    -v              Invert like grep -v, print lines that don't match
#    --matchcol <#>  Column in the row of file2 to match on (default 1)
#    --verbose       Print some status messages to STDERR
# 
# Note: - Column numbering starts at 1, and delimiter is whitespace (\s+)
#       - Both <file1> and <file2> may be gzipped (filename ending in .gz)
#         or one may be '-' to indicate STDIN
#       - Reads file 2 into memory, so be careful
#
use strict;
use warnings;
use Getopt::Long;
# use GTB::Run qw(comment_usage);
# use GTB::File qw(Open);
# replacing the above two modules requires Open() and comment_usage()
# as well as the following:
use Carp qw(carp croak confess);
our $Gzip = 'gzip';
our $GzipIn;
our $GzipOut;

comment_usage() if (@ARGV < 2);

my $col1    = 1;
my $col2    = 1;
my $verbose = 0;
my $print_not_matching = 0;
my $rv = GetOptions ("column=i"     => \$col1,
                     "matchcol=i"   => \$col2,
                     "verbose"      => \$verbose,
                     "v"            => \$print_not_matching,
                 );

$col1--;
$col2--;
if ($col1 < 0 or $col2 < 0) {
    comment_usage("columns must be positive integers");
}
if (@ARGV < 2) {
    comment_usage("Both <file1> and <file2> are required");
}
if ($ARGV[0] eq '-' && $ARGV[1] eq '-') {
    comment_usage("Only one of <file1> or <file2> can be stdin ('-')");
}
my $filename1 = shift(@ARGV);
my $filename2 = shift(@ARGV);
my $fh1 = Open($filename1);
my $fh2 = Open($filename2);

# get the strings to search for
print STDERR "Now reading in $filename2 column ", ($col2 + 1), "\n" 
    if ($verbose);
my %match;
while(defined(my $line = <$fh2>)) {
    chomp($line);
    my ($to_match) = (split(/\s+/, $line))[$col2];
    $match{$to_match}++;
}

print STDERR "Done reading in $filename2, now printing lines from $filename1 where column ", 
        ($col1+1)," matches\n"
    if ($verbose);

if ($print_not_matching) {
    while(defined(my $line = <$fh1>)) {
        chomp($line);
        my @F = split(/\s+/, $line);
        if ($#F < $col1) {
            warn "Line $. of $filename1 has insufficient fields\n" if ($verbose);
            print $line, "\n";
            next;
        }
        if (!defined($match{$F[$col1]})) {
            print $line, "\n";
        }
    }
} else {
    while(defined(my $line = <$fh1>)) {
        next if ($line =~ /^\s*$/); # just short-cut
        chomp($line);
        my @F = split(/\s+/, $line);
        if ($#F < $col1) {
            warn "Line $. of $filename1 has insufficient fields\n" if ($verbose);
            next;
        }
        if (defined($match{$F[$col1]})) {
            print $line, "\n";
        }
    }
}

sub Open {
    my ($file, $mode) = @_;
    if (!$file && $file ne '0') {
        croak "Open: no filename provided";
    }
    if ($mode) {
        $mode = lc $mode;
    }
    elsif ($file =~ /^\s*\|/) {
        $mode = 'w';
    }
    else {
        $mode = 'r';
    }
    my $fh;
    if ($file =~ /\|/) {
        if ($mode eq 'r') {
            if ($file =~ /\|\s*$/) {
                open $fh, $file or die "Can't open pipe '$file', $!\n";
            }
            else {
                croak "To open pipe for reading, pipe character must "
                    . "appear at end of command";
            }
        }
        elsif ($mode eq 'w') {
            if ($file =~ /^\s*\|/) {
                open $fh, $file or die "Can't open pipe '$file', $!\n";
            }
            else {
                croak "To open pipe for writing, pipe character must "
                    . "appear at beginning of command";
            }
        }
        else { # pipe, but not first or last in sequence
            croak << "END_MSG";
If a pipe character is present in the open string, there must be a pipe at
the beginning or end of the string, depending upon whether you plan to
write or read to the filehandle; '$file' is not valid.  If you need to read
and write to a program, try IPC::Open2 or IPC::Open3.
END_MSG
        }
    }
    elsif ($file =~ /\.(b?gz|bz2|zip|Z)$/) {
        if ($mode eq 'r') {
            my $prog = $1 eq 'bz2' ? 'bzip2' : ($GzipIn || $Gzip);
            croak "File ($file) not found" unless (-e $file);
            croak "File ($file) was not readable" unless (-r $file);
            open $fh, "$prog -dc $file |"
                or die "Can't read $file with $prog, $!\n";
        }
        elsif ($mode eq 'w') {
            my $prog = $1 eq 'bz2' ? 'bzip2' : ($GzipOut || $Gzip);
            open $fh, "| $prog > $file"
                or die "Can't create $prog file $file, $!\n";
        }
        elsif ($mode eq 'a') {
            if ($1 eq 'bz2') {
                croak "Open: mode 'a' not supported for bzip2 file $file";
            }
            my $prog = $GzipOut || $Gzip;
            open $fh, "| $prog >> $file"
                or die "Can't append $prog output to $file, $!\n";
        }
        else {
            croak "Open: mode '$mode' not supported; use 'r', 'w' or 'a'";
        }
    }
    elsif ($file eq '-') {
        if ($mode eq 'r') {
            open $fh, '-' or die "Can't read from STDIN, $!\n";
        }
        elsif ($mode eq 'w' || $mode eq 'a') {
            open $fh, '>-' or die "Can't write to STDOUT, $!\n";
        }
        else {
            croak "Open: mode '$mode' not supported; use 'r', 'w' or 'a'";
        }
    }
    elsif ($mode eq 'r') {
        open $fh, '<', $file or die "Can't open $file, $!\n";
    }
    elsif ($mode eq 'w') {
        open $fh, '>', $file or die "Can't create $file, $!\n";
    }
    elsif ($mode eq 'a') {
        open $fh, '>>', $file or die "Can't append to $file, $!\n";
    }
    else {
        croak "Open: mode '$mode' not supported; use 'r', 'w' or 'a'";
    }
    return $fh;
}

######################################################################
# comment_usage - print the comments at the beginning of the executable
#  as a usage message and die
# INPUTS:  Any strings passed in will be printed as error messages
#          following the usage message.
# OUTPUTS: Prints usage message and exit(1);
######################################################################
sub comment_usage {
    my @errors = @_;

    open IN, $0 or confess "Couldn't read source ($0): $!";
    $_ = <IN>;

    # skip frst 3 lines if the program starts with an eval statement
    # added by make install
    my $line2pos = tell(IN);
    $_ = <IN>; $_ .= <IN>; $_ .= <IN>;
    if ($_ !~ /\neval 'exec .*\n\s+if 0;[^\n]+\n/) {
        seek(IN, $line2pos, 0);
    }

    while (<IN>) {
        if(s/^#[ \t]?//) {
           print STDERR $_;
        } else {
            close (IN);
            last;
        }
    }
    if (@errors) {
        foreach my $error (@errors) {
            chomp $error;
            $error .= "\n";
        }
        print STDERR "ERROR:\n";
        print STDERR @errors;
        print STDERR "Died", Carp::shortmess();
        exit 1;
    }
    else {
        exit 1;
    }
}
