#!  -w
# $Id: fa_coords.pl.in,v 1.15 2006/04/21 16:37:38 twu Exp $

#$package_version = "2006-04-21";

use IO::File;
$opt_l = 1000000;		# Don't report contigs smaller than this length
undef($opt_o);			# Output file
undef($opt_S);			# Treat each sequence as a separate chromosome
undef($opt_E);			# Interpret argument as a command
use Getopt::Std;
getopts("o:CE");

# Usage: fa_coords [-o <output>] [-S] [-E] <fastafiles or command>

if (!defined($outfile = $opt_o)) {
  $outfile = "coords.txt";
}

$flags = "";
$flags .= "-o $outfile";

#print STDERR "Printing processing messages only for those contigs longer than\n";
#print STDERR "  $opt_l nt (but processing all of them, of course).\n";
#print STDERR "  You can change this value with the -l flag to fa_coords.\n";

if ($#ARGV < 0) {
  @streams = ();
  push @streams,"<&STDIN";
  ($output,$skipped) = parse_fa_files(\@streams);
} else {
  ($output,$skipped) = parse_fa_files(\@ARGV);
}
if ($#{$output} < 0) {
  printf STDOUT "Error: No contigs were read in.\n";
  exit(9);
} else {
  $OUT = new IO::File(">$outfile") or die "Cannot write to file $outfile";
  print $OUT "# To exclude a contig, place a '#' sign at the beginning of the line\n";
  print $OUT "#contig" . "\t" . "coordinates" . "\t" . "altstrain\n";
  foreach $string (@ {$output}) {
    print $OUT $string;
  }
  close($OUT);
}

if ($#$skipped >= 0) {
  printf "\n";
  printf STDOUT "********************************************************************************\n";
  printf STDOUT ("    A total of %d contigs had no recognizable chromosomal assignment, and\n",$#$skipped + 1);
  print <<NAMSG;
    were therefore concatenated into a chromosome called "NA".

    GMAP can handle this chromosome without any problems.  However,
    if you do not wish to include these contigs, please remove them from $outfile,
    or comment them out with the # character at the beginning of each line.
NAMSG
  printf STDOUT "********************************************************************************\n";
  print "\n";
}

@errors = ();
foreach $chr (keys %lowest) {
  if ($lowest{$chr} > 1) {
    push @errors,"  First contig in chromosome $chr starts at position $lowest{$chr}";
  }
}

if ($#errors >= 0) {
  print STDOUT "\n";
  print STDOUT "*** Possible errors: ***\n";
  foreach $error (@errors) {
    print $error . "\n";
  }
  print STDOUT "\n";
  print STDOUT "  Some the errors above may be addressed by specifying the contigs to be on\n";
  print STDOUT "  alternate strains of existing chromosomes, rather than on independent\n";
  print STDOUT "  alternate chromosomes.\n";
  print STDOUT "  You may make the appropriate changes in $outfile, by adding an alternate\n";
  print STDOUT "  strain in column 3, and specifying an existing chromosome in column 2\n";
  print STDOUT "\n";
}

print STDOUT "\n";
print STDOUT "============================================================\n";
print STDOUT "Contig mapping information has been written to file $outfile.\n";
if ($#errors >= 0) {
  printf STDOUT ("%d possible errors were found (listed above)\n",$#errors+1);
}

print STDOUT "You should look at this file, and edit it if necessary\n";
print STDOUT "If everything is okay, you should proceed by running\n";
if ($outfile =~ /coords\.(\S+)/) {
  $genome = $1;

  print STDOUT "    make -f Makefile.$genome gmapdb\n";
} else {
  print STDOUT "    make gmapdb\n";
}
print STDOUT "============================================================\n";

exit;



sub handle_consecutive_errors {
  my ($consec_errors) = @_;

  if ($consec_errors > 0) {
    print STDOUT "\n\n";
    print STDOUT "***  Note: For a total of $consec_errors consecutive contigs, ";
    print STDOUT "the chromosome could not be parsed from the header, \n";
    print STDOUT "     and were therefore assigned to chromosome NA.\n\n";
    $consec_errors = 0;
  }
  return $consec_errors;
}



sub parse_fa_files {
  my ($argv) = @_;
  my ($FP, $line, $strain);
  my @output = ();
  my @skipped = ();
  my $seglength;

  foreach $arg (@ {$argv}) {
    if (defined($opt_E)) {
      printf STDOUT "Executing command $arg\n";
      $FP = new IO::File("$arg |") or die "Can't execute $arg";
    } else {
      printf STDOUT "Opening file $arg\n";
      $FP = new IO::File("$arg") or die "Can't open file $arg";
    }
    $seglength = 0;
    undef($orientation);
    $consec_errors = 0;
    $shortp = 0;
    while (defined($line = <$FP>)) {
      chop $line;
      if ($line !~ /\S/) {
	# Skip blank lines

      } elsif ($line !~ /^>/) {
	if (defined($chr)) {
	  $seglength += length($line);
	}

      } else {
	# Handle previous contig
	if ($seglength > 0) {
	  if ($seglength > $opt_l) {
	    if ($shortp == 1) {
	      print STDOUT "\n";
	      $shortp = 0;
	    }
	    printf STDOUT ("  Processed contig %s at chromosomal coordinates %s:%d..",
			   $contig,$chr,$chrpos{$chr});
	    printf STDOUT ("%d (length = %d nt)",$chrpos{$chr}+$seglength-1,$seglength);
	    if (defined($orientation) && $orientation eq "rev") {
	      printf STDOUT (" (revcomp => %s:%d..%d)",$chr,$chrpos{$chr}+$seglength-1,$chrpos{$chr});
	    }
	    print STDOUT "\n";
	  } else {
	    if ($shortp == 0) {
	      printf STDOUT "  Processed short contigs (<$opt_l nt): ";
	      $shortp = 1;
	    }
	    printf STDOUT ".";
	  }

	  if (defined($orientation) && $orientation eq "rev") {
	    $string = sprintf("%s\t%s:%d..%d\n",$contig,$chr,$chrpos{$chr}+$seglength-1,$chrpos{$chr});
	  } else {
	    $string = sprintf("%s\t%s:%d..%d\n",$contig,$chr,$chrpos{$chr},$chrpos{$chr}+$seglength-1);
	  }
	  push @output,$string;
	  if (!defined($lowest{$chr})) {
	    $lowest{$chr} = $chrpos{$chr};
	  } elsif ($chrpos{$chr} < $lowest{$chr}) {
	    $lowest{$chr} = $chrpos{$chr};
	  }
	  $chrpos{$chr} += $seglength; # Used only when a header doesn't have a chrpos for this chr
	}

	# Handle current header
	# print STDOUT "  Header line: $line\n";
	$seglength = 0;
	($contig) = $line =~ /^>(\S+)/;
	undef $orientation;
	if (defined($opt_S)) {
	  $chr = $contig;

	} elsif ($line =~ /[Cc]hr\s*=?\s*(\S+):(\d+)\D+\d+/) {
	  $chr = $1;
	  $chrpos{$chr} = $2;
	  $consec_errors = handle_consecutive_errors($consec_errors);

	} elsif ($line =~ /[Cc]hr\s*=?\s*(\S+)/) {
	  $chr = $1;
	  $consec_errors = handle_consecutive_errors($consec_errors);

	} elsif ($line =~ /[Cc]hromosome\s*(\S+)/) {
	  # NCBI .mfa format
	  $chr = $1;
	  $consec_errors = handle_consecutive_errors($consec_errors);

	} elsif ($line =~ /[Cc]hromosome:[^:]+:([^:]+):(\d+)/) {
	  # Ensembl format: chromosome:NCBI35:22:1:49554710:1
	  $chr = $1;
	  $chrpos{$chr} = $2;

	  if ($chr =~ /(\S+?)_N\D_\d+/) {
	    # Ensembl notation for unmapped contig
	    $chr = $1 . "U";
	  }
	  $consec_errors = handle_consecutive_errors($consec_errors);

	} elsif ($line =~ /\/[Cc]hromosome=\S+/) {
	  # Celera format
	  ($chr) = $line =~ /\/[Cc]hromosome=(\S+)/;
	  if ($line =~ /\/alignment=\((\d+)-\d+\)/) {
	    ($chrpos{$chr}) = $1;
	    $chrpos{$chr} += 1;	# Because Celera uses 0-based coordinates
	  }
	  if ($line =~ /\/orientation=rev/) {
	    $orientation = "rev";
	  }
	  $consec_errors = handle_consecutive_errors($consec_errors);

	} elsif ($line =~ /[Cc]hr\s*(\S+):(\d+)\D+\d+/) {
	  $chr = $1;
	  $chrpos{$chr} = $2;
	  $consec_errors = handle_consecutive_errors($consec_errors);

	} elsif ($line =~ /[Cc]hr\s*(\S+)/ && $1 ne "omosome") {
	  $chr = $1;
	  $consec_errors = handle_consecutive_errors($consec_errors);

	} else {
	  if ($consec_errors == 0) {
	    print STDOUT "\n\n***  Note: Can't find chromosome in header $line.  Assigning to chromosome NA instead.\n\n";
	  }
	  $chr = "NA";
	  push @skipped,$contig;
	  $consec_errors += 1;
	}

	if (!defined($chrpos{$chr})) {
	  $chrpos{$chr} = 1;	# Start this contig at beginning of chromosome
	}
      }
    }

    # Handle last contig in the file
    if ($seglength > 0) {
      if ($seglength > $opt_l) {
	if ($shortp == 1) {
	  print STDOUT "\n";
	  $shortp = 0;
	}
	printf STDOUT ("  Processed contig %s at chromosomal coordinates %s:%d..",
		       $contig,$chr,$chrpos{$chr});
	printf STDOUT ("%d (length = %d nt)",$chrpos{$chr}+$seglength-1,$seglength);
	if (defined($orientation) && $orientation eq "rev") {
	  printf STDOUT (" (revcomp => %s:%d..%d)",$chr,$chrpos{$chr}+$seglength-1,$chrpos{$chr});
	}
	print STDOUT "\n";
	$shortp = 0;
      } else {
	if ($shortp == 0) {
	  printf STDOUT "  Processed short contigs (<$opt_l nt): ";
	  $shortp = 1;
	}
	printf STDOUT ".";
      }

      if (defined($orientation) && $orientation eq "rev") {
	$string = sprintf("%s\t%s:%d..%d\n",$contig,$chr,$chrpos{$chr}+$seglength-1,$chrpos{$chr});
      } else {
	$string = sprintf("%s\t%s:%d..%d\n",$contig,$chr,$chrpos{$chr},$chrpos{$chr}+$seglength-1);
      }
      push @output,$string;
      if (!defined($lowest{$chr})) {
	$lowest{$chr} = $chrpos{$chr};
      } elsif ($chrpos{$chr} < $lowest{$chr}) {
	$lowest{$chr} = $chrpos{$chr};
      }
    }
    close($FP);
    handle_consecutive_errors($consec_errors);
  }

  return (\@output,\@skipped);
}



