#!/pkg/bin/perl -w
#9/15/2000 I think this is the correct version to use. Not extract2.pl
#This reads in a file of the haplotype data generated by
#the Hudson program, and seperates out the replicates
#into distinct haplotype files. It also generates a genotype by
#combinging two lines at a time to create a genotype line.
#An exception to this is if the resulting genotype only has
#a single hetrozygous site, then it leaves the two haplotypes
#alone, and essentially declares them to be two homozygous genotypes.
#
#This just shortcuts what haplo.c and clark.c would have done
#if given a single-locus hetrozygote.
#The genotypes are written to a file which is used as input
#to haplo.c and to clark.c or clark2 or clarkX (some version above 2).
#
#The program also outputs a file (often called hapsavEXT.REP, where
#EXT is an extension that identifies the type of data, and REP is
# the replicate number, with 
#the genotype followed by the two haplotypes
#that created it, so that we can later compare the results of
#clark and of the LP. In the later two files, the homozygotes
#come first, followed by any hetrozygotes. 

$infile = "$ARGV[0]";
open OUT, ">exlog";
open IN, $infile;
@lines = <IN>;
close(IN);

$infile =~ m/data(.*)\.txt/;
$ext= $1;
$i = 1;

print "What is the prefix for the haplotype outfile? \n";
#$houtprefix = <STDIN>;
$houtprefix = "hap";
chomp ($houtprefix);

print "What is the prefix for the genotype outfile? \n";
#$goutprefix = <STDIN>;
$goutprefix = "gen";
chomp ($goutprefix);

print "What is the prefix for the haplosave outfile? \n";
#$hsoutprefix = <STDIN>;
$hsoutprefix = "hapsav";
chomp ($hsoutprefix);

$houtfile = $houtprefix . $ext. '.' . $i;
print $houtfile;
open (OUTH, ">$houtfile");

$goutfile = $goutprefix . $ext. '.' . $i;
print $goutfile;
open (OUTG, ">$goutfile");

$hsoutfile = $hsoutprefix . $ext. '.' . $i;
print $hsoutfile;
print "\n";
open (OUTHS, ">$hsoutfile");

$eo = 0;
$numlines = 0;
$numtwos = 0;
$totaltwos = 0;
$nhom = $nhet = $thet = 0;

foreach $line (@lines) {
chomp $line;
if ($line =~ m/nsam: *(\d+)/) {
  $nsam = $1;
  print OUTH "nsam: $nsam\n";
  print OUTH "1/1\n";
  $tnsam = $nsam = ($1)/2; 
}
if ($line =~ m/segsites: *(\d+)/) {
  $segsites= $1; 
}

if ($line =~ /\d+\/(\d+)/) {
  $totalreps = $1;
}

if ($line =~ /^[01]+$/) {
$numlines++;
$linelength = length($line);
 print OUTH "$line\n";
 print "$line\n";

 if ($eo == 1) {
     $numtwos = 0;
     $genoline = "";
     @arr1 =  split(//, $lastline);
     @arr2 =  split(//, $line);
       for ($j = 0; $j < $linelength; $j++) {
        if ($arr1[$j] eq $arr2[$j]) { 
          $char = $arr1[$j];
        }
        else {$char = '2';
        $numtwos++;
        $totaltwos++;
        $reptwos++;
        }
       $genoline = $genoline . "$char "; 
       }
 print "$numtwos\n";

 if ($numtwos == 1) {
   print OUT "found a partial-homozygote\n";
   $genoone = $genoline;
   $genoline =~ tr/2/0/;
   for ($k = 0; $k < 3; $k++) {
   $homarray[$nhom++] = $genoline;
   }

   $genoone =~ tr/2/1/;
   for ($k = 0; $k < 3; $k++) {
   $homarray[$nhom++] = $genoone;
   }
   $nsam++;
 }

elsif ($numtwos == 0) {
   print OUT "found a true homozygote\n";
   for ($k = 0; $k < 3; $k++) {
   $homarray[$nhom++] = $genoline;
   }
}
else {
   $hetarray[$nhet++] = $genoline;
   $hetarray[$nhet++] = $lastline;
   $hetarray[$nhet++] = $line;
   $thet++;
 }

 $eo = 0;
}
 else {
  $eo = 1;
 }

$lastline = $line;
}

if ($line =~ /bgnseed/) {
print "hit end of a replicate, and i is now $i\n";
print OUT "hit end of a replicate, and i is now $i\n";

   $numtwos = 0;
   if ($i > 0) {
   $avgtwo = $reptwos/$nsam;
   print "The avg. number of twos generated in replication $i is $avgtwo \n";
   $avgtwo = $reptwos/$tnsam;
   print "The avg. number of mismatches in a pair of haplotypes is $avgtwo \n";
   #$stopit = <STDIN>;

   #print OUTG "$ext $i\n";
   print OUTG "$nsam $segsites\n";
   $hom = $nhom/3;
   $het = $nhet/3;
   $tot = $hom + $het;
   print OUTHS "$hom  $het $tot\n";

    $j = 0;
    until ($j == $nhom) {
     print OUTG "$homarray[$j]\n";
     for ($k = 0; $k < 3; $k++) {
     $line = $homarray[$j++];
     $line =~ tr/ //d;
     print OUTHS "$line\n"; 
     }
   }
   
    $j = 0;
      until ($j == $nhet) {
       print OUTG "$hetarray[$j]\n";
       for ($k = 0; $k < 3; $k++) {
       $line = $hetarray[$j++];
       $line =~ tr/ //d;
       print OUTHS "$line\n"; 
       }
     }
   }

if ($i < $totalreps) { 
close OUTH;
close OUTG;
close OUTHS;

$i++;
$eo = 0;
$reptwos = 0;


$houtfile = $houtprefix . $ext. '.' . $i;
print $houtfile;
open (OUTH, ">$houtfile");

$goutfile = $goutprefix . $ext. '.' . $i;
print $goutfile;
open (OUTG, ">$goutfile");

$hsoutfile = $hsoutprefix . $ext. '.' . $i;
print $hsoutfile;

print "\n";
open (OUTHS, ">$hsoutfile");
}

$nhom = $nhet = 0;
}

}


 $avgtwo =  $totaltwos/$thet;
 print "there are $totaltwos two's generated.
 This is an average of $avgtwo per ambiguous genotype generated\n";
 $avgtwo = $totaltwos/($i * $tnsam);
 print "Avg. number of mismatches in a mated pair of haplotypes is $avgtwo\n";

print OUTH "There are $numlines lines detected\n";
close (OUTH);
close (OUTG);
close (OUTHS);
