#!/usr/local/bin/perl

use strict;
use warnings;
no warnings 'uninitialized';

$|++;

# Constants
my $MAX_LOCI_PER_FILE = 100;
my $EOL = "\015\012"; # Windows line ending 
$/ = $EOL;

my %population;	    # complete list of populations
my %locus;	    # complete list of SNP loci, with their names
my %sampled_alleles; # main data structure: 
		    # alleles per locus per population per individual sampled

# read in Bryndis' data file.  Format:
#  SNP_ID   New_SNP_ID	Sample_ID (Unique)  Population_No   Allele_code
#  <tab delimited>
# Build main data structure, and compile a list (via hash) of the populations 
# and loci in the dataset.

my $max_pop = 0;  # keep track of largest population number - this is
		  # required for the header
@ARGV or die <<USAGE;
    Usage: $0 snpfile

    Converts a file of SNP genotypes into an FSTAT input file.
    SNP file should be tab-separated fields:
    SNP_ID  New_SNP_ID  Sample_ID (Unique)  Population_No  Allele_code

USAGE

while (<>){
    chomp;
    my ($locus_name, $locus_id, $sample_id, $pop_id, $allele) = split "\t";
    $population{$pop_id}++;
    $max_pop = $pop_id if $pop_id > $max_pop;

    $locus{$locus_id} = $locus_name;
    $sampled_alleles{$sample_id}->{$pop_id}->{$locus_id} = $allele;
}

# The output file(s) will contain all the individuals (samples) in all the
# populations at all the loci, with 0 for missing datapoints.
#
# Format  is:
# Header of:
# 6 3 2 1
# locus1
# locus2
# locus3
# 
# i.e.
# num populations, number of loci, max allele number, digits in allele
# locus1 name (6 chars)
# locus2 name 
# locus3 name
#
# followed by a row for each individual in each population of:
# 1     12  12  0
# (space separated)
#
# i.e. population number, then alleles at locus 1, alleles at locus2, etc

# The max loci per file is 100, though, so the data will need to be chunked by 
# locus.

my @populations = sort{$a <=> $b} keys %population;
my @loci = sort{$a <=> $b} keys %locus;
my @samples = sort keys %sampled_alleles;

my $filecounter = 0;

while (@loci){
    my @current_loci = splice(@loci, 0, $MAX_LOCI_PER_FILE);
    $filecounter++;
    open (FSTAT, "> $ARGV.$filecounter.DAT") or die "Can't open file: $!\n";

    # header
    print FSTAT join "  ", ($max_pop, scalar(@current_loci), 2, 1);
    print FSTAT $EOL;
    # header loci names
    foreach (@current_loci){
	print FSTAT $_ , $EOL;
    }
     
    # main data - a row of alleles for each individual in each population
    foreach my $pop (@populations){
	foreach my $sample (@samples){
	    print FSTAT "$pop  ";
	    print FSTAT join "  ", map {$sampled_alleles{$sample}{$pop}{$_} || '0'} @current_loci;
	    print FSTAT $EOL;
	}
    }
    close FSTAT;
}

