#!/usr/local/bin/perl -w

# Maureen Liu, Sanger Institute, Oct/2008
# CGI_compare_1.pl

# Use Ensembl API to: 
# For human, mouse, and opossum
# 1. Count the number of X-linked protein-coding genes
# 2. Count the number of genes with 5' CpG island
# 3. Extract relevant gene and CpG information
# 4. Find the orthologous genes in the other species

# Run programme once for each species, comment and uncomment relevant lines

# Output1: tab-delimited gene and island information: 
#   Gene info: External name, Ensembl ID, position, orthologous genes
#   Island info: position, size, distance to gene, %GC, %CpG, CpG obs/exp

# Output2: CGI sequences in FASTA

# Current database version: Ensembl v50, Jul/2008


use Bio::Seq;
use Bio::SeqIO;
use Bio::EnsEMBL::Registry;


# Output files
#$output1="Gene_CGI_human.txt"; $output2="CGI_seq_human.seq";   # human
$output1="Gene_CGI_mouse.txt"; $output2="CGI_seq_mouse.seq";   # mouse
#$output1="Gene_CGI_opossum.txt"; $output2="CGI_seq_opossum.seq";   # opossum
open (OUTPUT, ">$output1") or die "Can't open output:$output1\n";
my $seqout = Bio::SeqIO->new(-file => ">$output2", 
                             -format => "fasta" );


my $registry = 'Bio::EnsEMBL::Registry';

$registry->load_registry_from_db(
    -host => 'ensembldb.ensembl.org',
    -user => 'anonymous'
);

my @db_adaptors = @{ $registry->get_all_DBAdaptors() };

while (my $db_adaptor = shift @db_adaptors) {
    my $db_connection = $db_adaptor->dbc();
#    printf(
#        "species/group\t%s/%s\ndatabase\t%s\nhost:port\t%s:%s\n\n",
#        $db_adaptor->species(),   $db_adaptor->group(),
#        $db_connection->dbname(), $db_connection->host(),
#        $db_connection->port()
#    );
}


# fetch all cpgs and genes on the human/mouse XCR or opossum X

#my $slice_adaptor = $registry->get_adaptor( 'human', 'Core', 'Slice' );   # human
my $slice_adaptor = $registry->get_adaptor( 'mouse', 'Core', 'Slice' );   # mouse
#my $slice_adaptor = $registry->get_adaptor( 'opossum', 'Core', 'Slice' );   # opossum

#my $slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X', '46846600');   # human XCR
my $slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X');   # mouse X
#my $slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X' );   # opossum X

#my @genes = @{$slice->get_all_Genes_by_type('protein_coding')};   # human/opossum

## mouse XCR: 5528005-7913274,146944902-149834200,92243232-144030513,73030083-73433137
##    20977991-23384329,33381706-58715089,58730773-72823991
my @mouse_XCR = qw(5528005 7913274 146944902 149834200 92243232 144030513 73030083 73433137 20977991 23384329 33381706 58715089 58730773 72823991);
my @genes;   
for (my $i=0; $i<7; $i++) {
  my $subslice = $slice->sub_Slice($mouse_XCR[$i*2], $mouse_XCR[$i*2+1]);
  my @subgenes = @{$subslice->get_all_Genes_by_type('protein_coding')};
  push (@genes, @subgenes);
}
## mouse----------------------

@genes = sort {$a->start <=> $b->start } @genes;
print "Have " . scalar(@genes) . " protein-coding genes on XCR\n\n";   # human/mouse
#print "Have " . scalar(@genes) . " protein-coding genes on X\n";   # opossum

my @cpgs = @{$slice->get_all_SimpleFeatures('CpG')};
print "Have " . scalar(@cpgs) . " CGIs on XCR\n\n";   # human/mouse
#print "Have " . scalar(@cpgs) . " CGIs on X\n\n";   # opossum


# Method for finding genes with cpg within 5000 bp:
# for each gene, expand the slice 5000 bp 5' direction
# fetch all CGIs on that slice
# if a CGI start is within 5100 bp of slice start 
# (i.e. no more than 100 bp downstream of gene start)
# then it is a 5' cpg

my $j=0;  # CGI-associated gene counter

#my @heading = qw(Species Gene EnsemblID Pos Mouse_ortho Opossum_ortho CGIpos size Distance %GC %CpG CpGo/e);   # human
my @heading = qw(Species Gene EnsemblID Pos Human_ortho Opossum_ortho CGIpos size Distance %GC %CpG CpGo/e);   # mouse
#my @heading = qw(Species Gene EnsemblID Pos Human_ortho Mouse_ortho CGIpos size Distance %GC %CpG CpGo/e);   # opossum
my $heading = join("\t",@heading);
print OUTPUT "$heading\n";

while (my $gene = shift @genes) {

  # extract gene information
  my $genename = $geneID = $gene->stable_id;        
  if ($gene->external_name) { $genename = $gene->external_name; }
  my $genepos = $gene->feature_Slice->name;

  
  # collect all X-linked orthologues in the other two species
  my $homolrefs = $gene->get_all_homologous_Genes;
  my (@hgene1, @hgene2) = ();
    
  while (my $href = shift @$homolrefs) {
      
    my ($hgene, $homol, $species) = @$href;
     
#    my $ortho1 = 'Mus'; my $ortho2 = 'Monodel';   # human
    my $ortho1 = 'Homo'; my $ortho2 = 'Monodel';   # mouse
#    my $ortho1 = 'Homo'; my $ortho2 = 'Mus';   # opossum
      
    # find first orthologue
    if ($species =~ /$ortho1/ && $homol->description =~ /ortholog/ && $hgene->feature_Slice->seq_region_name eq 'X') {
      my $hgenename = $hgene->stable_id;
      if ($hgene->external_name) { $hgenename = $hgene->external_name; }
      push (@hgene1,$hgenename);
    }
      
    # find second orthologue
    if ($species =~ /$ortho2/ && $homol->description =~ /ortholog/ && $hgene->feature_Slice->seq_region_name eq 'X') {
      my $hgenename = $hgene->stable_id;
      if ($hgene->external_name) { $hgenename = $hgene->external_name; }
      push (@hgene2,$hgenename);
    }
    
  }
	  
        
  # print gene, orthologues, and CGI info to table file
#  my $species = 'human';
  my $species = 'mouse';
#  my $species = 'opossum';
  
  my $hgene1 = join(",",@hgene1);
  my $hgene2 = join(",",@hgene2);
  
  # check for 5' CGI
  my $cpgpos = $cpgsize = $cpgdistance = $gc = $cpg_freq = $cpg_oe = '';
  my $slice = $gene->feature_Slice->expand(5000);
  my @cpgs = @{$slice->get_all_SimpleFeatures('CpG')};

  while (my $cpg = shift @cpgs) {
    if ($cpg->start < 5100) {

      if ($cpgpos) {print "extra CGI: "}
      
      # extract island information
      $cpgpos = $cpg->feature_Slice->name;
      $cpgsize = $cpg->end - $cpg->start;
      $cpgdistance = 5000 - $cpg->end;
      my $cpgseq = $slice->subseq($cpg->start, $cpg->end);
      	
      # calculate %GC, %CpG, CpG obs/exp
      $gc = &gc($cpgseq);
      $cpg_freq = &cpgobs($cpgseq);
      $cpg_oe = &nearest($cpg_freq/&cpgexp($cpgseq), 0.01);
      $cpg_freq = &nearest($cpg_freq*100, 0.1);
      
      # write CGI sequence to seq file
      my $seq = Bio::Seq->new (-seq=>$cpgseq,
                               -display_id=>"$genename:CGI",
	                       -alphabet=>"dna");
      $seqout->write_seq($seq);
      
      print "\nGene $genename has island $cpgpos $cpgdistance bp upstream\n";
      print OUTPUT "$species\t$genename\t$geneID\t$genepos\t$hgene1\t$hgene2\t$cpgpos\t$cpgsize\t$cpgdistance\t$gc\t$cpg_freq\t$cpg_oe\n";

    }
  }

  if ($cpgpos) {$j++;}
    else {print OUTPUT "$species\t$genename\t$geneID\t$genepos\t$hgene1\t$hgene2\t$cpgpos\t$cpgsize\t$cpgdistance\t$gc\t$cpg_freq\t$cpg_oe\n";}
    
}

print "\nDone. Have $j cpg-associated genes.\n";


#=================SUBROUTINES==================

sub gc {   
  # calculate %GC by calculating length after deleting all g and c
  my ($seq) = shift;
  $seq =~ s/\s//g;
  my $fulllength = length($seq);
  $seq =~s/(c|g)//gi;
  my $gc = &round( (1 - length($seq)/$fulllength) * 100 );
  return $gc;  
}

sub cpgobs {  
  # calculate frequency of CpG by calculating length after deleting all cg
  my ($seq) = shift;
  $seq =~ s/\s//g;
  my $fulllength = length($seq);
  $seq =~s/cg//gi;
  my $cpg = (1 - length($seq)/$fulllength)/2;
  return $cpg;  
}

sub cpgexp {
  # calculate expected freq of CpG based on c and g freq
  my ($seq) = shift;
  $seq =~ s/\s//g;
  my $fulllength = length($seq);
  $seq =~s/c//gi;
  my $c = length($seq);
  my $c_freq = 1 - $c/$fulllength;
  $seq =~s/g//gi;
  my $g_freq = ($c - length($seq))/$fulllength;
  my $cpgexp = $c_freq * $g_freq;
  return $cpgexp;
}

sub round {   # round numbers
  my $number = shift;
  return int($number + 0.5);
}

sub nearest {   # round to nearest 0.1 etc.
  my ($number,$near) = @_;
  return (int($number/$near + 0.5)) * $near;
}


