#! /usr/local/bin/perl
#
################################################################################
# Requirements                                                                 #
################################################################################
#
require 5.002;
#require "GUSI.ph";   #MacOS
#
# External References
#
use Carp;
use FileHandle;
use File::Path;
use File::Basename;
use Getopt::Std;    #UNIX

use vars qw($opt_d $opt_n $opt_h);
#
################################################################################
# Main program default variables                                               #
################################################################################
#
my $max_width = 300; #number of bases scanned up- and downstream of each 5mC
#
################################################################################
# Main program global variables                                                #
################################################################################
#
my $seq          ;
my $filelist     ;
my $file         ;
my @files        ;
my $seqname      ;
my $umgeb_C      ;
my @umgeb_block  ;
my @r_umgeb_block;
my $umgeb_feld   ;
my $z         = 0;
my $pos       = 0;
my $counter   = 0;
my $next      = 0;
my $frame     = 0;
my $laenge    = 0;
my $position  = 0;
my $summe     = 0;
my $Hsumme    = 0;
my $max_H     = 0;
my $C_anzahl  = 0;
my $c_anzahl  = 0;
my $a_anzahl  = 0;
my $t_anzahl  = 0;
my $g_anzahl  = 0;
my $n_anzahl  = 0;
my $f_a       = 0;
my $f_t       = 0;
my $f_c       = 0;
my $f_C       = 0;
my $f_g       = 0;
my $e         = 0;
$|            = 1;            #print immediately
#
# MAN page
#
my $MAN = <<MAN;
NAME

     InfoScan: determines information content around 5mC

AVAILABILITY

     Requires perl 5.002

DESCRIPTION

     In order to identify conserved sequence patterns around methylated 
     cytosines the program "InfoScan" determines the frequency of each 
     nucleotide for positions up to $max_width (default = 300) bp up- and 
     downstream of these sites. The information content R of each position
     (Schneider, 1986) is calculated. e is a correction factor, H(max.) 
     the maximum uncertainty 2.32 (for equal frequency of each base) and 
     H the uncertainty at each position.If the number of nucleotides
     per position is greater than 50, the correction factor e(pos.) can 
     be approximated with a simplified formula (Schneider, 1990; 
     Schneider, 1986) To incorporate the overall distribution of the five 
     nuclotides in the investigated region the relative entropy H'(Durbin, 
     1998) is calculated.
     
SYNOPSIS

     InfoScan.pl [-d directory] [-n] output file [-h]
     (replaced by file selector box under MacOS)

OPTIONS

    -d string dircetory containing input seq1-files
    -h string output file name
    -h        Prints this message
 
AUTHOR(S)

     Christoph Grunau (cgrunau\@imb-jena.de)

CHANGE LOG

     Nov.   1st, 1998 CG   Initial version
     Oct.   1st, 1999 CG   minor revisions
MAN
#
# Command line parsing 
#
getopts('d:n:h') || croak($MAN); #UNIX
#$opt_d = &GetFolder("Choose input folder");    #MacOS
#$opt_n = &PutFile("Choose output info file"); #MacOS

#
# Check command line arguments
#
croak($MAN) if ( $opt_h  || ! defined($opt_d) || $opt_d eq "" );



#----------------------------------------------------------------
#INFORMATION CONTENT UPSTREAM OF 5MC-SITES:

for ($z=0; $z<=$max_width+1; $z++)
    {
    $umgeb_feld[$z][1] = 0;  # set variables for a,t,g,c,C and n to zero
    $umgeb_feld[$z][2] = 0;
    $umgeb_feld[$z][3] = 0;
    $umgeb_feld[$z][4] = 0;
    $umgeb_feld[$z][5] = 0;
    $umgeb_feld[$z][6] = 0;
    
    $H[$z][1] = 0;
    $H[$z][2] = 0;
    $H[$z][3] = 0;
    $H[$z][4] = 0;
    $H[$z][5] = 0;
    $H[$z][6] = 0;
    
    $Hr[$z][1] = 0;
    $Hr[$z][2] = 0;
    $Hr[$z][3] = 0;
    $Hr[$z][4] = 0;
    $Hr[$z][5] = 0;
    $Hr[$z][6] = 0;
    
    $f[$z][1] = 0;
    $f[$z][2] = 0;
    $f[$z][3] = 0;
    $f[$z][4] = 0;
    $f[$z][5] = 0;
    $f[$z][6] = 0;
    }

chdir $opt_d;
@files = <*.seq1>;    #write all .seq1-files into a list
if ($#files <0) {die "Sorry! No files available. Please try option -h.\n"};
print "Scanning information content upstream of 5mC\n";
foreach $filelist(@files)
          {
	         $seq = "-";
          #consequtive transfer of sequences
	         open (SEQUENCE, "<$filelist") || die ("Cannot open file '$filelist'.");
	         print (".");                    #show program is running
	         while (<SEQUENCE>)
	              {
	             	#
             		#remove Newline, make sequences from lines, ignore FASTA-name
             		#
             		$file = $_;
             		chomp $file;
             		unless ($file =~ /^>/){$seq = $seq . $file;}
             		$seq =~ s/-//g; #replace "-"
             		}
          close SEQUENCE || die ("Cannot close open file '$filelist'.");
          $laenge = length($seq);
          $next = $laenge;
          #
          # determine global base number
          #
          while ($seq =~ /C/g)   {$C_anzahl++;}
          while ($seq =~ /c/g)   {$c_anzahl++;}
          while ($seq =~ /a/g)   {$a_anzahl++;}
          while ($seq =~ /t/g)   {$t_anzahl++;}
          while ($seq =~ /g/g)   {$g_anzahl++;}
          while ($seq =~ /n/g)   {$n_anzahl++;}
	         #
	         # find 5mC positions 
	         #
	         while ($seq)
	              {
            	  $position = rindex ($seq,"C",$next);
            	  $next = $position - 1;
	              if ($position == -1) {last;}
	  
	              $umgeb_C = substr($seq,0,$position+1);
            	  #
            	  #transform surrounding from string to list
            	  #
            	  @r_umgeb_block = split (//, $umgeb_C); 
            	  @umgeb_block = reverse(@r_umgeb_block); #reverse list
            	  $i = 0;
            	  for (
	                  $i=0;
	                  $i<($position);  #from current 5mC until end  
	                  $i++
	                  )
	                  {
	                   if ($umgeb_block[$i] eq "a") {$umgeb_feld[$i][1]++;} 
	                elsif ($umgeb_block[$i] eq "t") {$umgeb_feld[$i][2]++;}
	                elsif ($umgeb_block[$i] eq "g") {$umgeb_feld[$i][3]++;}
	                elsif ($umgeb_block[$i] eq "c") {$umgeb_feld[$i][4]++;}
	                elsif ($umgeb_block[$i] eq "C") {$umgeb_feld[$i][5]++;}
	                elsif ($umgeb_block[$i] eq "n") {$umgeb_feld[$i][6]++;} 
	                  }
	               }
          }
print "\n";
#
# Write results to table file.
#
$path = `pwd`;
chomp $path;

if (!defined $opt_n)
   {
    ($out_file = $path) =~ s#.*[/:]##;    #from path name end to last / or :
    }

else {
     $opt_n =~ tr/a-zA-Z0-9_\.\///cd;  #UNIX
     $out_file  =  $opt_n;
     }

if ($out_file !~ /\//) {$out_file =  substr ($out_file,0,28);}        #UNIX
unless ($out_file =~ /\.inf$/i) {$out_file = $out_file.".inf";}
open (INFO, ">$out_file") || die ("Cannot open '$out_file'.");

# determine globale frequencies of C (represents 5mC),c,t,g,a

 $f_a = $a_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl);
 $f_t = $t_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl);
 $f_c = $c_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl);
 $f_C = $C_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl);
 $f_g = $g_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl);

printf INFO
"%6s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n",
"Pos.","f(a)","f(t)","f(g)","f(c)","f(C)","f(n)","R","H'","Cov.";
$z = 0; 
for    
   (
   $z=$max_width;
   $z>0;
   $z=$z-1
   )
   {
   $summe = $umgeb_feld[$z][1] +
            $umgeb_feld[$z][2] +
       	    $umgeb_feld[$z][3] +
       	    $umgeb_feld[$z][4] +
       	    $umgeb_feld[$z][5] +
       	    $umgeb_feld[$z][6];
   if ($summe == 0) {last};
   
   $f[$z][1] = $umgeb_feld[$z][1]/$summe;
   $f[$z][2] = $umgeb_feld[$z][2]/$summe;
   $f[$z][3] = $umgeb_feld[$z][3]/$summe;
   $f[$z][4] = $umgeb_feld[$z][4]/$summe;
   $f[$z][5] = $umgeb_feld[$z][5]/$summe;
   $f[$z][6] = $umgeb_feld[$z][6]/$summe;
  
   if ($umgeb_feld[$z][1] > 0 && $f_a > 0)
   {$H[$z][1] = $umgeb_feld[$z][1]/$summe*(log($umgeb_feld[$z][1]/$summe)/log(2));
   $Hr[$z][1] = $umgeb_feld[$z][1]/$summe*(log(($umgeb_feld[$z][1]/$summe)/$f_a)/log(2))};
   if ($umgeb_feld[$z][2] > 0 && $f_t > 0)
   {$H[$z][2] = $umgeb_feld[$z][2]/$summe*(log($umgeb_feld[$z][2]/$summe)/log(2));
   $Hr[$z][2] = $umgeb_feld[$z][2]/$summe*(log(($umgeb_feld[$z][2]/$summe)/$f_t)/log(2))};
   if ($umgeb_feld[$z][3] > 0 && $f_g > 0 )
   {$H[$z][3] = $umgeb_feld[$z][3]/$summe*(log($umgeb_feld[$z][3]/$summe)/log(2));
   $Hr[$z][3] = $umgeb_feld[$z][3]/$summe*(log(($umgeb_feld[$z][3]/$summe)/$f_g)/log(2))};
   if ($umgeb_feld[$z][4] > 0 && $f_c > 0)
   {$H[$z][4] = $umgeb_feld[$z][4]/$summe*(log($umgeb_feld[$z][4]/$summe)/log(2));
   $Hr[$z][4] = $umgeb_feld[$z][4]/$summe*(log(($umgeb_feld[$z][4]/$summe)/$f_c)/log(2))};
   if ($umgeb_feld[$z][5] > 0 && $f_C > 0)
   {$H[$z][5] = $umgeb_feld[$z][5]/$summe*(log($umgeb_feld[$z][5]/$summe)/log(2));
   $Hr[$z][5] = $umgeb_feld[$z][5]/$summe*(log(($umgeb_feld[$z][5]/$summe)/$f_C)/log(2))};
   if ($umgeb_feld[$z][6] > 0 && $f_n > 0)
   {$H[$z][6] = $umgeb_feld[$z][6]/$summe*(log($umgeb_feld[$z][6]/$summe)/log(2));
   $Hr[$z][6] = $umgeb_feld[$z][6]/$summe*(log(($umgeb_feld[$z][6]/$summe)/$f_n)/log(2))};
   $Hsumme = $H[$z][1] +
             $H[$z][2] +
	            $H[$z][3] +
	            $H[$z][4] +
	            $H[$z][5];
   
   $rel_H  = $Hr[$z][1] +
             $Hr[$z][2] +
	            $Hr[$z][3] +
	            $Hr[$z][4] +
	            $Hr[$z][5];
	     

#
# "n" for max. entropy not considered:-> binaery logarythm of 1/5
#
$max_H = -(log(1/5))/(log(2));
#
# since it's difficult to determine exact error, estimation is used:
#
$e = ((5-1)/2*log(2)/$summe);
   	 
printf INFO
"%6d %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6d\n",
"-".$z,
$f[$z][1],
$f[$z][2],
$f[$z][3],
$f[$z][4],
$f[$z][5],
$f[$z][6],
$max_H + $Hsumme - $e,
$rel_H,
$summe;
}

#----------------------------------------------------------------
#INFORMATION CONTENT DOWNSTREAM OF 5MC-SITES:

for ($z=0; $z<=$max_width+1; $z++)
    {
    $umgeb_feld[$z][1] = 0;  #set variables for a,t,g,c,C and n back to zero
    $umgeb_feld[$z][2] = 0;
    $umgeb_feld[$z][3] = 0;
    $umgeb_feld[$z][4] = 0;
    $umgeb_feld[$z][5] = 0;
    $umgeb_feld[$z][6] = 0;
    
    $H[$z][1] = 0;
    $H[$z][2] = 0;
    $H[$z][3] = 0;
    $H[$z][4] = 0;
    $H[$z][5] = 0;
    $H[$z][6] = 0;
    
    $Hr[$z][1] = 0;
    $Hr[$z][2] = 0;
    $Hr[$z][3] = 0;
    $Hr[$z][4] = 0;
    $Hr[$z][5] = 0;
    $Hr[$z][6] = 0;
    
    $f[$z][1] = 0;
    $f[$z][2] = 0;
    $f[$z][3] = 0;
    $f[$z][4] = 0;
    $f[$z][5] = 0;
    $f[$z][6] = 0;
    }
    
    @files = <*.seq1>;         #write all .seq1-files into a list
    print "Scanning information content downstream of 5mC\n";
    foreach $filelist(@files)
          {
       	  $seq = "-";
       	  open (SEQUENCE, "<$filelist") || die ("Cannot open file '$filelist'.");
       	  print (".");                    #show program is running
	         while (<SEQUENCE>)
	               {
		              #
	              	# read line by line, cut Newline, make sequences from lines
	              	# ignore FASTA-names, remove all "-"
              		$file = $_;
              		chomp $file;
              		unless ($file =~ /^>/){$seq = $seq . $file}
              		$seq =~ s/-//g;
              		}
	        close SEQUENCE || die ("Cannot close open file '$filelist'.");
	        $laenge = length($seq);
	  
	        while ($seq)
	              {
	              #
	              # determine stepwise positions of 5mCs
	              $next = $position + 1;
	              $position = index ($seq,"C",$next);
	              if ($position == -1) {last};
	              $umgeb_C = substr($seq,$position,$laenge);
	              #
            	  # transform surrounding from string to list
            	  #
            	  @umgeb_block = split (//, $umgeb_C); 
	              $i = 0;
	              for (
	                  $i=0;
	                  $i<($laenge-$position);  #from 5mC position to the end
	                  $i++
	                  )
	                  {
                   if ($umgeb_block[$i] eq "a") {$umgeb_feld[$i][1]++;} 
	               elsif ($umgeb_block[$i] eq "t") {$umgeb_feld[$i][2]++;}
	               elsif ($umgeb_block[$i] eq "g") {$umgeb_feld[$i][3]++;}
	               elsif ($umgeb_block[$i] eq "c") {$umgeb_feld[$i][4]++;}
         	      elsif ($umgeb_block[$i] eq "C") {$umgeb_feld[$i][5]++;}
         	      elsif ($umgeb_block[$i] eq "n") {$umgeb_feld[$i][6]++;} 
              	   }
	               }
          }
print "\n";
$z = 0; 
for    
   (
   $z=0;
   $z<$max_width;
   $z++
   )
   {
   $summe = $umgeb_feld[$z][1] +
            $umgeb_feld[$z][2] +
	           $umgeb_feld[$z][3] +
	           $umgeb_feld[$z][4] +
	           $umgeb_feld[$z][5] +
	           $umgeb_feld[$z][6];
   if ($summe == 0) {last};
   
   $f[$z][1] = $umgeb_feld[$z][1]/$summe;
   $f[$z][2] = $umgeb_feld[$z][2]/$summe;
   $f[$z][3] = $umgeb_feld[$z][3]/$summe;
   $f[$z][4] = $umgeb_feld[$z][4]/$summe;
   $f[$z][5] = $umgeb_feld[$z][5]/$summe;
   $f[$z][6] = $umgeb_feld[$z][6]/$summe;
  
   if ($umgeb_feld[$z][1] > 0 && $f_a > 0)
   {$H[$z][1] = $umgeb_feld[$z][1]/$summe*(log($umgeb_feld[$z][1]/$summe)/log(2));
    $Hr[$z][1] = $umgeb_feld[$z][1]/$summe*(log(($umgeb_feld[$z][1]/$summe)/$f_a)/log(2))};
   if ($umgeb_feld[$z][2] > 0 && $f_t > 0)
   {$H[$z][2] = $umgeb_feld[$z][2]/$summe*(log($umgeb_feld[$z][2]/$summe)/log(2));
    $Hr[$z][2] = $umgeb_feld[$z][2]/$summe*(log(($umgeb_feld[$z][2]/$summe)/$f_t)/log(2))};
   if ($umgeb_feld[$z][3] > 0 && $f_g > 0)
   {$H[$z][3] = $umgeb_feld[$z][3]/$summe*(log($umgeb_feld[$z][3]/$summe)/log(2));
    $Hr[$z][3] = $umgeb_feld[$z][3]/$summe*(log(($umgeb_feld[$z][3]/$summe)/$f_g)/log(2))};
   if ($umgeb_feld[$z][4] > 0 && $f_c > 0)
   {$H[$z][4] = $umgeb_feld[$z][4]/$summe*(log($umgeb_feld[$z][4]/$summe)/log(2));
    $Hr[$z][4] = $umgeb_feld[$z][4]/$summe*(log(($umgeb_feld[$z][4]/$summe)/$f_c)/log(2))};
   if ($umgeb_feld[$z][5] > 0 && $f_C > 0)
   {$H[$z][5] = $umgeb_feld[$z][5]/$summe*(log($umgeb_feld[$z][5]/$summe)/log(2));
    $Hr[$z][5] = $umgeb_feld[$z][5]/$summe*(log(($umgeb_feld[$z][5]/$summe)/$f_C)/log(2))};
   if ($umgeb_feld[$z][6] > 0 && $f_n > 0)
   {$H[$z][6] = $umgeb_feld[$z][6]/$summe*(log($umgeb_feld[$z][6]/$summe)/log(2));
    $Hr[$z][6] = $umgeb_feld[$z][6]/$summe*(log(($umgeb_feld[$z][6]/$summe)/$f_n)/log(2))};
   $Hsumme = $H[$z][1] +
             $H[$z][2] +
	            $H[$z][3] +
	            $H[$z][4] +
	            $H[$z][5];
   $rel_H  = $Hr[$z][1] +
             $Hr[$z][2] +
	            $Hr[$z][3] +
	            $Hr[$z][4] +
	            $Hr[$z][5];     


#
# "n" for max. entropy not considered:-> binaery logarythm of 1/5
#
$max_H = -(log(1/5))/(log(2));

#
# since it's difficult to determine exact error estimation is used:
#
$e = ((5-1)/2*log(2)/$summe);

#
#write results into table:
# 
printf INFO
"%6d %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6d\n",
$z,
$f[$z][1],
$f[$z][2],
$f[$z][3],
$f[$z][4],
$f[$z][5],
$f[$z][6],
$max_H + $Hsumme - $e,
$rel_H,
$summe;

}

print INFO "\n";

print INFO "Sequenzes analyzed in \n",`pwd`,"\n"; #path of analyzed sequences

print  INFO "F(a) = ",$f_a,"\n"; # global nucleotide frequencies
print  INFO "F(t) = ",$f_t,"\n";
print  INFO "F(g) = ",$f_g,"\n";
print  INFO "F(c) = ",$f_c,"\n";
print  INFO "F(C) = ",$f_C,"\n";
print  INFO "\n";


print INFO <<'Ende';
-------------------

F(x) = global frequency of a base
f(x) = frequency of a base at position Pos.
R    = decrease in uncertainty at position Pos. in bits (1)
       - correction factor approximated according to (2)
       Maximum uncertainty for 5 bases = -(log(1/5))/(log(2))
       = aprx. -2.32
H'   = relative entropy (3)
       Takes global base frequency into account. Will be 0 when
       bases occure at a given position exactly with their global frequency.
Cov. = number of bases analysed per position ("Coverage").
       This number should be >50

Ref.:
(1) Schneider TD, Stephens RM: 
    Nucl.Acid.Res.,16: 6097-6100 (1990)
(2) Schneider TD, Stormo, GD, Gold L, Ehrenfeucht A: 
    J.Mol.Biol., 188:415-431 (1986)
(3) Durbin R, et.al:
    Biological sequence analysis: 308ff. (1998)
Ende

close (INFO) || die ("Cannot close open file '$out_file'");
print "\nResults written into file '$out_file'.\n";
##############################################################################
# SUBROUTINES                                                                #
# part of the Standard File Package Utility for MacPerl 4.1.1                #
#                                                                            #
# 1994.01.05 v4.1.1 Matthias Neeracher <neeri@iis.ee.ethz.ch>                #
#  Minor changes to reflect future plans for standard file support.          #
#                                                                            #
# 1993.10.27 v1.2	wm                                                         #
#	Change the calling syntax to adopt the 4.1.0 release.                      #
#                                                                            #
# 1993.10.19 v1.1	wm                                                         #
#	convert for 4.1b6                                                          #
#                                                                            #
# 1993.8.10  V1.0                                                            #
#     Watanabe Maki (Watanabe.Maki@tko.dec.com)                              #
#                                                                            #
##############################################################################
# Name
#    PutFile/GetNewFile
# Syntax
#    $filename = &PutFile($prompt [, $default]);
#    $filename = &GetNewFile($prompt [, $default]);
# Description
#    Query a new file name to user by Standard File Dialog Box.
#    $prompt is a prompt sting on the dialog box.
#    $default is a default file name.
#
#  sub PutFile {
#      local($prompt, $default) = @_;
#      
#      &MacPerl'Choose(
#          &GUSI'AF_FILE,         # domain
#          0,                     # type
#          $prompt,               # prompt
#          "",                    # constraint
#          &GUSI'CHOOSE_NEW + ($default ?
#  &GUSI'CHOOSE_DEFAULT : 0),    
#  		  								 # flag 
#          $default               # default filename
#          );
#  }
######
# Name
#    GetFolder
# Syntax
#    $foldername = &GetFolder($prompt [, $default]);
# Description
# Query a folder name to user by Standard File Dialog Box.
# $default is the default dialog
#
#  sub GetFolder {
#  	local($prompt, $default) = @_;
#  	
#      &MacPerl'Choose(
#          &GUSI'AF_FILE,          # domain
#          0,                      # type
#          $prompt,                # prompt
#          "",                     # constraint
#          &GUSI'CHOOSE_DIR + ($default ?
#  &GUSI'CHOOSE_DEFAULT : 0),
#  		                          # flag
#  		  $default
#          );
#  }
#  
