├── geneSCF-master-v1.0
    ├── test
    │   ├── output
    │   │   └── .geneSCF
    │   ├── sample_gene_list_id
    │   └── sample_gene_list_sym
    ├── test_geneSCF
    ├── class
    │   ├── lib
    │   │   ├── Statistics
    │   │   │   └── Multtest.pm
    │   │   ├── List
    │   │   │   └── Vectorize
    │   │   │   │   └── lib
    │   │   │   │       ├── Statistic.pl
    │   │   │   │       ├── Datatype.pl
    │   │   │   │       ├── Set.pl
    │   │   │   │       ├── Apply.pl
    │   │   │   │       └── IO.pl
    │   │   ├── Tie-IxHash-1.23
    │   │   │   ├── t
    │   │   │   │   ├── pod.t
    │   │   │   │   ├── each-delete.t
    │   │   │   │   └── ixhash.t
    │   │   │   ├── MANIFEST
    │   │   │   ├── Build.PL
    │   │   │   ├── META.yml
    │   │   │   ├── Changes
    │   │   │   ├── META.json
    │   │   │   ├── README
    │   │   │   └── Makefile.PL
    │   │   ├── Text
    │   │   │   ├── NSP.pm
    │   │   │   └── NSP
    │   │   │   │   └── Measures
    │   │   │   │       ├── 2D
    │   │   │   │           ├── Dice
    │   │   │   │           │   ├── jaccard.pm
    │   │   │   │           │   └── dice.pm
    │   │   │   │           ├── Dice.pm
    │   │   │   │           ├── CHI
    │   │   │   │           │   ├── tscore.pm
    │   │   │   │           │   ├── x2.pm
    │   │   │   │           │   └── phi.pm
    │   │   │   │           ├── odds.pm
    │   │   │   │           ├── MI
    │   │   │   │           │   ├── ps.pm
    │   │   │   │           │   ├── tmi.pm
    │   │   │   │           │   ├── pmi.pm
    │   │   │   │           │   └── ll.pm
    │   │   │   │           ├── Fisher
    │   │   │   │           │   ├── twotailed.pm
    │   │   │   │           │   ├── left.pm
    │   │   │   │           │   └── right.pm
    │   │   │   │           ├── Fisher2
    │   │   │   │           │   ├── twotailed.pm
    │   │   │   │           │   ├── left.pm
    │   │   │   │           │   └── right.pm
    │   │   │   │           ├── CHI.pm
    │   │   │   │           └── MI.pm
    │   │   │   │       ├── 3D
    │   │   │   │           └── MI
    │   │   │   │           │   ├── pmi.pm
    │   │   │   │           │   ├── ps.pm
    │   │   │   │           │   ├── tmi.pm
    │   │   │   │           │   └── ll.pm
    │   │   │   │       └── 4D
    │   │   │   │           └── MI
    │   │   │   │               └── ll.pm
    │   │   └── Number
    │   │   │   └── FormatEng.pm
    │   └── functional_class.pl
    ├── README.txt
    └── geneSCF
├── geneSCF-master-v1.1
    └── README.txt
└── README.md


/geneSCF-master-v1.0/test/output/.geneSCF:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/test_geneSCF:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./geneSCF -i=./test/sample_gene_list_sym -db=GO_BP -o=./test/output -t=sym
3 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Statistics/Multtest.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodebiology/geneSCF_inactive/HEAD/geneSCF-master-v1.0/class/lib/Statistics/Multtest.pm


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Statistic.pl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodebiology/geneSCF_inactive/HEAD/geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Statistic.pl


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/t/pod.t:
--------------------------------------------------------------------------------
1 | #!perl -T
2 | 
3 | use Test::More;
4 | eval "use Test::Pod 1.14";
5 | plan skip_all => "Test::Pod 1.14 required for testing POD" if $@;
6 | all_pod_files_ok();
7 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/MANIFEST:
--------------------------------------------------------------------------------
 1 | MANIFEST
 2 | README
 3 | Changes
 4 | Makefile.PL
 5 | Build.PL
 6 | lib/Tie/IxHash.pm
 7 | t/ixhash.t
 8 | t/each-delete.t
 9 | t/pod.t
10 | META.yml
11 | META.json
12 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/t/each-delete.t:
--------------------------------------------------------------------------------
 1 | 
 2 | use strict;
 3 | use Test::More tests=>2;
 4 | use Tie::IxHash;
 5 | 
 6 | my $o = tie my %h, 'Tie::IxHash';
 7 | 
 8 | $h{a} = 1; $h{b} = 2; $h{c} = 3; $h{d} = 4; $h{e} = 5;
 9 | 
10 | while (my ($k) = each %h) { 
11 |   if ($k =~ /b|d|e/) { delete $h{$k}; } 
12 | }
13 | 
14 | is(scalar(keys(%h)), 2) or diag explain(\%h);
15 | is(join(',',keys(%h)), 'a,c') or diag explain(\%h);
16 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/Build.PL:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Module::Build;
 3 | #created by eumm-migrate.pl
 4 | 
 5 | my $build = Module::Build->new(
 6 |   'auto_configure_requires' => 0,
 7 |   'module_name' => 'Tie::IxHash',
 8 |   'requires' => {
 9 |     'perl' => '5.005',
10 |   },
11 |   'build_requires' => {
12 |     'Test::More' => 0,
13 |   },
14 |   'meta_merge' => {
15 |     'resources' => {
16 |       'repository' => 'git://github.com/chorny/Tie-IxHash.git'
17 |     }
18 |   },
19 |   'license' => 'perl',
20 |   'dist_version_from' => 'lib/Tie/IxHash.pm',
21 | );
22 | 
23 | $build->create_build_script();
24 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/META.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | abstract: ordered associative arrays for Perl
 3 | author:
 4 |   - Gurusamy Sarathy        gsar@umich.edu
 5 | build_requires:
 6 |   Test::More: 0
 7 | dynamic_config: 1
 8 | generated_by: 'Module::Build version 0.4003, CPAN::Meta::Converter version 2.112621'
 9 | license: perl
10 | meta-spec:
11 |   url: http://module-build.sourceforge.net/META-spec-v1.4.html
12 |   version: 1.4
13 | name: Tie-IxHash
14 | provides:
15 |   Tie::IxHash:
16 |     file: lib/Tie/IxHash.pm
17 |     version: 1.23
18 | requires:
19 |   perl: 5.005
20 | resources:
21 |   license: http://dev.perl.org/licenses/
22 |   repository: git://github.com/chorny/Tie-IxHash.git
23 | version: 1.23
24 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.1/README.txt:
--------------------------------------------------------------------------------
 1 | Gene Set Clustering based on Functional annotation v1.1
 2 | ----------------------------------------------------------------------------
 3 | GeneSCF v1.1 supports all organisms from KEGG and also from Gene Ontology. This update will have real-time funtional enrichment feature.
 4 | For new updates about GeneSCF, visit hosting page: https://github.com/genescf/GeneSCF
 5 | 
 6 | 
 7 | 
 8 | --------------------------
 9 | Cite any version of GeneSCF using: 
10 | 
11 | Subhash S and Kanduri C. GeneSCF: a real-time based functional enrichment tool with support for multiple organisms. 
12 | BMC Bioinformatics 2016, 17:365, http://www.biomedcentral.com/1471-2105/17/365
13 | 
14 | --------------------------
15 | Author: Santhilal Subhash
16 | santhilal.subhash@gu.se
17 | Last Updated: 2020/10/03
18 | https://github.com/genescf/GeneSCF
19 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/test/sample_gene_list_id:
--------------------------------------------------------------------------------
  1 | 1
  2 | 2
  3 | 3
  4 | 4
  5 | 5
  6 | 6
  7 | 7
  8 | 8
  9 | 9
 10 | 10
 11 | 11
 12 | 12
 13 | 13
 14 | 14
 15 | 15
 16 | 16
 17 | 17
 18 | 18
 19 | 19
 20 | 20
 21 | 21
 22 | 22
 23 | 23
 24 | 24
 25 | 25
 26 | 26
 27 | 27
 28 | 28
 29 | 29
 30 | 30
 31 | 31
 32 | 32
 33 | 33
 34 | 34
 35 | 35
 36 | 36
 37 | 37
 38 | 38
 39 | 39
 40 | 40
 41 | 41
 42 | 42
 43 | 43
 44 | 44
 45 | 45
 46 | 46
 47 | 47
 48 | 48
 49 | 49
 50 | 50
 51 | 51
 52 | 52
 53 | 53
 54 | 54
 55 | 55
 56 | 56
 57 | 57
 58 | 58
 59 | 59
 60 | 60
 61 | 61
 62 | 62
 63 | 63
 64 | 64
 65 | 65
 66 | 66
 67 | 67
 68 | 68
 69 | 69
 70 | 70
 71 | 71
 72 | 72
 73 | 73
 74 | 74
 75 | 75
 76 | 76
 77 | 77
 78 | 78
 79 | 79
 80 | 80
 81 | 81
 82 | 82
 83 | 83
 84 | 84
 85 | 85
 86 | 86
 87 | 87
 88 | 88
 89 | 89
 90 | 90
 91 | 91
 92 | 92
 93 | 93
 94 | 94
 95 | 95
 96 | 96
 97 | 97
 98 | 98
 99 | 99
100 | 100
101 | 
102 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/Changes:
--------------------------------------------------------------------------------
 1 | =head1 NAME
 2 | 
 3 | HISTORY - release history for Tie::IxHash
 4 | 
 5 | =head1 DESCRIPTION
 6 | 
 7 | =over 8
 8 | 
 9 | =item 1.23  (24 February 2013)
10 | 
11 | New method Clear()
12 | 
13 | Deleting current element when doing cycle using each will work (test by OLEG, RT#82248)
14 | 
15 | =item 1.22  (27 February 2010)
16 | 
17 | Build.PL added
18 | 
19 | Better META.yml
20 | 
21 | Distribution upgrade
22 | 
23 | =item 1.21  (5 January 1998)
24 | 
25 | Key()/Values()/Indices() now return a single value when called with single
26 | argument (makes them useful in scalar contexts)
27 | 
28 | =item 1.2   (18 February 1997)
29 | 
30 | Repackaged into a tarball.
31 | 
32 | Added a testsuite.
33 | 
34 | C<s/TieHash/Tie::Hash/g> suggested by Michael De La Rue <miked@ed.ac.uk>.
35 | 
36 | =item 1.1
37 | 
38 | Initial release (ancient).
39 | 
40 | =back
41 | 
42 | =cut
43 | 
44 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/META.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "abstract" : "ordered associative arrays for Perl",
 3 |    "author" : [
 4 |       "Gurusamy Sarathy        gsar@umich.edu"
 5 |    ],
 6 |    "dynamic_config" : 1,
 7 |    "generated_by" : "Module::Build version 0.4003, CPAN::Meta::Converter version 2.112621",
 8 |    "license" : [
 9 |       "perl_5"
10 |    ],
11 |    "meta-spec" : {
12 |       "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
13 |       "version" : "2"
14 |    },
15 |    "name" : "Tie-IxHash",
16 |    "prereqs" : {
17 |       "build" : {
18 |          "requires" : {
19 |             "Test::More" : 0
20 |          }
21 |       },
22 |       "runtime" : {
23 |          "requires" : {
24 |             "perl" : "5.005"
25 |          }
26 |       }
27 |    },
28 |    "provides" : {
29 |       "Tie::IxHash" : {
30 |          "file" : "lib/Tie/IxHash.pm",
31 |          "version" : "1.23"
32 |       }
33 |    },
34 |    "release_status" : "stable",
35 |    "resources" : {
36 |       "license" : [
37 |          "http://dev.perl.org/licenses/"
38 |       ],
39 |       "repository" : {
40 |          "url" : "git://github.com/chorny/Tie-IxHash.git"
41 |       }
42 |    },
43 |    "version" : "1.23"
44 | }
45 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/test/sample_gene_list_sym:
--------------------------------------------------------------------------------
  1 | SGIP1
  2 | SLC45A1
  3 | NECAP2
  4 | CLIC4
  5 | ADC
  6 | AGBL4
  7 | DAB1
  8 | TGFBR3
  9 | DBT
 10 | PRUNE
 11 | RP11-550P17.5
 12 | RFWD2
 13 | C1orf21
 14 | RP1-272L16.1
 15 | LIN9
 16 | C1orf159
 17 | PRKCZ
 18 | PRDM16
 19 | ICMT
 20 | CAMTA1
 21 | RP5-1056L3.1
 22 | PINK1
 23 | PINK1-AS
 24 | USP48
 25 | EPHB2
 26 | STMN1
 27 | NUDC
 28 | EYA3
 29 | EPB41
 30 | PUM1
 31 | KHDRBS1
 32 | CSMD2
 33 | SFPQ
 34 | THRAP3
 35 | RP5-1180C18.1
 36 | MACF1
 37 | CCDC30
 38 | PTPRF
 39 | RNF220
 40 | GPBP1L1
 41 | TRABD2B
 42 | FAF1
 43 | RAB3B
 44 | SCP2
 45 | TCEANC2
 46 | USP24
 47 | FGGY
 48 | NFIA
 49 | USP1
 50 | ITGB3BP
 51 | CACHD1
 52 | LEPR
 53 | LRRC7
 54 | NEGR1
 55 | ST6GALNAC3
 56 | PIGK
 57 | LPHN2
 58 | DDAH1
 59 | CLCA4
 60 | RP11-76N22.2
 61 | LRRC8C
 62 | FAM69A
 63 | GCLM
 64 | RP4-639F20.1
 65 | RP11-147C23.1
 66 | RP11-202K23.1
 67 | NTNG1
 68 | GNAI3
 69 | KCNA2
 70 | RAP1A
 71 | RHOC
 72 | PPM1J
 73 | PHTF1
 74 | WDR3
 75 | NOTCH2
 76 | RP6-206I17.1
 77 | PDZK1
 78 | RP11-495P10.2
 79 | OTUD7B
 80 | SPRR2B
 81 | TPM3
 82 | GBAP1
 83 | SMG5
 84 | FCGR2A
 85 | PBX1
 86 | RP11-466F5.6
 87 | MROH9
 88 | DNM3
 89 | RP1-15D23.2
 90 | TNFSF18
 91 | TNN
 92 | FAM5B
 93 | RASAL2
 94 | SOAT1
 95 | ACBD6
 96 | CACNA1E
 97 | RGSL1
 98 | SMG7
 99 | PTGS2
100 | RP11-541F9.2
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GeneSCF
 2 | 
 3 | 
 4 | ``Update``: 2020/10/03
 5 | 
 6 | New GitHub page is up and running for GeneSCF, https://github.com/genescf
 7 | 
 8 | ``Update``: 2020/09
 9 | 
10 | New GitHub page is releasing soon for GeneSCF, https://github.com/genescf
11 | 
12 | 
13 | # Gene Set Clustering based on Functional annotation 
14 | ----------------------------------------------------------------------------
15 | GeneSCF from v1.1 supports all organisms from KEGG and also from Gene Ontology. This update will have real-time funtional enrichment feature.
16 | For new updates about GeneSCF, visit hosting website: https://github.com/genescf/GeneSCF
17 | 
18 | 
19 | --------------------------
20 | # Cite any version of GeneSCF using
21 | 
22 | Subhash S and Kanduri C. GeneSCF: a real-time based functional enrichment tool with support for multiple organisms. 
23 | BMC Bioinformatics 2016, 17:365, http://www.biomedcentral.com/1471-2105/17/365
24 | 
25 | 
26 | --------------------------
27 | # Report issues
28 | 
29 | BioStars: https://www.biostars.org/p/108669/
30 | 
31 | GitHub: https://github.com/genescf/GeneSCF/issues
32 | 
33 | Email: santhilalsubhash@gmail.com
34 | 
35 | 
36 | 
37 | --------------------------
38 | Correspondance: Santhilal Subhash
39 | 
40 | santhilalsubhash@gmail.com
41 | 
42 | Last Updated: 2020/10/03
43 | 
44 | https://github.com/genescf/GeneSCF
45 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/README:
--------------------------------------------------------------------------------
 1 | This is the README file for Tie::IxHash, the Perl module that 
 2 | implements ordered in-memory associative arrays.
 3 | 
 4 | It requires:
 5 |    Perl Version 5.005 or later.
 6 | 
 7 | If you have been led to believe that associative arrays in perl
 8 | don't preserve order, and if you have ever craved for that feature,
 9 | this module is for you.  Simply declare a "tie" for the hash variable
10 | that you want to be order-preserving, and forget that limitation
11 | ever existed.  You can do other nifty things with the tied hash object
12 | that you may be used to doing with arrays, like Push(), Pop() and 
13 | Splice().
14 | 
15 | If you don't know what "tie" means, you should look at the
16 | perltie(1) manpage in a recent perl distribution, or in the
17 | index of one of the numerous books on perl.
18 | 
19 | If you don't know what "perl" is, you don't need this software.
20 |  
21 | See the embedded documentation in the module file for details.
22 | 
23 | Don't forget to send your comments!
24 | 
25 |  - Sarathy.
26 |    gsar@umich.edu
27 | 
28 | -----------
29 | 
30 | Installation:
31 | 
32 |    perl Makefile.PL
33 |    make install
34 | 
35 | If you run into problems due to whatever reason in running the above,
36 | simply move the file IxHash.pm over into $PERL5LIB/Tie/IxHash.pm (where
37 | $PERL5LIB stands for the place where your standard perl library files
38 | are located) and you'll be okay.
39 | 
40 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use ExtUtils::MakeMaker;
 2 | WriteMakefile1(
 3 |   NAME => "Tie::IxHash",
 4 |   VERSION_FROM => 'lib/Tie/IxHash.pm',
 5 |   LICENSE => 'perl',
 6 |   MIN_PERL_VERSION => '5.005',
 7 |   META_MERGE => {
 8 |     resources => {
 9 |       repository => 'git://github.com/chorny/Tie-IxHash.git',
10 |     },
11 |   },
12 |   PL_FILES => {},
13 |   #SKIP => [qw(static dynamic)],
14 |   #'linkext' => {LINKTYPE => '' },
15 |   #'dist' => {COMPRESS=>'gzip -9f', SUFFIX => 'gz'},
16 | );
17 | 
18 | sub WriteMakefile1 {  #Written by Alexandr Ciornii, version 0.21. Added by eumm-upgrade.
19 |   my %params=@_;
20 |   my $eumm_version=$ExtUtils::MakeMaker::VERSION;
21 |   $eumm_version=eval $eumm_version;
22 |   die "EXTRA_META is deprecated" if exists $params{EXTRA_META};
23 |   die "License not specified" if not exists $params{LICENSE};
24 |   if ($params{BUILD_REQUIRES} and $eumm_version < 6.5503) {
25 |     #EUMM 6.5502 has problems with BUILD_REQUIRES
26 |     $params{PREREQ_PM}={ %{$params{PREREQ_PM} || {}} , %{$params{BUILD_REQUIRES}} };
27 |     delete $params{BUILD_REQUIRES};
28 |   }
29 |   delete $params{CONFIGURE_REQUIRES} if $eumm_version < 6.52;
30 |   delete $params{MIN_PERL_VERSION} if $eumm_version < 6.48;
31 |   delete $params{META_MERGE} if $eumm_version < 6.46;
32 |   delete $params{META_ADD} if $eumm_version < 6.46;
33 |   delete $params{LICENSE} if $eumm_version < 6.31;
34 |   delete $params{AUTHOR} if $] < 5.005;
35 |   delete $params{ABSTRACT_FROM} if $] < 5.005;
36 |   delete $params{BINARY_LOCATION} if $] < 5.005;
37 | 
38 |   WriteMakefile(%params);
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/t/ixhash.t:
--------------------------------------------------------------------------------
 1 | #!../perl -w
 2 | use Tie::IxHash;
 3 | 
 4 | my $TNUM = 0;
 5 | print "1..26\n";
 6 | 
 7 | sub T { print $_[0] ? "ok " : "not ok ", ++$TNUM, "\n" }
 8 | my %bar;
 9 | my $ixh = tie (%bar, 'Tie::IxHash', 'a' => 1, 'q' => 2, 'm' => 'X', 'n' => 'Y');
10 | #$ixh = Tie::IxHash->new('a' => 1, 'q' => 2, 'm' => 'X', n => 'Y');
11 | $ixh->Push(e => 5, f => 6);
12 | T 'a|1|q|2|m|X|n|Y|e|5|f|6' eq join('|', %bar);
13 | $ixh->Delete('e', 'a');
14 | T 'q|2|m|X|n|Y|f|6' eq join '|', %bar;
15 | T 'q|m|n|f' eq join '|', $ixh->Keys;
16 | T '2|X|Y|6' eq join '|', $ixh->Values;
17 | T 'm|n|f' eq join '|', $ixh->Keys(1, 2, 3);
18 | T 'X|Y|6' eq join '|', $ixh->Values(1, 2, 3);
19 | $ixh->Replace(1, 9);
20 | T 'q|2|m|9|n|Y|f|6' eq join '|', %bar;
21 | $ixh->Replace(0, 8, 'f');
22 | T 'f|8|m|9|n|Y' eq join '|', %bar;
23 | T '2|1' eq join '|', $ixh->Indices('n', 'm');
24 | $ixh->Push(z => 1);
25 | $ixh->SortByValue;
26 | T 'z|f|m|n' eq join '|', $ixh->Keys;
27 | $ixh->SortByKey;
28 | T 'f|m|n|z' eq join '|', $ixh->Keys;
29 | 
30 | T 'm' eq $ixh->Keys(1);
31 | T 'Y' eq $ixh->Values(2);
32 | T 3 == $ixh->Indices('z');
33 | 
34 | %bar = ('a' => 9, 'c' => 6, 'z' => 7, 'f' => 1);
35 | delete $bar{'z'};
36 | $bar{'a'} = 10;
37 | T 'a|10|c|6|f|1' eq join '|', %bar;
38 | T 'a|c|f' eq join '|', keys %bar;
39 | T '10|6|1' eq join '|', values %bar;
40 | $ixh->Reorder(sort { $bar{$a} <=> $bar{$b} } keys %bar);
41 | T 'f|c|a' eq join '|', keys %bar;
42 | $ixh->Reorder('c', 'a', 'z');
43 | T 'c|6|a|10' eq join '|', %bar;
44 | 
45 | @tmp = $ixh->Splice(0, 3, 'z' => 7, 'm' => 4); 
46 | T 'c|6|a|10' eq join '|', @tmp;
47 | T 'z|7|m|4' eq join '|', %bar;
48 | $ixh->Push('m' => 8);
49 | @tmp = $ixh->Pop;
50 | T 'm|8' eq join '|', @tmp;
51 | $ixh->Push('o' => 2, 'r' => 8);
52 | T 'z|7|o|2|r|8' eq join '|', %bar;
53 | $ixh->Pop;
54 | T 'z|7|o|2' eq join '|', %bar;
55 | $ixh->Splice($ixh->Length,0,$ixh->Pop);
56 | T 'z|7|o|2' eq join '|', %bar;
57 | 
58 | $ixh->Clear;
59 | T $ixh->Length == 0;
60 | 
61 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/README.txt:
--------------------------------------------------------------------------------
 1 | Gene Set Clustering based on Functional annotation
 2 | ----------------------------------------------------------------------------
 3 | 
 4 | INSTALL:
 5 | 
 6 | No installation required
 7 | 
 8 | TEST DATASETS:
 9 | 
10 | Run command
11 | 
12 | ./test_geneSCF
13 | 
14 | You will get output in './test/output/' directory.
15 | 
16 | 
17 | USAGE: 
18 | 
19 | geneSCF <OPTIONS> -i=<INPUT FILE> -o=<OUTPUT PATH/FOLDER> -db=<GO_all|GO_BP|GO_MF|GO_CC|KEGG|REACTOME|NCG>
20 | 
21 | ==========
22 | Options:
23 | ==========
24 | 
25 | [-i= | --infile=]	Input file contains list of Entrez GeneIDs or OFFICIAL GENE SYMBOLS.
26 | 			The genes must be new lines seperated (One gene per line).
27 | 
28 | [-t= | --gtype=]	Type of input in the provided list either Entrez GeneIDs 'gid'
29 | 			or OFFICIAL GENE SYMBOLS 'sym' (Without quotes, default: sym).
30 | 
31 | [-db= | --database=]	Database you want to find gene enrichment which is either 
32 | 			geneontology 'GO_all' or geneontology-biological_process 
33 | 			'GO_BP' or geneontology-molecular_function 'GO_MF' or 
34 | 			geneontology-cellular_components 'GO_CC' or kegg 'KEGG' or 
35 | 			reactome 'REACTOME' or Network of Cancer Genes 'NCG' (Without quotes).
36 | 
37 | [-o= | --outpath=]	Path to save output file. The output will be with saved in the 
38 | 			provided existing location as 
39 | 			{INPUT_FILE_NAME}_{database}_functional_classification.tsv 
40 | 			(tab-seperated file). Note: This tool will not create output directory, 
41 | 			only outputs in exiting location.
42 | 
43 | [-bg= | --background=]	Total background genes to consider (default : 30000).
44 | 
45 | [-h | --help]		For displaying this help page.
46 | 
47 | 
48 | 
49 | --------------------------
50 | Cite using: 
51 | 
52 | Subhash S and Kanduri C. GeneSCF: a real-time based functional enrichment tool with support for multiple organisms. 
53 | BMC Bioinformatics 2016, 17:365, http://www.biomedcentral.com/1471-2105/17/365
54 | 
55 | 
56 | --------------------------
57 | Author: Santhilal Subhash
58 | santhilal.subhash@gu.se
59 | Last Updated: 2015 June 05
60 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Datatype.pl:
--------------------------------------------------------------------------------
 1 | 
 2 | # description: is the scalar a number
 3 | sub is_numberic {
 4 |     my $value = $_[0];
 5 |     if($value =~/^-?\d+\.?\d*$/) {
 6 |         return 1;
 7 |     } else {
 8 |         return 0;
 9 |     }
10 | }
11 | 
12 | # description: is the scalar a array reference
13 | sub is_array_ref {
14 |     if($_[0] and ref($_[0])
15 |              and ref($_[0]) eq "ARRAY") {
16 |         return 1;
17 |     }
18 |     else {
19 |         return 0;
20 |     }
21 | }
22 | 
23 | # description: is the scalar a hash reference
24 | sub is_hash_ref {
25 |     if($_[0] and ref($_[0])
26 |              and ref($_[0]) eq "HASH") {
27 |         return 1;
28 |     }
29 |     else {
30 |         return 0;
31 |     }
32 | }
33 | 
34 | # description: is the scalar a scalar reference
35 | sub is_scalar_ref {
36 |     if($_[0] and ref($_[0])
37 |              and ref($_[0]) eq "SCALAR") {
38 |         return 1;
39 |     }
40 |     else {
41 |         return 0;
42 |     }
43 | }
44 | 
45 | # description: is the scalar a subroutiine reference
46 | sub is_code_ref {
47 |     if($_[0] and ref($_[0])
48 |              and ref($_[0]) eq "CODE") {
49 |         return 1;
50 |     }
51 |     else {
52 |         return 0;
53 |     }
54 | }
55 | 
56 | # description: is the scalar a typeglob reference
57 | sub is_glob_ref {
58 |     if($_[0] and ref($_[0])
59 |              and ref($_[0]) eq "GLOB") {
60 |         return 1;
61 |     }
62 |     else {
63 |         return 0;
64 |     }
65 | }
66 | 
67 | # description: is the scalar a reference reference
68 | sub is_ref_ref {
69 |     if($_[0] and ref($_[0])
70 |              and ref($_[0]) eq "REF") {
71 |         return 1;
72 |     }
73 |     else {
74 |         return 0;
75 |     }
76 | }
77 | 
78 | # description: the type of a scalar
79 | sub type_of {
80 | 
81 | 	if(ref($_[0])) {
82 | 		return ref($_[0])."_REF";
83 | 	}
84 | 	elsif(ref(\$_[0]) eq "GLOB") {
85 | 		return "GLOB";
86 | 	}
87 | 	else {
88 | 		return "SCALAR";
89 | 	}
90 | }
91 | 
92 | 1;
93 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/geneSCF:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in "$@"
 3 | do
 4 | case $i in
 5 |     -t=*|--gtype=*)
 6 |     GTYPE="${i#*=}"
 7 | 
 8 |     ;;
 9 |     -i=*|--infile=*)
10 |     INFILE="${i#*=}"
11 |     ;;
12 |     -o=*|--outpath=*)
13 |     OUTPATH="${i#*=}"
14 |     ;;
15 |    -db=*|--database=*)
16 |     database="${i#*=}"
17 |     ;;
18 |    -bg=*|--background=*)
19 |     background="${i#*=}"
20 |     ;;
21 |    -h*|--help)
22 |     echo -e "\n\nUSAGE: \n\ngeneSCF <OPTIONS> -i=<INPUT FILE> -o=<OUTPUT PATH/FOLDER> -db=<GO_all|GO_BP|GO_MF|GO_CC|KEGG|REACTOME>\n\n==========\nOptions:\n==========\n\n[-i= | --infile=]\tInput file contains list of Entrez GeneIDs or OFFICIAL GENE SYMBOLS.\n\t\t\tThe genes must be new lines seperated (One gene per line).\n\n[-t= | --gtype=]\tType of input in the provided list either Entrez GeneIDs 'gid'\n\t\t\tor OFFICIAL GENE SYMBOLS 'sym' (Without quotes, default: sym).\n\n[-db= | --database=]\tDatabase you want to find gene enrichment which is either \n\t\t\tgeneontology 'GO_all' or geneontology-biological_process \n\t\t\t'GO_BP' or geneontology-molecular_function 'GO_MF' or \n\t\t\tgeneontology-cellular_components 'GO_CC' or kegg 'KEGG' or \n\t\t\treactome 'REACTOME' or Network of Cancer Genes 'NCG' (Without quotes).\n\n[-o= | --outpath=]\tExisting directory to save output file. The output will be with saved in the \n\t\t\tprovided location as {INPUT_FILE_NAME}_{database}_functional_classification.tsv \n\t\t\t(tab-seperated file).\n\n[-bg= | --background=]\tTotal background genes to consider (default : 30000).\n\n[-h | --help]\t\tFor displaying this help page.\n";
23 | exit 1;
24 |     ;;
25 |     *)
26 |             # unknown option
27 |     ;;
28 | esac
29 | done
30 | 
31 | if [ $# -eq 0 ]; then
32 |     echo -e "\n\nPlease use:\n\ngeneSCF -h \n\n(or)\n\ngeneSCF --help \n\n for help\n";
33 | 	exit 1;
34 | fi
35 | if [ -z $database ]; then
36 |     echo "Please specify one of these databses GO_all,GO_BP,GO_MF,GO_CC,KEGG,NCG,REACTOME";
37 | 	exit 1;
38 | fi
39 | 
40 | if [ -z $GTYPE ]; then
41 |    
42 | GTYPE="sym";
43 | 
44 | fi
45 | 
46 | if [ -z $background ]; then
47 |    
48 | background=30000;
49 | 
50 | fi
51 | 
52 | if [ -z $INFILE ]; then
53 |    
54 | echo "Input file missing";
55 | 
56 | exit 1;
57 | 
58 | fi
59 | 
60 | if [ -z $OUTPATH ]; then
61 | 
62 | echo "Please specify out put path";
63 | 
64 | exit 1;
65 | 
66 | fi
67 | 
68 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
69 | DT=`/bin/date`
70 | echo "processing started....$DT" 
71 | 
72 | perl ${DIR}/class/functional_class.pl ${GTYPE} ${INFILE} ${OUTPATH} ${database} ${background} ${DIR}
73 | 
74 | DT=`/bin/date`
75 | echo "$DT finished processing" 
76 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Set.pl:
--------------------------------------------------------------------------------
  1 | 
  2 | # usage: intersect( [ARRAY REF], [ARRAY REF], ... )
  3 | # return: ARRAY REF
  4 | sub intersect {
  5 | 	
  6 | 	check_prototype(@_, '(\@)+');
  7 | 	
  8 | 	if(scalar(@_) < 2) {
  9 | 		return $_[0];
 10 | 	}
 11 | 	
 12 |     my $set1 = shift;
 13 |     my $set2 = shift;
 14 | 	my @remain_set = @_;
 15 | 	
 16 | 	# if set1 or set2 is empty
 17 | 	if(is_empty($set1) or is_empty($set2)) {
 18 | 		return [];
 19 | 	}
 20 | 	
 21 |     my $hash2;
 22 |     for (@$set2) {
 23 |         $hash2->{$_} = 1;
 24 |     }
 25 | 	
 26 | 	$set1 = unique($set1);
 27 |     
 28 |     my $intersect;
 29 |     for (@$set1) {
 30 |         push(@$intersect, $_) if($hash2->{$_});
 31 |     }
 32 | 	
 33 | 	if($intersect) {
 34 | 		$intersect = intersect($intersect, @remain_set);
 35 | 	}
 36 | 	else {
 37 | 		return [];
 38 | 	}
 39 | 	
 40 | 	return $intersect;
 41 | }
 42 | 
 43 | # usage: union( [ARRAY REF], [ARRAY REF] )
 44 | # return: ARRAY REF
 45 | sub union {
 46 | 	
 47 | 	check_prototype(@_, '(\@)+');
 48 | 	
 49 | 	if(scalar(@_) < 2) {
 50 | 		return $_[0];
 51 | 	}
 52 | 	
 53 |     my $set1 = shift;
 54 |     my $set2 = shift;
 55 | 	my @remain_set = @_;
 56 | 	
 57 | 	$set1 = unique($set1);
 58 | 	$set2 = unique($set2);
 59 |     
 60 |     my $hash1;
 61 |     my $union = $set1;
 62 |     for (@$set1) {
 63 |         $hash1->{$_} = 1;
 64 |     }
 65 | 	
 66 | 	for (@$set2) {
 67 | 		push(@$union, $_) if(! $hash1->{$_});
 68 | 	}
 69 | 	
 70 | 	$union = union($union, @remain_set);
 71 | 	
 72 | 	return $union;
 73 | }
 74 | 
 75 | # usage: complement( [ARRAY REF], [ARRAY REF] )
 76 | # return: ARRAY REF
 77 | # set1 - set2
 78 | sub setdiff {
 79 | 	
 80 | 	check_prototype(@_, '\@\@');
 81 | 	
 82 |     my $set1 = shift;
 83 |     my $set2 = shift;
 84 |     
 85 |     my $hash2;
 86 |     foreach (@$set2) {
 87 |         $hash2->{$_} = 1;
 88 |     }
 89 |     
 90 |     my $diff;
 91 |     foreach (@$set1) {
 92 |         push(@$diff, $_) unless($hash2->{$_});
 93 |     }
 94 |     return $diff;
 95 | }
 96 | 
 97 | # usage: setequal( [ARRAY REF], [ARRAY REF] )
 98 | # return: 1|0
 99 | sub setequal {
100 | 	
101 | 	check_prototype(@_, '\@\@');
102 | 	
103 | 	my $set1 = shift;
104 | 	my $set2 = shift;
105 | 	
106 | 	my $unique_set1 = unique($set1);
107 | 	my $unique_set2 = unique($set2);
108 | 	my $union = union($set1, $set2);
109 | 	
110 | 	if(len($unique_set1) == len($unique_set2)
111 | 	   and len($unique_set1) == len($union)) {
112 | 		return 1;
113 | 	}
114 | 	else {
115 | 		return 0;
116 | 	}
117 | }
118 | 
119 | # usage: is_element( [SCALAR], [ARRAY REF])
120 | # return 0|1
121 | sub is_element {
122 | 	
123 | 	check_prototype(@_, '$\@');
124 | 	
125 | 	my $item = shift;
126 | 	my $set = shift;
127 | 	
128 | 	for(my $i = 0; $i < len($set); $i ++) {
129 | 		if(is_numberic($set->[$i]) and is_numberic($item)
130 | 		   and abs($set->[$i] - $item) < EPS) {
131 | 			return 1;
132 | 		}
133 | 		elsif($set->[$i] eq $item) {
134 | 			return 1;
135 | 		}
136 | 	}
137 | 	return 0;
138 | }
139 | 
140 | 1;
141 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Apply.pl:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | sub sapply {
  4 | 
  5 |     check_prototype(@_, '\@\&');
  6 | 
  7 | 	my $array = shift;
  8 |     my $function = shift;
  9 | 	
 10 |     my $sapply = [];
 11 |     @$sapply = map { my $scalar = $function->($_);
 12 |                      $scalar;
 13 | 					} @$array;
 14 | 
 15 |     return $sapply;
 16 | }
 17 | 
 18 | 
 19 | sub mapply {
 20 |     
 21 | 	check_prototype(@_, '(\@|$)+\&');
 22 | 
 23 | 	my $function = pop; # the last argument
 24 |     my @array = @_;
 25 | 	
 26 |     for (0..$#array) {
 27 |         if(! is_array_ref($array[$_])) {
 28 |             $array[$_] = [$array[$_]];
 29 |         }
 30 |     }
 31 | 
 32 |     my $length = sapply(\@array, \&len);
 33 |     my $max_length = max($length);
 34 | 	
 35 | 	my $check_length = sapply($length, sub {$max_length % $_[0] != 0});
 36 | 	if(sum($check_length)) {
 37 | 		croak "ERROR: Longer object length is not a multiple of shorter object length.";
 38 | 	}
 39 | 	
 40 |     @array = @{ sapply(\@array, sub{_cycle($_[0], $max_length)}) };
 41 | 	
 42 |     my $mapply = [];
 43 |     for my $i (0..($max_length-1)) {
 44 |         my $param = sapply(\@array, sub {$_[0]->[$i]});
 45 |         $mapply->[$i] = do { my $scalar = $function->(@$param);
 46 | 			                 $scalar; };
 47 |     }
 48 | 
 49 |     return $mapply;
 50 | }
 51 | 
 52 | 
 53 | sub _cycle {
 54 |     my $array = shift;
 55 |     my $size = shift || len($array);
 56 |     my $scalar = len($array);
 57 | 
 58 |     if($size == $scalar) {
 59 |         return $array;
 60 |     }
 61 |     elsif($size < $scalar) {
 62 |         $size --;
 63 |         return subset($array, [0..$size]);
 64 |     }
 65 |     else {
 66 |         $size --;
 67 |         my $index = sapply([0..$size], sub {$_ % $scalar});
 68 |         return subset($array, $index);
 69 |     }
 70 | }
 71 | 
 72 | 
 73 | sub happly {
 74 | 
 75 |     check_prototype(@_, '\%\&');
 76 | 
 77 | 	my $hash = shift;
 78 |     my $function = shift;
 79 | 	
 80 |     my $happly = {};
 81 |     foreach (keys %$hash) {
 82 |         $happly->{$_} = do { my $scalar = $function->($hash->{$_});
 83 | 			                 $scalar; };
 84 |     }
 85 |     return $happly;
 86 | }
 87 | 
 88 | 
 89 | sub tapply {
 90 |     
 91 | 	check_prototype(@_, '\@(\@)+\&');
 92 | 	
 93 | 	my $array = shift;
 94 | 	my $function = pop;
 95 |     my @category = @_;
 96 | 	
 97 | 	my $length = sapply(\@category, \&len);
 98 | 	push(@$length, len($array));
 99 | 	if(max($length) != min($length)) {
100 | 		croak "ERROR: Length of the vector must be equal to the length of all categories.\n";
101 | 	}
102 | 	
103 | 	my $category = paste(@category, "|");
104 | 
105 |     my $label = unique($category);
106 |     my $tapply = {};
107 |     for (0..$#$label) {
108 |         my $current_label = $label->[$_];
109 |         my $index = test($category, sub {$_[0] eq $current_label});
110 |         $index = which($index);
111 | 		my @data = @{subset($array, $index)};
112 |         $tapply->{$current_label} = do { my $scalar = $function->(@data);
113 | 			                             $scalar; };
114 |     }
115 |     return $tapply;
116 | }
117 | 
118 | 1;
119 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/List/Vectorize/lib/IO.pl:
--------------------------------------------------------------------------------
  1 | 
  2 | # ============================= IO subroutine ==============================================
  3 | # usage: print_ref( [TYPEGLOB], [SCALAR] )
  4 | # description: print the data structure of a reference
  5 | sub print_ref {
  6 | 	
  7 | 	check_prototype(@_, '*?($|\$|\@|\%|\&)+');
  8 | 	
  9 | 	local $handle = *STDOUT;
 10 | 	if(is_glob_ref(\$_[0])) {
 11 | 		$handle = shift(@_);
 12 | 	}
 13 | 	my $ref = shift;
 14 |     
 15 |     if(is_array_ref($ref)) {
 16 |         print $handle "Reference of ARRAY.\n";
 17 | 		for (0..$#$ref) {
 18 | 			print $handle "[$_] $ref->[$_]\n";
 19 | 		}
 20 |         print $handle "\n";
 21 |     } elsif(is_hash_ref($ref)) {
 22 |         print $handle "Reference of HASH.\n";
 23 |         foreach (keys %$ref) {
 24 |             print $handle "$_\t$ref->{$_}\n";
 25 |         }
 26 |         print $handle "\n";
 27 |     } elsif(is_scalar_ref($ref)) {
 28 |         print $handle "Reference of SCALAR.\n";
 29 |         print $handle $$ref;
 30 |         print $handle "\n";
 31 |     } elsif(is_ref_ref($ref)) {
 32 |         print $handle "Reference of REF.\n";
 33 |         print $handle $$ref;
 34 |         print $handle "\n";
 35 |     } elsif(is_code_ref($ref)) {
 36 |         print $handle "Reference of CODE.\n";
 37 |     } else {
 38 |         print $handle "@_\n";
 39 |     }
 40 |     return $ref;
 41 | }
 42 | 
 43 | # usage: print_matrix( [TYPEGLOB], [SCALAR] )
 44 | # description: print the matrix
 45 | sub print_matrix {
 46 | 	
 47 | 	check_prototype(@_, '*?\@');
 48 | 	
 49 | 	local $handle = *STDOUT;
 50 | 	if(is_glob_ref(\$_[0])) {
 51 | 		$handle = shift(@_);
 52 | 	}
 53 | 	my $mat = $_[0];
 54 | 	my $sep = "\t";
 55 | 	
 56 | 	my ($nrow, $ncol) = dim($mat);
 57 | 	print "$nrow x $ncol matrix:\n\n";
 58 | 	
 59 | 	for(my $i = 0; $i < len($mat); $i ++) {
 60 | 		print $handle join $sep, @{$mat->[$i]};
 61 | 		print $handle "\n";
 62 | 	}
 63 | 	print "\n";
 64 | }
 65 | 
 66 | # usage: read_table( [SCALAR], %setup )
 67 | sub read_table {
 68 | 	
 69 | 	check_prototype(@_, '$($|\@){0,}');
 70 | 	
 71 | 	my $file = shift;
 72 | 	
 73 | 	my %setup = @_;
 74 | 	my $quote = $setup{"quote"} || "";
 75 | 	my $sep = $setup{"sep"} || "\t";
 76 | 	my $whether_rownames = $setup{"row.names"} || 0;       # if set true, first item will be key
 77 | 	my $whether_colnames = $setup{"col.names"} || 0;       # if set true, first item will be key
 78 | 	
 79 | 	open F, $file or croak "ERROR: cannot open $file.\n";
 80 | 	my $data;
 81 | 	my $rownames;
 82 | 	my $colnames;
 83 | 	my $i_line = 0;
 84 | 	my $i_array = 0;
 85 | 	my $flag = 0;
 86 | 	while( my $line = <F>) {
 87 | 		$i_line ++;
 88 | 		
 89 | 		# read the column names
 90 | 		if($flag == 0 and $whether_colnames) {
 91 | 			chomp $line;
 92 | 			$line =~s/^$quote|$quote$//g;
 93 | 			@$colnames = split "$quote$sep$quote", $line; 
 94 | 			if($whether_rownames) {
 95 | 				shift(@$colnames);
 96 | 			}
 97 | 			$flag = 1;
 98 | 			$i_line --;
 99 | 			next;
100 | 		}
101 | 		
102 | 		$i_array ++;
103 | 		
104 | 		chomp $line;
105 | 		$line =~s/^$quote|$quote$//g;
106 | 		my @tmp = split "$quote$sep$quote", $line; 
107 | 		
108 | 		# read rownames
109 | 		if($whether_rownames) {
110 | 			push(@$rownames, shift(@tmp));
111 | 		}
112 | 		
113 | 		push(@{$data->[$i_array - 1]}, @tmp);
114 | 		
115 | 	}
116 | 	close F;
117 | 
118 | 	wantarray ? ($data, $colnames, $rownames) : $data;
119 | }
120 | 
121 | # usage: write_table( [MATRIX], %setup )
122 | sub write_table {
123 | 	
124 | 	check_prototype(@_, '\@($|\@){2,}');
125 | 	
126 | 	my $matrix = shift;
127 | 	
128 | 	my %setup = @_;
129 | 	my $quote = $setup{"quote"} || "";
130 | 	my $sep = $setup{"sep"} || "\t";
131 | 	my $colnames = $setup{"col.names"};   # column names
132 | 	my $rownames = $setup{"row.names"};   # row names
133 | 	my $file = $setup{"file"};
134 | 	
135 | 	my ($nrow, $ncol) = dim($matrix);
136 | 	if($rownames and $nrow != len($rownames)) {
137 | 		croak "ERROR: Length of rownames should be equal to the length of rows in matrix\n";
138 | 	}
139 | 	if($colnames and $ncol != len($colnames)) {
140 | 		croak "ERROR: Length of colnames should be equal to the length of columns in matrix\n";
141 | 	}
142 | 	
143 | 	open OUT, ">$file" or croak "ERROR: Cannot create file:$file\n";
144 | 	if($rownames) {
145 | 		if($colnames) {
146 | 			# print colnames
147 | 			print OUT "$quote$quote$sep";
148 | 			print OUT join $sep, @{sapply($colnames, sub{"$quote$_$quote"})};
149 | 			print OUT "\n";
150 | 		}
151 | 		for(my $i = 0; $i < len($matrix); $i ++) {
152 | 			print OUT "$quote$rownames->[$i]$quote$sep";
153 | 			print OUT join $sep, @{sapply($matrix->[$i], sub{"$quote$_$quote"})};
154 | 			print OUT "\n";
155 | 		}
156 | 	}
157 | 	else {
158 | 		if($colnames) {
159 | 			print OUT join $sep, @{sapply($colnames, sub{"$quote$_$quote"})};
160 | 			print OUT "\n";
161 | 		}
162 | 		for(my $i = 0; $i < len($matrix); $i ++) {
163 | 			print OUT join $sep, @{sapply($matrix->[$i], sub{"$quote$_$quote"})};
164 | 			print OUT "\n";
165 | 		}
166 | 	}
167 | 	close OUT;
168 | }
169 | 
170 | 
171 | 
172 | 1;
173 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP - Extract collocations and Ngrams from text
  4 | 
  5 | =head1 SYNOPSIS
  6 | 
  7 | =head2 Basic Usage
  8 | 
  9 |   use Text::NSP::Measures::2D::MI::ll;
 10 | 
 11 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 12 | 
 13 |   $ll_value = calculateStatistic( n11=>$n11,
 14 |                                       n1p=>$n1p,
 15 |                                       np1=>$np1,
 16 |                                       npp=>$npp);
 17 | 
 18 |   if( ($errorCode = getErrorCode()))
 19 |   {
 20 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 21 |   }
 22 |   else
 23 |   {
 24 |     print getStatisticName."value for bigram is ".$ll_value."\n"";
 25 |   }
 26 | 
 27 | =head1 DESCRIPTION
 28 | 
 29 | The Ngram Statistics Package (NSP) is a collection of perl modules
 30 | that aid in analyzing Ngrams in text files. We define an Ngram as a
 31 | sequence of 'n' tokens that occur within a window of at least 'n'
 32 | tokens in the text; what constitutes a "token" can be defined by the
 33 | user.
 34 | 
 35 | NSP.pm is a stub that doesn't have any real functionality. It serves
 36 | as a top level module in the hierarchy and allows us to group the
 37 | Text::NSP::Count and Text::NSP::Measures modules.
 38 | 
 39 | The modules under Text::NSP::Measures implement measures of
 40 | association that are used to evaluate whether the co-occurrence of the
 41 | words in a Ngram is purely by chance or statistically significant.
 42 | These measures compute a numerical score for Ngrams. This score can be
 43 | used to decide whether or not there is enough evidence to reject the
 44 | null hypothesis (that the Ngram is not statistically significant) for
 45 | that Ngram.
 46 | 
 47 | To use one of the measures you can either use the program statistic.pl
 48 | provided under the utils directory, or write your own driver program.
 49 | Program statistic.pl takes as input a list of Ngrams with their
 50 | frequencies (in the format output by count.pl) and runs a
 51 | user-selected statistical measure of association to compute the score
 52 | for each Ngram. The Ngrams, along with their scores, are output in
 53 | descending order of this score. For help on using utils/statistic.pl
 54 | please refer to its perldoc (perldoc utils/statistic.pl).
 55 | 
 56 | If you are writing your own driver program, a basic usage example is
 57 | provided above under SYNOPSIS. For further clarification please refer
 58 | to the documentation of Text::NSP::Measures (perldoc
 59 | Text::NSP::Measures).
 60 | 
 61 | 
 62 | =head2 Error Codes
 63 | 
 64 | The following table describes the error codes use in the
 65 | implementation,
 66 | 
 67 | Error codes common to all the association measures.
 68 | 
 69 |  100 - Trying to create an object of a abstract class.
 70 | 
 71 |  200 - one of the required values is missing.
 72 | 
 73 |  201 - one of the observed frequency comes out to be -ve.
 74 | 
 75 |  202 - one of the frequency values(n11) exceeds the total no of
 76 |        bigrams(npp) or a marginal total(n1p, np1).
 77 | 
 78 |  203 - one of the marginal totals(n1p, np1) exceeds the total bigram
 79 |        count(npp).
 80 | 
 81 |  204 - one of the marginal totals is -ve.
 82 | 
 83 | Error Codes required by the mutual information measures
 84 | 
 85 |  211 - one of the expected values is zero.
 86 | 
 87 |  212 - one of the expected values is -ve.
 88 | 
 89 | 
 90 | Error codes required by the CHI measures.
 91 | 
 92 |  221 - one of the expected values is zero.
 93 | 
 94 | =head2 Methods
 95 | 
 96 | =over
 97 | 
 98 | =cut
 99 | 
100 | package Text::NSP;
101 | 
102 | use strict;
103 | use Carp;
104 | use warnings;
105 | 
106 | our ($VERSION, @ISA);
107 | 
108 | @ISA  = qw(Exporter);
109 | 
110 | $VERSION = '1.25';
111 | 
112 | 1;
113 | 
114 | __END__
115 | 
116 | 
117 | =back
118 | 
119 | =head1 AUTHORS
120 | 
121 | Ted Pedersen,                University of Minnesota Duluth
122 |                              E<lt>tpederse at d.umn.eduE<gt>
123 | 
124 | Satanjeev Banerjee,          Carnegie Mellon University
125 | 
126 | Amruta Purandare,            University of Pittsburgh
127 | 
128 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
129 | 
130 | Saiyam Kohli,                University of Minnesota Duluth
131 | 
132 | =head1 HISTORY
133 | 
134 | Last updated: $Id: NSP.pm,v 1.41 2012/01/15 17:14:55 tpederse Exp $
135 | 
136 | =head1 BUGS
137 | 
138 | =head1 SEE ALSO
139 | 
140 | L<http://groups.yahoo.com/group/ngram/>
141 | 
142 | L<http://ngram.sourceforge.net>
143 | 
144 | =head1 COPYRIGHT
145 | 
146 | Copyright (C) 2000-2008, Ted Pedersen, Satanjeev Banerjee,
147 | Amruta Purandare, Bridget Thomson-McInnes and Saiyam Kohli
148 | 
149 | This program is free software; you can redistribute it and/or modify
150 | it under the terms of the GNU General Public License as published by
151 | the Free Software Foundation; either version 2 of the License, or (at
152 | your option) any later version.
153 | 
154 | This program is distributed in the hope that it will be useful, but
155 | WITHOUT ANY WARRANTY; without even the implied warranty of
156 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
157 | General Public License for more details.
158 | 
159 | You should have received a copy of the GNU General Public License
160 | along with this program; if not, write to
161 | 
162 |     The Free Software Foundation, Inc.,
163 |     59 Temple Place - Suite 330,
164 |     Boston, MA  02111-1307, USA.
165 | 
166 | Note: a copy of the GNU General Public License is available on the web
167 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
168 | distribution as GPL.txt.
169 | 
170 | =cut
171 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Dice/jaccard.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Dice::jaccard - Perl module that implements
  4 |                                          the jaccard coefficient.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Dice::jaccard;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $jaccard_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$jaccard_value."\n"";
 26 |   }
 27 | 
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | Assume that the frequency count data associated with a bigram
 32 | <word1><word2> is stored in a 2x2 contingency table:
 33 | 
 34 |           word2   ~word2
 35 |   word1    n11      n12 | n1p
 36 |  ~word1    n21      n22 | n2p
 37 |            --------------
 38 |            np1      np2   npp
 39 | 
 40 | where n11 is the number of times <word1><word2> occur together, and
 41 | n12 is the number of times <word1> occurs with some word other than
 42 | word2, and n1p is the number of times in total that word1 occurs as
 43 | the first word in a bigram.
 44 | 
 45 | The Jaccard Coefficient is the ratio of number of times the words
 46 | occur together to the number of times atleast any one of the words
 47 | occur. It is defined as:
 48 | 
 49 |           n11
 50 |     ---------------
 51 |     n11 + n12 + n21
 52 | 
 53 | The Jaccard coefficient can also be computed by applying a
 54 | transformation to the dice coefficient:
 55 | 
 56 | $jaccard = $dice/(2-$dice)
 57 | 
 58 | We use this computation of jaccard in our implementation.
 59 | 
 60 | =head2 Methods
 61 | 
 62 | =over
 63 | 
 64 | =cut
 65 | 
 66 | 
 67 | package Text::NSP::Measures::2D::Dice::jaccard;
 68 | 
 69 | 
 70 | use Text::NSP::Measures::2D::Dice;
 71 | use strict;
 72 | use Carp;
 73 | use warnings;
 74 | no warnings 'redefine';
 75 | require Exporter;
 76 | 
 77 | our ($VERSION, @EXPORT, @ISA);
 78 | 
 79 | @ISA  = qw(Exporter);
 80 | 
 81 | @EXPORT = qw(initializeStatistic calculateStatistic
 82 |              getErrorCode getErrorMessage getStatisticName);
 83 | 
 84 | $VERSION = '0.97';
 85 | 
 86 | 
 87 | =item calculateStatistic() - method to calculate the jaccard coefficient value
 88 | 
 89 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 90 |                                        the count values computed by the
 91 |                                        count.pl program.
 92 | 
 93 | RETURN VALUES : $jaccard            .. Jaccard Coefficient value for this bigram.
 94 | 
 95 | =cut
 96 | 
 97 | sub calculateStatistic
 98 | {
 99 |   my %values = @_;
100 |   my $dice;
101 |   my $jaccard;
102 | 
103 |   #compute the dice coefficient
104 |   if( !($dice = Text::NSP::Measures::2D::Dice::computeVal(\%values)) )
105 |   {
106 |     return;
107 |   }
108 | 
109 |   #compute the jaccard coefficient from the dice coefficient
110 |   $jaccard = $dice/(2-$dice);
111 | 
112 |   return ($jaccard);
113 | }
114 | 
115 | 
116 | 
117 | =item getStatisticName() - Returns the name of this statistic
118 | 
119 | INPUT PARAMS  : none
120 | 
121 | RETURN VALUES : $name      .. Name of the measure.
122 | 
123 | =cut
124 | 
125 | sub getStatisticName
126 | {
127 |   return "Jaccard Coefficient";
128 | }
129 | 
130 | 
131 | 
132 | 1;
133 | __END__
134 | 
135 | 
136 | =back
137 | 
138 | =head1 AUTHOR
139 | 
140 | Ted Pedersen,                University of Minnesota Duluth
141 |                              E<lt>tpederse@d.umn.eduE<gt>
142 | 
143 | Satanjeev Banerjee,          Carnegie Mellon University
144 |                              E<lt>satanjeev@cmu.eduE<gt>
145 | 
146 | Amruta Purandare,            University of Pittsburgh
147 |                              E<lt>amruta@cs.pitt.eduE<gt>
148 | 
149 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
150 |                              E<lt>bthompson@d.umn.eduE<gt>
151 | 
152 | Saiyam Kohli,                University of Minnesota Duluth
153 |                              E<lt>kohli003@d.umn.eduE<gt>
154 | 
155 | =head1 HISTORY
156 | 
157 | Last updated: $Id: jaccard.pm,v 1.8 2006/06/21 11:10:52 saiyam_kohli Exp $
158 | 
159 | =head1 BUGS
160 | 
161 | 
162 | =head1 SEE ALSO
163 | 
164 | L<http://groups.yahoo.com/group/ngram/>
165 | 
166 | L<http://www.d.umn.edu/~tpederse/nsp.html>
167 | 
168 | 
169 | =head1 COPYRIGHT
170 | 
171 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
172 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
173 | 
174 | This program is free software; you can redistribute it and/or modify it
175 | under the terms of the GNU General Public License as published by the Free
176 | Software Foundation; either version 2 of the License, or (at your option)
177 | any later version.
178 | 
179 | This program is distributed in the hope that it will be useful, but
180 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
181 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
182 | for more details.
183 | 
184 | You should have received a copy of the GNU General Public License along
185 | with this program; if not, write to
186 | 
187 |     The Free Software Foundation, Inc.,
188 |     59 Temple Place - Suite 330,
189 |     Boston, MA  02111-1307, USA.
190 | 
191 | Note: a copy of the GNU General Public License is available on the web
192 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
193 | distribution as GPL.txt.
194 | 
195 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Dice/dice.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Dice::dice - Perl module to compute Dice coefficient
  4 |                                       for bigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Dice::dice;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $dice_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$dice_value."\n"";
 26 |   }
 27 | 
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | Assume that the frequency count data associated with a bigram
 32 | <word1><word2> is stored in a 2x2 contingency table:
 33 | 
 34 |           word2   ~word2
 35 |   word1    n11      n12 | n1p
 36 |  ~word1    n21      n22 | n2p
 37 |            --------------
 38 |            np1      np2   npp
 39 | 
 40 | where n11 is the number of times <word1><word2> occur together, and
 41 | n12 is the number of times <word1> occurs with some word other than
 42 | word2, and n1p is the number of times in total that word1 occurs as
 43 | the first word in a bigram.
 44 | 
 45 | The Dice Coefficient is defined as :
 46 | 
 47 |      2 * n11
 48 |     ---------
 49 |     np1 + n1p
 50 | 
 51 | The Jaccard coefficient can also be computed by applying a
 52 | transformation to the dice coefficient:
 53 | 
 54 | $jaccard = $dice/(2-$dice)
 55 | 
 56 | =head2 Methods
 57 | 
 58 | =over
 59 | 
 60 | =cut
 61 | 
 62 | 
 63 | package Text::NSP::Measures::2D::Dice::dice;
 64 | 
 65 | 
 66 | use Text::NSP::Measures::2D::Dice;
 67 | use strict;
 68 | use Carp;
 69 | use warnings;
 70 | no warnings 'redefine';
 71 | require Exporter;
 72 | 
 73 | our ($VERSION, @EXPORT, @ISA);
 74 | 
 75 | @ISA  = qw(Exporter);
 76 | 
 77 | @EXPORT = qw(initializeStatistic calculateStatistic
 78 |              getErrorCode getErrorMessage getStatisticName);
 79 | 
 80 | $VERSION = '0.97';
 81 | 
 82 | 
 83 | =item calculateStatistic() - method to calculate the dice coefficient value
 84 | 
 85 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 86 |                                        the count values computed by the
 87 |                                        count.pl program.
 88 | 
 89 | RETURN VALUES : $dice               .. Dice Coefficient value for this bigram.
 90 | 
 91 | =cut
 92 | 
 93 | sub calculateStatistic
 94 | {
 95 |   my %values = @_;
 96 | 
 97 |   #compute and return the dice coefficient.
 98 |   return Text::NSP::Measures::2D::Dice::computeVal(\%values);
 99 | }
100 | 
101 | 
102 | =item getStatisticName() - Returns the name of this statistic
103 | 
104 | INPUT PARAMS  : none
105 | 
106 | RETURN VALUES : $name      .. Name of the measure.
107 | 
108 | =cut
109 | 
110 | sub getStatisticName
111 | {
112 |   my ($self)=@_;
113 |   return "Dice Coefficient";
114 | }
115 | 
116 | 
117 | 
118 | 1;
119 | __END__
120 | 
121 | 
122 | =back
123 | 
124 | =head1 AUTHOR
125 | 
126 | Ted Pedersen,                University of Minnesota Duluth
127 |                              E<lt>tpederse@d.umn.eduE<gt>
128 | 
129 | Satanjeev Banerjee,          Carnegie Mellon University
130 |                              E<lt>satanjeev@cmu.eduE<gt>
131 | 
132 | Amruta Purandare,            University of Pittsburgh
133 |                              E<lt>amruta@cs.pitt.eduE<gt>
134 | 
135 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
136 |                              E<lt>bthompson@d.umn.eduE<gt>
137 | 
138 | Saiyam Kohli,                University of Minnesota Duluth
139 |                              E<lt>kohli003@d.umn.eduE<gt>
140 | 
141 | =head1 HISTORY
142 | 
143 | Last updated: $Id: dice.pm,v 1.6 2006/06/21 11:10:52 saiyam_kohli Exp $
144 | 
145 | =head1 BUGS
146 | 
147 | 
148 | =head1 SEE ALSO
149 | 
150 |   @article{SmadjaMH96,
151 |           author = {Smadja, F. and McKeown, K. and Hatzivassiloglou, V.},
152 |           title = {Translating Collocations for Bilingual Lexicons: A
153 |                   Statistical Approach},
154 |           journal = {Computational Linguistics},
155 |           volume = {22},
156 |           number = {1},
157 |           year = {1996},
158 |           pages = {1-38}
159 |           url = L<http://www.cs.mu.oz.au/acl/J/J96/J96-1001.pdf>}
160 | 
161 | L<http://groups.yahoo.com/group/ngram/>
162 | 
163 | L<http://www.d.umn.edu/~tpederse/nsp.html>
164 | 
165 | 
166 | =head1 COPYRIGHT
167 | 
168 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
169 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
170 | 
171 | This program is free software; you can redistribute it and/or modify it
172 | under the terms of the GNU General Public License as published by the Free
173 | Software Foundation; either version 2 of the License, or (at your option)
174 | any later version.
175 | 
176 | This program is distributed in the hope that it will be useful, but
177 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
178 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
179 | for more details.
180 | 
181 | You should have received a copy of the GNU General Public License along
182 | with this program; if not, write to
183 | 
184 |     The Free Software Foundation, Inc.,
185 |     59 Temple Place - Suite 330,
186 |     Boston, MA  02111-1307, USA.
187 | 
188 | Note: a copy of the GNU General Public License is available on the web
189 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
190 | distribution as GPL.txt.
191 | 
192 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Dice.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Dice  - Perl module that provides the
  4 |                                 framework to implement the Dice and
  5 |                                 Jaccard coefficients.
  6 | 
  7 | =head1 SYNOPSIS
  8 | 
  9 | =head3 Basic Usage
 10 | 
 11 |   use Text::NSP::Measures::2D::Dice::dice;
 12 | 
 13 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 14 | 
 15 |   $dice_value = calculateStatistic( n11=>$n11,
 16 |                                       n1p=>$n1p,
 17 |                                       np1=>$np1,
 18 |                                       npp=>$npp);
 19 | 
 20 |   if( ($errorCode = getErrorCode()))
 21 |   {
 22 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 23 |   }
 24 |   else
 25 |   {
 26 |     print getStatisticName."value for bigram is ".$dice_value."\n"";
 27 |   }
 28 | 
 29 | 
 30 | =head1 DESCRIPTION
 31 | 
 32 | Assume that the frequency count data associated with a bigram
 33 | <word1><word2> is stored in a 2x2 contingency table:
 34 | 
 35 |           word2   ~word2
 36 |   word1    n11      n12 | n1p
 37 |  ~word1    n21      n22 | n2p
 38 |            --------------
 39 |            np1      np2   npp
 40 | 
 41 | where n11 is the number of times <word1><word2> occur together, and
 42 | n12 is the number of times <word1> occurs with some word other than
 43 | word2, and n1p is the number of times in total that word1 occurs as
 44 | the first word in a bigram.
 45 | 
 46 | =over
 47 | 
 48 | =item The Dice Coefficient is defined as :
 49 | 
 50 |      2 * n11
 51 |     ---------
 52 |     np1 + n1p
 53 | 
 54 | =item The Jaccard coefficient is defined as:
 55 | 
 56 |           n11
 57 |     ---------------
 58 |     n11 + n12 + n21
 59 | 
 60 | =back
 61 | 
 62 | =head2 Methods
 63 | 
 64 | =over
 65 | 
 66 | =cut
 67 | 
 68 | 
 69 | package Text::NSP::Measures::2D::Dice;
 70 | 
 71 | 
 72 | use Text::NSP::Measures::2D;
 73 | use strict;
 74 | use Carp;
 75 | use warnings;
 76 | # use subs(calculateStatistic);
 77 | require Exporter;
 78 | 
 79 | our ($VERSION, @EXPORT, @ISA);
 80 | 
 81 | @ISA  = qw(Exporter);
 82 | 
 83 | @EXPORT = qw(initializeStatistic calculateStatistic
 84 |              getErrorCode getErrorMessage getStatisticName);
 85 | 
 86 | $VERSION = '0.97';
 87 | 
 88 | =item computeVal() - method to calculate the dice coefficient value
 89 | 
 90 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 91 |                                        the count values computed by the
 92 |                                        count.pl program.
 93 | 
 94 | RETURN VALUES : $dice               .. Dice Coefficient value for this bigram.
 95 | 
 96 | =cut
 97 | 
 98 | sub computeVal
 99 | {
100 |   my $values = shift;
101 | 
102 |   # computes and returns the marginal totals from the frequency
103 |   # combination values. returns undef if there is an error in
104 |   # the computation or the values are inconsistent.
105 |   if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){
106 |     return;
107 |   }
108 | 
109 |   # computes and returns the observed from the frequency
110 |   # combination values. returns undef if there is an error in
111 |   # the computation or the values are inconsistent.
112 |   if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) {
113 |       return;
114 |   }
115 | 
116 |   my $dice = 2 * $n11 / ($n1p + $np1);
117 | 
118 |   return ($dice);
119 | }
120 | 
121 | 
122 | 
123 | 
124 | 1;
125 | __END__
126 | 
127 | 
128 | =back
129 | 
130 | =head1 AUTHOR
131 | 
132 | Ted Pedersen,                University of Minnesota Duluth
133 |                              E<lt>tpederse@d.umn.eduE<gt>
134 | 
135 | Satanjeev Banerjee,          Carnegie Mellon University
136 |                              E<lt>satanjeev@cmu.eduE<gt>
137 | 
138 | Amruta Purandare,            University of Pittsburgh
139 |                              E<lt>amruta@cs.pitt.eduE<gt>
140 | 
141 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
142 |                              E<lt>bthompson@d.umn.eduE<gt>
143 | 
144 | Saiyam Kohli,                University of Minnesota Duluth
145 |                              E<lt>kohli003@d.umn.eduE<gt>
146 | 
147 | =head1 HISTORY
148 | 
149 | Last updated: $Id: Dice.pm,v 1.6 2006/06/21 11:10:52 saiyam_kohli Exp $
150 | 
151 | =head1 BUGS
152 | 
153 | 
154 | =head1 SEE ALSO
155 | 
156 | @article{SmadjaMH96,
157 |         author = {Smadja, F. and McKeown, K. and Hatzivassiloglou, V.},
158 |         title = {Translating Collocations for Bilingual Lexicons: A
159 |                  Statistical Approach},
160 |         journal = {Computational Linguistics},
161 |         volume = {22},
162 |         number = {1},
163 |         year = {1996},
164 |         pages = {1-38}
165 |         url = L<http://www.cs.mu.oz.au/acl/J/J96/J96-1001.pdf>}
166 | 
167 | L<http://groups.yahoo.com/group/ngram/>
168 | 
169 | L<http://www.d.umn.edu/~tpederse/nsp.html>
170 | 
171 | 
172 | =head1 COPYRIGHT
173 | 
174 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
175 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
176 | 
177 | This program is free software; you can redistribute it and/or modify it
178 | under the terms of the GNU General Public License as published by the Free
179 | Software Foundation; either version 2 of the License, or (at your option)
180 | any later version.
181 | 
182 | This program is distributed in the hope that it will be useful, but
183 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
184 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
185 | for more details.
186 | 
187 | You should have received a copy of the GNU General Public License along
188 | with this program; if not, write to
189 | 
190 |     The Free Software Foundation, Inc.,
191 |     59 Temple Place - Suite 330,
192 |     Boston, MA  02111-1307, USA.
193 | 
194 | Note: a copy of the GNU General Public License is available on the web
195 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
196 | distribution as GPL.txt.
197 | 
198 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/pmi.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::3D::MI::pmi - Perl module that implements Pointwise
  4 |                                    Mutual Information for trigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::3D::MI::pmi;
 11 | 
 12 |   $pmi_value = calculateStatistic( n111=>10,
 13 |                                   n1pp=>40,
 14 |                                   np1p=>45,
 15 |                                   npp1=>42,
 16 |                                   n11p=>20,
 17 |                                   n1p1=>23,
 18 |                                   np11=>21,
 19 |                                   nppp=>100);
 20 | 
 21 |   if( ($errorCode = getErrorCode()))
 22 |   {
 23 |     print STDERR $erroCode." - ".getErrorMessage()."\n";
 24 |   }
 25 |   else
 26 |   {
 27 |     print getStatisticName."value for bigram is ".$pmi_value."\n";
 28 |   }
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | The expected values for the internal cells are calculated by taking the
 34 | product of their associated marginals and dividing by the sample size,
 35 | for example:
 36 | 
 37 |             n1pp * np1p * npp1
 38 |    m111=   --------------------
 39 |               nppp * nppp
 40 | 
 41 | Pointwise Mutual Information (pmi) is defined as the log of the devitation
 42 | between the observed frequency of a trigram (n111) and the probability of
 43 | that trigram if it were independent (m111).
 44 | 
 45 |  PMI =   log (n111/m111)
 46 | 
 47 | =head2 Methods
 48 | 
 49 | =over
 50 | 
 51 | =cut
 52 | 
 53 | 
 54 | package Text::NSP::Measures::3D::MI::pmi;
 55 | 
 56 | 
 57 | use Text::NSP::Measures::3D::MI;
 58 | use strict;
 59 | use Carp;
 60 | use warnings;
 61 | no warnings 'redefine';
 62 | require Exporter;
 63 | 
 64 | our ($VERSION, @EXPORT, @ISA, $exp);
 65 | 
 66 | $exp=1;
 67 | 
 68 | @ISA  = qw(Exporter);
 69 | 
 70 | @EXPORT = qw(initializeStatistic calculateStatistic
 71 |              getErrorCode getErrorMessage getStatisticName);
 72 | 
 73 | $VERSION = '0.97';
 74 | 
 75 | 
 76 | =item initializeStatistic() -Initialization of the pmi_exp parameter if required
 77 | 
 78 | INPUT PARAMS  : none
 79 | 
 80 | RETURN VALUES : none
 81 | 
 82 | =cut
 83 | 
 84 | sub initializeStatistic
 85 | {
 86 |   $exp = shift;
 87 | }
 88 | 
 89 | 
 90 | 
 91 | =item calculateStatistic() - This method calculates the pmi value
 92 | 
 93 | INPUT PARAMS  : $count_values       .. Reference of a hash containing
 94 |                                        the count values computed by the
 95 |                                        count.pl program.
 96 | 
 97 | RETURN VALUES : $pmi                .. PMI value for this trigram.
 98 | 
 99 | =cut
100 | 
101 | sub calculateStatistic
102 | {
103 |   my %values = @_;
104 | 
105 |   # computes and sets the observed and expected values from
106 |   # the frequency combination values. returns 0 if there is an
107 |   # error in the computation or the values are inconsistent.
108 |   if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) {
109 |     return(0);
110 |   }
111 | 
112 |   #  Now the calculations!
113 |   my $pmi = Text::NSP::Measures::3D::MI::computePMI($n111**$exp, $m111);
114 | 
115 |   return($pmi/log(2));
116 | }
117 | 
118 | 
119 | 
120 | =item getStatisticName() - Returns the name of this statistic
121 | 
122 | INPUT PARAMS  : none
123 | 
124 | RETURN VALUES : $name      .. Name of the measure.
125 | 
126 | =cut
127 | 
128 | sub getStatisticName
129 | {
130 |     return "Pointwise Mutual Information";
131 | }
132 | 
133 | 
134 | 
135 | 1;
136 | __END__
137 | 
138 | 
139 | =back
140 | 
141 | =head1 AUTHOR
142 | 
143 | Ted Pedersen,                University of Minnesota Duluth
144 |                              E<lt>tpederse@d.umn.eduE<gt>
145 | 
146 | Satanjeev Banerjee,          Carnegie Mellon University
147 |                              E<lt>satanjeev@cmu.eduE<gt>
148 | 
149 | Amruta Purandare,            University of Pittsburgh
150 |                              E<lt>amruta@cs.pitt.eduE<gt>
151 | 
152 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
153 |                              E<lt>bthompson@d.umn.eduE<gt>
154 | 
155 | Saiyam Kohli,                University of Minnesota Duluth
156 |                              E<lt>kohli003@d.umn.eduE<gt>
157 | 
158 | =head1 HISTORY
159 | 
160 | Last updated: $Id: pmi.pm,v 1.9 2009/11/03 14:53:55 tpederse Exp $
161 | 
162 | =head1 BUGS
163 | 
164 | 
165 | =head1 SEE ALSO
166 | 
167 |   @inproceedings{ church89word,
168 |       author = {Kenneth W. Church and Patrick Hanks},
169 |       title = {Word association norms, mutual information, and Lexicography},
170 |       booktitle = {Proceedings of the 27th. Annual Meeting of the Association for Computational Linguistics},
171 |       publisher = {Association for Computational Linguistics},
172 |       address = {Vancouver, B.C.},
173 |       pages = {76--83},
174 |       year = {1989},
175 |       url = L<http://acl.ldc.upenn.edu/J/J90/J90-1003.pdf> }
176 | 
177 | 
178 | L<http://groups.yahoo.com/group/ngram/>
179 | 
180 | L<http://www.d.umn.edu/~tpederse/nsp.html>
181 | 
182 | 
183 | =head1 COPYRIGHT
184 | 
185 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
186 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
187 | 
188 | This program is free software; you can redistribute it and/or modify it
189 | under the terms of the GNU General Public License as published by the Free
190 | Software Foundation; either version 2 of the License, or (at your option)
191 | any later version.
192 | 
193 | This program is distributed in the hope that it will be useful, but
194 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
195 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
196 | for more details.
197 | 
198 | You should have received a copy of the GNU General Public License along
199 | with this program; if not, write to
200 | 
201 |     The Free Software Foundation, Inc.,
202 |     59 Temple Place - Suite 330,
203 |     Boston, MA  02111-1307, USA.
204 | 
205 | Note: a copy of the GNU General Public License is available on the web
206 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
207 | distribution as GPL.txt.
208 | 
209 | =cut
210 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI/tscore.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::CHI::tscore  - Perl module that implements T-score
  4 |                                         measure of association for bigrams.
  5 | 
  6 | 
  7 | =head1 SYNOPSIS
  8 | 
  9 | =head3 Basic Usage
 10 | 
 11 |   use Text::NSP::Measures::2D::CHI::tscore;
 12 | 
 13 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 14 | 
 15 |   $tscore_value = calculateStatistic( n11=>$n11,
 16 |                                       n1p=>$n1p,
 17 |                                       np1=>$np1,
 18 |                                       npp=>$npp);
 19 | 
 20 |   if( ($errorCode = getErrorCode()))
 21 |   {
 22 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 23 |   }
 24 |   else
 25 |   {
 26 |     print getStatisticName."value for bigram is ".$tscore_value."\n"";
 27 |   }
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | Assume that the frequency count data associated with a bigram
 32 | <word1><word2> is stored in a 2x2 contingency table:
 33 | 
 34 |           word2   ~word2
 35 |   word1    n11      n12 | n1p
 36 |  ~word1    n21      n22 | n2p
 37 |            --------------
 38 |            np1      np2   npp
 39 | 
 40 | where n11 is the number of times <word1><word2> occur together, and
 41 | n12 is the number of times <word1> occurs with some word other than
 42 | word2, and n1p is the number of times in total that word1 occurs as
 43 | the first word in a bigram.
 44 | 
 45 | The T-score is defined as a ratio of difference between the observed
 46 | and the expected mean to the variance of the sample. Note that this
 47 | is a variant of the standard t-test that was proposed for use in the
 48 | identification of collocations in large samples of text.
 49 | 
 50 | Thus, the T-score is defined as follows:
 51 | 
 52 |     m11 = n1p * np1 / npp
 53 | 
 54 |     T-score = (n11 - m11)/sqrt(n11)
 55 | 
 56 | =over
 57 | 
 58 | =cut
 59 | 
 60 | 
 61 | package Text::NSP::Measures::2D::CHI::tscore;
 62 | 
 63 | 
 64 | use Text::NSP::Measures::2D::CHI;
 65 | use strict;
 66 | use Carp;
 67 | use warnings;
 68 | no warnings 'redefine';
 69 | require Exporter;
 70 | 
 71 | our ($VERSION, @EXPORT, @ISA);
 72 | 
 73 | @ISA  = qw(Exporter);
 74 | 
 75 | @EXPORT = qw(initializeStatistic calculateStatistic
 76 |              getErrorCode getErrorMessage getStatisticName);
 77 | 
 78 | $VERSION = '0.97';
 79 | 
 80 | 
 81 | =item calculateStatistic() - method to calculate the tscore Coefficient
 82 | 
 83 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 84 |                                        the count values computed by the
 85 |                                        count.pl program.
 86 | 
 87 | RETURN VALUES : $tscore             .. tscore value for this bigram.
 88 | 
 89 | =cut
 90 | 
 91 | sub calculateStatistic
 92 | {
 93 |   my %values = @_;
 94 | 
 95 |   # computes and returns the observed and expected values from
 96 |   # the frequency combination values. returns 0 if there is an
 97 |   # error in the computation or the values are inconsistent.
 98 |   if( !(Text::NSP::Measures::2D::CHI::getValues(\%values)) ) {
 99 |     return;
100 |   }
101 |   #  Now calculate the tscore
102 | 
103 |   my $tscore = (($n11-$m11)/($n11**0.5));
104 | 
105 |   return ( $tscore );
106 | }
107 | 
108 | 
109 | 
110 | =item getStatisticName() - Returns the name of this statistic
111 | 
112 | INPUT PARAMS  : none
113 | 
114 | RETURN VALUES : $name      .. Name of the measure.
115 | 
116 | =cut
117 | 
118 | sub getStatisticName
119 | {
120 |   return "T-score";
121 | }
122 | 
123 | 
124 | 
125 | 1;
126 | __END__
127 | 
128 | 
129 | =back
130 | 
131 | =head1 AUTHOR
132 | 
133 | Ted Pedersen,                University of Minnesota Duluth
134 |                              E<lt>tpederse@d.umn.eduE<gt>
135 | 
136 | Satanjeev Banerjee,          Carnegie Mellon University
137 |                              E<lt>satanjeev@cmu.eduE<gt>
138 | 
139 | Amruta Purandare,            University of Pittsburgh
140 |                              E<lt>amruta@cs.pitt.eduE<gt>
141 | 
142 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
143 |                              E<lt>bthompson@d.umn.eduE<gt>
144 | 
145 | Saiyam Kohli,                University of Minnesota Duluth
146 |                              E<lt>kohli003@d.umn.eduE<gt>
147 | 
148 | =head1 HISTORY
149 | 
150 | Last updated: $Id: tscore.pm,v 1.11 2006/06/21 11:10:52 saiyam_kohli Exp $
151 | 
152 | =head1 BUGS
153 | 
154 | 
155 | =head1 SEE ALSO
156 | 
157 |   @incollection {ChurchGHH91,
158 |           author={Church, K. and Gale, W. and Hanks, P. and Hindle, D. },
159 |           title={Using Statistics in Lexical Analysis},
160 |           booktitle={Lexical Acquisition: Exploiting On-Line Resources
161 |                       to Build a Lexicon},
162 |           editor={Zernik, U.},
163 |           year={1991},
164 |           address={Hillsdale, NJ},
165 |           publisher={Lawrence Erlbaum Associates}
166 |           url = L<http://www.patrickhanks.com/papers/usingStats.pdf>}
167 | 
168 | L<http://groups.yahoo.com/group/ngram/>
169 | 
170 | L<http://www.d.umn.edu/~tpederse/nsp.html>
171 | 
172 | 
173 | =head1 COPYRIGHT
174 | 
175 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
176 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
177 | 
178 | This program is free software; you can redistribute it and/or modify it
179 | under the terms of the GNU General Public License as published by the Free
180 | Software Foundation; either version 2 of the License, or (at your option)
181 | any later version.
182 | 
183 | This program is distributed in the hope that it will be useful, but
184 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
185 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
186 | for more details.
187 | 
188 | You should have received a copy of the GNU General Public License along
189 | with this program; if not, write to
190 | 
191 |     The Free Software Foundation, Inc.,
192 |     59 Temple Place - Suite 330,
193 |     Boston, MA  02111-1307, USA.
194 | 
195 | Note: a copy of the GNU General Public License is available on the web
196 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
197 | distribution as GPL.txt.
198 | 
199 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/ps.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::3D::MI::ps - Perl module that implements
  4 |                                   Poisson Stirling Measure for trigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::3D::MI::ps;
 11 | 
 12 |   $ps_value = calculateStatistic( n111=>10,
 13 |                                   n1pp=>40,
 14 |                                   np1p=>45,
 15 |                                   npp1=>42,
 16 |                                   n11p=>20,
 17 |                                   n1p1=>23,
 18 |                                   np11=>21,
 19 |                                   nppp=>100);
 20 | 
 21 |   if( ($errorCode = getErrorCode()))
 22 |   {
 23 |     print STDERR $erroCode." - ".getErrorMessage()."\n";
 24 |   }
 25 |   else
 26 |   {
 27 |     print getStatisticName."value for bigram is ".$ps_value."\n";
 28 |   }
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | The log-likelihood ratio measures the devitation between the observed data
 34 | and what would be expected if <word1>, <word2> and <word3> were independent.
 35 | The higher the score, the less evidence there is in favor of concluding that
 36 | the words are independent.
 37 | 
 38 | The expected values for the internal cells are calculated by taking the
 39 | product of their associated marginals and dividing by the sample size,
 40 | for example:
 41 | 
 42 |             n1pp * np1p * npp1
 43 |    m111=   --------------------
 44 |                    nppp
 45 | 
 46 | The poisson stirling measure is a negative lograthimic approximation
 47 | of the poisson-likelihood measure. It uses the stirlings firmula to
 48 | approximate the factorial in poisson-likelihood measure. It is
 49 | computed as follows:
 50 | 
 51 | Posson-Stirling = n111 * ( log(n111) - log(m111) - 1)
 52 | 
 53 | =head2 Methods
 54 | 
 55 | =over
 56 | 
 57 | =cut
 58 | 
 59 | package Text::NSP::Measures::3D::MI::ps;
 60 | 
 61 | 
 62 | use Text::NSP::Measures::3D::MI;
 63 | use strict;
 64 | use Carp;
 65 | use warnings;
 66 | no warnings 'redefine';
 67 | require Exporter;
 68 | 
 69 | our ($VERSION, @EXPORT, @ISA);
 70 | 
 71 | @ISA  = qw(Exporter);
 72 | 
 73 | @EXPORT = qw(initializeStatistic calculateStatistic
 74 |              getErrorCode getErrorMessage getStatisticName);
 75 | 
 76 | $VERSION = '0.97';
 77 | 
 78 | =item calculateStatistic() - This method calculates the ps value
 79 | 
 80 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 81 |                                        the count values computed by the
 82 |                                        count.pl program.
 83 | 
 84 | RETURN VALUES : $poissonStirling      .. Poisson-Stirling value for this trigram.
 85 | 
 86 | =cut
 87 | 
 88 | sub calculateStatistic
 89 | {
 90 |   my %values = @_;
 91 | 
 92 |   # computes and returns the observed and expected values from
 93 |   # the frequency combination values. returns 0 if there is an
 94 |   # error in the computation or the values are inconsistent.
 95 |   if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) {
 96 |     return;
 97 |   }
 98 | 
 99 |   #  Now for the actual calculation of Loglikelihood!
100 |   my $poissonStirling = 0;
101 | 
102 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so!
103 |   $poissonStirling = $n111 * (Text::NSP::Measures::3D::MI::computePMI($n111, $m111) - 1);
104 | 
105 |   return $poissonStirling;
106 | }
107 | 
108 | 
109 | =item getStatisticName() - Returns the name of this statistic
110 | 
111 | INPUT PARAMS  : none
112 | 
113 | RETURN VALUES : $name      .. Name of the measure.
114 | 
115 | =cut
116 | 
117 | sub getStatisticName
118 | {
119 |     return "Poisson-Stirling Measure";
120 | }
121 | 
122 | 
123 | 
124 | 1;
125 | __END__
126 | 
127 | 
128 | =back
129 | 
130 | =head1 AUTHOR
131 | 
132 | Ted Pedersen,                University of Minnesota Duluth
133 |                              E<lt>tpederse@d.umn.eduE<gt>
134 | 
135 | Satanjeev Banerjee,          Carnegie Mellon University
136 |                              E<lt>satanjeev@cmu.eduE<gt>
137 | 
138 | Amruta Purandare,            University of Pittsburgh
139 |                              E<lt>amruta@cs.pitt.eduE<gt>
140 | 
141 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
142 |                              E<lt>bthompson@d.umn.eduE<gt>
143 | 
144 | Saiyam Kohli,                University of Minnesota Duluth
145 |                              E<lt>kohli003@d.umn.eduE<gt>
146 | 
147 | =head1 HISTORY
148 | 
149 | Last updated: $Id: ps.pm,v 1.7 2006/06/21 11:10:53 saiyam_kohli Exp $
150 | 
151 | =head1 BUGS
152 | 
153 | 
154 | =head1 SEE ALSO
155 | 
156 |   @inproceedings{ church89word,
157 |       author = {Kenneth W. Church and Patrick Hanks},
158 |       title = {Word association norms, mutual information, and Lexicography},
159 |       booktitle = {Proceedings of the 27th. Annual Meeting of the Association for Computational Linguistics},
160 |       publisher = {Association for Computational Linguistics},
161 |       address = {Vancouver, B.C.},
162 |       pages = {76--83},
163 |       year = {1989},
164 |       url = L<http://acl.ldc.upenn.edu/J/J90/J90-1003.pdf> }
165 | 
166 | 
167 | L<http://groups.yahoo.com/group/ngram/>
168 | 
169 | L<http://www.d.umn.edu/~tpederse/nsp.html>
170 | 
171 | 
172 | =head1 COPYRIGHT
173 | 
174 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
175 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
176 | 
177 | This program is free software; you can redistribute it and/or modify it
178 | under the terms of the GNU General Public License as published by the Free
179 | Software Foundation; either version 2 of the License, or (at your option)
180 | any later version.
181 | 
182 | This program is distributed in the hope that it will be useful, but
183 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
184 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
185 | for more details.
186 | 
187 | You should have received a copy of the GNU General Public License along
188 | with this program; if not, write to
189 | 
190 |     The Free Software Foundation, Inc.,
191 |     59 Temple Place - Suite 330,
192 |     Boston, MA  02111-1307, USA.
193 | 
194 | Note: a copy of the GNU General Public License is available on the web
195 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
196 | distribution as GPL.txt.
197 | 
198 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI/x2.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::CHI::x2  - Perl module that implements Pearson's
  4 |                                     chi squared measure of association for
  5 |                                     bigrams.
  6 | 
  7 | =head1 SYNOPSIS
  8 | 
  9 | =head3 Basic Usage
 10 | 
 11 |   use Text::NSP::Measures::2D::CHI::x2;
 12 | 
 13 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 14 | 
 15 |   $x2_value = calculateStatistic( n11=>$n11,
 16 |                                       n1p=>$n1p,
 17 |                                       np1=>$np1,
 18 |                                       npp=>$npp);
 19 | 
 20 |   if( ($errorCode = getErrorCode()))
 21 |   {
 22 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 23 |   }
 24 |   else
 25 |   {
 26 |     print getStatisticName."value for bigram is ".$x2_value."\n"";
 27 |   }
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | Pearson's Chi-squred test measures the devitation between the observed
 32 | data and what would be expected if <word1> and <word2> were independent.
 33 | The higher the score, the less evidence there is in favor of concluding
 34 | that the words are independent.
 35 | 
 36 | 
 37 | Assume that the frequency count data associated with a bigram
 38 | <word1><word2> is stored in a 2x2 contingency table:
 39 | 
 40 |           word2   ~word2
 41 |   word1    n11      n12 | n1p
 42 |  ~word1    n21      n22 | n2p
 43 |            --------------
 44 |            np1      np2   npp
 45 | 
 46 | where n11 is the number of times <word1><word2> occur together, and
 47 | n12 is the number of times <word1> occurs with some word other than
 48 | word2, and n1p is the number of times in total that word1 occurs as
 49 | the first word in a bigram.
 50 | 
 51 | The expected values for the internal cells are calculated by taking the
 52 | product of their associated marginals and dividing by the sample size,
 53 | for example:
 54 | 
 55 |           np1 * n1p
 56 |    m11=   ---------
 57 |             npp
 58 | 
 59 | Then the deviation between observed and expected values for each internal
 60 | cell is computed to arrive at the Pearson's Chi-Squared test value:
 61 | 
 62 |  Pearson's Chi-Squared = 2 * [((n11 - m11)/m11)^2 + ((n12 - m12)/m12)^2 +
 63 |                               ((n21 - m21)/m21)^2 + ((n22 -m22)/m22)^2]
 64 | 
 65 | 
 66 | =over
 67 | 
 68 | =cut
 69 | 
 70 | 
 71 | package Text::NSP::Measures::2D::CHI::x2;
 72 | 
 73 | 
 74 | use Text::NSP::Measures::2D::CHI;
 75 | use strict;
 76 | use Carp;
 77 | use warnings;
 78 | no warnings 'redefine';
 79 | require Exporter;
 80 | 
 81 | our ($VERSION, @EXPORT, @ISA);
 82 | 
 83 | @ISA  = qw(Exporter);
 84 | 
 85 | @EXPORT = qw(initializeStatistic calculateStatistic
 86 |              getErrorCode getErrorMessage getStatisticName);
 87 | 
 88 | $VERSION = '0.97';
 89 | 
 90 | 
 91 | =item calculateStatistic() - method to calculate the Chi-squared value.
 92 | 
 93 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 94 |                                        the count values computed by the
 95 |                                        count.pl program.
 96 | 
 97 | RETURN VALUES : $x2                .. x2 value for this bigram.
 98 | 
 99 | =cut
100 | 
101 | sub calculateStatistic
102 | {
103 |   my %values = @_;
104 | 
105 |   # computes and returns the observed and expected values from
106 |   # the frequency combination values. returns 0 if there is an
107 |   # error in the computation or the values are inconsistent.
108 |   if( !(Text::NSP::Measures::2D::CHI::getValues(\%values)) ) {
109 |     return;
110 |   }
111 |   #  Now calculate the xsquare
112 |   my $Xsquare = 0;
113 | 
114 |   $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n11, $m11);
115 |   $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n12, $m12);
116 |   $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n21, $m21);
117 |   $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n22, $m22);
118 | 
119 |   return $Xsquare;
120 | }
121 | 
122 | 
123 | 
124 | =item getStatisticName() - Returns the name of this statistic
125 | 
126 | INPUT PARAMS  : none
127 | 
128 | RETURN VALUES : $name      .. Name of the measure.
129 | 
130 | =cut
131 | 
132 | sub getStatisticName
133 | {
134 |   return "Chi-squared test";
135 | }
136 | 
137 | 
138 | 
139 | 1;
140 | __END__
141 | 
142 | 
143 | =back
144 | 
145 | =head1 AUTHOR
146 | 
147 | Ted Pedersen,                University of Minnesota Duluth
148 |                              E<lt>tpederse@d.umn.eduE<gt>
149 | 
150 | Satanjeev Banerjee,          Carnegie Mellon University
151 |                              E<lt>satanjeev@cmu.eduE<gt>
152 | 
153 | Amruta Purandare,            University of Pittsburgh
154 |                              E<lt>amruta@cs.pitt.eduE<gt>
155 | 
156 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
157 |                              E<lt>bthompson@d.umn.eduE<gt>
158 | 
159 | Saiyam Kohli,                University of Minnesota Duluth
160 |                              E<lt>kohli003@d.umn.eduE<gt>
161 | 
162 | =head1 HISTORY
163 | 
164 | Last updated: $Id: x2.pm,v 1.10 2006/06/21 11:10:52 saiyam_kohli Exp $
165 | 
166 | =head1 BUGS
167 | 
168 | 
169 | =head1 SEE ALSO
170 | 
171 | L<http://groups.yahoo.com/group/ngram/>
172 | 
173 | L<http://www.d.umn.edu/~tpederse/nsp.html>
174 | 
175 | 
176 | =head1 COPYRIGHT
177 | 
178 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
179 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
180 | 
181 | This program is free software; you can redistribute it and/or modify it
182 | under the terms of the GNU General Public License as published by the Free
183 | Software Foundation; either version 2 of the License, or (at your option)
184 | any later version.
185 | 
186 | This program is distributed in the hope that it will be useful, but
187 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
188 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
189 | for more details.
190 | 
191 | You should have received a copy of the GNU General Public License along
192 | with this program; if not, write to
193 | 
194 |     The Free Software Foundation, Inc.,
195 |     59 Temple Place - Suite 330,
196 |     Boston, MA  02111-1307, USA.
197 | 
198 | Note: a copy of the GNU General Public License is available on the web
199 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
200 | distribution as GPL.txt.
201 | 
202 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/odds.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::odds - Perl module to compute the Odds
  4 |                                 ratio for bigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |  use Text::NSP::Measures::2D::odds;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $odds_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$odds_value."\n"";
 26 |   }
 27 | 
 28 | 
 29 | 
 30 | =head1 DESCRIPTION
 31 | 
 32 | Assume that the frequency count data associated with a bigram
 33 | <word1><word2> is stored in a 2x2 contingency table:
 34 | 
 35 |           word2   ~word2
 36 |   word1    n11      n12 | n1p
 37 |  ~word1    n21      n22 | n2p
 38 |            --------------
 39 |            np1      np2   npp
 40 | 
 41 | where n11 is the number of times <word1><word2> occur together, and
 42 | n12 is the number of times <word1> occurs with some word other than
 43 | word2, and n1p is the number of times in total that word1 occurs as
 44 | the first word in a bigram.
 45 | 
 46 | The odds ratio computes the ratio of the number of times that
 47 | the words in a bigram occur together (or not at all) to the
 48 | number of times the words occur individually. It is the cross
 49 | product of the diagonal and the off-diagonal.
 50 | 
 51 | Thus, ODDS RATIO = n11*n22/n21*n12
 52 | 
 53 | if n21 and/or n12 is 0, then each zero value is "smoothed" to one to
 54 | avoid a zero in the denominator.
 55 | 
 56 | =over
 57 | 
 58 | =cut
 59 | 
 60 | 
 61 | package Text::NSP::Measures::2D::odds;
 62 | 
 63 | 
 64 | use Text::NSP::Measures::2D;
 65 | use strict;
 66 | use Carp;
 67 | use warnings;
 68 | no warnings 'redefine';
 69 | require Exporter;
 70 | 
 71 | our ($VERSION, @EXPORT, @ISA);
 72 | 
 73 | @ISA  = qw(Exporter);
 74 | 
 75 | @EXPORT = qw(initializeStatistic calculateStatistic
 76 |              getErrorCode getErrorMessage getStatisticName);
 77 | 
 78 | $VERSION = '0.97';
 79 | 
 80 | 
 81 | =item calculateStatistic() - method to calculate the odds ratio value!
 82 | 
 83 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 84 |                                        the count values computed by the
 85 |                                        count.pl program.
 86 | 
 87 | RETURN VALUES : $odds               .. Odds ratio for this bigram.
 88 | 
 89 | =cut
 90 | 
 91 | sub calculateStatistic
 92 | {
 93 |   my %values = @_;
 94 | 
 95 |   # computes and returns the marginal totals from the frequency
 96 |   # combination values. returns undef if there is an error in
 97 |   # the computation or the values are inconsistent.
 98 |   if(!(Text::NSP::Measures::2D::computeMarginalTotals(\%values)) ){
 99 |     return;
100 |   }
101 | 
102 |   # computes and returns the observed from the frequency
103 |   # combination values. returns 0 if there is an error in
104 |   # the computation or the values are inconsistent.
105 |   if( !(Text::NSP::Measures::2D::computeObservedValues(\%values)) ) {
106 |       return(0);
107 |   }
108 | 
109 |   # Add-one smoothing to avoid zero denominator
110 | 
111 |   if ($n21 == 0)
112 |   {
113 |     $n21 = 1;
114 |   }
115 |   if ($n12 == 0)
116 |   {
117 |     $n12 = 1;
118 |   }
119 | 
120 |   my $odds = (($n11*$n22) / ($n12*$n21));
121 | 
122 |   return ($odds);
123 | }
124 | 
125 | 
126 | 
127 | =item getStatisticName() - Returns the name of this statistic
128 | 
129 | INPUT PARAMS  : none
130 | 
131 | RETURN VALUES : $name      .. Name of the measure.
132 | 
133 | =cut
134 | 
135 | sub getStatisticName
136 | {
137 |   return "Odds Ratio";
138 | }
139 | 
140 | 
141 | 
142 | 1;
143 | __END__
144 | 
145 | 
146 | =back
147 | 
148 | =head1 AUTHOR
149 | 
150 | Ted Pedersen,                University of Minnesota Duluth
151 |                              E<lt>tpederse@d.umn.eduE<gt>
152 | 
153 | Satanjeev Banerjee,          Carnegie Mellon University
154 |                              E<lt>satanjeev@cmu.eduE<gt>
155 | 
156 | Amruta Purandare,            University of Pittsburgh
157 |                              E<lt>amruta@cs.pitt.eduE<gt>
158 | 
159 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
160 |                              E<lt>bthompson@d.umn.eduE<gt>
161 | 
162 | Saiyam Kohli,                University of Minnesota Duluth
163 |                              E<lt>kohli003@d.umn.eduE<gt>
164 | 
165 | =head1 HISTORY
166 | 
167 | Last updated: $Id: odds.pm,v 1.18 2006/06/21 11:10:52 saiyam_kohli Exp $
168 | 
169 | =head1 BUGS
170 | 
171 | 
172 | =head1 SEE ALSO
173 | 
174 |   @inproceedings{ blaheta01unsupervised,
175 |                   author = {D. BLAHETA and M. JOHNSON},
176 |                   title = {Unsupervised learning of multi-word verbs},
177 |                   booktitle = {}Proceedings of the 39th Annual Meeting of the ACL},
178 |                   year = {2001},
179 |                   pages =  {54-60},
180 |                   url = L<http://www.cog.brown.edu/~mj/papers/2001/dpb-colloc01.pdf> }
181 | 
182 | L<http://groups.yahoo.com/group/ngram/>
183 | 
184 | L<http://www.d.umn.edu/~tpederse/nsp.html>
185 | 
186 | 
187 | =head1 COPYRIGHT
188 | 
189 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
190 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
191 | 
192 | This program is free software; you can redistribute it and/or modify it
193 | under the terms of the GNU General Public License as published by the Free
194 | Software Foundation; either version 2 of the License, or (at your option)
195 | any later version.
196 | 
197 | This program is distributed in the hope that it will be useful, but
198 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
199 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
200 | for more details.
201 | 
202 | You should have received a copy of the GNU General Public License along
203 | with this program; if not, write to
204 | 
205 |     The Free Software Foundation, Inc.,
206 |     59 Temple Place - Suite 330,
207 |     Boston, MA  02111-1307, USA.
208 | 
209 | Note: a copy of the GNU General Public License is available on the web
210 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
211 | distribution as GPL.txt.
212 | 
213 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI/phi.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::CHI::phi - Perl module that implements Phi coefficient
  4 |                                     measure for bigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::CHI::phi;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $phi_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$phi_value."\n"";
 26 |   }
 27 | 
 28 | =head1 DESCRIPTION
 29 | 
 30 | This function computes the the square of the traditional formulation of
 31 | the Phi Coefficient.
 32 | 
 33 | Assume that the frequency count data associated with a bigram
 34 | <word1><word2> is stored in a 2x2 contingency table:
 35 | 
 36 |           word2   ~word2
 37 |   word1    n11      n12 | n1p
 38 |  ~word1    n21      n22 | n2p
 39 |            --------------
 40 |            np1      np2   npp
 41 | 
 42 | where n11 is the number of times <word1><word2> occur together, and
 43 | n12 is the number of times <word1> occurs with some word other than
 44 | word2, and n1p is the number of times in total that word1 occurs as
 45 | the first word in a bigram.
 46 | 
 47 |  PHI^2 = ((n11 * n22) - (n21 * n21))^2/(n1p * np1 * np2 * n2p)
 48 | 
 49 | Note that the value of PHI^2 is equivalent to
 50 | Pearson's Chi-Squared test multiplied by the sample size, that is:
 51 | 
 52 |  Chi-Squared = npp * PHI^2
 53 | 
 54 | We use PHI^2 rather than PHI since PHI^2 was employed for collocation
 55 | identification in:
 56 | 
 57 | Church, K. (1991) Concordances for Parallel Text, Seventh Annual
 58 | Conference of the UW Centre for the New OED and Text Research, Oxford,
 59 | England.
 60 | 
 61 | =over
 62 | 
 63 | =cut
 64 | 
 65 | 
 66 | package Text::NSP::Measures::2D::CHI::phi;
 67 | 
 68 | 
 69 | use Text::NSP::Measures::2D::CHI;
 70 | use strict;
 71 | use Carp;
 72 | use warnings;
 73 | no warnings 'redefine';
 74 | require Exporter;
 75 | 
 76 | our ($VERSION, @EXPORT, @ISA);
 77 | 
 78 | @ISA  = qw(Exporter);
 79 | 
 80 | @EXPORT = qw(initializeStatistic calculateStatistic
 81 |              getErrorCode getErrorMessage getStatisticName);
 82 | 
 83 | $VERSION = '0.97';
 84 | 
 85 | 
 86 | =item calculateStatistic() - method to calculate the Phi Coefficient
 87 | 
 88 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 89 |                                        the count values computed by the
 90 |                                        count.pl program.
 91 | 
 92 | RETURN VALUES : $phi                .. phi value for this bigram.
 93 | 
 94 | =cut
 95 | 
 96 | sub calculateStatistic
 97 | {
 98 |   my %values = @_;
 99 | 
100 |   # computes and returns the observed and expected values from
101 |   # the frequency combination values. returns 0 if there is an
102 |   # error in the computation or the values are inconsistent.
103 |   if( !(Text::NSP::Measures::2D::CHI::getValues(\%values)) ) {
104 |     return;
105 |   }
106 | 
107 |   #  Now calculate the phi coefficient
108 |   my $phi = 0;
109 | 
110 |   $phi += Text::NSP::Measures::2D::CHI::computeVal($n11, $m11);
111 |   $phi += Text::NSP::Measures::2D::CHI::computeVal($n12, $m12);
112 |   $phi += Text::NSP::Measures::2D::CHI::computeVal($n21, $m21);
113 |   $phi += Text::NSP::Measures::2D::CHI::computeVal($n22, $m22);
114 | 
115 |   return $phi/$values{npp};
116 | }
117 | 
118 | 
119 | 
120 | =item getStatisticName() - Returns the name of this statistic
121 | 
122 | INPUT PARAMS  : none
123 | 
124 | RETURN VALUES : $name      .. Name of the measure.
125 | 
126 | =cut
127 | 
128 | sub getStatisticName
129 | {
130 |   return "Phi Coefficient";
131 | }
132 | 
133 | 
134 | 
135 | 1;
136 | __END__
137 | 
138 | 
139 | =back
140 | 
141 | =head1 AUTHOR
142 | 
143 | Ted Pedersen,                University of Minnesota Duluth
144 |                              E<lt>tpederse@d.umn.eduE<gt>
145 | 
146 | Satanjeev Banerjee,          Carnegie Mellon University
147 |                              E<lt>satanjeev@cmu.eduE<gt>
148 | 
149 | Amruta Purandare,            University of Pittsburgh
150 |                              E<lt>amruta@cs.pitt.eduE<gt>
151 | 
152 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
153 |                              E<lt>bthompson@d.umn.eduE<gt>
154 | 
155 | Saiyam Kohli,                University of Minnesota Duluth
156 |                              E<lt>kohli003@d.umn.eduE<gt>
157 | 
158 | =head1 HISTORY
159 | 
160 | Last updated: $Id: phi.pm,v 1.12 2006/06/21 11:10:52 saiyam_kohli Exp $
161 | 
162 | =head1 BUGS
163 | 
164 | 
165 | =head1 SEE ALSO
166 | 
167 |   @inproceedings{GaleC91,
168 |           author = {Gale, W. and Church, K.},
169 |           title = {A Program for Aligning Sentences in Bilingual Corpora},
170 |           booktitle = {Proceedings of the 29th Annual Meeting of the
171 |                       Association for Computational Linguistics},
172 |           address = {Berkeley, CA},
173 |           year = {1991}
174 |           url = L<http://www.cs.mu.oz.au/acl/J/J93/J93-1004.pdf>}
175 | 
176 | 
177 | L<http://groups.yahoo.com/group/ngram/>
178 | 
179 | L<http://www.d.umn.edu/~tpederse/nsp.html>
180 | 
181 | 
182 | =head1 COPYRIGHT
183 | 
184 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
185 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
186 | 
187 | This program is free software; you can redistribute it and/or modify it
188 | under the terms of the GNU General Public License as published by the Free
189 | Software Foundation; either version 2 of the License, or (at your option)
190 | any later version.
191 | 
192 | This program is distributed in the hope that it will be useful, but
193 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
194 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
195 | for more details.
196 | 
197 | You should have received a copy of the GNU General Public License along
198 | with this program; if not, write to
199 | 
200 |     The Free Software Foundation, Inc.,
201 |     59 Temple Place - Suite 330,
202 |     Boston, MA  02111-1307, USA.
203 | 
204 | Note: a copy of the GNU General Public License is available on the web
205 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
206 | distribution as GPL.txt.
207 | 
208 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/ps.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::MI::ps - Perl module that implements Poisson-Stirling
  4 |                                   measure of association for bigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::MI::ps;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $ps_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$ps_value."\n"";
 26 |   }
 27 | 
 28 | =head1 DESCRIPTION
 29 | 
 30 | The log-likelihood ratio measures the deviation between the observed data
 31 | and what would be expected if <word1> and <word2> were independent. The
 32 | higher the score, the less evidence there is in favor of concluding that
 33 | the words are independent.
 34 | 
 35 | Assume that the frequency count data associated with a bigram
 36 | <word1><word2> as shown by a 2x2 contingency table:
 37 | 
 38 |           word2   ~word2
 39 |   word1    n11      n12 | n1p
 40 |  ~word1    n21      n22 | n2p
 41 |            --------------
 42 |            np1      np2   npp
 43 | 
 44 | where n11 is the number of times <word1><word2> occur together, and
 45 | n12 is the number of times <word1> occurs with some word other than
 46 | word2, and n1p is the number of times in total that word1 occurs as
 47 | the first word in a bigram.
 48 | 
 49 | The expected values for the internal cells are calculated by taking the
 50 | product of their associated marginals and dividing by the sample size,
 51 | for example:
 52 | 
 53 |           np1 * n1p
 54 |    m11=   ---------
 55 |             npp
 56 | 
 57 | The Poisson Stirling measure is a negative logarithmic approximation
 58 | of the Poisson-likelihood measure. It uses the Stirling's formula to
 59 | approximate the factorial in Poisson-likelihood measure.
 60 | 
 61 | Poisson-Stirling = n11 * ( log(n11) - log(m11) - 1)
 62 | 
 63 | which is same as
 64 | 
 65 | Poisson-Stirling = n11 * ( log(n11/m11) - 1)
 66 | 
 67 | 
 68 | =head2 Methods
 69 | 
 70 | =over
 71 | 
 72 | =cut
 73 | 
 74 | 
 75 | package Text::NSP::Measures::2D::MI::ps;
 76 | 
 77 | 
 78 | use Text::NSP::Measures::2D::MI;
 79 | use strict;
 80 | use Carp;
 81 | use warnings;
 82 | no warnings 'redefine';
 83 | require Exporter;
 84 | 
 85 | our ($VERSION, @EXPORT, @ISA);
 86 | 
 87 | @ISA  = qw(Exporter);
 88 | 
 89 | @EXPORT = qw(initializeStatistic calculateStatistic
 90 |              getErrorCode getErrorMessage getStatisticName);
 91 | 
 92 | $VERSION = '0.97';
 93 | 
 94 | =item calculateStatistic() - This method calculates the ps value
 95 | 
 96 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 97 |                                        the count values computed by the
 98 |                                        count.pl program.
 99 | 
100 | RETURN VALUES : $poissonStirling      .. Poisson-Stirling value for this bigram.
101 | 
102 | =cut
103 | 
104 | sub calculateStatistic
105 | {
106 |   my %values = @_;
107 | 
108 |   # computes and returns the observed and expected values from
109 |   # the frequency combination values. returns 0 if there is an
110 |   # error in the computation or the values are inconsistent.
111 |   if( !(Text::NSP::Measures::2D::MI::getValues(\%values)) ) {
112 |     return;
113 |   }
114 | 
115 |   #  Now for the actual calculation of Loglikelihood!
116 |   my $poissonStirling = 0;
117 | 
118 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so!
119 |   $poissonStirling = $n11 * (Text::NSP::Measures::2D::MI::computePMI($n11,$m11) - 1);
120 | 
121 |   return $poissonStirling;
122 | }
123 | 
124 | 
125 | =item getStatisticName() - Returns the name of this statistic
126 | 
127 | INPUT PARAMS  : none
128 | 
129 | RETURN VALUES : $name      .. Name of the measure.
130 | 
131 | =cut
132 | 
133 | sub getStatisticName
134 | {
135 |     return "Poisson-Stirling Measure";
136 | }
137 | 
138 | 
139 | 
140 | 1;
141 | __END__
142 | 
143 | 
144 | =back
145 | 
146 | =head1 AUTHOR
147 | 
148 | Ted Pedersen,                University of Minnesota Duluth
149 |                              E<lt>tpederse@d.umn.eduE<gt>
150 | 
151 | Satanjeev Banerjee,          Carnegie Mellon University
152 |                              E<lt>satanjeev@cmu.eduE<gt>
153 | 
154 | Amruta Purandare,            University of Pittsburgh
155 |                              E<lt>amruta@cs.pitt.eduE<gt>
156 | 
157 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
158 |                              E<lt>bthompson@d.umn.eduE<gt>
159 | 
160 | Saiyam Kohli,                University of Minnesota Duluth
161 |                              E<lt>kohli003@d.umn.eduE<gt>
162 | 
163 | =head1 HISTORY
164 | 
165 | Last updated: $Id: ps.pm,v 1.9 2008/03/26 17:20:28 tpederse Exp $
166 | 
167 | =head1 BUGS
168 | 
169 | 
170 | =head1 SEE ALSO
171 | 
172 | L<http://groups.yahoo.com/group/ngram/>
173 | 
174 | L<http://www.d.umn.edu/~tpederse/nsp.html>
175 | 
176 |   @article{SmadjaMH96,
177 |           author = {Quasthoff, Uwe and Wolff, Christian},
178 |           title = {The Poisson collocation measure and its application},
179 |           journal = {Workshop on Computational Approaches to Collocations},
180 |           year = {2002},
181 |           url = L<http://www.ofai.at/~brigitte.krenn/colloc02/PoissonCollocationMeasureQuasthoffWolff_final.pdf>}
182 | 
183 | =head1 COPYRIGHT
184 | 
185 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
186 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
187 | 
188 | This program is free software; you can redistribute it and/or modify it
189 | under the terms of the GNU General Public License as published by the Free
190 | Software Foundation; either version 2 of the License, or (at your option)
191 | any later version.
192 | 
193 | This program is distributed in the hope that it will be useful, but
194 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
195 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
196 | for more details.
197 | 
198 | You should have received a copy of the GNU General Public License along
199 | with this program; if not, write to
200 | 
201 |     The Free Software Foundation, Inc.,
202 |     59 Temple Place - Suite 330,
203 |     Boston, MA  02111-1307, USA.
204 | 
205 | Note: a copy of the GNU General Public License is available on the web
206 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
207 | distribution as GPL.txt.
208 | 
209 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher/twotailed.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Fisher::twotailed - Perl module implementation of the two-sided
  4 |                                              Fisher's exact test.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Fisher::twotailed;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $twotailed_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage();
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$twotailed_value;
 26 |   }
 27 | 
 28 | =head1 DESCRIPTION
 29 | 
 30 | Assume that the frequency count data associated with a bigram
 31 | <word1><word2> is stored in a 2x2 contingency table:
 32 | 
 33 |           word2   ~word2
 34 |   word1    n11      n12 | n1p
 35 |  ~word1    n21      n22 | n2p
 36 |            --------------
 37 |            np1      np2   npp
 38 | 
 39 | where n11 is the number of times <word1><word2> occur together, and
 40 | n12 is the number of times <word1> occurs with some word other than
 41 | word2, and n1p is the number of times in total that word1 occurs as
 42 | the first word in a bigram.
 43 | 
 44 | The fishers exact tests are calculated by fixing the marginal totals
 45 | and computing the hypergeometric probabilities for all the possible
 46 | contingency tables,
 47 | 
 48 | A twotailed fishers test is calculated by adding the probabilities of
 49 | all the contingency tables with probabilities less than the probability
 50 | of the observed table. The twotailed fishers test tells us how likely
 51 | it would be to observe an contingency table which is less probable than
 52 | the current table.
 53 | 
 54 | =head2 Methods
 55 | 
 56 | =over
 57 | 
 58 | =cut
 59 | 
 60 | package Text::NSP::Measures::2D::Fisher::twotailed;
 61 | 
 62 | 
 63 | use Text::NSP::Measures::2D::Fisher;
 64 | use strict;
 65 | use Carp;
 66 | use warnings;
 67 | no warnings 'redefine';
 68 | require Exporter;
 69 | 
 70 | our ($VERSION, @EXPORT, @ISA);
 71 | 
 72 | @ISA  = qw(Exporter);
 73 | 
 74 | @EXPORT = qw(initializeStatistic calculateStatistic
 75 |              getErrorCode getErrorMessage getStatisticName);
 76 | 
 77 | $VERSION = '0.97';
 78 | 
 79 | 
 80 | =item calculateStatistic() - This method calculates the twotailed
 81 | Fisher value
 82 | 
 83 | INPUT PARAMS  : $count_values       .. Reference of an array containing
 84 |                                        the count values computed by the
 85 |                                        count.pl program.
 86 | 
 87 | RETURN VALUES : $twotailed          .. Twotailed Fisher value.
 88 | 
 89 | =cut
 90 | 
 91 | sub calculateStatistic
 92 | {
 93 |   my %values = @_;
 94 | 
 95 |   my $probabilities;
 96 | 
 97 |   # computes and returns the observed and marginal values from
 98 |   # the frequency combination values. returns 0 if there is an
 99 |   # error in the computation or the values are inconsistent.
100 |   if( !(Text::NSP::Measures::2D::Fisher::getValues(\%values)) )
101 |   {
102 |     return;
103 |   }
104 | 
105 |   my $final_limit = ($n1p < $np1) ? $n1p : $np1;
106 | 
107 |   my $n11_org = $n11;
108 |   my $n11_start = $n1p + $np1 - $npp;
109 |   if($n11_start<0)
110 |   {
111 |     $n11_start = 0;
112 |   }
113 | 
114 |   if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($n11_start, $final_limit)))
115 |   {
116 |       return;
117 |   }
118 | 
119 |   my $value;
120 | 
121 |   my $ttfisher=0;
122 | 
123 |   foreach $value (sort { $a <=> $b } values %$probabilities)
124 |   {
125 |     if($value > $probabilities->{$n11_org})
126 |     {
127 |       next;
128 |     }
129 |     $ttfisher += exp($value);
130 |   }
131 | 
132 |   return $ttfisher;
133 | }
134 | 
135 | 
136 | =item getStatisticName() - Returns the name of this statistic
137 | 
138 | INPUT PARAMS  : none
139 | 
140 | RETURN VALUES : $name      .. Name of the measure.
141 | 
142 | =cut
143 | 
144 | sub getStatisticName
145 | {
146 |     return "Two Tailed Fisher";
147 | }
148 | 
149 | 
150 | 
151 | 1;
152 | __END__
153 | 
154 | =back
155 | 
156 | =head1 AUTHOR
157 | 
158 | Ted Pedersen,                University of Minnesota Duluth
159 |                              E<lt>tpederse@d.umn.eduE<gt>
160 | 
161 | Satanjeev Banerjee,          Carnegie Mellon University
162 |                              E<lt>satanjeev@cmu.eduE<gt>
163 | 
164 | Amruta Purandare,            University of Pittsburgh
165 |                              E<lt>amruta@cs.pitt.eduE<gt>
166 | 
167 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
168 |                              E<lt>bthompson@d.umn.eduE<gt>
169 | 
170 | Saiyam Kohli,                University of Minnesota Duluth
171 |                              E<lt>kohli003@d.umn.eduE<gt>
172 | 
173 | =head1 HISTORY
174 | 
175 | Last updated: $Id: twotailed.pm,v 1.13 2008/03/26 17:21:19 tpederse Exp $
176 | 
177 | =head1 BUGS
178 | 
179 | 
180 | =head1 SEE ALSO
181 | 
182 |   @inproceedings{Pedersen96,
183 |           author = {Pedersen, T.},
184 |           title = {Fishing For Exactness},
185 |           booktitle = {Proceedings of the South Central SAS User's
186 |                       Group (SCSUG-96) Conference},
187 |           year = {1996},
188 |           pages = {188--200},
189 |           month ={October},
190 |           address = {Austin, TX}
191 |           url = L<http://www.d.umn.edu/~tpederse/pubs.html>}
192 | 
193 | L<http://groups.yahoo.com/group/ngram/>
194 | 
195 | L<http://www.d.umn.edu/~tpederse/nsp.html>
196 | 
197 | 
198 | =head1 COPYRIGHT
199 | 
200 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
201 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
202 | 
203 | This program is free software; you can redistribute it and/or modify it
204 | under the terms of the GNU General Public License as published by the Free
205 | Software Foundation; either version 2 of the License, or (at your option)
206 | any later version.
207 | 
208 | This program is distributed in the hope that it will be useful, but
209 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
210 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
211 | for more details.
212 | 
213 | You should have received a copy of the GNU General Public License along
214 | with this program; if not, write to
215 | 
216 |     The Free Software Foundation, Inc.,
217 |     59 Temple Place - Suite 330,
218 |     Boston, MA  02111-1307, USA.
219 | 
220 | Note: a copy of the GNU General Public License is available on the web
221 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
222 | distribution as GPL.txt.
223 | 
224 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/tmi.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::MI::tmi - Perl module that implements True Mutual
  4 |                                    Information.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::MI::tmi;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $tmi_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$tmi_value."\n"";
 26 |   }
 27 | 
 28 | =head1 DESCRIPTION
 29 | 
 30 | Assume that the frequency count data associated with a bigram
 31 | <word1><word2> is stored in a 2x2 contingency table:
 32 | 
 33 |           word2   ~word2
 34 |   word1    n11      n12 | n1p
 35 |  ~word1    n21      n22 | n2p
 36 |            --------------
 37 |            np1      np2   npp
 38 | 
 39 | where n11 is the number of times <word1><word2> occur together, and
 40 | n12 is the number of times <word1> occurs with some word other than
 41 | word2, and n1p is the number of times in total that word1 occurs as
 42 | the first word in a bigram.
 43 | 
 44 | The expected values for the internal cells are calculated by taking the
 45 | product of their associated marginals and dividing by the sample size,
 46 | for example:
 47 | 
 48 |           np1 * n1p
 49 |    m11=   ---------
 50 |             npp
 51 | 
 52 | True Mutual Information (tmi) is defined as the weighted average of the
 53 | Pointwise mutual informations for all the observed and expected value pairs.
 54 | 
 55 |  tmi = [n11/npp * log(n11/m11) + n12/npp * log(n12/m12) +
 56 |         n21/npp * log(n21/m21) + n22/npp * log(n22/m22)]
 57 | 
 58 | 
 59 |  PMI =   log (n11/m11)
 60 | 
 61 | =head2 Methods
 62 | 
 63 | =over
 64 | 
 65 | =cut
 66 | 
 67 | package Text::NSP::Measures::2D::MI::tmi;
 68 | 
 69 | 
 70 | use Text::NSP::Measures::2D::MI;
 71 | use strict;
 72 | use Carp;
 73 | use warnings;
 74 | no warnings 'redefine';
 75 | require Exporter;
 76 | 
 77 | our ($VERSION, @EXPORT, @ISA);
 78 | 
 79 | @ISA  = qw(Exporter);
 80 | 
 81 | @EXPORT = qw(initializeStatistic calculateStatistic
 82 |              getErrorCode getErrorMessage getStatisticName);
 83 | 
 84 | $VERSION = '0.97';
 85 | 
 86 | 
 87 | =item calculateStatistic() - This method calculates the tmi value
 88 | 
 89 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 90 |                                        the count values computed by the
 91 |                                        count.pl program.
 92 | 
 93 | RETURN VALUES : $tmi                .. TMI value for this bigram.
 94 | 
 95 | =cut
 96 | 
 97 | sub calculateStatistic
 98 | {
 99 |   my %values = @_;
100 | 
101 |   # computes and returns the observed and expected values from
102 |   # the frequency combination values. returns 0 if there is an
103 |   # error in the computation or the values are inconsistent.
104 |   if( !(Text::NSP::Measures::2D::MI::getValues(\%values)) ) {
105 |     return(0);
106 |   }
107 | 
108 |   #my $marginals = $self->computeMarginalTotals(@_);
109 | 
110 |   #  Now for the actual calculation of TMI!
111 |   my $tmi = 0;
112 | 
113 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so!
114 |   $tmi += $n11/$npp * Text::NSP::Measures::2D::MI::computePMI( $n11, $m11 )/ log 2;
115 |   $tmi += $n12/$npp * Text::NSP::Measures::2D::MI::computePMI( $n12, $m12 )/ log 2;
116 |   $tmi += $n21/$npp * Text::NSP::Measures::2D::MI::computePMI( $n21, $m21 )/ log 2;
117 |   $tmi += $n22/$npp * Text::NSP::Measures::2D::MI::computePMI( $n22, $m22 )/ log 2;
118 | 
119 |   return ($tmi);
120 | }
121 | 
122 | 
123 | =item getStatisticName() - Returns the name of this statistic
124 | 
125 | INPUT PARAMS  : none
126 | 
127 | RETURN VALUES : $name      .. Name of the measure.
128 | 
129 | =cut
130 | 
131 | sub getStatisticName
132 | {
133 |     return "True Mutual Information";
134 | }
135 | 
136 | 
137 | 
138 | 1;
139 | __END__
140 | 
141 | 
142 | =back
143 | 
144 | =head1 AUTHOR
145 | 
146 | Ted Pedersen,                University of Minnesota Duluth
147 |                              E<lt>tpederse@d.umn.eduE<gt>
148 | 
149 | Satanjeev Banerjee,          Carnegie Mellon University
150 |                              E<lt>satanjeev@cmu.eduE<gt>
151 | 
152 | Amruta Purandare,            University of Pittsburgh
153 |                              E<lt>amruta@cs.pitt.eduE<gt>
154 | 
155 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
156 |                              E<lt>bthompson@d.umn.eduE<gt>
157 | 
158 | Saiyam Kohli,                University of Minnesota Duluth
159 |                              E<lt>kohli003@d.umn.eduE<gt>
160 | 
161 | =head1 HISTORY
162 | 
163 | Last updated: $Id: tmi.pm,v 1.23 2008/03/26 17:20:28 tpederse Exp $
164 | 
165 | =head1 BUGS
166 | 
167 | 
168 | =head1 SEE ALSO
169 | 
170 |   @inproceedings{moore:2004:EMNLP,
171 |                 author    = {Moore, Robert C.},
172 |                 title     = {On Log-Likelihood-Ratios and the Significance of Rare
173 |                              Events },
174 |                 booktitle = {Proceedings of EMNLP 2004},
175 |                 editor = {Dekang Lin and Dekai Wu},
176 |                 year      = 2004,
177 |                 month     = {July},
178 |                 address   = {Barcelona, Spain},
179 |                 publisher = {Association for Computational Linguistics},
180 |                 pages     = {333--340}
181 |                 url = L<http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Moore.pdf>}
182 | 
183 | L<http://groups.yahoo.com/group/ngram/>
184 | 
185 | L<http://www.d.umn.edu/~tpederse/nsp.html>
186 | 
187 | 
188 | =head1 COPYRIGHT
189 | 
190 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
191 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
192 | 
193 | This program is free software; you can redistribute it and/or modify it
194 | under the terms of the GNU General Public License as published by the Free
195 | Software Foundation; either version 2 of the License, or (at your option)
196 | any later version.
197 | 
198 | This program is distributed in the hope that it will be useful, but
199 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
200 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
201 | for more details.
202 | 
203 | You should have received a copy of the GNU General Public License along
204 | with this program; if not, write to
205 | 
206 |     The Free Software Foundation, Inc.,
207 |     59 Temple Place - Suite 330,
208 |     Boston, MA  02111-1307, USA.
209 | 
210 | Note: a copy of the GNU General Public License is available on the web
211 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
212 | distribution as GPL.txt.
213 | 
214 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher/left.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Fisher::left - Perl module implementation of the left sided
  4 |                                         Fisher's exact test.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Fisher::left;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $left_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage();
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$left_value;
 26 |   }
 27 | 
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | Assume that the frequency count data associated with a bigram
 32 | <word1><word2> is stored in a 2x2 contingency table:
 33 | 
 34 |           word2   ~word2
 35 |   word1    n11      n12 | n1p
 36 |  ~word1    n21      n22 | n2p
 37 |            --------------
 38 |            np1      np2   npp
 39 | 
 40 | where n11 is the number of times <word1><word2> occur together, and
 41 | n12 is the number of times <word1> occurs with some word other than
 42 | word2, and n1p is the number of times in total that word1 occurs as
 43 | the first word in a bigram.
 44 | 
 45 | The fishers exact tests are calculated by fixing the marginal totals
 46 | and computing the hypergeometric probabilities for all the possible
 47 | contingency tables,
 48 | 
 49 | A left sided test is calculated by adding the probabilities of all
 50 | the possible two by two contingency tables formed by fixing the
 51 | marginal totals and changing the value of n11 to less than the given
 52 | value. A left sided Fisher's Exact Test tells us how likely it is to
 53 | randomly sample a table where n11 is less than observed. In other words,
 54 | it tells us how likely it is to sample an observation where the two words
 55 | are less dependent than currently observed.
 56 | 
 57 | =head2 Methods
 58 | 
 59 | =over
 60 | 
 61 | =cut
 62 | 
 63 | 
 64 | package Text::NSP::Measures::2D::Fisher::left;
 65 | 
 66 | 
 67 | use Text::NSP::Measures::2D::Fisher;
 68 | use strict;
 69 | use Carp;
 70 | use warnings;
 71 | no warnings 'redefine';
 72 | require Exporter;
 73 | 
 74 | our ($VERSION, @EXPORT, @ISA);
 75 | 
 76 | @ISA  = qw(Exporter);
 77 | 
 78 | @EXPORT = qw(initializeStatistic calculateStatistic
 79 |              getErrorCode getErrorMessage getStatisticName);
 80 | 
 81 | $VERSION = '0.97';
 82 | 
 83 | 
 84 | =item calculateStatistic() - This method computes the left sided Fishers
 85 |                              exact test.
 86 | 
 87 | INPUT PARAMS  : $count_values       .. Reference of an array containing
 88 |                                        the count values computed by the
 89 |                                        count.pl program.
 90 | 
 91 | RETURN VALUES : $left               .. Left Fisher value.
 92 | 
 93 | =cut
 94 | 
 95 | sub calculateStatistic
 96 | {
 97 |   my %values = @_;
 98 |   my $probabilities;
 99 | 
100 |   # computes and returns the observed and marginal values from
101 |   # the frequency combination values. returns 0 if there is an
102 |   # error in the computation or the values are inconsistent.
103 |   if( !(Text::NSP::Measures::2D::Fisher::getValues(\%values)) )
104 |   {
105 |     return;
106 |   }
107 | 
108 |   my $final_limit = $n11;
109 |   my $n11_start = $n1p + $np1 - $npp;
110 |   if($n11_start<0)
111 |   {
112 |     $n11_start = 0;
113 |   }
114 | 
115 |   if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($n11_start, $final_limit)))
116 |   {
117 |       return;
118 |   }
119 | 
120 | 
121 |   my $key_n11;
122 | 
123 |   my $leftfisher=0;
124 | 
125 |   foreach $key_n11 (sort { $a <=> $b } keys %$probabilities)
126 |   {
127 |     if($key_n11>$final_limit)
128 |     {
129 |       last;
130 |     }
131 |     $leftfisher += exp($probabilities->{$key_n11});
132 |   }
133 | 
134 |   return $leftfisher;
135 | }
136 | 
137 | 
138 | =item getStatisticName()
139 | 
140 | Returns the name of this statistic
141 | 
142 | INPUT PARAMS  : none
143 | 
144 | RETURN VALUES : $name      .. Name of the measure.
145 | 
146 | =cut
147 | 
148 | sub getStatisticName
149 | {
150 |     return "Left Fisher";
151 | }
152 | 
153 | 
154 | 
155 | 1;
156 | __END__
157 | 
158 | =back
159 | 
160 | =head1 AUTHOR
161 | 
162 | Ted Pedersen,                University of Minnesota Duluth
163 |                              E<lt>tpederse@d.umn.eduE<gt>
164 | 
165 | Satanjeev Banerjee,          Carnegie Mellon University
166 |                              E<lt>satanjeev@cmu.eduE<gt>
167 | 
168 | Amruta Purandare,            University of Pittsburgh
169 |                              E<lt>amruta@cs.pitt.eduE<gt>
170 | 
171 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
172 |                              E<lt>bthompson@d.umn.eduE<gt>
173 | 
174 | Saiyam Kohli,                University of Minnesota Duluth
175 |                              E<lt>kohli003@d.umn.eduE<gt>
176 | 
177 | =head1 HISTORY
178 | 
179 | Last updated: $Id: left.pm,v 1.12 2006/06/21 11:10:52 saiyam_kohli Exp $
180 | 
181 | =head1 BUGS
182 | 
183 | 
184 | =head1 SEE ALSO
185 | 
186 |   @inproceedings{Pedersen96,
187 |           author = {Pedersen, T.},
188 |           title = {Fishing For Exactness},
189 |           booktitle = {Proceedings of the South Central SAS User's
190 |                       Group (SCSUG-96) Conference},
191 |           year = {1996},
192 |           pages = {188--200},
193 |           month ={October},
194 |           address = {Austin, TX}
195 |           url = L<http://www.d.umn.edu/~tpederse/pubs.html>}
196 | 
197 | L<http://groups.yahoo.com/group/ngram/>
198 | 
199 | L<http://www.d.umn.edu/~tpederse/nsp.html>
200 | 
201 | 
202 | =head1 COPYRIGHT
203 | 
204 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
205 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
206 | 
207 | This program is free software; you can redistribute it and/or modify it
208 | under the terms of the GNU General Public License as published by the Free
209 | Software Foundation; either version 2 of the License, or (at your option)
210 | any later version.
211 | 
212 | This program is distributed in the hope that it will be useful, but
213 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
214 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
215 | for more details.
216 | 
217 | You should have received a copy of the GNU General Public License along
218 | with this program; if not, write to
219 | 
220 |     The Free Software Foundation, Inc.,
221 |     59 Temple Place - Suite 330,
222 |     Boston, MA  02111-1307, USA.
223 | 
224 | Note: a copy of the GNU General Public License is available on the web
225 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
226 | distribution as GPL.txt.
227 | 
228 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher2/twotailed.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Fisher2::twotailed - Perl module implementation of the two-sided
  4 |                                              Fisher's exact test (Deprecated).
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Fisher2::twotailed;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $twotailed_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage();
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$twotailed_value;
 26 |   }
 27 | 
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | This module provides a naive implementation of the fishers twotailed
 32 | exact tests. That is the implementation does not have any
 33 | optimizations for performance. This will compute the factorials and
 34 | the hypergeometric measures using direct multiplications.
 35 | 
 36 | This measure should be used if you need exact values without any
 37 | rounding errors, and you are not worried about the performance of
 38 | the measure, otherwise use the implementations under the
 39 | Text::NSP::Measures::2D::Fisher module. To use this implementation,
 40 | you will have to specify the entire module name. Usage:
 41 | 
 42 | statistic.pl Text::NSP::Measures::Fisher2::twotailed dest.txt source.cnt
 43 | 
 44 | Assume that the frequency count data associated with a bigram
 45 | <word1><word2> is stored in a 2x2 contingency table:
 46 | 
 47 |           word2   ~word2
 48 |   word1    n11      n12 | n1p
 49 |  ~word1    n21      n22 | n2p
 50 |            --------------
 51 |            np1      np2   npp
 52 | 
 53 | where n11 is the number of times <word1><word2> occur together, and
 54 | n12 is the number of times <word1> occurs with some word other than
 55 | word2, and n1p is the number of times in total that word1 occurs as
 56 | the first word in a bigram.
 57 | 
 58 | The fishers exact tests are calculated by fixing the marginal totals
 59 | and computing the hypergeometric probabilities for all the possible
 60 | contingency tables,
 61 | 
 62 | A twotailed fishers test is calculated by adding the probabilities of
 63 | all the contingency tables with probabilities less than the probability
 64 | of the observed table. The twotailed fishers test tells us how likely
 65 | it would be to observe an contingency table which is less probable than
 66 | the current table.
 67 | 
 68 | =head2 Methods
 69 | 
 70 | =over
 71 | 
 72 | =cut
 73 | 
 74 | package Text::NSP::Measures::2D::Fisher2::twotailed;
 75 | 
 76 | 
 77 | use Text::NSP::Measures::2D::Fisher2;
 78 | use strict;
 79 | use Carp;
 80 | use warnings;
 81 | no warnings 'redefine';
 82 | require Exporter;
 83 | 
 84 | our ($VERSION, @EXPORT, @ISA);
 85 | 
 86 | @ISA  = qw(Exporter);
 87 | 
 88 | @EXPORT = qw(initializeStatistic calculateStatistic
 89 |              getErrorCode getErrorMessage getStatisticName);
 90 | 
 91 | $VERSION = '0.97';
 92 | 
 93 | 
 94 | =item calculateStatistic() - This method computes the right sided Fishers
 95 |                              exact test.
 96 | 
 97 | INPUT PARAMS  : $count_values       .. Reference of an array containing
 98 |                                        the count values computed by the
 99 |                                        count.pl program.
100 | 
101 | RETURN VALUES : $twotailed          .. Twotailed Fisher value.
102 | 
103 | =cut
104 | 
105 | sub calculateStatistic
106 | {
107 |   my %values = @_;
108 | 
109 |   my $probabilities;
110 | 
111 |   # computes and returns the observed and marginal values from
112 |   # the frequency combination values. returns 0 if there is an
113 |   # error in the computation or the values are inconsistent.
114 |   if( !(Text::NSP::Measures::2D::Fisher2::getValues(\%values)) )
115 |   {
116 |     return;
117 |   }
118 | 
119 | 
120 |   my $final_limit = ($n1p < $np1) ? $n1p : $np1;
121 |   my $n11_org = $n11;
122 | 
123 |   my $n11_start = $n1p + $np1 - $npp;
124 |   if($n11_start<0)
125 |   {
126 |     $n11_start = 0;
127 |   }
128 | 
129 |   if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($n11_start, $final_limit)))
130 |   {
131 |       return;
132 |   }
133 | 
134 |   my $value;
135 | 
136 |   my $ttfisher=0;
137 | 
138 |   foreach $value (sort { $a <=> $b } values %$probabilities)
139 |   {
140 |     if($value > $probabilities->{$n11_org})
141 |     {
142 |       next;
143 |     }
144 |     $ttfisher += $value;
145 |   }
146 | 
147 |   return $ttfisher;
148 | }
149 | 
150 | 
151 | =item getStatisticName()
152 | 
153 | Returns the name of this statistic
154 | 
155 | INPUT PARAMS  : none
156 | 
157 | RETURN VALUES : $name      .. Name of the measure.
158 | 
159 | =cut
160 | 
161 | sub getStatisticName
162 | {
163 |     return "Two Tailed Fisher";
164 | }
165 | 
166 | 
167 | 
168 | 1;
169 | __END__
170 | 
171 | =back
172 | 
173 | =head1 AUTHOR
174 | 
175 | Ted Pedersen,                University of Minnesota Duluth
176 |                              E<lt>tpederse@d.umn.eduE<gt>
177 | 
178 | Satanjeev Banerjee,          Carnegie Mellon University
179 |                              E<lt>satanjeev@cmu.eduE<gt>
180 | 
181 | Amruta Purandare,            University of Pittsburgh
182 |                              E<lt>amruta@cs.pitt.eduE<gt>
183 | 
184 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
185 |                              E<lt>bthompson@d.umn.eduE<gt>
186 | 
187 | Saiyam Kohli,                University of Minnesota Duluth
188 |                              E<lt>kohli003@d.umn.eduE<gt>
189 | 
190 | =head1 HISTORY
191 | 
192 | Last updated: $Id: twotailed.pm,v 1.10 2008/03/26 17:24:15 tpederse Exp $
193 | 
194 | =head1 BUGS
195 | 
196 | 
197 | =head1 SEE ALSO
198 | 
199 | L<http://groups.yahoo.com/group/ngram/>
200 | 
201 | L<http://www.d.umn.edu/~tpederse/nsp.html>
202 | 
203 | 
204 | =head1 COPYRIGHT
205 | 
206 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
207 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
208 | 
209 | This program is free software; you can redistribute it and/or modify it
210 | under the terms of the GNU General Public License as published by the Free
211 | Software Foundation; either version 2 of the License, or (at your option)
212 | any later version.
213 | 
214 | This program is distributed in the hope that it will be useful, but
215 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
216 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
217 | for more details.
218 | 
219 | You should have received a copy of the GNU General Public License along
220 | with this program; if not, write to
221 | 
222 |     The Free Software Foundation, Inc.,
223 |     59 Temple Place - Suite 330,
224 |     Boston, MA  02111-1307, USA.
225 | 
226 | Note: a copy of the GNU General Public License is available on the web
227 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
228 | distribution as GPL.txt.
229 | 
230 | =cut
231 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/tmi.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::3D::MI::tmi - Perl implementation for True Mutual
  4 |                                    Information for trigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::3D::MI::tmi;
 11 | 
 12 |   $tmi_value = calculateStatistic( n111=>10,
 13 |                                   n1pp=>40,
 14 |                                   np1p=>45,
 15 |                                   npp1=>42,
 16 |                                   n11p=>20,
 17 |                                   n1p1=>23,
 18 |                                   np11=>21,
 19 |                                   nppp=>100);
 20 | 
 21 |   if( ($errorCode = getErrorCode()))
 22 |   {
 23 |     print STDERR $erroCode." - ".getErrorMessage()."\n";
 24 |   }
 25 |   else
 26 |   {
 27 |     print getStatisticName."value for bigram is ".$tmi_value."\n";
 28 |   }
 29 | 
 30 | =head1 DESCRIPTION
 31 | 
 32 | True Mutual Information (tmi) is defined as the weighted average of the
 33 | pointwise mutual informations for all the observed and expected value pairs.
 34 | 
 35 |  tmi = [n111/nppp * log(n111/m111) + n112/nppp * log(n112/m112) +
 36 |         n121/nppp * log(n121/m121) + n122/nppp * log(n122/m122) +
 37 |         n211/nppp * log(n211/m211) + n212/nppp * log(n212/m212) +
 38 |         n221/nppp * log(n221/m221) + n222/nppp * log(n222/m222)]
 39 | 
 40 |  PMI =   log (n111/m111)
 41 | 
 42 | Here n111 represents the observed value for the cell (1,1,1) and m111
 43 | represents the expected value for that cell. The expected values for
 44 | the internal cells are calculated by taking the product of their
 45 | associated marginals and dividing by the sample size, for example:
 46 | 
 47 |             n1pp * np1p * npp1
 48 |    m111=   --------------------
 49 |                    nppp
 50 | 
 51 | =head2 Methods
 52 | 
 53 | =over
 54 | 
 55 | =cut
 56 | 
 57 | 
 58 | package Text::NSP::Measures::3D::MI::tmi;
 59 | 
 60 | 
 61 | use Text::NSP::Measures::3D::MI;
 62 | use strict;
 63 | use Carp;
 64 | use warnings;
 65 | no warnings 'redefine';
 66 | require Exporter;
 67 | 
 68 | our ($VERSION, @EXPORT, @ISA);
 69 | 
 70 | @ISA  = qw(Exporter);
 71 | 
 72 | @EXPORT = qw(initializeStatistic calculateStatistic
 73 |              getErrorCode getErrorMessage getStatisticName);
 74 | 
 75 | $VERSION = '0.97';
 76 | 
 77 | 
 78 | =item calculateStatistic($count_values) - This method calculates
 79 | the tmi value
 80 | 
 81 | INPUT PARAMS  : $count_values   .. Reference of an hash containing
 82 |                                    the count values computed by the
 83 |                                    count.pl program.
 84 | 
 85 | RETURN VALUES : $tmi            .. TMI value for this trigram.
 86 | 
 87 | =cut
 88 | 
 89 | sub calculateStatistic
 90 | {
 91 |   my %values = @_;
 92 | 
 93 |   # computes and returns the observed and expected values from
 94 |   # the frequency combination values. returns 0 if there is an
 95 |   # error in the computation or the values are inconsistent.
 96 |   if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) {
 97 |     return(0);
 98 |   }
 99 | 
100 |   #my $marginals = $self->computeMarginalTotals(@_);
101 | 
102 |   #  Now for the actual calculation of TMI!
103 |   my $tmi = 0;
104 | 
105 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so!
106 |   $tmi += $n111/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n111, $m111 )/ log 2;
107 |   $tmi += $n112/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n112, $m112 )/ log 2;
108 |   $tmi += $n121/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n121, $m121 )/ log 2;
109 |   $tmi += $n122/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n122, $m122 )/ log 2;
110 |   $tmi += $n211/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n211, $m211 )/ log 2;
111 |   $tmi += $n212/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n212, $m212 )/ log 2;
112 |   $tmi += $n221/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n221, $m221 )/ log 2;
113 |   $tmi += $n222/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n222, $m222 )/ log 2;
114 | 
115 |   return ($tmi);
116 | }
117 | 
118 | 
119 | =item getStatisticName() - Returns the name of this statistic
120 | 
121 | INPUT PARAMS  : none
122 | 
123 | RETURN VALUES : $name      .. Name of the measure.
124 | 
125 | =cut
126 | 
127 | sub getStatisticName
128 | {
129 |     return "True Mutual Information";
130 | }
131 | 
132 | 
133 | 
134 | 1;
135 | __END__
136 | 
137 | =back
138 | 
139 | =head1 AUTHOR
140 | 
141 | Ted Pedersen,                University of Minnesota Duluth
142 |                              E<lt>tpederse@d.umn.eduE<gt>
143 | 
144 | Satanjeev Banerjee,          Carnegie Mellon University
145 |                              E<lt>satanjeev@cmu.eduE<gt>
146 | 
147 | Amruta Purandare,            University of Pittsburgh
148 |                              E<lt>amruta@cs.pitt.eduE<gt>
149 | 
150 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
151 |                              E<lt>bthompson@d.umn.eduE<gt>
152 | 
153 | Saiyam Kohli,                University of Minnesota Duluth
154 |                              E<lt>kohli003@d.umn.eduE<gt>
155 | 
156 | =head1 HISTORY
157 | 
158 | Last updated: $Id: tmi.pm,v 1.10 2006/06/21 11:10:53 saiyam_kohli Exp $
159 | 
160 | =head1 BUGS
161 | 
162 | 
163 | =head1 SEE ALSO
164 | 
165 |   @inproceedings{moore:2004:EMNLP,
166 |                 author    = {Moore, Robert C.},
167 |                 title     = {On Log-Likelihood-Ratios and the Significance of Rare
168 |                              Events },
169 |                 booktitle = {Proceedings of EMNLP 2004},
170 |                 editor = {Dekang Lin and Dekai Wu},
171 |                 year      = 2004,
172 |                 month     = {July},
173 |                 address   = {Barcelona, Spain},
174 |                 publisher = {Association for Computational Linguistics},
175 |                 pages     = {333--340}
176 |                 url = L<http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Moore.pdf>}
177 | 
178 | L<http://groups.yahoo.com/group/ngram/>
179 | 
180 | L<http://www.d.umn.edu/~tpederse/nsp.html>
181 | 
182 | 
183 | =head1 COPYRIGHT
184 | 
185 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
186 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
187 | 
188 | This program is free software; you can redistribute it and/or modify it
189 | under the terms of the GNU General Public License as published by the Free
190 | Software Foundation; either version 2 of the License, or (at your option)
191 | any later version.
192 | 
193 | This program is distributed in the hope that it will be useful, but
194 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
195 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
196 | for more details.
197 | 
198 | You should have received a copy of the GNU General Public License along
199 | with this program; if not, write to
200 | 
201 |     The Free Software Foundation, Inc.,
202 |     59 Temple Place - Suite 330,
203 |     Boston, MA  02111-1307, USA.
204 | 
205 | Note: a copy of the GNU General Public License is available on the web
206 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
207 | distribution as GPL.txt.
208 | 
209 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher2/left.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Fisher2::left - Perl module implementation of the left sided
  4 |                                         Fisher's exact test (Deprecated).
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Fisher2::left;
 11 | 
 12 |   my $leftFisher = Text::NSP::Measures::2D::Fisher2::left->new();
 13 | 
 14 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 15 | 
 16 |   $leftFisher_value = $leftFisher->calculateStatistic( n11=>$n11,
 17 |                                                        n1p=>$n1p,
 18 |                                                        np1=>$np1,
 19 |                                                        npp=>$npp);
 20 | 
 21 |   if( ($errorCode = $leftFisher->getErrorCode()))
 22 |   {
 23 |     print STDERR $erroCode." - ".$leftFisher->getErrorMessage();
 24 |   }
 25 |   else
 26 |   {
 27 |     print $leftFisher->getStatisticName."value for bigram is ".$leftFisher_value;
 28 |   }
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | This module provides a naive implementation of the fishers left
 34 | sided exact tests. That is the implementation does not have any
 35 | optimizations for performance. This will compute the factorials and
 36 | the hypergeometric measures using direct multiplications.
 37 | 
 38 | This measure should be used if you need exact values without any
 39 | rounding errors, and you are not worried about the performance of
 40 | the measure, otherwise use the implementations under the
 41 | Text::NSP::Measures::2D::Fisher module. To use this implementation,
 42 | you will have to specify the entire module name. Usage:
 43 | 
 44 | statistic.pl Text::NSP::Measures::Fisher2::left dest.txt source.cnt
 45 | 
 46 | Assume that the frequency count data associated with a bigram
 47 | <word1><word2> is stored in a 2x2 contingency table:
 48 | 
 49 |           word2   ~word2
 50 |   word1    n11      n12 | n1p
 51 |  ~word1    n21      n22 | n2p
 52 |            --------------
 53 |            np1      np2   npp
 54 | 
 55 | where n11 is the number of times <word1><word2> occur together, and
 56 | n12 is the number of times <word1> occurs with some word other than
 57 | word2, and n1p is the number of times in total that word1 occurs as
 58 | the first word in a bigram.
 59 | 
 60 | The fishers exact tests are calculated by fixing the marginal totals
 61 | and computing the hypergeometric probabilities for all the possible
 62 | contingency tables,
 63 | 
 64 | A left sided test is calculated by adding the probabilities of all
 65 | the possible two by two contingency tables formed by fixing the
 66 | marginal totals and changing the value of n11 to less than the given
 67 | value. A left sided Fisher's Exact Test tells us how likely it is to
 68 | randomly sample a table where n11 is less than observed. In other words,
 69 | it tells us how likely it is to sample an observation where the two words
 70 | are less dependent than currently observed.
 71 | 
 72 | =head2 Methods
 73 | 
 74 | =over
 75 | 
 76 | =cut
 77 | 
 78 | 
 79 | package Text::NSP::Measures::2D::Fisher2::left;
 80 | 
 81 | 
 82 | use Text::NSP::Measures::2D::Fisher2;
 83 | use strict;
 84 | use Carp;
 85 | use warnings;
 86 | no warnings 'redefine';
 87 | require Exporter;
 88 | 
 89 | our ($VERSION, @EXPORT, @ISA);
 90 | 
 91 | @ISA  = qw(Exporter);
 92 | 
 93 | @EXPORT = qw(initializeStatistic calculateStatistic
 94 |              getErrorCode getErrorMessage getStatisticName);
 95 | 
 96 | $VERSION = '0.97';
 97 | 
 98 | 
 99 | =item calculateStatistic() - This method computes the left sided Fishers
100 |                              exact test.
101 | 
102 | INPUT PARAMS  : $count_values       .. Reference of an array containing
103 |                                        the count values computed by the
104 |                                        count.pl program.
105 | 
106 | RETURN VALUES : $left               .. Left Fisher value.
107 | 
108 | =cut
109 | 
110 | sub calculateStatistic
111 | {
112 |   my %values = @_;
113 | 
114 |   my $probabilities;
115 | 
116 |   # computes and returns the observed and marginal values from
117 |   # the frequency combination values. returns 0 if there is an
118 |   # error in the computation or the values are inconsistent.
119 |   if( !(Text::NSP::Measures::2D::Fisher2::getValues(\%values)) )
120 |   {
121 |     return;
122 |   }
123 | 
124 |   my $final_limit = $n11;
125 |   my $n11 = $n1p + $np1 - $npp;
126 |   if($n11<0)
127 |   {
128 |     $n11 = 0;
129 |   }
130 | 
131 |   if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($n11, $final_limit)))
132 |   {
133 |       return;
134 |   }
135 | 
136 | 
137 |   my $key_n11;
138 | 
139 |   my $leftfisher=0;
140 | 
141 |   foreach $key_n11 (sort { $a <=> $b } keys %$probabilities)
142 |   {
143 |     if($key_n11>$final_limit)
144 |     {
145 |       last;
146 |     }
147 |     $leftfisher += $probabilities->{$key_n11};
148 |   }
149 | 
150 |   return $leftfisher;
151 | }
152 | 
153 | 
154 | =item getStatisticName() - Returns the name of this statistic
155 | 
156 | INPUT PARAMS  : none
157 | 
158 | RETURN VALUES : $name      .. Name of the measure.
159 | 
160 | =cut
161 | 
162 | sub getStatisticName
163 | {
164 |     return "Left Fisher";
165 | }
166 | 
167 | 
168 | 
169 | 1;
170 | __END__
171 | 
172 | =back
173 | 
174 | =head1 AUTHOR
175 | 
176 | Ted Pedersen,                University of Minnesota Duluth
177 |                              E<lt>tpederse@d.umn.eduE<gt>
178 | 
179 | Satanjeev Banerjee,          Carnegie Mellon University
180 |                              E<lt>satanjeev@cmu.eduE<gt>
181 | 
182 | Amruta Purandare,            University of Pittsburgh
183 |                              E<lt>amruta@cs.pitt.eduE<gt>
184 | 
185 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
186 |                              E<lt>bthompson@d.umn.eduE<gt>
187 | 
188 | Saiyam Kohli,                University of Minnesota Duluth
189 |                              E<lt>kohli003@d.umn.eduE<gt>
190 | 
191 | =head1 HISTORY
192 | 
193 | Last updated: $Id: left.pm,v 1.10 2008/03/26 17:24:15 tpederse Exp $
194 | 
195 | =head1 BUGS
196 | 
197 | 
198 | =head1 SEE ALSO
199 | 
200 | L<http://groups.yahoo.com/group/ngram/>
201 | 
202 | L<http://www.d.umn.edu/~tpederse/nsp.html>
203 | 
204 | 
205 | =head1 COPYRIGHT
206 | 
207 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
208 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
209 | 
210 | This program is free software; you can redistribute it and/or modify it
211 | under the terms of the GNU General Public License as published by the Free
212 | Software Foundation; either version 2 of the License, or (at your option)
213 | any later version.
214 | 
215 | This program is distributed in the hope that it will be useful, but
216 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
217 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
218 | for more details.
219 | 
220 | You should have received a copy of the GNU General Public License along
221 | with this program; if not, write to
222 | 
223 |     The Free Software Foundation, Inc.,
224 |     59 Temple Place - Suite 330,
225 |     Boston, MA  02111-1307, USA.
226 | 
227 | Note: a copy of the GNU General Public License is available on the web
228 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
229 | distribution as GPL.txt.
230 | 
231 | =cut
232 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::CHI - Perl module that provides error checks
  4 |                                for the Pearson's chi squared, phi coefficient
  5 |                                and the Tscore measures.
  6 | 
  7 | =head1 SYNOPSIS
  8 | 
  9 | =head3 Basic Usage
 10 | 
 11 |   use Text::NSP::Measures::2D::CHI::x2;
 12 | 
 13 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 14 | 
 15 |   $x2_value = calculateStatistic( n11=>$n11,
 16 |                                       n1p=>$n1p,
 17 |                                       np1=>$np1,
 18 |                                       npp=>$npp);
 19 | 
 20 |   if( ($errorCode = getErrorCode()))
 21 |   {
 22 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 23 |   }
 24 |   else
 25 |   {
 26 |     print getStatisticName."value for bigram is ".$x2_value."\n"";
 27 |   }
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | This module is the base class for the Chi-squared and Phi coefficient
 32 | measures. This module provides error checks specific for these measures,
 33 | it also implements the computations that are common to these measures.
 34 | 
 35 | =over
 36 | 
 37 | =item Pearson's Chi-Squared
 38 | 
 39 |   x2 = 2 * [((n11 - m11)/m11)^2 + ((n12 - m12)/m12)^2 +
 40 |            ((n21 - m21)/m21)^2 + ((n22 -m22)/m22)^2]
 41 | 
 42 | =item Phi Coefficient
 43 | 
 44 |  PHI^2 = ((n11 * n22) - (n21 * n21))^2/(n1p * np1 * np2 * n2p)
 45 | 
 46 | =item T-Score
 47 | 
 48 |  tscore = (n11 - m11)/sqrt(n11)
 49 | 
 50 | =back
 51 | 
 52 | Note that the value of PHI^2 is equivalent to
 53 | Pearson's Chi-Squared test multiplied by the sample size, that is:
 54 | 
 55 |  Chi-Squared = npp * PHI^2
 56 | 
 57 |  Although T-score seems quite different from the other two measures we
 58 |  have put it in the CHI family because like the other two measures it
 59 |  uses the difference between the observed and expected values and is also
 60 |  quite similar in ranking the bigrams.
 61 | 
 62 | =over
 63 | 
 64 | =cut
 65 | 
 66 | 
 67 | package Text::NSP::Measures::2D::CHI;
 68 | 
 69 | 
 70 | use Text::NSP::Measures::2D;
 71 | use strict;
 72 | use Carp;
 73 | use warnings;
 74 | # use subs(calculateStatistic);
 75 | require Exporter;
 76 | 
 77 | our ($VERSION, @EXPORT, @ISA);
 78 | 
 79 | @ISA  = qw(Exporter);
 80 | 
 81 | @EXPORT = qw(initializeStatistic calculateStatistic
 82 |              getErrorCode getErrorMessage getStatisticName
 83 |              $n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22
 84 |              $npp $np1 $np2 $n2p $n1p $errorCodeNumber
 85 |              $errorMessage);
 86 | 
 87 | $VERSION = '1.03';
 88 | 
 89 | =item getValues() - This method calls the computeMarginalTotals(),
 90 | computeObservedValues() and the computeExpectedValues() methods to
 91 | compute the observed and expected values. It checks thees values for
 92 | any errors that might cause the PHI and x2 measures to fail.
 93 | 
 94 | INPUT PARAMS  : $count_values           .. Reference of an hash containing
 95 |                                            the count values computed by the
 96 |                                            count.pl program.
 97 | 
 98 | RETURN VALUES : 1/undef           ..returns '1' to indicate success
 99 |                                     and an undefined(NULL) value to indicate
100 |                                     failure.
101 | 
102 | =cut
103 | 
104 | sub getValues
105 | {
106 |   my ($values)=@_;
107 | 
108 |   if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ) {
109 |     return;
110 |   }
111 | 
112 |   if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) {
113 |       return;
114 |   }
115 | 
116 |   if( !(Text::NSP::Measures::2D::computeExpectedValues($values)) ) {
117 |       return;
118 |   }
119 | 
120 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so and return;
121 |   if ( $n11 )
122 |   {
123 |     if ($m11 == 0)
124 |     {
125 |       $errorMessage = "Expected value in cell (1,1) must not be zero";
126 |       $errorCodeNumber = 221;
127 |       return;
128 |     }
129 |   }
130 |   if ( $n12 )
131 |   {
132 |     if ($m12 == 0)
133 |     {
134 |       $errorMessage = "Expected value in cell (1,2) must not be zero";
135 |       $errorCodeNumber = 221;
136 |       return;
137 |     }
138 |   }
139 |   if ( $n21 )
140 |   {
141 |     if ($m21 == 0)
142 |     {
143 |       $errorMessage = "Expected value in cell (2,1) must not be zero";
144 |       $errorCodeNumber = 221;
145 |       return;
146 |     }
147 |   }
148 |   if ( $n22 )
149 |   {
150 |     if ($m22 == 0)
151 |     {
152 |       $errorMessage = "Expected value in cell (2,2) must not be zero";
153 |       $errorCodeNumber = 221;
154 |       return;
155 |     }
156 |   }
157 |   #  Everything looks good so we can return 1
158 |   return 1;
159 | }
160 | 
161 | 
162 | 
163 | 
164 | =item computeVal() - Computes the deviation in observed value with respect
165 | to the expected values
166 | 
167 | INPUT PARAMS  : $n         ..Observed value
168 |                 $m         ..Expected value
169 | 
170 | RETURN VALUES : (n-m)^2/m  ..the log of the ratio of
171 |                              observed value to expected
172 |                              value.
173 | 
174 | =cut
175 | 
176 | sub computeVal
177 | {
178 |   my $n = shift;
179 |   my $m = shift;
180 |   if($m)
181 |   {
182 |     return (($n-$m)**2)/$m;
183 |   }
184 |   else
185 |   {
186 |     return 0;
187 |   }
188 | }
189 | 
190 | 
191 | 
192 | 1;
193 | __END__
194 | 
195 | 
196 | =back
197 | 
198 | =head1 AUTHOR
199 | 
200 | Ted Pedersen,                University of Minnesota Duluth
201 |                              E<lt>tpederse@d.umn.eduE<gt>
202 | 
203 | Satanjeev Banerjee,          Carnegie Mellon University
204 |                              E<lt>satanjeev@cmu.eduE<gt>
205 | 
206 | Amruta Purandare,            University of Pittsburgh
207 |                              E<lt>amruta@cs.pitt.eduE<gt>
208 | 
209 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
210 |                              E<lt>bthompson@d.umn.eduE<gt>
211 | 
212 | Saiyam Kohli,                University of Minnesota Duluth
213 |                              E<lt>kohli003@d.umn.eduE<gt>
214 | 
215 | =head1 HISTORY
216 | 
217 | Last updated: $Id: CHI.pm,v 1.14 2008/03/26 17:18:26 tpederse Exp $
218 | 
219 | =head1 BUGS
220 | 
221 | 
222 | =head1 SEE ALSO
223 | 
224 | L<http://groups.yahoo.com/group/ngram/>
225 | 
226 | L<http://www.d.umn.edu/~tpederse/nsp.html>
227 | 
228 | 
229 | =head1 COPYRIGHT
230 | 
231 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
232 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
233 | 
234 | This program is free software; you can redistribute it and/or modify it
235 | under the terms of the GNU General Public License as published by the Free
236 | Software Foundation; either version 2 of the License, or (at your option)
237 | any later version.
238 | 
239 | This program is distributed in the hope that it will be useful, but
240 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
241 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
242 | for more details.
243 | 
244 | You should have received a copy of the GNU General Public License along
245 | with this program; if not, write to
246 | 
247 |     The Free Software Foundation, Inc.,
248 |     59 Temple Place - Suite 330,
249 |     Boston, MA  02111-1307, USA.
250 | 
251 | Note: a copy of the GNU General Public License is available on the web
252 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
253 | distribution as GPL.txt.
254 | 
255 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/pmi.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::MI::pmi - Perl module that implements Pointwise
  4 |                                    Mutual Information.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::MI::pmi;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $pmi_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$pmi_value."\n"";
 26 |   }
 27 | 
 28 | =head1 DESCRIPTION
 29 | 
 30 | Assume that the frequency count data associated with a bigram
 31 | <word1><word2> is stored in a 2x2 contingency table:
 32 | 
 33 |           word2   ~word2
 34 |   word1    n11      n12 | n1p
 35 |  ~word1    n21      n22 | n2p
 36 |            --------------
 37 |            np1      np2   npp
 38 | 
 39 | where n11 is the number of times <word1><word2> occur together, and
 40 | n12 is the number of times <word1> occurs with some word other than
 41 | word2, and n1p is the number of times in total that word1 occurs as
 42 | the first word in a bigram.
 43 | 
 44 | The expected values for the internal cells are calculated by taking the
 45 | product of their associated marginals and dividing by the sample size,
 46 | for example:
 47 | 
 48 |           np1 * n1p
 49 |    m11=   ---------
 50 |             npp
 51 | 
 52 | Pointwise Mutual Information (pmi) is defined as the log of the deviation
 53 | between the observed frequency of a bigram (n11) and the probability of
 54 | that bigram if it were independent (m11).
 55 | 
 56 |  PMI =   log (n11/m11)
 57 | 
 58 |  The Pointwise Mutual Information tends to overestimate bigrams with low
 59 |  observed frequency counts. To prevent this sometimes a variation of pmi
 60 |  is used which increases the influence of the observed frequency.
 61 | 
 62 |  PMI = log((n11^$exp)/m11)
 63 | 
 64 |  The $exp is 1 by default, so by default the measure will compute the
 65 |  Pointwise Mutual Information for the given bigram. To use a variation of
 66 |  the measure, users can pass the $exp parameter using the --pmi_exp
 67 |  command line option in statistic.pl or by passing the $exp to the
 68 |  initializeStatistic() method from their program.
 69 | 
 70 |  The usage for statistic.pl is
 71 | 
 72 |  statistic.pl pmi out_pmi.stt out.cnt    - for Point Wise Mutual Information
 73 |                                            $exp is 1 in this case.
 74 | 
 75 |  statistic.pl --pmi_exp 2 pmi out_pmi2.stt out.cnt   - for the variant with
 76 |                                                        $exp set to 2.
 77 | 
 78 | =head2 Methods
 79 | 
 80 | =over
 81 | 
 82 | =cut
 83 | 
 84 | 
 85 | package Text::NSP::Measures::2D::MI::pmi;
 86 | 
 87 | 
 88 | use Text::NSP::Measures::2D::MI;
 89 | use strict;
 90 | use Carp;
 91 | use warnings;
 92 | no warnings 'redefine';
 93 | require Exporter;
 94 | 
 95 | our ($VERSION, @EXPORT, @ISA, $exp);
 96 | 
 97 | $exp = 1;
 98 | 
 99 | @ISA  = qw(Exporter);
100 | 
101 | @EXPORT = qw(initializeStatistic calculateStatistic
102 |              getErrorCode getErrorMessage getStatisticName);
103 | 
104 | $VERSION = '0.97';
105 | 
106 | 
107 | =item initializeStatistic() -Initialization of the pmi_exp parameter if required
108 | 
109 | INPUT PARAMS  : none
110 | 
111 | RETURN VALUES : none
112 | 
113 | =cut
114 | 
115 | sub initializeStatistic
116 | {
117 |   $exp = shift;
118 | }
119 | 
120 | 
121 | 
122 | =item calculateStatistic() - This method calculates the pmi value
123 | 
124 | INPUT PARAMS  : $count_values       .. Reference of a hash containing
125 |                                        the count values computed by the
126 |                                        count.pl program.
127 | 
128 | RETURN VALUES : $pmi                .. PMI value for this bigram.
129 | 
130 | =cut
131 | 
132 | sub calculateStatistic
133 | {
134 |   my %values = @_;
135 | 
136 |   # computes and returns the observed and expected values from
137 |   # the frequency combination values. returns 0 if there is an
138 |   # error in the computation or the values are inconsistent.
139 |   if( !(Text::NSP::Measures::2D::MI::getValues(\%values)) ) {
140 |     return;
141 |   }
142 | 
143 |   #  Now the calculations!
144 |   my $pmi = Text::NSP::Measures::2D::MI::computePMI($n11**$exp,$m11);
145 | 
146 |   return($pmi/log(2));
147 | }
148 | 
149 | 
150 | 
151 | =item getStatisticName() - Returns the name of this statistic
152 | 
153 | INPUT PARAMS  : none
154 | 
155 | RETURN VALUES : $name      .. Name of the measure.
156 | 
157 | =cut
158 | 
159 | sub getStatisticName
160 | {
161 |     return "Pointwise Mutual Information";
162 | }
163 | 
164 | 
165 | 
166 | 1;
167 | __END__
168 | 
169 | 
170 | =back
171 | 
172 | =head1 AUTHOR
173 | 
174 | Ted Pedersen,                University of Minnesota Duluth
175 |                              E<lt>tpederse@d.umn.eduE<gt>
176 | 
177 | Satanjeev Banerjee,          Carnegie Mellon University
178 |                              E<lt>satanjeev@cmu.eduE<gt>
179 | 
180 | Amruta Purandare,            University of Pittsburgh
181 |                              E<lt>amruta@cs.pitt.eduE<gt>
182 | 
183 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
184 |                              E<lt>bthompson@d.umn.eduE<gt>
185 | 
186 | Saiyam Kohli,                University of Minnesota Duluth
187 |                              E<lt>kohli003@d.umn.eduE<gt>
188 | 
189 | =head1 HISTORY
190 | 
191 | Last updated: $Id: pmi.pm,v 1.24 2008/03/26 17:20:28 tpederse Exp $
192 | 
193 | =head1 BUGS
194 | 
195 | 
196 | =head1 SEE ALSO
197 | 
198 |   @inproceedings{ church89word,
199 |       author = {Kenneth W. Church and Patrick Hanks},
200 |       title = {Word association norms, mutual information, and Lexicography},
201 |       booktitle = {Proceedings of the 27th. Annual Meeting of the Association for Computational Linguistics},
202 |       publisher = {Association for Computational Linguistics},
203 |       address = {Vancouver, B.C.},
204 |       pages = {76--83},
205 |       year = {1989},
206 |       url = L<http://acl.ldc.upenn.edu/J/J90/J90-1003.pdf> }
207 | 
208 | 
209 | L<http://groups.yahoo.com/group/ngram/>
210 | 
211 | L<http://www.d.umn.edu/~tpederse/nsp.html>
212 | 
213 | 
214 | =head1 COPYRIGHT
215 | 
216 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
217 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
218 | 
219 | This program is free software; you can redistribute it and/or modify it
220 | under the terms of the GNU General Public License as published by the Free
221 | Software Foundation; either version 2 of the License, or (at your option)
222 | any later version.
223 | 
224 | This program is distributed in the hope that it will be useful, but
225 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
226 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
227 | for more details.
228 | 
229 | You should have received a copy of the GNU General Public License along
230 | with this program; if not, write to
231 | 
232 |     The Free Software Foundation, Inc.,
233 |     59 Temple Place - Suite 330,
234 |     Boston, MA  02111-1307, USA.
235 | 
236 | Note: a copy of the GNU General Public License is available on the web
237 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
238 | distribution as GPL.txt.
239 | 
240 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/ll.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::MI::ll - Perl module that implements Loglikelihood
  4 |                                   measure of association for bigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::MI::ll;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $ll_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage();
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$ll_value;
 26 |   }
 27 | 
 28 | =head1 DESCRIPTION
 29 | 
 30 | The log-likelihood ratio measures the deviation between the observed data
 31 | and what would be expected if <word1> and <word2> were independent. The
 32 | higher the score, the less evidence there is in favor of concluding that
 33 | the words are independent.
 34 | 
 35 | Assume that the frequency count data associated with a bigram
 36 | <word1><word2> as shown by a 2x2 contingency table:
 37 | 
 38 |           word2   ~word2
 39 |   word1    n11      n12 | n1p
 40 |  ~word1    n21      n22 | n2p
 41 |            --------------
 42 |            np1      np2   npp
 43 | 
 44 | where n11 is the number of times <word1><word2> occur together, and
 45 | n12 is the number of times <word1> occurs with some word other than
 46 | word2, and n1p is the number of times in total that word1 occurs as
 47 | the first word in a bigram.
 48 | 
 49 | The expected values for the internal cells are calculated by taking the
 50 | product of their associated marginals and dividing by the sample size,
 51 | for example:
 52 | 
 53 |           np1 * n1p
 54 |    m11=   ---------
 55 |             npp
 56 | 
 57 | Then the deviation between observed and expected values for each internal
 58 | cell is computed to arrive at the log-likelihood value.
 59 | 
 60 |  Log-Likelihood = 2 * [n11 * log(n11/m11) + n12 * log(n12/m12) +
 61 |            n21 * log(n21/m21) + n22 * log(n22/m22)]
 62 | 
 63 | =head2 Methods
 64 | 
 65 | =over
 66 | 
 67 | =cut
 68 | 
 69 | 
 70 | package Text::NSP::Measures::2D::MI::ll;
 71 | 
 72 | 
 73 | use Text::NSP::Measures::2D::MI;
 74 | use strict;
 75 | use Carp;
 76 | use warnings;
 77 | no warnings 'redefine';
 78 | require Exporter;
 79 | 
 80 | our ($VERSION, @EXPORT, @ISA);
 81 | 
 82 | @ISA  = qw(Exporter);
 83 | 
 84 | @EXPORT = qw(initializeStatistic calculateStatistic
 85 |              getErrorCode getErrorMessage getStatisticName);
 86 | 
 87 | $VERSION = '0.97';
 88 | 
 89 | =item calculateStatistic() - This method calculates the ll value
 90 | 
 91 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 92 |                                        the count values computed by the
 93 |                                        count.pl program.
 94 | 
 95 | RETURN VALUES : $loglikelihood      .. Loglikelihood value for this bigram.
 96 | 
 97 | =cut
 98 | 
 99 | sub calculateStatistic
100 | {
101 |   my %values = @_;
102 | 
103 |   # computes and sets the observed and expected values from
104 |   # the frequency combination values. returns 0 if there is an
105 |   # error in the computation or the values are inconsistent.
106 |   if( !Text::NSP::Measures::2D::MI::getValues(\%values) )
107 |   {
108 |     return;
109 |   }
110 | 
111 |   #  Now for the actual calculation of Loglikelihood!
112 |   my $logLikelihood = 0;
113 | 
114 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so!
115 |   $logLikelihood += $n11 * Text::NSP::Measures::2D::MI::computePMI( $n11, $m11 );
116 |   $logLikelihood += $n12 * Text::NSP::Measures::2D::MI::computePMI( $n12, $m12 );
117 |   $logLikelihood += $n21 * Text::NSP::Measures::2D::MI::computePMI( $n21, $m21 );
118 |   $logLikelihood += $n22 * Text::NSP::Measures::2D::MI::computePMI( $n22, $m22 );
119 | 
120 |   return ( 2 * $logLikelihood );
121 | }
122 | 
123 | 
124 | =item getStatisticName() - Returns the name of this statistic
125 | 
126 | INPUT PARAMS  : none
127 | 
128 | RETURN VALUES : $name      .. Name of the measure.
129 | 
130 | =cut
131 | 
132 | sub getStatisticName
133 | {
134 |     return "Log-likelihood";
135 | }
136 | 
137 | 
138 | 
139 | 1;
140 | __END__
141 | 
142 | 
143 | =back
144 | 
145 | =head1 AUTHOR
146 | 
147 | Ted Pedersen,                University of Minnesota Duluth
148 |                              E<lt>tpederse@d.umn.eduE<gt>
149 | 
150 | Satanjeev Banerjee,          Carnegie Mellon University
151 |                              E<lt>satanjeev@cmu.eduE<gt>
152 | 
153 | Amruta Purandare,            University of Pittsburgh
154 |                              E<lt>amruta@cs.pitt.eduE<gt>
155 | 
156 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
157 |                              E<lt>bthompson@d.umn.eduE<gt>
158 | 
159 | Saiyam Kohli,                University of Minnesota Duluth
160 |                              E<lt>kohli003@d.umn.eduE<gt>
161 | 
162 | =head1 HISTORY
163 | 
164 | Last updated: $Id: ll.pm,v 1.23 2008/03/26 17:20:27 tpederse Exp $
165 | 
166 | =head1 BUGS
167 | 
168 | 
169 | =head1 SEE ALSO
170 | 
171 |   @article{Dunning93,
172 |             author = {Dunning, T.},
173 |             title = {Accurate Methods for the Statistics of
174 |                     Surprise and Coincidence},
175 |             journal = {Computational Linguistics},
176 |             volume = {19},
177 |             number = {1},
178 |             year = {1993},
179 |             pages = {61-74}
180 |             url = L<http://www.comp.lancs.ac.uk/ucrel/papers/tedstats.pdf>}
181 | 
182 |   @inproceedings{moore:2004:EMNLP,
183 |                 author    = {Moore, Robert C.},
184 |                 title     = {On Log-Likelihood-Ratios and the Significance of Rare
185 |                             Events },
186 |                 booktitle = {Proceedings of EMNLP 2004},
187 |                 editor = {Dekang Lin and Dekai Wu},
188 |                 year      = 2004,
189 |                 month     = {July},
190 |                 address   = {Barcelona, Spain},
191 |                 publisher = {Association for Computational Linguistics},
192 |                 pages     = {333--340}
193 |                 url = L<http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Moore.pdf>}
194 | 
195 | L<http://groups.yahoo.com/group/ngram/>
196 | 
197 | L<http://www.d.umn.edu/~tpederse/nsp.html>
198 | 
199 | 
200 | =head1 COPYRIGHT
201 | 
202 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
203 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
204 | 
205 | This program is free software; you can redistribute it and/or modify it
206 | under the terms of the GNU General Public License as published by the Free
207 | Software Foundation; either version 2 of the License, or (at your option)
208 | any later version.
209 | 
210 | This program is distributed in the hope that it will be useful, but
211 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
212 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
213 | for more details.
214 | 
215 | You should have received a copy of the GNU General Public License along
216 | with this program; if not, write to
217 | 
218 |     The Free Software Foundation, Inc.,
219 |     59 Temple Place - Suite 330,
220 |     Boston, MA  02111-1307, USA.
221 | 
222 | Note: a copy of the GNU General Public License is available on the web
223 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
224 | distribution as GPL.txt.
225 | 
226 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/functional_class.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | $/=undef;
  4 | use Cwd;
  5 | $pwd=cwd();
  6 | use lib "$ARGV[5]/class/lib";
  7 | use lib "$ARGV[5]/class//lib/Tie-IxHash-1.23";
  8 | use Tie::IxHash;
  9 | use Statistics::Multtest qw(bonferroni holm hommel hochberg BH BY qvalue);
 10 | use Statistics::Multtest qw(:all);
 11 | tie %kegg, 'Tie::IxHash';
 12 | use Text::NSP::Measures::2D::Fisher::right;
 13 | use Number::FormatEng qw(:all);
 14 | use Data::Dumper;
 15 | @myout=split("/",$ARGV[1]);
 16 | open(RESULT,">@ARGV[2]/@{myout[$#myout]}_${ARGV[3]}_functional_classification.tsv");
 17 | ##if(! defined @ARGV[0] && ! defined @ARGV[1] && ! defined @ARGV[2] || $ARGV[0] eq "-h")
 18 | ##{
 19 | #print"Gene Ontology/Functional Classification for set of genes\n--------------------------------------------------------\nUsage: OntologyClass [gene_identifier] [list]\n\nidentifier: Gene identifier for given input ['sym' - without quotes for gene symbols and 'gid' - without quotes for Gene ID ].\nlist: Gene list for analysis.\n\nAuthor: Santhilal Subhash\nsanthilal.subhash\@gu.se\nLast Updated: 2013 July 14\n"
 20 | ##print"Gene Ontology/Functional Classification for set of genes\n--------------------------------------------------------\nUsage: OntologyClass [identifier] [list] [OUTPATH]\n\nidentifier: Gene identifier for given input ['sym' - without quotes for gene symbols and 'gid' - without quotes for Gene ID ].\nlist: Gene list for analysis.\nOUTPATH: The path where output file should be stored (This script generates output named YOUR_INPUT_FILE_functional_classification.tsv in the mentioned path).\n\nAuthor: Santhilal Subhash\nsanthilal.subhash\@gu.se\nLast Updated: 2014 February 6\n"
 21 | ##}
 22 | 
 23 | #### GeneOntology DB ####
 24 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_all")
 25 | {
 26 | $mytype="Gene Symbol";
 27 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new.txt") or die "Error opening in file";
 28 | }
 29 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_all")
 30 | {
 31 | $mytype="Entrez GeneID";
 32 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new.txt") or die "Error opening in file";
 33 | }
 34 | 
 35 | 
 36 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_BP")
 37 | {
 38 | $mytype="Gene Symbol";
 39 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new_bp.txt") or die "Error opening in file";
 40 | }
 41 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_BP")
 42 | {
 43 | $mytype="Entrez GeneID";
 44 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new_bp.txt") or die "Error opening in file";
 45 | }
 46 | 
 47 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_MF")
 48 | {
 49 | $mytype="Gene Symbol";
 50 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new_mf.txt") or die "Error opening in file";
 51 | }
 52 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_MF")
 53 | {
 54 | $mytype="Entrez GeneID";
 55 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new_mf.txt") or die "Error opening in file";
 56 | }
 57 | 
 58 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_CC")
 59 | {
 60 | $mytype="Gene Symbol";
 61 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new_cc.txt") or die "Error opening in file";
 62 | }
 63 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_CC")
 64 | {
 65 | $mytype="Entrez GeneID";
 66 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new_cc.txt") or die "Error opening in file";
 67 | }
 68 | 
 69 | #### KEGG DB ####
 70 | if($ARGV[0] eq "sym" && $ARGV[3] eq "KEGG")
 71 | {
 72 | $mytype="Gene Symbol";
 73 | open(IN1,"$ARGV[5]/annotation/KEGG_pathway_updated130711_geneSym.txt") or die "Error opening in file";
 74 | }
 75 | if($ARGV[0] eq "gid" && $ARGV[3] eq "KEGG")
 76 | {
 77 | $mytype="Entrez GeneID";
 78 | open(IN1,"$ARGV[5]/annotation/KEGG_pathway_updated130711_geneID.txt") or die "Error opening in file";
 79 | }
 80 | 
 81 | #### REACTOME DB ####
 82 | if($ARGV[0] eq "sym" && $ARGV[3] eq "REACTOME")
 83 | {
 84 | $mytype="Gene Symbol";
 85 | open(IN1,"$ARGV[5]/annotation/ReactomePathways_updated150605_geneSym.txt") or die "Error opening in file";
 86 | }
 87 | if($ARGV[0] eq "gid" && $ARGV[3] eq "REACTOME")
 88 | {
 89 | $mytype="Entrez GeneID";
 90 | open(IN1,"$ARGV[5]/annotation/ReactomePathways_updated150605_RplcdIDs.txt") or die "Error opening in file";
 91 | }
 92 | 
 93 | if($ARGV[0] eq "sym" && $ARGV[3] eq "NCG")
 94 | {
 95 | $mytype="Gene Symbol";
 96 | open(IN1,"$ARGV[5]/annotation/NCG4.0_annotation_Updated150605_geneSym.txt") or die "Error opening in file";
 97 | }
 98 | 
 99 | if($ARGV[0] eq "gid" && $ARGV[3] eq "NCG")
100 | {
101 | $mytype="Gene Symbol";
102 | open(IN1,"$ARGV[5]/annotation/NCG4.0_annotation_Updated150605_RplcdIDs.txt") or die "Error opening in file";
103 | }
104 | 
105 | 
106 | 
107 | open(IN2,$ARGV[1]) or print "\n***\nError opening input file: $ARGV[1]\n***\n\n";
108 | 
109 | print RESULT "Genes\tProcess\tGO:Class\tnum_of_Genes\tgene_group\tpercentage%\tP-value\tEASE (http://david.abcc.ncifcrf.gov/content.jsp?file=functional_annotation.html#fisher) \tBenjamini and Hochberg (FDR)\t Hommel singlewise process\tBonferroni single-step process\tHommel singlewise process\tHochberg step-up process\tBenjamini and Yekutieli\n";
110 | 
111 | while(<IN1>){
112 | 	chomp;
113 | 	@temp1=split("\n",$_);
114 | 	foreach $temp1(@temp1)
115 | 	{
116 | 		@ar1 = split("\t",$temp1);
117 | 		$kegg{$ar1[0]} = ["$ar1[1]","$ar1[2]"];
118 | 	}
119 | }
120 | 
121 | 
122 | 
123 | while(<IN2>){
124 | 	@temp2=split("\n",$_);
125 | 	foreach $temp2(@temp2)
126 | 	{
127 | 		@ar2 = split("\t",$temp2);
128 | 		$bp{$ar2[0]} = $ar2[0];
129 | 	}
130 | }
131 | $gene_list=@temp2;
132 | 
133 | @mykeys=();	
134 | foreach my $keykg ( keys %kegg )
135 | {
136 | 	
137 | 	@kgene=split(",",$kegg{$keykg}[1]);
138 | 	$kcount=1;
139 | 	$gcount=1;
140 | 	$gnum=0;
141 | 	@gset=();
142 | 	foreach $kgene(@kgene)
143 | 	{$knum=$kcount++;
144 | 				if(exists $bp{$kgene})
145 | 				{
146 | 					$gnum=$gcount++;
147 | 					$indgene=$bp{$kgene}.";";
148 | 					push(@gset,$indgene);
149 | 					##print RESULT $bp{$kgene}.";";
150 | 				}
151 | 				
152 | 					
153 | 	}
154 | 
155 | 
156 | if($gnum>0)
157 | {
158 | $x=$gnum;
159 | $n=$gene_list;
160 | $M=$knum; ## total genes in process
161 | $N=$ARGV[4];
162 | 
163 | 
164 | 
165 | 
166 | $fisher_value = calculateStatistic( n11=>$x,n1p=>$n,np1=>$x+$M,npp=>$N+$n);
167 | $ease_value= calculateStatistic( n11=>$x-1,n1p=>$n,np1=>$x+$M,npp=>$N+$n);
168 | 
169 | push(@new,$fisher_value);
170 | 
171 | ##print RESULT "\t$keykg\t$kegg{$keykg}[0]\t$gnum\t$knum\t$fisher_value\t$ease_value\n";
172 | $percent=(($gnum/$knum)*100);
173 | push(@finres,"@gset\t$keykg\t$kegg{$keykg}[0]\t$gnum\t$knum\t$percent\t$fisher_value\t$ease_value\t");
174 | 
175 | }
176 | 
177 | 
178 | }
179 | 
180 | $p=\@new;
181 | $bhres = BH($p);
182 | $holmres = holm($p);
183 | $bfres=bonferroni($p);
184 | $hommel=hommel($p);
185 | $hochberg=hochberg($p);
186 | $byres=BY($p);
187 | 
188 | 
189 | for($i=0;$i<=$#finres;$i++)
190 | {
191 | print RESULT $finres[$i];
192 | print RESULT @$bhres[$i]."\t".@$holmres[$i]."\t".@$bfres[$i]."\t".@$hommel[$i]."\t".@$hochberg[$i]."\t".@$byres[$i]."\n";
193 | 
194 | }
195 | 
196 | 
197 | 
198 | 
199 | 
200 | close(RESULT);
201 | print"=================\nRun successful. Check your output directory $ARGV[2] \n=================\n\nParameters used:\n\nbackground genes:\t$ARGV[4]\nIdentitiy:\t\t$mytype\nDatabase used:\t\t$ARGV[3]\nOutput file:\t\t@ARGV[2]@{myout[$#myout]}_${ARGV[3]}_functional_classification.tsv\n\t\tWARNING: Your output is not sorted with P-val/FDR.\n\n\n---------------------\n\nAuthor: Santhilal Subhash\nsanthilal.subhash\@gu.se\nLast Updated: 2015 June 05\n"
202 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/ll.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::3D::MI::ll - Perl module that implements Loglikelihood
  4 |                                   measure of association for trigrams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::3D::MI::ll;
 11 | 
 12 |   $ll_value = calculateStatistic( n111=>10,
 13 |                                   n1pp=>40,
 14 |                                   np1p=>45,
 15 |                                   npp1=>42,
 16 |                                   n11p=>20,
 17 |                                   n1p1=>23,
 18 |                                   np11=>21,
 19 |                                   nppp=>100);
 20 | 
 21 |   if( ($errorCode = getErrorCode()))
 22 |   {
 23 |     print STDERR $erroCode." - ".getErrorMessage()."\n";
 24 |   }
 25 |   else
 26 |   {
 27 |     print getStatisticName."value for trigram is ".$ll_value."\n";
 28 |   }
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | The log-likelihood ratio measures the devitation between the observed data
 34 | and what would be expected if <word1>, <word2> and <word3> were independent.
 35 | The higher the score, the less evidence there is in favor of concluding that
 36 | the words are independent.
 37 | 
 38 | The expected values for the internal cells are calculated by taking the
 39 | product of their associated marginals and dividing by the sample size,
 40 | for example:
 41 | 
 42 |             n1pp * np1p * npp1
 43 |    m111=   --------------------
 44 |                    nppp
 45 | 
 46 | Then the deviation between observed and expected values for each internal
 47 | cell is computed to arrive at the log-likelihood value.
 48 | 
 49 |  Log-Likelihood = 2 * [n111 * log(n111/m111) + n112 * log(n112/m112) +
 50 |            n121 * log(n121/m121) + n122 * log(n122/m122) +
 51 |            n211 * log(n211/m211) + n212 * log(n212/m212) +
 52 |            n221 * log(n221/m221) + n222 * log(n222/m222)]
 53 | 
 54 | =over
 55 | 
 56 | =cut
 57 | 
 58 | 
 59 | package Text::NSP::Measures::3D::MI::ll;
 60 | 
 61 | 
 62 | use Text::NSP::Measures::3D::MI;
 63 | use strict;
 64 | use Carp;
 65 | use warnings;
 66 | no warnings 'redefine';
 67 | require Exporter;
 68 | 
 69 | our ($VERSION, @EXPORT, @ISA);
 70 | 
 71 | @ISA  = qw(Exporter);
 72 | 
 73 | @EXPORT = qw(initializeStatistic calculateStatistic
 74 |              getErrorCode getErrorMessage getStatisticName);
 75 | 
 76 | $VERSION = '0.97';
 77 | 
 78 | =item calculateStatistic($count_values) - This method calculates
 79 | the ll value
 80 | 
 81 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 82 |                                        the count values computed by the
 83 |                                        count.pl program.
 84 | 
 85 | RETURN VALUES : $loglikelihood      .. Loglikelihood value for this trigram.
 86 | 
 87 | =cut
 88 | 
 89 | sub calculateStatistic
 90 | {
 91 |   my %values = @_;
 92 | 
 93 |   # computes and sets the observed and expected values from
 94 |   # the frequency combination values. returns 0 if there is an
 95 |   # error in the computation or the values are inconsistent.
 96 |   if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) {
 97 |     return;
 98 |   }
 99 | 
100 |   #  Now for the actual calculation of Loglikelihood!
101 |   my $logLikelihood = 0;
102 | 
103 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so!
104 |   $logLikelihood += $n111 * Text::NSP::Measures::3D::MI::computePMI( $n111, $m111 );
105 |   $logLikelihood += $n112 * Text::NSP::Measures::3D::MI::computePMI( $n112, $m112 );
106 |   $logLikelihood += $n121 * Text::NSP::Measures::3D::MI::computePMI( $n121, $m121 );
107 |   $logLikelihood += $n122 * Text::NSP::Measures::3D::MI::computePMI( $n122, $m122 );
108 |   $logLikelihood += $n211 * Text::NSP::Measures::3D::MI::computePMI( $n211, $m211 );
109 |   $logLikelihood += $n212 * Text::NSP::Measures::3D::MI::computePMI( $n212, $m212 );
110 |   $logLikelihood += $n221 * Text::NSP::Measures::3D::MI::computePMI( $n221, $m221 );
111 |   $logLikelihood += $n222 * Text::NSP::Measures::3D::MI::computePMI( $n222, $m222 );
112 | 
113 |   return ( 2 * $logLikelihood );
114 | }
115 | 
116 | 
117 | =item getStatisticName() - Returns the name of this statistic
118 | 
119 | INPUT PARAMS  : none
120 | 
121 | RETURN VALUES : $name      .. Name of the measure.
122 | 
123 | =cut
124 | 
125 | sub getStatisticName
126 | {
127 |     return "Loglikelihood";
128 | }
129 | 
130 | 
131 | 
132 | 1;
133 | __END__
134 | 
135 | 
136 | =back
137 | 
138 | =head1 AUTHOR
139 | 
140 | Ted Pedersen,                University of Minnesota Duluth
141 |                              E<lt>tpederse@d.umn.eduE<gt>
142 | 
143 | Satanjeev Banerjee,          Carnegie Mellon University
144 |                              E<lt>satanjeev@cmu.eduE<gt>
145 | 
146 | Amruta Purandare,            University of Pittsburgh
147 |                              E<lt>amruta@cs.pitt.eduE<gt>
148 | 
149 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
150 |                              E<lt>bthomson@d.umn.eduE<gt>
151 | 
152 | Saiyam Kohli,                University of Minnesota Duluth
153 |                              E<lt>kohli003@d.umn.eduE<gt>
154 | 
155 | =head1 HISTORY
156 | 
157 | Last updated: $Id: ll.pm,v 1.10 2011/12/23 21:59:33 btmcinnes Exp $
158 | 
159 | =head1 BUGS
160 | 
161 | 
162 | =head1 SEE ALSO
163 | 
164 |   @article{Dunning93,
165 |             author = {Dunning, T.},
166 |             title = {Accurate Methods for the Statistics of
167 |           Surprise and Coincidence},
168 |             journal = {Computational Linguistics},
169 |             volume = {19},
170 |             number = {1},
171 |             year = {1993},
172 |             pages = {61-74}
173 |             url = L<http://www.comp.lancs.ac.uk/ucrel/papers/tedstats.pdf>}
174 | 
175 |   @inproceedings{moore:2004:EMNLP,
176 |                 author    = {Moore, Robert C.},
177 |                 title     = {On Log-Likelihood-Ratios and the Significance of Rare
178 |               Events },
179 |                 booktitle = {Proceedings of EMNLP 2004},
180 |                 editor = {Dekang Lin and Dekai Wu},
181 |                 year      = 2004,
182 |                 month     = {July},
183 |                 address   = {Barcelona, Spain},
184 |                 publisher = {Association for Computational Linguistics},
185 |                 pages     = {333--340}
186 |                 url = L<http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Moore.pdf>}
187 | 
188 | L<http://groups.yahoo.com/group/ngram/>
189 | 
190 | L<http://www.d.umn.edu/~tpederse/nsp.html>
191 | 
192 | 
193 | =head1 COPYRIGHT
194 | 
195 | Copyright (C) 2000-2011, Ted Pedersen, Satanjeev Banerjee, Amruta
196 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
197 | 
198 | This program is free software; you can redistribute it and/or modify it
199 | under the terms of the GNU General Public License as published by the Free
200 | Software Foundation; either version 2 of the License, or (at your option)
201 | any later version.
202 | 
203 | This program is distributed in the hope that it will be useful, but
204 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
205 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
206 | for more details.
207 | 
208 | You should have received a copy of the GNU General Public License along
209 | with this program; if not, write to
210 | 
211 |     The Free Software Foundation, Inc.,
212 |     59 Temple Place - Suite 330,
213 |     Boston, MA  02111-1307, USA.
214 | 
215 | Note: a copy of the GNU General Public License is available on the web
216 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
217 | distribution as GPL.txt.
218 | 
219 | =cut
220 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Number/FormatEng.pm:
--------------------------------------------------------------------------------
  1 | package Number::FormatEng;
  2 | 
  3 | use warnings;
  4 | use strict;
  5 | use Carp;
  6 | use POSIX;
  7 | use Scalar::Util qw(looks_like_number);
  8 | 
  9 | require Exporter;
 10 | our @ISA         = qw(Exporter);
 11 | our @EXPORT_OK   = qw(format_eng format_pref unformat_pref use_e_zero no_e_zero);
 12 | our %EXPORT_TAGS = (all => \@EXPORT_OK);
 13 | 
 14 | our $VERSION = '0.01';
 15 | 
 16 | my %prefix = (
 17 |     '-8' => 'y',    '8' => 'Y',
 18 |     '-7' => 'z',    '7' => 'Z',
 19 |     '-6' => 'a',    '6' => 'E',
 20 |     '-5' => 'f',    '5' => 'P',
 21 |     '-4' => 'p',    '4' => 'T',
 22 |     '-3' => 'n',    '3' => 'G',
 23 |     '-2' => 'u',    '2' => 'M',
 24 |     '-1' => 'm',    '1' => 'k',
 25 |      '0' => '' ,
 26 | );
 27 | my %exponent = reverse %prefix;
 28 | 
 29 | my $no_e_zero = 1;
 30 | 
 31 | sub use_e_zero {
 32 |     $no_e_zero = 0;
 33 | }
 34 | 
 35 | sub no_e_zero {
 36 |     $no_e_zero = 1;
 37 | }
 38 | 
 39 | sub format_pref {
 40 |     return format_num(1, @_);
 41 | }
 42 | 
 43 | sub format_eng {
 44 |     return format_num(0, @_);
 45 | }
 46 | 
 47 | sub format_num {
 48 |     my $prefix_mode = shift;
 49 |     my $num         = shift;
 50 | 
 51 |     my $name = ($prefix_mode) ? 'format_pref' : 'format_eng';
 52 | 
 53 |     # Check validity of input
 54 |     unless (defined $num) {
 55 |         croak("Error: $name requires numeric input. ",
 56 |               'It seems like no input was provided or input was undefined');
 57 |     }
 58 |     unless (looks_like_number($num)) {
 59 |         croak("Error: $name requires numeric input. '$num' is not numeric");
 60 |     }
 61 | 
 62 |     if ($num == 0) {
 63 |         if ($prefix_mode or $no_e_zero) {
 64 |             return '0';
 65 |         }
 66 |         else {
 67 |             return '0e0';
 68 |         }
 69 |     }
 70 | 
 71 |     my $sign = ($num < 0) ? '-' : '';
 72 |     $num = abs $num;
 73 | 
 74 |     if ($prefix_mode) {
 75 |         if ( ($num >= 1e27) or ($num <= 1e-25) ) {
 76 |             # switch to number exponent mode
 77 |             $prefix_mode = 0;
 78 |         }
 79 |     }
 80 | 
 81 |     my $e    = floor( log($num) / log(1000) );
 82 |     my $mult = 1000**$e;
 83 |     $num = adjust($num / $mult);
 84 |     if ($prefix_mode) {
 85 |         return $sign . $num . $prefix{$e};
 86 |     }
 87 |     else {
 88 |         if ($no_e_zero and ($e == 0)) {
 89 |             return $sign . $num;
 90 |         }
 91 |         else {
 92 |             return $sign . $num . 'e' . 3*$e;
 93 |         }
 94 |     }
 95 | }
 96 | 
 97 | sub adjust {
 98 |     my $num = shift;
 99 |     if ($num < 1) {
100 |         return 1;
101 |     }
102 |     elsif (($num < 10) and ($num > 9.999_999_999)) {
103 |         return 10;
104 |     }
105 |     elsif (($num < 100) and ($num > 99.999_999_99)) {
106 |         return 100;
107 |     }
108 |     else {
109 |         return $num;
110 |     }
111 | }
112 | 
113 | sub unformat_pref {
114 |     my ($num) = @_;
115 | 
116 |     # Check validity of input
117 |     unless (defined $num) {
118 |         croak('Error: unformat_pref requires input. ',
119 |               'It seems like no input was provided or input was undefined');
120 |     }
121 | 
122 |     # Trim leading and trailing whitespace
123 |     $num =~ s/^\s+//;
124 |     $num =~ s/\s+$//;
125 | 
126 |     unless (length $num) {
127 |         croak('Error: unformat_pref requires input. ',
128 |               'It seems like no input was provided');
129 |     }
130 | 
131 |     my $prefix = substr $num, -1;
132 |     if (exists $exponent{$prefix}) {
133 |         chop $num;
134 |         unless (looks_like_number($num)) {
135 |             croak("Error: unformat_pref input '$num' is not numeric before prefix '$prefix'");
136 |         }
137 |         $num = $num * (1000**$exponent{$prefix});
138 |     }
139 |     else {
140 |         unless (looks_like_number($num)) {
141 |             croak("Error: unformat_pref input '$num' is not numeric");
142 |         }
143 |     }
144 | 
145 |     return $num;
146 | }
147 | 
148 | 
149 | =head1 NAME
150 | 
151 | Number::FormatEng - Format a number using engineering notation
152 | 
153 | =head1 VERSION
154 | 
155 | This document refers to Number::FormatEng version 0.01.
156 | 
157 | =head1 SYNOPSIS
158 | 
159 |     use Number::FormatEng qw(:all);
160 |     print format_eng(1234);     # prints 1.234e3
161 |     print format_pref(-0.035);  # prints -35m
162 |     unformat_pref('1.23T');     # returns 1.23e+12
163 | 
164 | =head1 DESCRIPTION
165 | 
166 | Format a number for printing using engineering notation.
167 | Engineering notation is similar to scientific notation except that
168 | the power of ten must be a multiple of three.
169 | Alternately, the number can be formatted using an International
170 | System of Units (SI) prefix representing a factor of a thousand.
171 | 
172 | =head1 SUBROUTINES
173 | 
174 | =over 4
175 | 
176 | =item format_eng($number)
177 | 
178 | Format a numeric value using engineering notation.  This function
179 | returns a string whose exponent is a multiple of 3.  Here are some examples:
180 | 
181 |     format_eng(1234);   # returns 1.234e3
182 |     format_eng(-0.03);  # returns -30e-3
183 |     format_eng(7.8e7);  # returns 78e6
184 | 
185 | In most cases, the precision is preserved.  However, rounding will occur
186 | if the number of digits is too large (system-dependent).  Keep this in
187 | mind if C<$number> is a numeric expression.  For example, the following
188 | may return a different number of digits from system to system:
189 | 
190 |     format_eng(1/3);
191 | 
192 | =item format_pref($number)
193 | 
194 | Format a numeric value using engineering notation.  This function
195 | returns a string using one of the following SI prefixes (representing a
196 | power of a thousand):
197 | 
198 |     m u n p f a z y
199 |     k M G T P E Z Y
200 | 
201 | Notice that lower-case C<u> is used instead of the Greek letter Mu.
202 | 
203 | If the number is beyond the prefix ranges (y and Y), then C<format_pref>
204 | returns the same formatted string as C<format_eng>.  In other words, it
205 | does not use an SI prefix.
206 | 
207 | Here are some examples:
208 | 
209 |     format_pref(1234);      # returns 1.234k
210 |     format_pref(-0.0004);   # returns -400u
211 |     format_pref(1.27e13);   # returns 12.7G
212 |     format_pref(7.5e60);    # returns 7.5e60
213 | 
214 | =item unformat_pref($string)
215 | 
216 | Convert a string formatted using C<format_pref> into a numeric value.
217 | Here are some examples:
218 | 
219 |     unformat_pref('1.23T'); # returns 1.23e+12
220 |     unformat_pref('-400u'); # returns -4e-4
221 |     unformat_pref(37.5);    # returns 37.5
222 | 
223 | =item use_e_zero() and no_e_zero()
224 | 
225 | By default, if the exponent is zero, C<e0> is not displayed by
226 | C<format_eng>.  To explicitly display C<e0>, use the C<use_e_zero> method.
227 | Use the C<no_e_zero> method to return to the default behavior.
228 | 
229 |     format_eng(55);     # returns 55
230 |     Number::FormatEng::use_e_zero();
231 |     format_eng(55);     # now returns 55e0
232 |     Number::FormatEng::no_e_zero();
233 |     format_eng(55);     # back to 55
234 | 
235 | =back
236 | 
237 | =head1 EXPORT
238 | 
239 | Nothing is exported by default.  Functions may be exported individually, or
240 | all functions may be exported at once, using the special tag C<:all>.
241 | 
242 | =head1 DIAGNOSTICS
243 | 
244 | Error conditions cause the program to die using C<croak> from the
245 | C<Carp.pm> Core module.
246 | 
247 | =head1 BUGS AND LIMITATIONS
248 | 
249 | There are no known bugs in this module.
250 | 
251 | =head1 SEE ALSO
252 | 
253 | Refer to the following website:
254 | 
255 | L<http://en.wikipedia.org/wiki/Engineering_notation>
256 | 
257 | =head1 AUTHOR
258 | 
259 | Gene Sullivan (gsullivan@cpan.org)
260 | 
261 | =head1 ACKNOWLEDGEMENTS
262 | 
263 | Influenced by the following PerlMonks: BrowserUk, GrandFather and repellent.
264 | 
265 | =head1 COPYRIGHT AND LICENSE
266 | 
267 | Copyright (c) 2009 Gene Sullivan.  All rights reserved.
268 | 
269 | This module is free software; you can redistribute it and/or modify
270 | it under the same terms as Perl itself.  See L<perlartistic>.
271 | 
272 | =cut
273 | 
274 | 1;
275 | 
276 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::MI - Perl module that provides error checks
  4 |                               for Loglikelihood, Total Mutual
  5 |                               Information, Pointwise Mutual Information
  6 |                               and Poisson-Stirling Measure.
  7 | 
  8 | =head1 SYNOPSIS
  9 | 
 10 | =head3 Basic Usage
 11 | 
 12 |   use Text::NSP::Measures::2D::MI::ll;
 13 | 
 14 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 15 | 
 16 |   $ll_value = calculateStatistic( n11=>$n11,
 17 |                                       n1p=>$n1p,
 18 |                                       np1=>$np1,
 19 |                                       npp=>$npp);
 20 | 
 21 |   if( ($errorCode = getErrorCode()))
 22 |   {
 23 |     print STDERR $errorCode." - ".getErrorMessage()."\n"";
 24 |   }
 25 |   else
 26 |   {
 27 |     print getStatisticName."value for bigram is ".$ll_value."\n"";
 28 |   }
 29 | 
 30 | =head1 DESCRIPTION
 31 | 
 32 | This module is the base class for the Loglikelihood, Total Mutual
 33 | Information and the Pointwise Mutual Information measures. All these
 34 | measure are similar. This module provides error checks specific for
 35 | these measures, it also implements the computations that are common
 36 | to these measures.
 37 | 
 38 | =over
 39 | 
 40 | =item Log-Likelihood measure is computed as
 41 | 
 42 | Log-Likelihood = 2 * [n11 * log(n11/m11) + n12 * log(n12/m12) +
 43 |                  n21 * log(n21/m21) + n22 * log(n22/m22)]
 44 | 
 45 | =item Total Mutual Information
 46 | 
 47 | TMI =   (1/npp)*[n11 * log(n11/m11)/log 2 + n12 * log(n12/m12)/log 2 +
 48 |                  n21 * log(n21/m21)/log 2 + n22 * log(n22/m22)/log 2]
 49 | 
 50 | =item Pointwise Mutual Information
 51 | 
 52 | PMI =   log (n11/m11)/log 2
 53 | 
 54 | =item Poisson Stirling Measures
 55 | 
 56 | PS =   n11*(log (n11/m11)-1)
 57 | 
 58 | =back
 59 | 
 60 | All these methods use the ratio of the observed values to expected values,
 61 | for computations, and thus have common error checks, so they have been grouped
 62 | together.
 63 | 
 64 | =head2 Methods
 65 | 
 66 | =over
 67 | 
 68 | =cut
 69 | 
 70 | 
 71 | package Text::NSP::Measures::2D::MI;
 72 | 
 73 | 
 74 | use Text::NSP::Measures::2D;
 75 | use strict;
 76 | use Carp;
 77 | use warnings;
 78 | # use subs(calculateStatistic);
 79 | require Exporter;
 80 | 
 81 | our ($VERSION, @EXPORT, @ISA);
 82 | 
 83 | @ISA  = qw(Exporter);
 84 | 
 85 | @EXPORT = qw(initializeStatistic calculateStatistic
 86 |              getErrorCode getErrorMessage getStatisticName
 87 |              $errorCodeNumber $errorMessage
 88 |              $n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22
 89 |              $npp $np1 $np2 $n2p $n1p);
 90 | 
 91 | $VERSION = '1.03';
 92 | 
 93 | 
 94 | =item getValues() - This method calls the computeMarginalTotals(),
 95 | computeObservedValues() and the computeExpectedValues() methods to
 96 | compute the observed and expected values. It checks these values for
 97 | any errors that might cause the Loglikelihood, TMI & PMI measures to
 98 | fail.
 99 | 
100 | 
101 | INPUT PARAMS  : $count_values           .. Reference of an hash containing
102 |                                            the count values computed by the
103 |                                            count.pl program.
104 | 
105 | 
106 | RETURN VALUES : 1/undef           ..returns '1' to indicate success
107 |                                     and an undefined(NULL) value to indicate
108 |                                     failure.
109 | 
110 | =cut
111 | 
112 | sub getValues
113 | {
114 |   my ($values)=@_;
115 | 
116 |   if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){
117 |     return;
118 |   }
119 | 
120 |   if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) {
121 |       return;
122 |   }
123 | 
124 |   if( !(Text::NSP::Measures::2D::computeExpectedValues($values)) ) {
125 |       return;
126 |   }
127 | 
128 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so and return;
129 |   if ( $n11 )
130 |   {
131 |     if ($m11 == 0)
132 |     {
133 |       $errorMessage = "Expected value in cell (1,1) must not be zero";
134 |       $errorCodeNumber = 211;
135 |       return;
136 |     }
137 |   }
138 |   if ( $n12 )
139 |   {
140 |     if ($m12 == 0)
141 |     {
142 |       $errorMessage = "Expected value in cell (1,2) must not be zero";
143 |       $errorCodeNumber = 211;
144 |       return;
145 |     }
146 |   }
147 |   if ( $n21 )
148 |   {
149 |     if ($m21 == 0)
150 |     {
151 |       $errorMessage = "Expected value in cell (2,1) must not be zero";
152 |       $errorCodeNumber = 211;
153 |       return;
154 |     }
155 |   }
156 |   if ( $n22 )
157 |   {
158 |     if ($m22 == 0)
159 |     {
160 |       $errorMessage = "Expected value in cell (2,2) must not be zero";
161 |       $errorCodeNumber = 211;
162 |       return;
163 |     }
164 |   }
165 |   if ($m11 < 0)
166 |   {
167 |     $errorMessage = "Expected value for cell (1,1) should not be negative";
168 |     $errorCodeNumber = 212;
169 |     return;
170 |   }
171 |   if ($m12 < 0)
172 |   {
173 |     $errorMessage = "Expected value for cell (1,2) should not be negative";
174 |     $errorCodeNumber = 212;
175 |     return;
176 |   }
177 |   if ($m21 < 0)
178 |   {
179 |     $errorMessage = "Expected value for cell (2,1) should not be negative";
180 |     $errorCodeNumber = 212;
181 |     return;
182 |   }
183 |   if ($m22 < 0)
184 |   {
185 |     $errorMessage = "Expected value for cell (2,2) should not be negative";
186 |     $errorCodeNumber = 212;
187 |     return;
188 |   }
189 | 
190 |   #  Everything looks good so we can return 1
191 |   return 1;
192 | }
193 | 
194 | 
195 | 
196 | 
197 | =item computePMI() - Computes the pmi of a given observed and expected
198 | value pair.
199 | 
200 | INPUT PARAMS  : $n         ..Observed value
201 |                 $m         ..Expected value
202 | 
203 | RETURN VALUES : log(n/m)   ..the log of the ratio of
204 |                              observed value to expected
205 |                              value.
206 | 
207 | =cut
208 | 
209 | sub computePMI
210 | {
211 |   my $n = shift;
212 |   my $m = shift;
213 |   if($n)
214 |   {
215 |     my $val = $n/$m;
216 |     return log($val);
217 |   }
218 |   else
219 |   {
220 |     return 0;
221 |   }
222 | }
223 | 
224 | 
225 | 
226 | 1;
227 | __END__
228 | 
229 | 
230 | =back
231 | 
232 | =head1 AUTHOR
233 | 
234 | Ted Pedersen,                University of Minnesota Duluth
235 |                              E<lt>tpederse@d.umn.eduE<gt>
236 | 
237 | Satanjeev Banerjee,          Carnegie Mellon University
238 |                              E<lt>satanjeev@cmu.eduE<gt>
239 | 
240 | Amruta Purandare,            University of Pittsburgh
241 |                              E<lt>amruta@cs.pitt.eduE<gt>
242 | 
243 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
244 |                              E<lt>bthompson@d.umn.eduE<gt>
245 | 
246 | Saiyam Kohli,                University of Minnesota Duluth
247 |                              E<lt>kohli003@d.umn.eduE<gt>
248 | 
249 | =head1 HISTORY
250 | 
251 | Last updated: $Id: MI.pm,v 1.27 2008/03/26 17:18:26 tpederse Exp $
252 | 
253 | =head1 BUGS
254 | 
255 | 
256 | =head1 SEE ALSO
257 | 
258 | L<http://groups.yahoo.com/group/ngram/>
259 | 
260 | L<http://www.d.umn.edu/~tpederse/nsp.html>
261 | 
262 | 
263 | =head1 COPYRIGHT
264 | 
265 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
266 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
267 | 
268 | This program is free software; you can redistribute it and/or modify it
269 | under the terms of the GNU General Public License as published by the Free
270 | Software Foundation; either version 2 of the License, or (at your option)
271 | any later version.
272 | 
273 | This program is distributed in the hope that it will be useful, but
274 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
275 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
276 | for more details.
277 | 
278 | You should have received a copy of the GNU General Public License along
279 | with this program; if not, write to
280 | 
281 |     The Free Software Foundation, Inc.,
282 |     59 Temple Place - Suite 330,
283 |     Boston, MA  02111-1307, USA.
284 | 
285 | Note: a copy of the GNU General Public License is available on the web
286 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
287 | distribution as GPL.txt.
288 | 
289 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher/right.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Fisher::right - Perl module implementation of the right sided
  4 |                                          Fisher's exact test.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Fisher::right;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $right_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage();
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$right_value;
 26 |   }
 27 | 
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | Assume that the frequency count data associated with a bigram
 32 | <word1><word2> is stored in a 2x2 contingency table:
 33 | 
 34 |           word2   ~word2
 35 |   word1    n11      n12 | n1p
 36 |  ~word1    n21      n22 | n2p
 37 |            --------------
 38 |            np1      np2   npp
 39 | 
 40 | where n11 is the number of times <word1><word2> occur together, and
 41 | n12 is the number of times <word1> occurs with some word other than
 42 | word2, and n1p is the number of times in total that word1 occurs as
 43 | the first word in a bigram.
 44 | 
 45 | The fishers exact tests are calculated by fixing the marginal totals
 46 | and computing the hypergeometric probabilities for all the possible
 47 | contingency tables,
 48 | 
 49 | A right sided test is calculated by adding the probabilities of all
 50 | the possible two by two contingency tables formed by fixing the
 51 | marginal totals and changing the value of n11 to greater than or
 52 | equal to the given value. A right sided Fisher's Exact Test tells us
 53 | how likely it is to randomly sample a table where n11 is greater
 54 | than observed. In other words, it tells us how likely it is to sample
 55 | an observation where the two words are more dependent than currently
 56 | observed.
 57 | 
 58 | =head2 Methods
 59 | 
 60 | =over
 61 | 
 62 | =cut
 63 | 
 64 | package Text::NSP::Measures::2D::Fisher::right;
 65 | 
 66 | 
 67 | use Text::NSP::Measures::2D::Fisher;
 68 | use strict;
 69 | use Carp;
 70 | use warnings;
 71 | no warnings 'redefine';
 72 | require Exporter;
 73 | 
 74 | our ($VERSION, @EXPORT, @ISA);
 75 | 
 76 | @ISA  = qw(Exporter);
 77 | 
 78 | @EXPORT = qw(initializeStatistic calculateStatistic
 79 |              getErrorCode getErrorMessage getStatisticName);
 80 | 
 81 | $VERSION = '0.97';
 82 | 
 83 | 
 84 | =item calculateStatistic() - This method calculates the right Fisher value
 85 | 
 86 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 87 |                                        the count values computed by the
 88 |                                        count.pl program.
 89 | 
 90 | RETURN VALUES : $right              .. Right Fisher value.
 91 | 
 92 | =cut
 93 | 
 94 | sub calculateStatistic
 95 | {
 96 |   my %values = @_;
 97 | 
 98 |   my $probabilities;
 99 |   my $left_flag = 0;
100 | 
101 |   # computes and returns the observed and marginal values from
102 |   # the frequency combination values. returns 0 if there is an
103 |   # error in the computation or the values are inconsistent.
104 |   if( !(Text::NSP::Measures::2D::Fisher::getValues(\%values)) )
105 |   {
106 |     return;
107 |   }
108 | 
109 |   my $final_limit = ($n1p < $np1) ? $n1p : $np1;
110 |   my $n11_org = $n11;
111 | 
112 |   my $n11_start = $n1p + $np1 - $npp;
113 |   if($n11_start < $n11)
114 |   {
115 |     $n11_start = $n11;
116 |   }
117 | 
118 | 
119 |   # to make the computations faster, we check which would require less computations
120 |   # computing the leftfisher value and subtracting it from 1 or directly computing
121 |   # the right fisher value. We do this since, generally for bigrams n11 is quite small
122 |   # so its much faster to compute the left Fisher value.
123 |   my $left_final_limit = $n11-1;
124 |   my $left_n11 = $n1p + $np1 - $npp;
125 |   if($left_n11<0)
126 |   {
127 |     $left_n11 = 0;
128 |   }
129 | 
130 |   # if computing the left fisher values first will take lesser amount of time them
131 |   # we set a flag for later reference and then compute the leftfisher score for
132 |   # n11-1 and then subtract the total score from one to get the right fisher value.
133 |   if(($left_final_limit - $left_n11) < ($final_limit - $n11_start))
134 |   {
135 |     $left_flag = 1;
136 |     if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($left_n11, $left_final_limit)))
137 |     {
138 |         return;
139 |     }
140 |   }
141 | 
142 |   #else we compute the value normally and simply sum to get the rightfisher value.
143 |   else
144 |   {
145 |     if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($n11_start, $final_limit)))
146 |     {
147 |         return;
148 |     }
149 |   }
150 | 
151 |   my $key_n11;
152 | 
153 |   my $rightfisher=0;
154 | 
155 |   foreach $key_n11 (sort { $b <=> $a } keys %$probabilities)
156 |   {
157 |     if($left_flag)
158 |     {
159 |       if($key_n11 >= $n11_org)
160 |       {
161 |         last;
162 |       }
163 |     }
164 |     else
165 |     {
166 |       if($key_n11 < $n11_org)
167 |       {
168 |         last;
169 |       }
170 |     }
171 |     $rightfisher += exp($probabilities->{$key_n11});
172 |   }
173 | 
174 |   # if we computed the leftfisher value to get the right fisher value, we subtract
175 |   # the sum of the probabilities for the tables from one to get the right fisher score.
176 |   if($left_flag)
177 |   {
178 |     if ($rightfisher > 1)
179 |     {
180 |       $rightfisher = 0;
181 |     }
182 |     else
183 |     {
184 |       $rightfisher = 1 - $rightfisher;
185 |     }
186 |   }
187 | 
188 |   return $rightfisher;
189 | }
190 | 
191 | 
192 | =item getStatisticName() - Returns the name of this statistic
193 | 
194 | INPUT PARAMS  : none
195 | 
196 | RETURN VALUES : $name      .. Name of the measure.
197 | 
198 | =cut
199 | 
200 | sub getStatisticName
201 | {
202 |     return "Right Fisher";
203 | }
204 | 
205 | 
206 | 
207 | 1;
208 | __END__
209 | 
210 | =back
211 | 
212 | =head1 AUTHOR
213 | 
214 | Ted Pedersen,                University of Minnesota Duluth
215 |                              E<lt>tpederse@d.umn.eduE<gt>
216 | 
217 | Satanjeev Banerjee,          Carnegie Mellon University
218 |                              E<lt>satanjeev@cmu.eduE<gt>
219 | 
220 | Amruta Purandare,            University of Pittsburgh
221 |                              E<lt>amruta@cs.pitt.eduE<gt>
222 | 
223 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
224 |                              E<lt>bthompson@d.umn.eduE<gt>
225 | 
226 | Saiyam Kohli,                University of Minnesota Duluth
227 |                              E<lt>kohli003@d.umn.eduE<gt>
228 | 
229 | =head1 HISTORY
230 | 
231 | Last updated: $Id: right.pm,v 1.12 2006/06/21 11:10:52 saiyam_kohli Exp $
232 | 
233 | =head1 BUGS
234 | 
235 | 
236 | =head1 SEE ALSO
237 | 
238 |   @inproceedings{Pedersen96,
239 |           author = {Pedersen, T.},
240 |           title = {Fishing For Exactness},
241 |           booktitle = {Proceedings of the South Central SAS User's
242 |                       Group (SCSUG-96) Conference},
243 |           year = {1996},
244 |           pages = {188--200},
245 |           month ={October},
246 |           address = {Austin, TX}
247 |           url = L<http://www.d.umn.edu/~tpederse/pubs.html>}
248 | 
249 | L<http://groups.yahoo.com/group/ngram/>
250 | 
251 | L<http://www.d.umn.edu/~tpederse/nsp.html>
252 | 
253 | 
254 | =head1 COPYRIGHT
255 | 
256 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
257 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
258 | 
259 | This program is free software; you can redistribute it and/or modify it
260 | under the terms of the GNU General Public License as published by the Free
261 | Software Foundation; either version 2 of the License, or (at your option)
262 | any later version.
263 | 
264 | This program is distributed in the hope that it will be useful, but
265 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
266 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
267 | for more details.
268 | 
269 | You should have received a copy of the GNU General Public License along
270 | with this program; if not, write to
271 | 
272 |     The Free Software Foundation, Inc.,
273 |     59 Temple Place - Suite 330,
274 |     Boston, MA  02111-1307, USA.
275 | 
276 | Note: a copy of the GNU General Public License is available on the web
277 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
278 | distribution as GPL.txt.
279 | 
280 | =cut


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher2/right.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::2D::Fisher2::right - Perl module implementation of the right sided
  4 |                                           Fisher's exact test (Deprecated).
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 | 
 10 |   use Text::NSP::Measures::2D::Fisher2::right;
 11 | 
 12 |   my $npp = 60; my $n1p = 20; my $np1 = 20;  my $n11 = 10;
 13 | 
 14 |   $right_value = calculateStatistic( n11=>$n11,
 15 |                                       n1p=>$n1p,
 16 |                                       np1=>$np1,
 17 |                                       npp=>$npp);
 18 | 
 19 |   if( ($errorCode = getErrorCode()))
 20 |   {
 21 |     print STDERR $errorCode." - ".getErrorMessage();
 22 |   }
 23 |   else
 24 |   {
 25 |     print getStatisticName."value for bigram is ".$right_value;
 26 |   }
 27 | 
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | This module provides a naive implementation of the fishers right
 32 | sided exact tests. That is the implementation does not have any
 33 | optimizations for performance. This will compute the factorials and
 34 | the hypergeometric measures using direct multiplications.
 35 | 
 36 | This measure should be used if you need exact values without any
 37 | rounding errors, and you are not worried about the performance of
 38 | the measure, otherwise use the implementations under the
 39 | Text::NSP::Measures::2D::Fisher module. To use this implementation,
 40 | you will have to specify the entire module name. Usage:
 41 | 
 42 | statistic.pl Text::NSP::Measures::Fisher2::right dest.txt source.cnt
 43 | 
 44 | Assume that the frequency count data associated with a bigram
 45 | <word1><word2> is stored in a 2x2 contingency table:
 46 | 
 47 |           word2   ~word2
 48 |   word1    n11      n12 | n1p
 49 |  ~word1    n21      n22 | n2p
 50 |            --------------
 51 |            np1      np2   npp
 52 | 
 53 | where n11 is the number of times <word1><word2> occur together, and
 54 | n12 is the number of times <word1> occurs with some word other than
 55 | word2, and n1p is the number of times in total that word1 occurs as
 56 | the first word in a bigram.
 57 | 
 58 | The fishers exact tests are calculated by fixing the marginal totals
 59 | and computing the hypergeometric probabilities for all the possible
 60 | contingency tables,
 61 | 
 62 | A right sided test is calculated by adding the probabilities of all
 63 | the possible two by two contingency tables formed by fixing the
 64 | marginal totals and changing the value of n11 to greater than or
 65 | equal to the given value. A right sided Fisher's Exact Test tells us
 66 | how likely it is to randomly sample a table where n11 is greater
 67 | than observed. In other words, it tells us how likely it is to sample
 68 | an observation where the two words are more dependent than currently
 69 | observed.
 70 | 
 71 | =head2 Methods
 72 | 
 73 | =over
 74 | 
 75 | =cut
 76 | 
 77 | package Text::NSP::Measures::2D::Fisher2::right;
 78 | 
 79 | 
 80 | use Text::NSP::Measures::2D::Fisher2;
 81 | use strict;
 82 | use Carp;
 83 | use warnings;
 84 | no warnings 'redefine';
 85 | require Exporter;
 86 | 
 87 | our ($VERSION, @EXPORT, @ISA);
 88 | 
 89 | @ISA  = qw(Exporter);
 90 | 
 91 | @EXPORT = qw(initializeStatistic calculateStatistic
 92 |              getErrorCode getErrorMessage getStatisticName);
 93 | 
 94 | $VERSION = '0.97';
 95 | 
 96 | 
 97 | =item calculateStatistic() - This method computes the right sided Fishers
 98 |                              exact test.
 99 | 
100 | INPUT PARAMS  : $count_values       .. Reference of an array containing
101 |                                        the count values computed by the
102 |                                        count.pl program.
103 | 
104 | RETURN VALUES : $right              .. Right Fisher value.
105 | 
106 | =cut
107 | 
108 | sub calculateStatistic
109 | {
110 |   my %values = @_;
111 | 
112 | 
113 |   my $probabilities;
114 |   my $left_flag = 0;
115 | 
116 |   # computes and returns the observed and marginal values from
117 |   # the frequency combination values. returns 0 if there is an
118 |   # error in the computation or the values are inconsistent.
119 |   if( !(Text::NSP::Measures::2D::Fisher2::getValues(\%values)) )
120 |   {
121 |     return;
122 |   }
123 | 
124 |   my $final_limit = ($n1p < $np1) ? $n1p : $np1;
125 | 
126 |   my $n11_org = $n11;
127 |   my $n11_start = $n1p + $np1 - $npp;
128 |   if($n11_start < $n11)
129 |   {
130 |     $n11_start = $n11;
131 |   }
132 | 
133 | 
134 |   # to make the computations faster, we check which would require less computations
135 |   # computing the leftfisher value and subtracting it from 1 or directly computing
136 |   # the right fisher value.
137 |   my $left_final_limit = $n11-1;
138 |   my $left_n11 = $n1p + $np1 - $npp;
139 |   if($left_n11<0)
140 |   {
141 |     $left_n11 = 0;
142 |   }
143 | 
144 |   # if computing the left fisher values first will take lesser amount of time them
145 |   # we set a flag for later reference and then compute the leftfisher score for
146 |   # n11-1 and then subtract the total score from one to get the right fisher value.
147 |   if(($left_final_limit - $left_n11) < ($final_limit - $n11_start))
148 |   {
149 |     $left_flag = 1;
150 |     if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($left_n11, $left_final_limit)))
151 |     {
152 |         return;
153 |     }
154 |   }
155 | 
156 |   #else we compute the value normally and simply sum to get the rightfisher value.
157 |   else
158 |   {
159 |     if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($n11_start, $final_limit)))
160 |     {
161 |         return;
162 |     }
163 |   }
164 | 
165 |   my $key_n11;
166 | 
167 |   my $rightfisher=0;
168 | 
169 |   foreach $key_n11 (sort { $b <=> $a } keys %$probabilities)
170 |   {
171 |     if($left_flag)
172 |     {
173 |       if($key_n11 >= $n11_org)
174 |       {
175 |         last;
176 |       }
177 |     }
178 |     else
179 |     {
180 |       if($key_n11 < $n11_org)
181 |       {
182 |         last;
183 |       }
184 |     }
185 |     $rightfisher += $probabilities->{$key_n11};
186 |   }
187 | 
188 |   # if we computed the leftfisher value to get the right fisher value, we subtract
189 |   # the sum of the probabilities for the tables from one to get the right fisher score.
190 |   if($left_flag)
191 |   {
192 |     $rightfisher = 1 - $rightfisher;
193 |   }
194 | 
195 |   return $rightfisher;
196 | }
197 | 
198 | 
199 | =item getStatisticName() - Returns the name of this statistic
200 | 
201 | INPUT PARAMS  : none
202 | 
203 | RETURN VALUES : $name      .. Name of the measure.
204 | 
205 | =cut
206 | 
207 | sub getStatisticName
208 | {
209 |     return "Right Fisher";
210 | }
211 | 
212 | 
213 | 
214 | 1;
215 | __END__
216 | 
217 | =back
218 | 
219 | =head1 AUTHOR
220 | 
221 | Ted Pedersen,                University of Minnesota Duluth
222 |                              E<lt>tpederse@d.umn.eduE<gt>
223 | 
224 | Satanjeev Banerjee,          Carnegie Mellon University
225 |                              E<lt>satanjeev@cmu.eduE<gt>
226 | 
227 | Amruta Purandare,            University of Pittsburgh
228 |                              E<lt>amruta@cs.pitt.eduE<gt>
229 | 
230 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
231 |                              E<lt>bthompson@d.umn.eduE<gt>
232 | 
233 | Saiyam Kohli,                University of Minnesota Duluth
234 |                              E<lt>kohli003@d.umn.eduE<gt>
235 | 
236 | =head1 HISTORY
237 | 
238 | Last updated: $Id: right.pm,v 1.10 2008/03/26 17:24:15 tpederse Exp $
239 | 
240 | =head1 BUGS
241 | 
242 | 
243 | =head1 SEE ALSO
244 | 
245 | L<http://groups.yahoo.com/group/ngram/>
246 | 
247 | L<http://www.d.umn.edu/~tpederse/nsp.html>
248 | 
249 | 
250 | =head1 COPYRIGHT
251 | 
252 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
253 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
254 | 
255 | This program is free software; you can redistribute it and/or modify it
256 | under the terms of the GNU General Public License as published by the Free
257 | Software Foundation; either version 2 of the License, or (at your option)
258 | any later version.
259 | 
260 | This program is distributed in the hope that it will be useful, but
261 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
262 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
263 | for more details.
264 | 
265 | You should have received a copy of the GNU General Public License along
266 | with this program; if not, write to
267 | 
268 |     The Free Software Foundation, Inc.,
269 |     59 Temple Place - Suite 330,
270 |     Boston, MA  02111-1307, USA.
271 | 
272 | Note: a copy of the GNU General Public License is available on the web
273 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
274 | distribution as GPL.txt.
275 | 
276 | =cut
277 | 


--------------------------------------------------------------------------------
/geneSCF-master-v1.0/class/lib/Text/NSP/Measures/4D/MI/ll.pm:
--------------------------------------------------------------------------------
  1 | =head1 NAME
  2 | 
  3 | Text::NSP::Measures::4D::MI::ll - Perl module that implements Loglikelihood
  4 |                                   measure of association for 4-grams.
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 | =head3 Basic Usage
  9 |  use Text::NSP::Measures::4D::MI::ll;
 10 | 
 11 |   $ll_value = calculateStatistic( 
 12 |                                   n1111=>8,
 13 |                                   n1ppp=>306,
 14 |                                   np1pp=>83,
 15 |                                   npp1p=>83,
 16 |                                   nppp1=>57,
 17 |                                   n11pp=>8,
 18 |                                   n1p1p=>8,
 19 |                                   n1pp1=>8,
 20 |                                   np11p=>83,
 21 |                                   np1p1=>56,
 22 |                                   npp11=>56,
 23 |                                   n111p=>8,
 24 |                                   n11p1=>8,
 25 |                                   n1p11=>8,
 26 |                                   np111=>56,
 27 |                                   npppp=>15180);
 28 | 
 29 |   if( ($errorCode = getErrorCode()))
 30 |   {
 31 |     print STDERR $erroCode." - ".getErrorMessage()."\n";
 32 |   }
 33 |   else
 34 |   {
 35 |     print getStatisticName."value for 4-gram is ".$ll_value."\n";
 36 |   }
 37 | 
 38 | =head1 DESCRIPTION
 39 | 
 40 | The log-likelihood ratio measures the devitation between the observed data
 41 | and what would be expected if <word1>, <word2>, <word3>  and <word4> were 
 42 | independent.The higher the score, the less evidence there is in favor of 
 43 | concluding thatthe words are independent.
 44 | 
 45 | The expected values for the internal cells are calculated by taking the
 46 | product of their associated marginals and dividing by the sample size,
 47 | for example:
 48 | 
 49 |             n1ppp * np1pp * npp1p * nppp1
 50 |    m111=   -------------------------------
 51 |                        npppp ^ 3
 52 | 
 53 | Then the deviation between observed and expected values for each internal
 54 | cell is computed to arrive at the log-likelihood value.
 55 | 
 56 |   Log-Likelihood = 2 * [n1111 * log ( n1111 / m1111 ) + n1112 * log ( n1112 / m1112 ) + 
 57 |                        n1121 * log ( n1121 / m1121 ) + n1122 * log ( n1122 / m1122 ) + 
 58 |                        n1211 * log ( n1211 / m1211 ) + n1212 * log ( n1212 / m1212 ) + 
 59 |                        n1221 * log ( n1221 / m1221 ) + n1222 * log ( n1222 / m1222 ) + 
 60 |                        n2111 * log ( n2111 / m2111 ) + n2112 * log ( n2112 / m2112 ) + 
 61 |                        n2121 * log ( n2121 / m2121 ) + n2122 * log ( n2122 / m2122 ) + 
 62 |                        n2211 * log ( n2211 / m2211 ) + n2212 * log ( n2212 / m2212 ) + 
 63 |                        n2221 * log ( n2221 / m2221 ) + n2222 * log ( n2222 / m2222 )];
 64 |   
 65 | =head2 Methods
 66 | 
 67 | =over
 68 | 
 69 | =cut
 70 | 
 71 | 
 72 | package Text::NSP::Measures::4D::MI::ll;
 73 | 
 74 | 
 75 | use Text::NSP::Measures::4D::MI;
 76 | use strict;
 77 | use Carp;
 78 | use warnings;
 79 | no warnings 'redefine';
 80 | require Exporter;
 81 | 
 82 | our ($VERSION, @EXPORT, @ISA);
 83 | 
 84 | @ISA  = qw(Exporter);
 85 | 
 86 | @EXPORT = qw(initializeStatistic calculateStatistic
 87 |              getErrorCode getErrorMessage getStatisticName);
 88 | 
 89 | $VERSION = '0.97';
 90 | 
 91 | =item calculateStatistic($count_values) - This method calculates
 92 | the ll value
 93 | 
 94 | INPUT PARAMS  : $count_values       .. Reference of an hash containing
 95 |                                        the count values computed by the
 96 |                                        count.pl program.
 97 | 
 98 | RETURN VALUES : $loglikelihood      .. Loglikelihood value for this 4-gram.
 99 | 
100 | =cut
101 | 
102 | sub calculateStatistic
103 | {
104 |   my %values = @_;
105 | 
106 |   # computes and sets the observed and expected values from
107 |   # the frequency combination values. returns 0 if there is an
108 |   # error in the computation or the values are inconsistent.
109 |   if( !(Text::NSP::Measures::4D::MI::getValues(\%values)) ) {
110 |     return;
111 |   }
112 | 
113 |   #  Now for the actual calculation of Loglikelihood!
114 |   my $logLikelihood = 0;
115 | 
116 |    
117 |   # dont want ($nxy / $mxy) to be 0 or less! flag error if so!
118 |   $logLikelihood += $n1111 * Text::NSP::Measures::4D::MI::computePMI ( $n1111, $m1111 );
119 |   $logLikelihood += $n1112 * Text::NSP::Measures::4D::MI::computePMI ( $n1112, $m1112 );
120 |   $logLikelihood += $n1121 * Text::NSP::Measures::4D::MI::computePMI ( $n1121, $m1121 );
121 |   $logLikelihood += $n1122 * Text::NSP::Measures::4D::MI::computePMI ( $n1122, $m1122 );
122 |   $logLikelihood += $n1211 * Text::NSP::Measures::4D::MI::computePMI ( $n1211, $m1211 );
123 |   $logLikelihood += $n1212 * Text::NSP::Measures::4D::MI::computePMI ( $n1212, $m1212 );
124 |   $logLikelihood += $n1221 * Text::NSP::Measures::4D::MI::computePMI ( $n1221, $m1221 );
125 |   $logLikelihood += $n1222 * Text::NSP::Measures::4D::MI::computePMI ( $n1222, $m1222 );
126 |   $logLikelihood += $n2111 * Text::NSP::Measures::4D::MI::computePMI ( $n2111, $m2111 );
127 |   $logLikelihood += $n2112 * Text::NSP::Measures::4D::MI::computePMI ( $n2112, $m2112 );
128 |   $logLikelihood += $n2121 * Text::NSP::Measures::4D::MI::computePMI ( $n2121, $m2121 );
129 |   $logLikelihood += $n2122 * Text::NSP::Measures::4D::MI::computePMI ( $n2122, $m2122 );
130 |   $logLikelihood += $n2211 * Text::NSP::Measures::4D::MI::computePMI ( $n2211, $m2211 );
131 |   $logLikelihood += $n2212 * Text::NSP::Measures::4D::MI::computePMI ( $n2212, $m2212 );
132 |   $logLikelihood += $n2221 * Text::NSP::Measures::4D::MI::computePMI ( $n2221, $m2221 );
133 |   $logLikelihood += $n2222 * Text::NSP::Measures::4D::MI::computePMI ( $n2222, $m2222 ); 
134 |   return ( 2 * $logLikelihood );
135 | }
136 | 
137 | 
138 | =item getStatisticName() - Returns the name of this statistic
139 | 
140 | INPUT PARAMS  : none
141 | 
142 | RETURN VALUES : $name      .. Name of the measure.
143 | 
144 | =cut
145 | 
146 | sub getStatisticName
147 | {
148 |     return "Loglikelihood";
149 | }
150 | 
151 | 
152 | 
153 | 1;
154 | __END__
155 | 
156 | 
157 | =back
158 | 
159 | =head1 AUTHOR
160 | 
161 | Ted Pedersen,                University of Minnesota Duluth
162 |                              E<lt>tpederse@d.umn.eduE<gt>
163 | 
164 | Satanjeev Banerjee,          Carnegie Mellon University
165 |                              E<lt>satanjeev@cmu.eduE<gt>
166 | 
167 | Amruta Purandare,            University of Pittsburgh
168 |                              E<lt>amruta@cs.pitt.eduE<gt>
169 | 
170 | Bridget Thomson-McInnes,     University of Minnesota Twin Cities
171 |                              E<lt>bthomson@cs.umn.eduE<gt>
172 | 
173 | Saiyam Kohli,                University of Minnesota Duluth
174 |                              E<lt>kohli003@d.umn.eduE<gt>
175 | 
176 | =head1 HISTORY
177 | 
178 | Last updated: $Id: ll.pm,v 1.1 2008/11/22 18:53:13 btmcinnes Exp $
179 | 
180 | =head1 BUGS
181 | 
182 | 
183 | =head1 SEE ALSO
184 | 
185 |   @article{Dunning93,
186 |             author = {Dunning, T.},
187 |             title = {Accurate Methods for the Statistics of
188 |           Surprise and Coincidence},
189 |             journal = {Computational Linguistics},
190 |             volume = {19},
191 |             number = {1},
192 |             year = {1993},
193 |             pages = {61-74}
194 |             url = L<http://www.comp.lancs.ac.uk/ucrel/papers/tedstats.pdf>}
195 | 
196 |   @inproceedings{moore:2004:EMNLP,
197 |                 author    = {Moore, Robert C.},
198 |                 title     = {On Log-Likelihood-Ratios and the Significance of Rare
199 |               Events },
200 |                 booktitle = {Proceedings of EMNLP 2004},
201 |                 editor = {Dekang Lin and Dekai Wu},
202 |                 year      = 2004,
203 |                 month     = {July},
204 |                 address   = {Barcelona, Spain},
205 |                 publisher = {Association for Computational Linguistics},
206 |                 pages     = {333--340}
207 |                 url = L<http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Moore.pdf>}
208 | 
209 | L<http://groups.yahoo.com/group/ngram/>
210 | 
211 | L<http://www.d.umn.edu/~tpederse/nsp.html>
212 | 
213 | 
214 | =head1 COPYRIGHT
215 | 
216 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta
217 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli
218 | 
219 | This program is free software; you can redistribute it and/or modify it
220 | under the terms of the GNU General Public License as published by the Free
221 | Software Foundation; either version 2 of the License, or (at your option)
222 | any later version.
223 | 
224 | This program is distributed in the hope that it will be useful, but
225 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
226 | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
227 | for more details.
228 | 
229 | You should have received a copy of the GNU General Public License along
230 | with this program; if not, write to
231 | 
232 |     The Free Software Foundation, Inc.,
233 |     59 Temple Place - Suite 330,
234 |     Boston, MA  02111-1307, USA.
235 | 
236 | Note: a copy of the GNU General Public License is available on the web
237 | at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
238 | distribution as GPL.txt.
239 | 
240 | =cut
241 | 


--------------------------------------------------------------------------------