├── geneSCF-master-v1.0 ├── test │ ├── output │ │ └── .geneSCF │ ├── sample_gene_list_id │ └── sample_gene_list_sym ├── test_geneSCF ├── class │ ├── lib │ │ ├── Statistics │ │ │ └── Multtest.pm │ │ ├── List │ │ │ └── Vectorize │ │ │ │ └── lib │ │ │ │ ├── Statistic.pl │ │ │ │ ├── Datatype.pl │ │ │ │ ├── Set.pl │ │ │ │ ├── Apply.pl │ │ │ │ └── IO.pl │ │ ├── Tie-IxHash-1.23 │ │ │ ├── t │ │ │ │ ├── pod.t │ │ │ │ ├── each-delete.t │ │ │ │ └── ixhash.t │ │ │ ├── MANIFEST │ │ │ ├── Build.PL │ │ │ ├── META.yml │ │ │ ├── Changes │ │ │ ├── META.json │ │ │ ├── README │ │ │ └── Makefile.PL │ │ ├── Text │ │ │ ├── NSP.pm │ │ │ └── NSP │ │ │ │ └── Measures │ │ │ │ ├── 2D │ │ │ │ ├── Dice │ │ │ │ │ ├── jaccard.pm │ │ │ │ │ └── dice.pm │ │ │ │ ├── Dice.pm │ │ │ │ ├── CHI │ │ │ │ │ ├── tscore.pm │ │ │ │ │ ├── x2.pm │ │ │ │ │ └── phi.pm │ │ │ │ ├── odds.pm │ │ │ │ ├── MI │ │ │ │ │ ├── ps.pm │ │ │ │ │ ├── tmi.pm │ │ │ │ │ ├── pmi.pm │ │ │ │ │ └── ll.pm │ │ │ │ ├── Fisher │ │ │ │ │ ├── twotailed.pm │ │ │ │ │ ├── left.pm │ │ │ │ │ └── right.pm │ │ │ │ ├── Fisher2 │ │ │ │ │ ├── twotailed.pm │ │ │ │ │ ├── left.pm │ │ │ │ │ └── right.pm │ │ │ │ ├── CHI.pm │ │ │ │ └── MI.pm │ │ │ │ ├── 3D │ │ │ │ └── MI │ │ │ │ │ ├── pmi.pm │ │ │ │ │ ├── ps.pm │ │ │ │ │ ├── tmi.pm │ │ │ │ │ └── ll.pm │ │ │ │ └── 4D │ │ │ │ └── MI │ │ │ │ └── ll.pm │ │ └── Number │ │ │ └── FormatEng.pm │ └── functional_class.pl ├── README.txt └── geneSCF ├── geneSCF-master-v1.1 └── README.txt └── README.md /geneSCF-master-v1.0/test/output/.geneSCF: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/test_geneSCF: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ./geneSCF -i=./test/sample_gene_list_sym -db=GO_BP -o=./test/output -t=sym 3 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Statistics/Multtest.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodebiology/geneSCF_inactive/HEAD/geneSCF-master-v1.0/class/lib/Statistics/Multtest.pm -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Statistic.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodebiology/geneSCF_inactive/HEAD/geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Statistic.pl -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/t/pod.t: -------------------------------------------------------------------------------- 1 | #!perl -T 2 | 3 | use Test::More; 4 | eval "use Test::Pod 1.14"; 5 | plan skip_all => "Test::Pod 1.14 required for testing POD" if $@; 6 | all_pod_files_ok(); 7 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/MANIFEST: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | README 3 | Changes 4 | Makefile.PL 5 | Build.PL 6 | lib/Tie/IxHash.pm 7 | t/ixhash.t 8 | t/each-delete.t 9 | t/pod.t 10 | META.yml 11 | META.json 12 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/t/each-delete.t: -------------------------------------------------------------------------------- 1 | 2 | use strict; 3 | use Test::More tests=>2; 4 | use Tie::IxHash; 5 | 6 | my $o = tie my %h, 'Tie::IxHash'; 7 | 8 | $h{a} = 1; $h{b} = 2; $h{c} = 3; $h{d} = 4; $h{e} = 5; 9 | 10 | while (my ($k) = each %h) { 11 | if ($k =~ /b|d|e/) { delete $h{$k}; } 12 | } 13 | 14 | is(scalar(keys(%h)), 2) or diag explain(\%h); 15 | is(join(',',keys(%h)), 'a,c') or diag explain(\%h); 16 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/Build.PL: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Module::Build; 3 | #created by eumm-migrate.pl 4 | 5 | my $build = Module::Build->new( 6 | 'auto_configure_requires' => 0, 7 | 'module_name' => 'Tie::IxHash', 8 | 'requires' => { 9 | 'perl' => '5.005', 10 | }, 11 | 'build_requires' => { 12 | 'Test::More' => 0, 13 | }, 14 | 'meta_merge' => { 15 | 'resources' => { 16 | 'repository' => 'git://github.com/chorny/Tie-IxHash.git' 17 | } 18 | }, 19 | 'license' => 'perl', 20 | 'dist_version_from' => 'lib/Tie/IxHash.pm', 21 | ); 22 | 23 | $build->create_build_script(); 24 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/META.yml: -------------------------------------------------------------------------------- 1 | --- 2 | abstract: ordered associative arrays for Perl 3 | author: 4 | - Gurusamy Sarathy gsar@umich.edu 5 | build_requires: 6 | Test::More: 0 7 | dynamic_config: 1 8 | generated_by: 'Module::Build version 0.4003, CPAN::Meta::Converter version 2.112621' 9 | license: perl 10 | meta-spec: 11 | url: http://module-build.sourceforge.net/META-spec-v1.4.html 12 | version: 1.4 13 | name: Tie-IxHash 14 | provides: 15 | Tie::IxHash: 16 | file: lib/Tie/IxHash.pm 17 | version: 1.23 18 | requires: 19 | perl: 5.005 20 | resources: 21 | license: http://dev.perl.org/licenses/ 22 | repository: git://github.com/chorny/Tie-IxHash.git 23 | version: 1.23 24 | -------------------------------------------------------------------------------- /geneSCF-master-v1.1/README.txt: -------------------------------------------------------------------------------- 1 | Gene Set Clustering based on Functional annotation v1.1 2 | ---------------------------------------------------------------------------- 3 | GeneSCF v1.1 supports all organisms from KEGG and also from Gene Ontology. This update will have real-time funtional enrichment feature. 4 | For new updates about GeneSCF, visit hosting page: https://github.com/genescf/GeneSCF 5 | 6 | 7 | 8 | -------------------------- 9 | Cite any version of GeneSCF using: 10 | 11 | Subhash S and Kanduri C. GeneSCF: a real-time based functional enrichment tool with support for multiple organisms. 12 | BMC Bioinformatics 2016, 17:365, http://www.biomedcentral.com/1471-2105/17/365 13 | 14 | -------------------------- 15 | Author: Santhilal Subhash 16 | santhilal.subhash@gu.se 17 | Last Updated: 2020/10/03 18 | https://github.com/genescf/GeneSCF 19 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/test/sample_gene_list_id: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | 11 12 | 12 13 | 13 14 | 14 15 | 15 16 | 16 17 | 17 18 | 18 19 | 19 20 | 20 21 | 21 22 | 22 23 | 23 24 | 24 25 | 25 26 | 26 27 | 27 28 | 28 29 | 29 30 | 30 31 | 31 32 | 32 33 | 33 34 | 34 35 | 35 36 | 36 37 | 37 38 | 38 39 | 39 40 | 40 41 | 41 42 | 42 43 | 43 44 | 44 45 | 45 46 | 46 47 | 47 48 | 48 49 | 49 50 | 50 51 | 51 52 | 52 53 | 53 54 | 54 55 | 55 56 | 56 57 | 57 58 | 58 59 | 59 60 | 60 61 | 61 62 | 62 63 | 63 64 | 64 65 | 65 66 | 66 67 | 67 68 | 68 69 | 69 70 | 70 71 | 71 72 | 72 73 | 73 74 | 74 75 | 75 76 | 76 77 | 77 78 | 78 79 | 79 80 | 80 81 | 81 82 | 82 83 | 83 84 | 84 85 | 85 86 | 86 87 | 87 88 | 88 89 | 89 90 | 90 91 | 91 92 | 92 93 | 93 94 | 94 95 | 95 96 | 96 97 | 97 98 | 98 99 | 99 100 | 100 101 | 102 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/Changes: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | HISTORY - release history for Tie::IxHash 4 | 5 | =head1 DESCRIPTION 6 | 7 | =over 8 8 | 9 | =item 1.23 (24 February 2013) 10 | 11 | New method Clear() 12 | 13 | Deleting current element when doing cycle using each will work (test by OLEG, RT#82248) 14 | 15 | =item 1.22 (27 February 2010) 16 | 17 | Build.PL added 18 | 19 | Better META.yml 20 | 21 | Distribution upgrade 22 | 23 | =item 1.21 (5 January 1998) 24 | 25 | Key()/Values()/Indices() now return a single value when called with single 26 | argument (makes them useful in scalar contexts) 27 | 28 | =item 1.2 (18 February 1997) 29 | 30 | Repackaged into a tarball. 31 | 32 | Added a testsuite. 33 | 34 | C suggested by Michael De La Rue . 35 | 36 | =item 1.1 37 | 38 | Initial release (ancient). 39 | 40 | =back 41 | 42 | =cut 43 | 44 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/META.json: -------------------------------------------------------------------------------- 1 | { 2 | "abstract" : "ordered associative arrays for Perl", 3 | "author" : [ 4 | "Gurusamy Sarathy gsar@umich.edu" 5 | ], 6 | "dynamic_config" : 1, 7 | "generated_by" : "Module::Build version 0.4003, CPAN::Meta::Converter version 2.112621", 8 | "license" : [ 9 | "perl_5" 10 | ], 11 | "meta-spec" : { 12 | "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", 13 | "version" : "2" 14 | }, 15 | "name" : "Tie-IxHash", 16 | "prereqs" : { 17 | "build" : { 18 | "requires" : { 19 | "Test::More" : 0 20 | } 21 | }, 22 | "runtime" : { 23 | "requires" : { 24 | "perl" : "5.005" 25 | } 26 | } 27 | }, 28 | "provides" : { 29 | "Tie::IxHash" : { 30 | "file" : "lib/Tie/IxHash.pm", 31 | "version" : "1.23" 32 | } 33 | }, 34 | "release_status" : "stable", 35 | "resources" : { 36 | "license" : [ 37 | "http://dev.perl.org/licenses/" 38 | ], 39 | "repository" : { 40 | "url" : "git://github.com/chorny/Tie-IxHash.git" 41 | } 42 | }, 43 | "version" : "1.23" 44 | } 45 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/test/sample_gene_list_sym: -------------------------------------------------------------------------------- 1 | SGIP1 2 | SLC45A1 3 | NECAP2 4 | CLIC4 5 | ADC 6 | AGBL4 7 | DAB1 8 | TGFBR3 9 | DBT 10 | PRUNE 11 | RP11-550P17.5 12 | RFWD2 13 | C1orf21 14 | RP1-272L16.1 15 | LIN9 16 | C1orf159 17 | PRKCZ 18 | PRDM16 19 | ICMT 20 | CAMTA1 21 | RP5-1056L3.1 22 | PINK1 23 | PINK1-AS 24 | USP48 25 | EPHB2 26 | STMN1 27 | NUDC 28 | EYA3 29 | EPB41 30 | PUM1 31 | KHDRBS1 32 | CSMD2 33 | SFPQ 34 | THRAP3 35 | RP5-1180C18.1 36 | MACF1 37 | CCDC30 38 | PTPRF 39 | RNF220 40 | GPBP1L1 41 | TRABD2B 42 | FAF1 43 | RAB3B 44 | SCP2 45 | TCEANC2 46 | USP24 47 | FGGY 48 | NFIA 49 | USP1 50 | ITGB3BP 51 | CACHD1 52 | LEPR 53 | LRRC7 54 | NEGR1 55 | ST6GALNAC3 56 | PIGK 57 | LPHN2 58 | DDAH1 59 | CLCA4 60 | RP11-76N22.2 61 | LRRC8C 62 | FAM69A 63 | GCLM 64 | RP4-639F20.1 65 | RP11-147C23.1 66 | RP11-202K23.1 67 | NTNG1 68 | GNAI3 69 | KCNA2 70 | RAP1A 71 | RHOC 72 | PPM1J 73 | PHTF1 74 | WDR3 75 | NOTCH2 76 | RP6-206I17.1 77 | PDZK1 78 | RP11-495P10.2 79 | OTUD7B 80 | SPRR2B 81 | TPM3 82 | GBAP1 83 | SMG5 84 | FCGR2A 85 | PBX1 86 | RP11-466F5.6 87 | MROH9 88 | DNM3 89 | RP1-15D23.2 90 | TNFSF18 91 | TNN 92 | FAM5B 93 | RASAL2 94 | SOAT1 95 | ACBD6 96 | CACNA1E 97 | RGSL1 98 | SMG7 99 | PTGS2 100 | RP11-541F9.2 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GeneSCF 2 | 3 | 4 | ``Update``: 2020/10/03 5 | 6 | New GitHub page is up and running for GeneSCF, https://github.com/genescf 7 | 8 | ``Update``: 2020/09 9 | 10 | New GitHub page is releasing soon for GeneSCF, https://github.com/genescf 11 | 12 | 13 | # Gene Set Clustering based on Functional annotation 14 | ---------------------------------------------------------------------------- 15 | GeneSCF from v1.1 supports all organisms from KEGG and also from Gene Ontology. This update will have real-time funtional enrichment feature. 16 | For new updates about GeneSCF, visit hosting website: https://github.com/genescf/GeneSCF 17 | 18 | 19 | -------------------------- 20 | # Cite any version of GeneSCF using 21 | 22 | Subhash S and Kanduri C. GeneSCF: a real-time based functional enrichment tool with support for multiple organisms. 23 | BMC Bioinformatics 2016, 17:365, http://www.biomedcentral.com/1471-2105/17/365 24 | 25 | 26 | -------------------------- 27 | # Report issues 28 | 29 | BioStars: https://www.biostars.org/p/108669/ 30 | 31 | GitHub: https://github.com/genescf/GeneSCF/issues 32 | 33 | Email: santhilalsubhash@gmail.com 34 | 35 | 36 | 37 | -------------------------- 38 | Correspondance: Santhilal Subhash 39 | 40 | santhilalsubhash@gmail.com 41 | 42 | Last Updated: 2020/10/03 43 | 44 | https://github.com/genescf/GeneSCF 45 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/README: -------------------------------------------------------------------------------- 1 | This is the README file for Tie::IxHash, the Perl module that 2 | implements ordered in-memory associative arrays. 3 | 4 | It requires: 5 | Perl Version 5.005 or later. 6 | 7 | If you have been led to believe that associative arrays in perl 8 | don't preserve order, and if you have ever craved for that feature, 9 | this module is for you. Simply declare a "tie" for the hash variable 10 | that you want to be order-preserving, and forget that limitation 11 | ever existed. You can do other nifty things with the tied hash object 12 | that you may be used to doing with arrays, like Push(), Pop() and 13 | Splice(). 14 | 15 | If you don't know what "tie" means, you should look at the 16 | perltie(1) manpage in a recent perl distribution, or in the 17 | index of one of the numerous books on perl. 18 | 19 | If you don't know what "perl" is, you don't need this software. 20 | 21 | See the embedded documentation in the module file for details. 22 | 23 | Don't forget to send your comments! 24 | 25 | - Sarathy. 26 | gsar@umich.edu 27 | 28 | ----------- 29 | 30 | Installation: 31 | 32 | perl Makefile.PL 33 | make install 34 | 35 | If you run into problems due to whatever reason in running the above, 36 | simply move the file IxHash.pm over into $PERL5LIB/Tie/IxHash.pm (where 37 | $PERL5LIB stands for the place where your standard perl library files 38 | are located) and you'll be okay. 39 | 40 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/Makefile.PL: -------------------------------------------------------------------------------- 1 | use ExtUtils::MakeMaker; 2 | WriteMakefile1( 3 | NAME => "Tie::IxHash", 4 | VERSION_FROM => 'lib/Tie/IxHash.pm', 5 | LICENSE => 'perl', 6 | MIN_PERL_VERSION => '5.005', 7 | META_MERGE => { 8 | resources => { 9 | repository => 'git://github.com/chorny/Tie-IxHash.git', 10 | }, 11 | }, 12 | PL_FILES => {}, 13 | #SKIP => [qw(static dynamic)], 14 | #'linkext' => {LINKTYPE => '' }, 15 | #'dist' => {COMPRESS=>'gzip -9f', SUFFIX => 'gz'}, 16 | ); 17 | 18 | sub WriteMakefile1 { #Written by Alexandr Ciornii, version 0.21. Added by eumm-upgrade. 19 | my %params=@_; 20 | my $eumm_version=$ExtUtils::MakeMaker::VERSION; 21 | $eumm_version=eval $eumm_version; 22 | die "EXTRA_META is deprecated" if exists $params{EXTRA_META}; 23 | die "License not specified" if not exists $params{LICENSE}; 24 | if ($params{BUILD_REQUIRES} and $eumm_version < 6.5503) { 25 | #EUMM 6.5502 has problems with BUILD_REQUIRES 26 | $params{PREREQ_PM}={ %{$params{PREREQ_PM} || {}} , %{$params{BUILD_REQUIRES}} }; 27 | delete $params{BUILD_REQUIRES}; 28 | } 29 | delete $params{CONFIGURE_REQUIRES} if $eumm_version < 6.52; 30 | delete $params{MIN_PERL_VERSION} if $eumm_version < 6.48; 31 | delete $params{META_MERGE} if $eumm_version < 6.46; 32 | delete $params{META_ADD} if $eumm_version < 6.46; 33 | delete $params{LICENSE} if $eumm_version < 6.31; 34 | delete $params{AUTHOR} if $] < 5.005; 35 | delete $params{ABSTRACT_FROM} if $] < 5.005; 36 | delete $params{BINARY_LOCATION} if $] < 5.005; 37 | 38 | WriteMakefile(%params); 39 | } 40 | 41 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Tie-IxHash-1.23/t/ixhash.t: -------------------------------------------------------------------------------- 1 | #!../perl -w 2 | use Tie::IxHash; 3 | 4 | my $TNUM = 0; 5 | print "1..26\n"; 6 | 7 | sub T { print $_[0] ? "ok " : "not ok ", ++$TNUM, "\n" } 8 | my %bar; 9 | my $ixh = tie (%bar, 'Tie::IxHash', 'a' => 1, 'q' => 2, 'm' => 'X', 'n' => 'Y'); 10 | #$ixh = Tie::IxHash->new('a' => 1, 'q' => 2, 'm' => 'X', n => 'Y'); 11 | $ixh->Push(e => 5, f => 6); 12 | T 'a|1|q|2|m|X|n|Y|e|5|f|6' eq join('|', %bar); 13 | $ixh->Delete('e', 'a'); 14 | T 'q|2|m|X|n|Y|f|6' eq join '|', %bar; 15 | T 'q|m|n|f' eq join '|', $ixh->Keys; 16 | T '2|X|Y|6' eq join '|', $ixh->Values; 17 | T 'm|n|f' eq join '|', $ixh->Keys(1, 2, 3); 18 | T 'X|Y|6' eq join '|', $ixh->Values(1, 2, 3); 19 | $ixh->Replace(1, 9); 20 | T 'q|2|m|9|n|Y|f|6' eq join '|', %bar; 21 | $ixh->Replace(0, 8, 'f'); 22 | T 'f|8|m|9|n|Y' eq join '|', %bar; 23 | T '2|1' eq join '|', $ixh->Indices('n', 'm'); 24 | $ixh->Push(z => 1); 25 | $ixh->SortByValue; 26 | T 'z|f|m|n' eq join '|', $ixh->Keys; 27 | $ixh->SortByKey; 28 | T 'f|m|n|z' eq join '|', $ixh->Keys; 29 | 30 | T 'm' eq $ixh->Keys(1); 31 | T 'Y' eq $ixh->Values(2); 32 | T 3 == $ixh->Indices('z'); 33 | 34 | %bar = ('a' => 9, 'c' => 6, 'z' => 7, 'f' => 1); 35 | delete $bar{'z'}; 36 | $bar{'a'} = 10; 37 | T 'a|10|c|6|f|1' eq join '|', %bar; 38 | T 'a|c|f' eq join '|', keys %bar; 39 | T '10|6|1' eq join '|', values %bar; 40 | $ixh->Reorder(sort { $bar{$a} <=> $bar{$b} } keys %bar); 41 | T 'f|c|a' eq join '|', keys %bar; 42 | $ixh->Reorder('c', 'a', 'z'); 43 | T 'c|6|a|10' eq join '|', %bar; 44 | 45 | @tmp = $ixh->Splice(0, 3, 'z' => 7, 'm' => 4); 46 | T 'c|6|a|10' eq join '|', @tmp; 47 | T 'z|7|m|4' eq join '|', %bar; 48 | $ixh->Push('m' => 8); 49 | @tmp = $ixh->Pop; 50 | T 'm|8' eq join '|', @tmp; 51 | $ixh->Push('o' => 2, 'r' => 8); 52 | T 'z|7|o|2|r|8' eq join '|', %bar; 53 | $ixh->Pop; 54 | T 'z|7|o|2' eq join '|', %bar; 55 | $ixh->Splice($ixh->Length,0,$ixh->Pop); 56 | T 'z|7|o|2' eq join '|', %bar; 57 | 58 | $ixh->Clear; 59 | T $ixh->Length == 0; 60 | 61 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/README.txt: -------------------------------------------------------------------------------- 1 | Gene Set Clustering based on Functional annotation 2 | ---------------------------------------------------------------------------- 3 | 4 | INSTALL: 5 | 6 | No installation required 7 | 8 | TEST DATASETS: 9 | 10 | Run command 11 | 12 | ./test_geneSCF 13 | 14 | You will get output in './test/output/' directory. 15 | 16 | 17 | USAGE: 18 | 19 | geneSCF -i= -o= -db= 20 | 21 | ========== 22 | Options: 23 | ========== 24 | 25 | [-i= | --infile=] Input file contains list of Entrez GeneIDs or OFFICIAL GENE SYMBOLS. 26 | The genes must be new lines seperated (One gene per line). 27 | 28 | [-t= | --gtype=] Type of input in the provided list either Entrez GeneIDs 'gid' 29 | or OFFICIAL GENE SYMBOLS 'sym' (Without quotes, default: sym). 30 | 31 | [-db= | --database=] Database you want to find gene enrichment which is either 32 | geneontology 'GO_all' or geneontology-biological_process 33 | 'GO_BP' or geneontology-molecular_function 'GO_MF' or 34 | geneontology-cellular_components 'GO_CC' or kegg 'KEGG' or 35 | reactome 'REACTOME' or Network of Cancer Genes 'NCG' (Without quotes). 36 | 37 | [-o= | --outpath=] Path to save output file. The output will be with saved in the 38 | provided existing location as 39 | {INPUT_FILE_NAME}_{database}_functional_classification.tsv 40 | (tab-seperated file). Note: This tool will not create output directory, 41 | only outputs in exiting location. 42 | 43 | [-bg= | --background=] Total background genes to consider (default : 30000). 44 | 45 | [-h | --help] For displaying this help page. 46 | 47 | 48 | 49 | -------------------------- 50 | Cite using: 51 | 52 | Subhash S and Kanduri C. GeneSCF: a real-time based functional enrichment tool with support for multiple organisms. 53 | BMC Bioinformatics 2016, 17:365, http://www.biomedcentral.com/1471-2105/17/365 54 | 55 | 56 | -------------------------- 57 | Author: Santhilal Subhash 58 | santhilal.subhash@gu.se 59 | Last Updated: 2015 June 05 60 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Datatype.pl: -------------------------------------------------------------------------------- 1 | 2 | # description: is the scalar a number 3 | sub is_numberic { 4 | my $value = $_[0]; 5 | if($value =~/^-?\d+\.?\d*$/) { 6 | return 1; 7 | } else { 8 | return 0; 9 | } 10 | } 11 | 12 | # description: is the scalar a array reference 13 | sub is_array_ref { 14 | if($_[0] and ref($_[0]) 15 | and ref($_[0]) eq "ARRAY") { 16 | return 1; 17 | } 18 | else { 19 | return 0; 20 | } 21 | } 22 | 23 | # description: is the scalar a hash reference 24 | sub is_hash_ref { 25 | if($_[0] and ref($_[0]) 26 | and ref($_[0]) eq "HASH") { 27 | return 1; 28 | } 29 | else { 30 | return 0; 31 | } 32 | } 33 | 34 | # description: is the scalar a scalar reference 35 | sub is_scalar_ref { 36 | if($_[0] and ref($_[0]) 37 | and ref($_[0]) eq "SCALAR") { 38 | return 1; 39 | } 40 | else { 41 | return 0; 42 | } 43 | } 44 | 45 | # description: is the scalar a subroutiine reference 46 | sub is_code_ref { 47 | if($_[0] and ref($_[0]) 48 | and ref($_[0]) eq "CODE") { 49 | return 1; 50 | } 51 | else { 52 | return 0; 53 | } 54 | } 55 | 56 | # description: is the scalar a typeglob reference 57 | sub is_glob_ref { 58 | if($_[0] and ref($_[0]) 59 | and ref($_[0]) eq "GLOB") { 60 | return 1; 61 | } 62 | else { 63 | return 0; 64 | } 65 | } 66 | 67 | # description: is the scalar a reference reference 68 | sub is_ref_ref { 69 | if($_[0] and ref($_[0]) 70 | and ref($_[0]) eq "REF") { 71 | return 1; 72 | } 73 | else { 74 | return 0; 75 | } 76 | } 77 | 78 | # description: the type of a scalar 79 | sub type_of { 80 | 81 | if(ref($_[0])) { 82 | return ref($_[0])."_REF"; 83 | } 84 | elsif(ref(\$_[0]) eq "GLOB") { 85 | return "GLOB"; 86 | } 87 | else { 88 | return "SCALAR"; 89 | } 90 | } 91 | 92 | 1; 93 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/geneSCF: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in "$@" 3 | do 4 | case $i in 5 | -t=*|--gtype=*) 6 | GTYPE="${i#*=}" 7 | 8 | ;; 9 | -i=*|--infile=*) 10 | INFILE="${i#*=}" 11 | ;; 12 | -o=*|--outpath=*) 13 | OUTPATH="${i#*=}" 14 | ;; 15 | -db=*|--database=*) 16 | database="${i#*=}" 17 | ;; 18 | -bg=*|--background=*) 19 | background="${i#*=}" 20 | ;; 21 | -h*|--help) 22 | echo -e "\n\nUSAGE: \n\ngeneSCF -i= -o= -db=\n\n==========\nOptions:\n==========\n\n[-i= | --infile=]\tInput file contains list of Entrez GeneIDs or OFFICIAL GENE SYMBOLS.\n\t\t\tThe genes must be new lines seperated (One gene per line).\n\n[-t= | --gtype=]\tType of input in the provided list either Entrez GeneIDs 'gid'\n\t\t\tor OFFICIAL GENE SYMBOLS 'sym' (Without quotes, default: sym).\n\n[-db= | --database=]\tDatabase you want to find gene enrichment which is either \n\t\t\tgeneontology 'GO_all' or geneontology-biological_process \n\t\t\t'GO_BP' or geneontology-molecular_function 'GO_MF' or \n\t\t\tgeneontology-cellular_components 'GO_CC' or kegg 'KEGG' or \n\t\t\treactome 'REACTOME' or Network of Cancer Genes 'NCG' (Without quotes).\n\n[-o= | --outpath=]\tExisting directory to save output file. The output will be with saved in the \n\t\t\tprovided location as {INPUT_FILE_NAME}_{database}_functional_classification.tsv \n\t\t\t(tab-seperated file).\n\n[-bg= | --background=]\tTotal background genes to consider (default : 30000).\n\n[-h | --help]\t\tFor displaying this help page.\n"; 23 | exit 1; 24 | ;; 25 | *) 26 | # unknown option 27 | ;; 28 | esac 29 | done 30 | 31 | if [ $# -eq 0 ]; then 32 | echo -e "\n\nPlease use:\n\ngeneSCF -h \n\n(or)\n\ngeneSCF --help \n\n for help\n"; 33 | exit 1; 34 | fi 35 | if [ -z $database ]; then 36 | echo "Please specify one of these databses GO_all,GO_BP,GO_MF,GO_CC,KEGG,NCG,REACTOME"; 37 | exit 1; 38 | fi 39 | 40 | if [ -z $GTYPE ]; then 41 | 42 | GTYPE="sym"; 43 | 44 | fi 45 | 46 | if [ -z $background ]; then 47 | 48 | background=30000; 49 | 50 | fi 51 | 52 | if [ -z $INFILE ]; then 53 | 54 | echo "Input file missing"; 55 | 56 | exit 1; 57 | 58 | fi 59 | 60 | if [ -z $OUTPATH ]; then 61 | 62 | echo "Please specify out put path"; 63 | 64 | exit 1; 65 | 66 | fi 67 | 68 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 69 | DT=`/bin/date` 70 | echo "processing started....$DT" 71 | 72 | perl ${DIR}/class/functional_class.pl ${GTYPE} ${INFILE} ${OUTPATH} ${database} ${background} ${DIR} 73 | 74 | DT=`/bin/date` 75 | echo "$DT finished processing" 76 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Set.pl: -------------------------------------------------------------------------------- 1 | 2 | # usage: intersect( [ARRAY REF], [ARRAY REF], ... ) 3 | # return: ARRAY REF 4 | sub intersect { 5 | 6 | check_prototype(@_, '(\@)+'); 7 | 8 | if(scalar(@_) < 2) { 9 | return $_[0]; 10 | } 11 | 12 | my $set1 = shift; 13 | my $set2 = shift; 14 | my @remain_set = @_; 15 | 16 | # if set1 or set2 is empty 17 | if(is_empty($set1) or is_empty($set2)) { 18 | return []; 19 | } 20 | 21 | my $hash2; 22 | for (@$set2) { 23 | $hash2->{$_} = 1; 24 | } 25 | 26 | $set1 = unique($set1); 27 | 28 | my $intersect; 29 | for (@$set1) { 30 | push(@$intersect, $_) if($hash2->{$_}); 31 | } 32 | 33 | if($intersect) { 34 | $intersect = intersect($intersect, @remain_set); 35 | } 36 | else { 37 | return []; 38 | } 39 | 40 | return $intersect; 41 | } 42 | 43 | # usage: union( [ARRAY REF], [ARRAY REF] ) 44 | # return: ARRAY REF 45 | sub union { 46 | 47 | check_prototype(@_, '(\@)+'); 48 | 49 | if(scalar(@_) < 2) { 50 | return $_[0]; 51 | } 52 | 53 | my $set1 = shift; 54 | my $set2 = shift; 55 | my @remain_set = @_; 56 | 57 | $set1 = unique($set1); 58 | $set2 = unique($set2); 59 | 60 | my $hash1; 61 | my $union = $set1; 62 | for (@$set1) { 63 | $hash1->{$_} = 1; 64 | } 65 | 66 | for (@$set2) { 67 | push(@$union, $_) if(! $hash1->{$_}); 68 | } 69 | 70 | $union = union($union, @remain_set); 71 | 72 | return $union; 73 | } 74 | 75 | # usage: complement( [ARRAY REF], [ARRAY REF] ) 76 | # return: ARRAY REF 77 | # set1 - set2 78 | sub setdiff { 79 | 80 | check_prototype(@_, '\@\@'); 81 | 82 | my $set1 = shift; 83 | my $set2 = shift; 84 | 85 | my $hash2; 86 | foreach (@$set2) { 87 | $hash2->{$_} = 1; 88 | } 89 | 90 | my $diff; 91 | foreach (@$set1) { 92 | push(@$diff, $_) unless($hash2->{$_}); 93 | } 94 | return $diff; 95 | } 96 | 97 | # usage: setequal( [ARRAY REF], [ARRAY REF] ) 98 | # return: 1|0 99 | sub setequal { 100 | 101 | check_prototype(@_, '\@\@'); 102 | 103 | my $set1 = shift; 104 | my $set2 = shift; 105 | 106 | my $unique_set1 = unique($set1); 107 | my $unique_set2 = unique($set2); 108 | my $union = union($set1, $set2); 109 | 110 | if(len($unique_set1) == len($unique_set2) 111 | and len($unique_set1) == len($union)) { 112 | return 1; 113 | } 114 | else { 115 | return 0; 116 | } 117 | } 118 | 119 | # usage: is_element( [SCALAR], [ARRAY REF]) 120 | # return 0|1 121 | sub is_element { 122 | 123 | check_prototype(@_, '$\@'); 124 | 125 | my $item = shift; 126 | my $set = shift; 127 | 128 | for(my $i = 0; $i < len($set); $i ++) { 129 | if(is_numberic($set->[$i]) and is_numberic($item) 130 | and abs($set->[$i] - $item) < EPS) { 131 | return 1; 132 | } 133 | elsif($set->[$i] eq $item) { 134 | return 1; 135 | } 136 | } 137 | return 0; 138 | } 139 | 140 | 1; 141 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/List/Vectorize/lib/Apply.pl: -------------------------------------------------------------------------------- 1 | 2 | 3 | sub sapply { 4 | 5 | check_prototype(@_, '\@\&'); 6 | 7 | my $array = shift; 8 | my $function = shift; 9 | 10 | my $sapply = []; 11 | @$sapply = map { my $scalar = $function->($_); 12 | $scalar; 13 | } @$array; 14 | 15 | return $sapply; 16 | } 17 | 18 | 19 | sub mapply { 20 | 21 | check_prototype(@_, '(\@|$)+\&'); 22 | 23 | my $function = pop; # the last argument 24 | my @array = @_; 25 | 26 | for (0..$#array) { 27 | if(! is_array_ref($array[$_])) { 28 | $array[$_] = [$array[$_]]; 29 | } 30 | } 31 | 32 | my $length = sapply(\@array, \&len); 33 | my $max_length = max($length); 34 | 35 | my $check_length = sapply($length, sub {$max_length % $_[0] != 0}); 36 | if(sum($check_length)) { 37 | croak "ERROR: Longer object length is not a multiple of shorter object length."; 38 | } 39 | 40 | @array = @{ sapply(\@array, sub{_cycle($_[0], $max_length)}) }; 41 | 42 | my $mapply = []; 43 | for my $i (0..($max_length-1)) { 44 | my $param = sapply(\@array, sub {$_[0]->[$i]}); 45 | $mapply->[$i] = do { my $scalar = $function->(@$param); 46 | $scalar; }; 47 | } 48 | 49 | return $mapply; 50 | } 51 | 52 | 53 | sub _cycle { 54 | my $array = shift; 55 | my $size = shift || len($array); 56 | my $scalar = len($array); 57 | 58 | if($size == $scalar) { 59 | return $array; 60 | } 61 | elsif($size < $scalar) { 62 | $size --; 63 | return subset($array, [0..$size]); 64 | } 65 | else { 66 | $size --; 67 | my $index = sapply([0..$size], sub {$_ % $scalar}); 68 | return subset($array, $index); 69 | } 70 | } 71 | 72 | 73 | sub happly { 74 | 75 | check_prototype(@_, '\%\&'); 76 | 77 | my $hash = shift; 78 | my $function = shift; 79 | 80 | my $happly = {}; 81 | foreach (keys %$hash) { 82 | $happly->{$_} = do { my $scalar = $function->($hash->{$_}); 83 | $scalar; }; 84 | } 85 | return $happly; 86 | } 87 | 88 | 89 | sub tapply { 90 | 91 | check_prototype(@_, '\@(\@)+\&'); 92 | 93 | my $array = shift; 94 | my $function = pop; 95 | my @category = @_; 96 | 97 | my $length = sapply(\@category, \&len); 98 | push(@$length, len($array)); 99 | if(max($length) != min($length)) { 100 | croak "ERROR: Length of the vector must be equal to the length of all categories.\n"; 101 | } 102 | 103 | my $category = paste(@category, "|"); 104 | 105 | my $label = unique($category); 106 | my $tapply = {}; 107 | for (0..$#$label) { 108 | my $current_label = $label->[$_]; 109 | my $index = test($category, sub {$_[0] eq $current_label}); 110 | $index = which($index); 111 | my @data = @{subset($array, $index)}; 112 | $tapply->{$current_label} = do { my $scalar = $function->(@data); 113 | $scalar; }; 114 | } 115 | return $tapply; 116 | } 117 | 118 | 1; 119 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/List/Vectorize/lib/IO.pl: -------------------------------------------------------------------------------- 1 | 2 | # ============================= IO subroutine ============================================== 3 | # usage: print_ref( [TYPEGLOB], [SCALAR] ) 4 | # description: print the data structure of a reference 5 | sub print_ref { 6 | 7 | check_prototype(@_, '*?($|\$|\@|\%|\&)+'); 8 | 9 | local $handle = *STDOUT; 10 | if(is_glob_ref(\$_[0])) { 11 | $handle = shift(@_); 12 | } 13 | my $ref = shift; 14 | 15 | if(is_array_ref($ref)) { 16 | print $handle "Reference of ARRAY.\n"; 17 | for (0..$#$ref) { 18 | print $handle "[$_] $ref->[$_]\n"; 19 | } 20 | print $handle "\n"; 21 | } elsif(is_hash_ref($ref)) { 22 | print $handle "Reference of HASH.\n"; 23 | foreach (keys %$ref) { 24 | print $handle "$_\t$ref->{$_}\n"; 25 | } 26 | print $handle "\n"; 27 | } elsif(is_scalar_ref($ref)) { 28 | print $handle "Reference of SCALAR.\n"; 29 | print $handle $$ref; 30 | print $handle "\n"; 31 | } elsif(is_ref_ref($ref)) { 32 | print $handle "Reference of REF.\n"; 33 | print $handle $$ref; 34 | print $handle "\n"; 35 | } elsif(is_code_ref($ref)) { 36 | print $handle "Reference of CODE.\n"; 37 | } else { 38 | print $handle "@_\n"; 39 | } 40 | return $ref; 41 | } 42 | 43 | # usage: print_matrix( [TYPEGLOB], [SCALAR] ) 44 | # description: print the matrix 45 | sub print_matrix { 46 | 47 | check_prototype(@_, '*?\@'); 48 | 49 | local $handle = *STDOUT; 50 | if(is_glob_ref(\$_[0])) { 51 | $handle = shift(@_); 52 | } 53 | my $mat = $_[0]; 54 | my $sep = "\t"; 55 | 56 | my ($nrow, $ncol) = dim($mat); 57 | print "$nrow x $ncol matrix:\n\n"; 58 | 59 | for(my $i = 0; $i < len($mat); $i ++) { 60 | print $handle join $sep, @{$mat->[$i]}; 61 | print $handle "\n"; 62 | } 63 | print "\n"; 64 | } 65 | 66 | # usage: read_table( [SCALAR], %setup ) 67 | sub read_table { 68 | 69 | check_prototype(@_, '$($|\@){0,}'); 70 | 71 | my $file = shift; 72 | 73 | my %setup = @_; 74 | my $quote = $setup{"quote"} || ""; 75 | my $sep = $setup{"sep"} || "\t"; 76 | my $whether_rownames = $setup{"row.names"} || 0; # if set true, first item will be key 77 | my $whether_colnames = $setup{"col.names"} || 0; # if set true, first item will be key 78 | 79 | open F, $file or croak "ERROR: cannot open $file.\n"; 80 | my $data; 81 | my $rownames; 82 | my $colnames; 83 | my $i_line = 0; 84 | my $i_array = 0; 85 | my $flag = 0; 86 | while( my $line = ) { 87 | $i_line ++; 88 | 89 | # read the column names 90 | if($flag == 0 and $whether_colnames) { 91 | chomp $line; 92 | $line =~s/^$quote|$quote$//g; 93 | @$colnames = split "$quote$sep$quote", $line; 94 | if($whether_rownames) { 95 | shift(@$colnames); 96 | } 97 | $flag = 1; 98 | $i_line --; 99 | next; 100 | } 101 | 102 | $i_array ++; 103 | 104 | chomp $line; 105 | $line =~s/^$quote|$quote$//g; 106 | my @tmp = split "$quote$sep$quote", $line; 107 | 108 | # read rownames 109 | if($whether_rownames) { 110 | push(@$rownames, shift(@tmp)); 111 | } 112 | 113 | push(@{$data->[$i_array - 1]}, @tmp); 114 | 115 | } 116 | close F; 117 | 118 | wantarray ? ($data, $colnames, $rownames) : $data; 119 | } 120 | 121 | # usage: write_table( [MATRIX], %setup ) 122 | sub write_table { 123 | 124 | check_prototype(@_, '\@($|\@){2,}'); 125 | 126 | my $matrix = shift; 127 | 128 | my %setup = @_; 129 | my $quote = $setup{"quote"} || ""; 130 | my $sep = $setup{"sep"} || "\t"; 131 | my $colnames = $setup{"col.names"}; # column names 132 | my $rownames = $setup{"row.names"}; # row names 133 | my $file = $setup{"file"}; 134 | 135 | my ($nrow, $ncol) = dim($matrix); 136 | if($rownames and $nrow != len($rownames)) { 137 | croak "ERROR: Length of rownames should be equal to the length of rows in matrix\n"; 138 | } 139 | if($colnames and $ncol != len($colnames)) { 140 | croak "ERROR: Length of colnames should be equal to the length of columns in matrix\n"; 141 | } 142 | 143 | open OUT, ">$file" or croak "ERROR: Cannot create file:$file\n"; 144 | if($rownames) { 145 | if($colnames) { 146 | # print colnames 147 | print OUT "$quote$quote$sep"; 148 | print OUT join $sep, @{sapply($colnames, sub{"$quote$_$quote"})}; 149 | print OUT "\n"; 150 | } 151 | for(my $i = 0; $i < len($matrix); $i ++) { 152 | print OUT "$quote$rownames->[$i]$quote$sep"; 153 | print OUT join $sep, @{sapply($matrix->[$i], sub{"$quote$_$quote"})}; 154 | print OUT "\n"; 155 | } 156 | } 157 | else { 158 | if($colnames) { 159 | print OUT join $sep, @{sapply($colnames, sub{"$quote$_$quote"})}; 160 | print OUT "\n"; 161 | } 162 | for(my $i = 0; $i < len($matrix); $i ++) { 163 | print OUT join $sep, @{sapply($matrix->[$i], sub{"$quote$_$quote"})}; 164 | print OUT "\n"; 165 | } 166 | } 167 | close OUT; 168 | } 169 | 170 | 171 | 172 | 1; 173 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP - Extract collocations and Ngrams from text 4 | 5 | =head1 SYNOPSIS 6 | 7 | =head2 Basic Usage 8 | 9 | use Text::NSP::Measures::2D::MI::ll; 10 | 11 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 12 | 13 | $ll_value = calculateStatistic( n11=>$n11, 14 | n1p=>$n1p, 15 | np1=>$np1, 16 | npp=>$npp); 17 | 18 | if( ($errorCode = getErrorCode())) 19 | { 20 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 21 | } 22 | else 23 | { 24 | print getStatisticName."value for bigram is ".$ll_value."\n""; 25 | } 26 | 27 | =head1 DESCRIPTION 28 | 29 | The Ngram Statistics Package (NSP) is a collection of perl modules 30 | that aid in analyzing Ngrams in text files. We define an Ngram as a 31 | sequence of 'n' tokens that occur within a window of at least 'n' 32 | tokens in the text; what constitutes a "token" can be defined by the 33 | user. 34 | 35 | NSP.pm is a stub that doesn't have any real functionality. It serves 36 | as a top level module in the hierarchy and allows us to group the 37 | Text::NSP::Count and Text::NSP::Measures modules. 38 | 39 | The modules under Text::NSP::Measures implement measures of 40 | association that are used to evaluate whether the co-occurrence of the 41 | words in a Ngram is purely by chance or statistically significant. 42 | These measures compute a numerical score for Ngrams. This score can be 43 | used to decide whether or not there is enough evidence to reject the 44 | null hypothesis (that the Ngram is not statistically significant) for 45 | that Ngram. 46 | 47 | To use one of the measures you can either use the program statistic.pl 48 | provided under the utils directory, or write your own driver program. 49 | Program statistic.pl takes as input a list of Ngrams with their 50 | frequencies (in the format output by count.pl) and runs a 51 | user-selected statistical measure of association to compute the score 52 | for each Ngram. The Ngrams, along with their scores, are output in 53 | descending order of this score. For help on using utils/statistic.pl 54 | please refer to its perldoc (perldoc utils/statistic.pl). 55 | 56 | If you are writing your own driver program, a basic usage example is 57 | provided above under SYNOPSIS. For further clarification please refer 58 | to the documentation of Text::NSP::Measures (perldoc 59 | Text::NSP::Measures). 60 | 61 | 62 | =head2 Error Codes 63 | 64 | The following table describes the error codes use in the 65 | implementation, 66 | 67 | Error codes common to all the association measures. 68 | 69 | 100 - Trying to create an object of a abstract class. 70 | 71 | 200 - one of the required values is missing. 72 | 73 | 201 - one of the observed frequency comes out to be -ve. 74 | 75 | 202 - one of the frequency values(n11) exceeds the total no of 76 | bigrams(npp) or a marginal total(n1p, np1). 77 | 78 | 203 - one of the marginal totals(n1p, np1) exceeds the total bigram 79 | count(npp). 80 | 81 | 204 - one of the marginal totals is -ve. 82 | 83 | Error Codes required by the mutual information measures 84 | 85 | 211 - one of the expected values is zero. 86 | 87 | 212 - one of the expected values is -ve. 88 | 89 | 90 | Error codes required by the CHI measures. 91 | 92 | 221 - one of the expected values is zero. 93 | 94 | =head2 Methods 95 | 96 | =over 97 | 98 | =cut 99 | 100 | package Text::NSP; 101 | 102 | use strict; 103 | use Carp; 104 | use warnings; 105 | 106 | our ($VERSION, @ISA); 107 | 108 | @ISA = qw(Exporter); 109 | 110 | $VERSION = '1.25'; 111 | 112 | 1; 113 | 114 | __END__ 115 | 116 | 117 | =back 118 | 119 | =head1 AUTHORS 120 | 121 | Ted Pedersen, University of Minnesota Duluth 122 | Etpederse at d.umn.eduE 123 | 124 | Satanjeev Banerjee, Carnegie Mellon University 125 | 126 | Amruta Purandare, University of Pittsburgh 127 | 128 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 129 | 130 | Saiyam Kohli, University of Minnesota Duluth 131 | 132 | =head1 HISTORY 133 | 134 | Last updated: $Id: NSP.pm,v 1.41 2012/01/15 17:14:55 tpederse Exp $ 135 | 136 | =head1 BUGS 137 | 138 | =head1 SEE ALSO 139 | 140 | L 141 | 142 | L 143 | 144 | =head1 COPYRIGHT 145 | 146 | Copyright (C) 2000-2008, Ted Pedersen, Satanjeev Banerjee, 147 | Amruta Purandare, Bridget Thomson-McInnes and Saiyam Kohli 148 | 149 | This program is free software; you can redistribute it and/or modify 150 | it under the terms of the GNU General Public License as published by 151 | the Free Software Foundation; either version 2 of the License, or (at 152 | your option) any later version. 153 | 154 | This program is distributed in the hope that it will be useful, but 155 | WITHOUT ANY WARRANTY; without even the implied warranty of 156 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 157 | General Public License for more details. 158 | 159 | You should have received a copy of the GNU General Public License 160 | along with this program; if not, write to 161 | 162 | The Free Software Foundation, Inc., 163 | 59 Temple Place - Suite 330, 164 | Boston, MA 02111-1307, USA. 165 | 166 | Note: a copy of the GNU General Public License is available on the web 167 | at L and is included in this 168 | distribution as GPL.txt. 169 | 170 | =cut 171 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Dice/jaccard.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Dice::jaccard - Perl module that implements 4 | the jaccard coefficient. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Dice::jaccard; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $jaccard_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$jaccard_value."\n""; 26 | } 27 | 28 | 29 | =head1 DESCRIPTION 30 | 31 | Assume that the frequency count data associated with a bigram 32 | is stored in a 2x2 contingency table: 33 | 34 | word2 ~word2 35 | word1 n11 n12 | n1p 36 | ~word1 n21 n22 | n2p 37 | -------------- 38 | np1 np2 npp 39 | 40 | where n11 is the number of times occur together, and 41 | n12 is the number of times occurs with some word other than 42 | word2, and n1p is the number of times in total that word1 occurs as 43 | the first word in a bigram. 44 | 45 | The Jaccard Coefficient is the ratio of number of times the words 46 | occur together to the number of times atleast any one of the words 47 | occur. It is defined as: 48 | 49 | n11 50 | --------------- 51 | n11 + n12 + n21 52 | 53 | The Jaccard coefficient can also be computed by applying a 54 | transformation to the dice coefficient: 55 | 56 | $jaccard = $dice/(2-$dice) 57 | 58 | We use this computation of jaccard in our implementation. 59 | 60 | =head2 Methods 61 | 62 | =over 63 | 64 | =cut 65 | 66 | 67 | package Text::NSP::Measures::2D::Dice::jaccard; 68 | 69 | 70 | use Text::NSP::Measures::2D::Dice; 71 | use strict; 72 | use Carp; 73 | use warnings; 74 | no warnings 'redefine'; 75 | require Exporter; 76 | 77 | our ($VERSION, @EXPORT, @ISA); 78 | 79 | @ISA = qw(Exporter); 80 | 81 | @EXPORT = qw(initializeStatistic calculateStatistic 82 | getErrorCode getErrorMessage getStatisticName); 83 | 84 | $VERSION = '0.97'; 85 | 86 | 87 | =item calculateStatistic() - method to calculate the jaccard coefficient value 88 | 89 | INPUT PARAMS : $count_values .. Reference of an hash containing 90 | the count values computed by the 91 | count.pl program. 92 | 93 | RETURN VALUES : $jaccard .. Jaccard Coefficient value for this bigram. 94 | 95 | =cut 96 | 97 | sub calculateStatistic 98 | { 99 | my %values = @_; 100 | my $dice; 101 | my $jaccard; 102 | 103 | #compute the dice coefficient 104 | if( !($dice = Text::NSP::Measures::2D::Dice::computeVal(\%values)) ) 105 | { 106 | return; 107 | } 108 | 109 | #compute the jaccard coefficient from the dice coefficient 110 | $jaccard = $dice/(2-$dice); 111 | 112 | return ($jaccard); 113 | } 114 | 115 | 116 | 117 | =item getStatisticName() - Returns the name of this statistic 118 | 119 | INPUT PARAMS : none 120 | 121 | RETURN VALUES : $name .. Name of the measure. 122 | 123 | =cut 124 | 125 | sub getStatisticName 126 | { 127 | return "Jaccard Coefficient"; 128 | } 129 | 130 | 131 | 132 | 1; 133 | __END__ 134 | 135 | 136 | =back 137 | 138 | =head1 AUTHOR 139 | 140 | Ted Pedersen, University of Minnesota Duluth 141 | Etpederse@d.umn.eduE 142 | 143 | Satanjeev Banerjee, Carnegie Mellon University 144 | Esatanjeev@cmu.eduE 145 | 146 | Amruta Purandare, University of Pittsburgh 147 | Eamruta@cs.pitt.eduE 148 | 149 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 150 | Ebthompson@d.umn.eduE 151 | 152 | Saiyam Kohli, University of Minnesota Duluth 153 | Ekohli003@d.umn.eduE 154 | 155 | =head1 HISTORY 156 | 157 | Last updated: $Id: jaccard.pm,v 1.8 2006/06/21 11:10:52 saiyam_kohli Exp $ 158 | 159 | =head1 BUGS 160 | 161 | 162 | =head1 SEE ALSO 163 | 164 | L 165 | 166 | L 167 | 168 | 169 | =head1 COPYRIGHT 170 | 171 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 172 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 173 | 174 | This program is free software; you can redistribute it and/or modify it 175 | under the terms of the GNU General Public License as published by the Free 176 | Software Foundation; either version 2 of the License, or (at your option) 177 | any later version. 178 | 179 | This program is distributed in the hope that it will be useful, but 180 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 181 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 182 | for more details. 183 | 184 | You should have received a copy of the GNU General Public License along 185 | with this program; if not, write to 186 | 187 | The Free Software Foundation, Inc., 188 | 59 Temple Place - Suite 330, 189 | Boston, MA 02111-1307, USA. 190 | 191 | Note: a copy of the GNU General Public License is available on the web 192 | at L and is included in this 193 | distribution as GPL.txt. 194 | 195 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Dice/dice.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Dice::dice - Perl module to compute Dice coefficient 4 | for bigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Dice::dice; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $dice_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$dice_value."\n""; 26 | } 27 | 28 | 29 | =head1 DESCRIPTION 30 | 31 | Assume that the frequency count data associated with a bigram 32 | is stored in a 2x2 contingency table: 33 | 34 | word2 ~word2 35 | word1 n11 n12 | n1p 36 | ~word1 n21 n22 | n2p 37 | -------------- 38 | np1 np2 npp 39 | 40 | where n11 is the number of times occur together, and 41 | n12 is the number of times occurs with some word other than 42 | word2, and n1p is the number of times in total that word1 occurs as 43 | the first word in a bigram. 44 | 45 | The Dice Coefficient is defined as : 46 | 47 | 2 * n11 48 | --------- 49 | np1 + n1p 50 | 51 | The Jaccard coefficient can also be computed by applying a 52 | transformation to the dice coefficient: 53 | 54 | $jaccard = $dice/(2-$dice) 55 | 56 | =head2 Methods 57 | 58 | =over 59 | 60 | =cut 61 | 62 | 63 | package Text::NSP::Measures::2D::Dice::dice; 64 | 65 | 66 | use Text::NSP::Measures::2D::Dice; 67 | use strict; 68 | use Carp; 69 | use warnings; 70 | no warnings 'redefine'; 71 | require Exporter; 72 | 73 | our ($VERSION, @EXPORT, @ISA); 74 | 75 | @ISA = qw(Exporter); 76 | 77 | @EXPORT = qw(initializeStatistic calculateStatistic 78 | getErrorCode getErrorMessage getStatisticName); 79 | 80 | $VERSION = '0.97'; 81 | 82 | 83 | =item calculateStatistic() - method to calculate the dice coefficient value 84 | 85 | INPUT PARAMS : $count_values .. Reference of an hash containing 86 | the count values computed by the 87 | count.pl program. 88 | 89 | RETURN VALUES : $dice .. Dice Coefficient value for this bigram. 90 | 91 | =cut 92 | 93 | sub calculateStatistic 94 | { 95 | my %values = @_; 96 | 97 | #compute and return the dice coefficient. 98 | return Text::NSP::Measures::2D::Dice::computeVal(\%values); 99 | } 100 | 101 | 102 | =item getStatisticName() - Returns the name of this statistic 103 | 104 | INPUT PARAMS : none 105 | 106 | RETURN VALUES : $name .. Name of the measure. 107 | 108 | =cut 109 | 110 | sub getStatisticName 111 | { 112 | my ($self)=@_; 113 | return "Dice Coefficient"; 114 | } 115 | 116 | 117 | 118 | 1; 119 | __END__ 120 | 121 | 122 | =back 123 | 124 | =head1 AUTHOR 125 | 126 | Ted Pedersen, University of Minnesota Duluth 127 | Etpederse@d.umn.eduE 128 | 129 | Satanjeev Banerjee, Carnegie Mellon University 130 | Esatanjeev@cmu.eduE 131 | 132 | Amruta Purandare, University of Pittsburgh 133 | Eamruta@cs.pitt.eduE 134 | 135 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 136 | Ebthompson@d.umn.eduE 137 | 138 | Saiyam Kohli, University of Minnesota Duluth 139 | Ekohli003@d.umn.eduE 140 | 141 | =head1 HISTORY 142 | 143 | Last updated: $Id: dice.pm,v 1.6 2006/06/21 11:10:52 saiyam_kohli Exp $ 144 | 145 | =head1 BUGS 146 | 147 | 148 | =head1 SEE ALSO 149 | 150 | @article{SmadjaMH96, 151 | author = {Smadja, F. and McKeown, K. and Hatzivassiloglou, V.}, 152 | title = {Translating Collocations for Bilingual Lexicons: A 153 | Statistical Approach}, 154 | journal = {Computational Linguistics}, 155 | volume = {22}, 156 | number = {1}, 157 | year = {1996}, 158 | pages = {1-38} 159 | url = L} 160 | 161 | L 162 | 163 | L 164 | 165 | 166 | =head1 COPYRIGHT 167 | 168 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 169 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 170 | 171 | This program is free software; you can redistribute it and/or modify it 172 | under the terms of the GNU General Public License as published by the Free 173 | Software Foundation; either version 2 of the License, or (at your option) 174 | any later version. 175 | 176 | This program is distributed in the hope that it will be useful, but 177 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 178 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 179 | for more details. 180 | 181 | You should have received a copy of the GNU General Public License along 182 | with this program; if not, write to 183 | 184 | The Free Software Foundation, Inc., 185 | 59 Temple Place - Suite 330, 186 | Boston, MA 02111-1307, USA. 187 | 188 | Note: a copy of the GNU General Public License is available on the web 189 | at L and is included in this 190 | distribution as GPL.txt. 191 | 192 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Dice.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Dice - Perl module that provides the 4 | framework to implement the Dice and 5 | Jaccard coefficients. 6 | 7 | =head1 SYNOPSIS 8 | 9 | =head3 Basic Usage 10 | 11 | use Text::NSP::Measures::2D::Dice::dice; 12 | 13 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 14 | 15 | $dice_value = calculateStatistic( n11=>$n11, 16 | n1p=>$n1p, 17 | np1=>$np1, 18 | npp=>$npp); 19 | 20 | if( ($errorCode = getErrorCode())) 21 | { 22 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 23 | } 24 | else 25 | { 26 | print getStatisticName."value for bigram is ".$dice_value."\n""; 27 | } 28 | 29 | 30 | =head1 DESCRIPTION 31 | 32 | Assume that the frequency count data associated with a bigram 33 | is stored in a 2x2 contingency table: 34 | 35 | word2 ~word2 36 | word1 n11 n12 | n1p 37 | ~word1 n21 n22 | n2p 38 | -------------- 39 | np1 np2 npp 40 | 41 | where n11 is the number of times occur together, and 42 | n12 is the number of times occurs with some word other than 43 | word2, and n1p is the number of times in total that word1 occurs as 44 | the first word in a bigram. 45 | 46 | =over 47 | 48 | =item The Dice Coefficient is defined as : 49 | 50 | 2 * n11 51 | --------- 52 | np1 + n1p 53 | 54 | =item The Jaccard coefficient is defined as: 55 | 56 | n11 57 | --------------- 58 | n11 + n12 + n21 59 | 60 | =back 61 | 62 | =head2 Methods 63 | 64 | =over 65 | 66 | =cut 67 | 68 | 69 | package Text::NSP::Measures::2D::Dice; 70 | 71 | 72 | use Text::NSP::Measures::2D; 73 | use strict; 74 | use Carp; 75 | use warnings; 76 | # use subs(calculateStatistic); 77 | require Exporter; 78 | 79 | our ($VERSION, @EXPORT, @ISA); 80 | 81 | @ISA = qw(Exporter); 82 | 83 | @EXPORT = qw(initializeStatistic calculateStatistic 84 | getErrorCode getErrorMessage getStatisticName); 85 | 86 | $VERSION = '0.97'; 87 | 88 | =item computeVal() - method to calculate the dice coefficient value 89 | 90 | INPUT PARAMS : $count_values .. Reference of an hash containing 91 | the count values computed by the 92 | count.pl program. 93 | 94 | RETURN VALUES : $dice .. Dice Coefficient value for this bigram. 95 | 96 | =cut 97 | 98 | sub computeVal 99 | { 100 | my $values = shift; 101 | 102 | # computes and returns the marginal totals from the frequency 103 | # combination values. returns undef if there is an error in 104 | # the computation or the values are inconsistent. 105 | if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){ 106 | return; 107 | } 108 | 109 | # computes and returns the observed from the frequency 110 | # combination values. returns undef if there is an error in 111 | # the computation or the values are inconsistent. 112 | if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) { 113 | return; 114 | } 115 | 116 | my $dice = 2 * $n11 / ($n1p + $np1); 117 | 118 | return ($dice); 119 | } 120 | 121 | 122 | 123 | 124 | 1; 125 | __END__ 126 | 127 | 128 | =back 129 | 130 | =head1 AUTHOR 131 | 132 | Ted Pedersen, University of Minnesota Duluth 133 | Etpederse@d.umn.eduE 134 | 135 | Satanjeev Banerjee, Carnegie Mellon University 136 | Esatanjeev@cmu.eduE 137 | 138 | Amruta Purandare, University of Pittsburgh 139 | Eamruta@cs.pitt.eduE 140 | 141 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 142 | Ebthompson@d.umn.eduE 143 | 144 | Saiyam Kohli, University of Minnesota Duluth 145 | Ekohli003@d.umn.eduE 146 | 147 | =head1 HISTORY 148 | 149 | Last updated: $Id: Dice.pm,v 1.6 2006/06/21 11:10:52 saiyam_kohli Exp $ 150 | 151 | =head1 BUGS 152 | 153 | 154 | =head1 SEE ALSO 155 | 156 | @article{SmadjaMH96, 157 | author = {Smadja, F. and McKeown, K. and Hatzivassiloglou, V.}, 158 | title = {Translating Collocations for Bilingual Lexicons: A 159 | Statistical Approach}, 160 | journal = {Computational Linguistics}, 161 | volume = {22}, 162 | number = {1}, 163 | year = {1996}, 164 | pages = {1-38} 165 | url = L} 166 | 167 | L 168 | 169 | L 170 | 171 | 172 | =head1 COPYRIGHT 173 | 174 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 175 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 176 | 177 | This program is free software; you can redistribute it and/or modify it 178 | under the terms of the GNU General Public License as published by the Free 179 | Software Foundation; either version 2 of the License, or (at your option) 180 | any later version. 181 | 182 | This program is distributed in the hope that it will be useful, but 183 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 184 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 185 | for more details. 186 | 187 | You should have received a copy of the GNU General Public License along 188 | with this program; if not, write to 189 | 190 | The Free Software Foundation, Inc., 191 | 59 Temple Place - Suite 330, 192 | Boston, MA 02111-1307, USA. 193 | 194 | Note: a copy of the GNU General Public License is available on the web 195 | at L and is included in this 196 | distribution as GPL.txt. 197 | 198 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/pmi.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::3D::MI::pmi - Perl module that implements Pointwise 4 | Mutual Information for trigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::3D::MI::pmi; 11 | 12 | $pmi_value = calculateStatistic( n111=>10, 13 | n1pp=>40, 14 | np1p=>45, 15 | npp1=>42, 16 | n11p=>20, 17 | n1p1=>23, 18 | np11=>21, 19 | nppp=>100); 20 | 21 | if( ($errorCode = getErrorCode())) 22 | { 23 | print STDERR $erroCode." - ".getErrorMessage()."\n"; 24 | } 25 | else 26 | { 27 | print getStatisticName."value for bigram is ".$pmi_value."\n"; 28 | } 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | The expected values for the internal cells are calculated by taking the 34 | product of their associated marginals and dividing by the sample size, 35 | for example: 36 | 37 | n1pp * np1p * npp1 38 | m111= -------------------- 39 | nppp * nppp 40 | 41 | Pointwise Mutual Information (pmi) is defined as the log of the devitation 42 | between the observed frequency of a trigram (n111) and the probability of 43 | that trigram if it were independent (m111). 44 | 45 | PMI = log (n111/m111) 46 | 47 | =head2 Methods 48 | 49 | =over 50 | 51 | =cut 52 | 53 | 54 | package Text::NSP::Measures::3D::MI::pmi; 55 | 56 | 57 | use Text::NSP::Measures::3D::MI; 58 | use strict; 59 | use Carp; 60 | use warnings; 61 | no warnings 'redefine'; 62 | require Exporter; 63 | 64 | our ($VERSION, @EXPORT, @ISA, $exp); 65 | 66 | $exp=1; 67 | 68 | @ISA = qw(Exporter); 69 | 70 | @EXPORT = qw(initializeStatistic calculateStatistic 71 | getErrorCode getErrorMessage getStatisticName); 72 | 73 | $VERSION = '0.97'; 74 | 75 | 76 | =item initializeStatistic() -Initialization of the pmi_exp parameter if required 77 | 78 | INPUT PARAMS : none 79 | 80 | RETURN VALUES : none 81 | 82 | =cut 83 | 84 | sub initializeStatistic 85 | { 86 | $exp = shift; 87 | } 88 | 89 | 90 | 91 | =item calculateStatistic() - This method calculates the pmi value 92 | 93 | INPUT PARAMS : $count_values .. Reference of a hash containing 94 | the count values computed by the 95 | count.pl program. 96 | 97 | RETURN VALUES : $pmi .. PMI value for this trigram. 98 | 99 | =cut 100 | 101 | sub calculateStatistic 102 | { 103 | my %values = @_; 104 | 105 | # computes and sets the observed and expected values from 106 | # the frequency combination values. returns 0 if there is an 107 | # error in the computation or the values are inconsistent. 108 | if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) { 109 | return(0); 110 | } 111 | 112 | # Now the calculations! 113 | my $pmi = Text::NSP::Measures::3D::MI::computePMI($n111**$exp, $m111); 114 | 115 | return($pmi/log(2)); 116 | } 117 | 118 | 119 | 120 | =item getStatisticName() - Returns the name of this statistic 121 | 122 | INPUT PARAMS : none 123 | 124 | RETURN VALUES : $name .. Name of the measure. 125 | 126 | =cut 127 | 128 | sub getStatisticName 129 | { 130 | return "Pointwise Mutual Information"; 131 | } 132 | 133 | 134 | 135 | 1; 136 | __END__ 137 | 138 | 139 | =back 140 | 141 | =head1 AUTHOR 142 | 143 | Ted Pedersen, University of Minnesota Duluth 144 | Etpederse@d.umn.eduE 145 | 146 | Satanjeev Banerjee, Carnegie Mellon University 147 | Esatanjeev@cmu.eduE 148 | 149 | Amruta Purandare, University of Pittsburgh 150 | Eamruta@cs.pitt.eduE 151 | 152 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 153 | Ebthompson@d.umn.eduE 154 | 155 | Saiyam Kohli, University of Minnesota Duluth 156 | Ekohli003@d.umn.eduE 157 | 158 | =head1 HISTORY 159 | 160 | Last updated: $Id: pmi.pm,v 1.9 2009/11/03 14:53:55 tpederse Exp $ 161 | 162 | =head1 BUGS 163 | 164 | 165 | =head1 SEE ALSO 166 | 167 | @inproceedings{ church89word, 168 | author = {Kenneth W. Church and Patrick Hanks}, 169 | title = {Word association norms, mutual information, and Lexicography}, 170 | booktitle = {Proceedings of the 27th. Annual Meeting of the Association for Computational Linguistics}, 171 | publisher = {Association for Computational Linguistics}, 172 | address = {Vancouver, B.C.}, 173 | pages = {76--83}, 174 | year = {1989}, 175 | url = L } 176 | 177 | 178 | L 179 | 180 | L 181 | 182 | 183 | =head1 COPYRIGHT 184 | 185 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 186 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 187 | 188 | This program is free software; you can redistribute it and/or modify it 189 | under the terms of the GNU General Public License as published by the Free 190 | Software Foundation; either version 2 of the License, or (at your option) 191 | any later version. 192 | 193 | This program is distributed in the hope that it will be useful, but 194 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 195 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 196 | for more details. 197 | 198 | You should have received a copy of the GNU General Public License along 199 | with this program; if not, write to 200 | 201 | The Free Software Foundation, Inc., 202 | 59 Temple Place - Suite 330, 203 | Boston, MA 02111-1307, USA. 204 | 205 | Note: a copy of the GNU General Public License is available on the web 206 | at L and is included in this 207 | distribution as GPL.txt. 208 | 209 | =cut 210 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI/tscore.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::CHI::tscore - Perl module that implements T-score 4 | measure of association for bigrams. 5 | 6 | 7 | =head1 SYNOPSIS 8 | 9 | =head3 Basic Usage 10 | 11 | use Text::NSP::Measures::2D::CHI::tscore; 12 | 13 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 14 | 15 | $tscore_value = calculateStatistic( n11=>$n11, 16 | n1p=>$n1p, 17 | np1=>$np1, 18 | npp=>$npp); 19 | 20 | if( ($errorCode = getErrorCode())) 21 | { 22 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 23 | } 24 | else 25 | { 26 | print getStatisticName."value for bigram is ".$tscore_value."\n""; 27 | } 28 | 29 | =head1 DESCRIPTION 30 | 31 | Assume that the frequency count data associated with a bigram 32 | is stored in a 2x2 contingency table: 33 | 34 | word2 ~word2 35 | word1 n11 n12 | n1p 36 | ~word1 n21 n22 | n2p 37 | -------------- 38 | np1 np2 npp 39 | 40 | where n11 is the number of times occur together, and 41 | n12 is the number of times occurs with some word other than 42 | word2, and n1p is the number of times in total that word1 occurs as 43 | the first word in a bigram. 44 | 45 | The T-score is defined as a ratio of difference between the observed 46 | and the expected mean to the variance of the sample. Note that this 47 | is a variant of the standard t-test that was proposed for use in the 48 | identification of collocations in large samples of text. 49 | 50 | Thus, the T-score is defined as follows: 51 | 52 | m11 = n1p * np1 / npp 53 | 54 | T-score = (n11 - m11)/sqrt(n11) 55 | 56 | =over 57 | 58 | =cut 59 | 60 | 61 | package Text::NSP::Measures::2D::CHI::tscore; 62 | 63 | 64 | use Text::NSP::Measures::2D::CHI; 65 | use strict; 66 | use Carp; 67 | use warnings; 68 | no warnings 'redefine'; 69 | require Exporter; 70 | 71 | our ($VERSION, @EXPORT, @ISA); 72 | 73 | @ISA = qw(Exporter); 74 | 75 | @EXPORT = qw(initializeStatistic calculateStatistic 76 | getErrorCode getErrorMessage getStatisticName); 77 | 78 | $VERSION = '0.97'; 79 | 80 | 81 | =item calculateStatistic() - method to calculate the tscore Coefficient 82 | 83 | INPUT PARAMS : $count_values .. Reference of an hash containing 84 | the count values computed by the 85 | count.pl program. 86 | 87 | RETURN VALUES : $tscore .. tscore value for this bigram. 88 | 89 | =cut 90 | 91 | sub calculateStatistic 92 | { 93 | my %values = @_; 94 | 95 | # computes and returns the observed and expected values from 96 | # the frequency combination values. returns 0 if there is an 97 | # error in the computation or the values are inconsistent. 98 | if( !(Text::NSP::Measures::2D::CHI::getValues(\%values)) ) { 99 | return; 100 | } 101 | # Now calculate the tscore 102 | 103 | my $tscore = (($n11-$m11)/($n11**0.5)); 104 | 105 | return ( $tscore ); 106 | } 107 | 108 | 109 | 110 | =item getStatisticName() - Returns the name of this statistic 111 | 112 | INPUT PARAMS : none 113 | 114 | RETURN VALUES : $name .. Name of the measure. 115 | 116 | =cut 117 | 118 | sub getStatisticName 119 | { 120 | return "T-score"; 121 | } 122 | 123 | 124 | 125 | 1; 126 | __END__ 127 | 128 | 129 | =back 130 | 131 | =head1 AUTHOR 132 | 133 | Ted Pedersen, University of Minnesota Duluth 134 | Etpederse@d.umn.eduE 135 | 136 | Satanjeev Banerjee, Carnegie Mellon University 137 | Esatanjeev@cmu.eduE 138 | 139 | Amruta Purandare, University of Pittsburgh 140 | Eamruta@cs.pitt.eduE 141 | 142 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 143 | Ebthompson@d.umn.eduE 144 | 145 | Saiyam Kohli, University of Minnesota Duluth 146 | Ekohli003@d.umn.eduE 147 | 148 | =head1 HISTORY 149 | 150 | Last updated: $Id: tscore.pm,v 1.11 2006/06/21 11:10:52 saiyam_kohli Exp $ 151 | 152 | =head1 BUGS 153 | 154 | 155 | =head1 SEE ALSO 156 | 157 | @incollection {ChurchGHH91, 158 | author={Church, K. and Gale, W. and Hanks, P. and Hindle, D. }, 159 | title={Using Statistics in Lexical Analysis}, 160 | booktitle={Lexical Acquisition: Exploiting On-Line Resources 161 | to Build a Lexicon}, 162 | editor={Zernik, U.}, 163 | year={1991}, 164 | address={Hillsdale, NJ}, 165 | publisher={Lawrence Erlbaum Associates} 166 | url = L} 167 | 168 | L 169 | 170 | L 171 | 172 | 173 | =head1 COPYRIGHT 174 | 175 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 176 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 177 | 178 | This program is free software; you can redistribute it and/or modify it 179 | under the terms of the GNU General Public License as published by the Free 180 | Software Foundation; either version 2 of the License, or (at your option) 181 | any later version. 182 | 183 | This program is distributed in the hope that it will be useful, but 184 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 185 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 186 | for more details. 187 | 188 | You should have received a copy of the GNU General Public License along 189 | with this program; if not, write to 190 | 191 | The Free Software Foundation, Inc., 192 | 59 Temple Place - Suite 330, 193 | Boston, MA 02111-1307, USA. 194 | 195 | Note: a copy of the GNU General Public License is available on the web 196 | at L and is included in this 197 | distribution as GPL.txt. 198 | 199 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/ps.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::3D::MI::ps - Perl module that implements 4 | Poisson Stirling Measure for trigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::3D::MI::ps; 11 | 12 | $ps_value = calculateStatistic( n111=>10, 13 | n1pp=>40, 14 | np1p=>45, 15 | npp1=>42, 16 | n11p=>20, 17 | n1p1=>23, 18 | np11=>21, 19 | nppp=>100); 20 | 21 | if( ($errorCode = getErrorCode())) 22 | { 23 | print STDERR $erroCode." - ".getErrorMessage()."\n"; 24 | } 25 | else 26 | { 27 | print getStatisticName."value for bigram is ".$ps_value."\n"; 28 | } 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | The log-likelihood ratio measures the devitation between the observed data 34 | and what would be expected if , and were independent. 35 | The higher the score, the less evidence there is in favor of concluding that 36 | the words are independent. 37 | 38 | The expected values for the internal cells are calculated by taking the 39 | product of their associated marginals and dividing by the sample size, 40 | for example: 41 | 42 | n1pp * np1p * npp1 43 | m111= -------------------- 44 | nppp 45 | 46 | The poisson stirling measure is a negative lograthimic approximation 47 | of the poisson-likelihood measure. It uses the stirlings firmula to 48 | approximate the factorial in poisson-likelihood measure. It is 49 | computed as follows: 50 | 51 | Posson-Stirling = n111 * ( log(n111) - log(m111) - 1) 52 | 53 | =head2 Methods 54 | 55 | =over 56 | 57 | =cut 58 | 59 | package Text::NSP::Measures::3D::MI::ps; 60 | 61 | 62 | use Text::NSP::Measures::3D::MI; 63 | use strict; 64 | use Carp; 65 | use warnings; 66 | no warnings 'redefine'; 67 | require Exporter; 68 | 69 | our ($VERSION, @EXPORT, @ISA); 70 | 71 | @ISA = qw(Exporter); 72 | 73 | @EXPORT = qw(initializeStatistic calculateStatistic 74 | getErrorCode getErrorMessage getStatisticName); 75 | 76 | $VERSION = '0.97'; 77 | 78 | =item calculateStatistic() - This method calculates the ps value 79 | 80 | INPUT PARAMS : $count_values .. Reference of an hash containing 81 | the count values computed by the 82 | count.pl program. 83 | 84 | RETURN VALUES : $poissonStirling .. Poisson-Stirling value for this trigram. 85 | 86 | =cut 87 | 88 | sub calculateStatistic 89 | { 90 | my %values = @_; 91 | 92 | # computes and returns the observed and expected values from 93 | # the frequency combination values. returns 0 if there is an 94 | # error in the computation or the values are inconsistent. 95 | if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) { 96 | return; 97 | } 98 | 99 | # Now for the actual calculation of Loglikelihood! 100 | my $poissonStirling = 0; 101 | 102 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so! 103 | $poissonStirling = $n111 * (Text::NSP::Measures::3D::MI::computePMI($n111, $m111) - 1); 104 | 105 | return $poissonStirling; 106 | } 107 | 108 | 109 | =item getStatisticName() - Returns the name of this statistic 110 | 111 | INPUT PARAMS : none 112 | 113 | RETURN VALUES : $name .. Name of the measure. 114 | 115 | =cut 116 | 117 | sub getStatisticName 118 | { 119 | return "Poisson-Stirling Measure"; 120 | } 121 | 122 | 123 | 124 | 1; 125 | __END__ 126 | 127 | 128 | =back 129 | 130 | =head1 AUTHOR 131 | 132 | Ted Pedersen, University of Minnesota Duluth 133 | Etpederse@d.umn.eduE 134 | 135 | Satanjeev Banerjee, Carnegie Mellon University 136 | Esatanjeev@cmu.eduE 137 | 138 | Amruta Purandare, University of Pittsburgh 139 | Eamruta@cs.pitt.eduE 140 | 141 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 142 | Ebthompson@d.umn.eduE 143 | 144 | Saiyam Kohli, University of Minnesota Duluth 145 | Ekohli003@d.umn.eduE 146 | 147 | =head1 HISTORY 148 | 149 | Last updated: $Id: ps.pm,v 1.7 2006/06/21 11:10:53 saiyam_kohli Exp $ 150 | 151 | =head1 BUGS 152 | 153 | 154 | =head1 SEE ALSO 155 | 156 | @inproceedings{ church89word, 157 | author = {Kenneth W. Church and Patrick Hanks}, 158 | title = {Word association norms, mutual information, and Lexicography}, 159 | booktitle = {Proceedings of the 27th. Annual Meeting of the Association for Computational Linguistics}, 160 | publisher = {Association for Computational Linguistics}, 161 | address = {Vancouver, B.C.}, 162 | pages = {76--83}, 163 | year = {1989}, 164 | url = L } 165 | 166 | 167 | L 168 | 169 | L 170 | 171 | 172 | =head1 COPYRIGHT 173 | 174 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 175 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 176 | 177 | This program is free software; you can redistribute it and/or modify it 178 | under the terms of the GNU General Public License as published by the Free 179 | Software Foundation; either version 2 of the License, or (at your option) 180 | any later version. 181 | 182 | This program is distributed in the hope that it will be useful, but 183 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 184 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 185 | for more details. 186 | 187 | You should have received a copy of the GNU General Public License along 188 | with this program; if not, write to 189 | 190 | The Free Software Foundation, Inc., 191 | 59 Temple Place - Suite 330, 192 | Boston, MA 02111-1307, USA. 193 | 194 | Note: a copy of the GNU General Public License is available on the web 195 | at L and is included in this 196 | distribution as GPL.txt. 197 | 198 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI/x2.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::CHI::x2 - Perl module that implements Pearson's 4 | chi squared measure of association for 5 | bigrams. 6 | 7 | =head1 SYNOPSIS 8 | 9 | =head3 Basic Usage 10 | 11 | use Text::NSP::Measures::2D::CHI::x2; 12 | 13 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 14 | 15 | $x2_value = calculateStatistic( n11=>$n11, 16 | n1p=>$n1p, 17 | np1=>$np1, 18 | npp=>$npp); 19 | 20 | if( ($errorCode = getErrorCode())) 21 | { 22 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 23 | } 24 | else 25 | { 26 | print getStatisticName."value for bigram is ".$x2_value."\n""; 27 | } 28 | 29 | =head1 DESCRIPTION 30 | 31 | Pearson's Chi-squred test measures the devitation between the observed 32 | data and what would be expected if and were independent. 33 | The higher the score, the less evidence there is in favor of concluding 34 | that the words are independent. 35 | 36 | 37 | Assume that the frequency count data associated with a bigram 38 | is stored in a 2x2 contingency table: 39 | 40 | word2 ~word2 41 | word1 n11 n12 | n1p 42 | ~word1 n21 n22 | n2p 43 | -------------- 44 | np1 np2 npp 45 | 46 | where n11 is the number of times occur together, and 47 | n12 is the number of times occurs with some word other than 48 | word2, and n1p is the number of times in total that word1 occurs as 49 | the first word in a bigram. 50 | 51 | The expected values for the internal cells are calculated by taking the 52 | product of their associated marginals and dividing by the sample size, 53 | for example: 54 | 55 | np1 * n1p 56 | m11= --------- 57 | npp 58 | 59 | Then the deviation between observed and expected values for each internal 60 | cell is computed to arrive at the Pearson's Chi-Squared test value: 61 | 62 | Pearson's Chi-Squared = 2 * [((n11 - m11)/m11)^2 + ((n12 - m12)/m12)^2 + 63 | ((n21 - m21)/m21)^2 + ((n22 -m22)/m22)^2] 64 | 65 | 66 | =over 67 | 68 | =cut 69 | 70 | 71 | package Text::NSP::Measures::2D::CHI::x2; 72 | 73 | 74 | use Text::NSP::Measures::2D::CHI; 75 | use strict; 76 | use Carp; 77 | use warnings; 78 | no warnings 'redefine'; 79 | require Exporter; 80 | 81 | our ($VERSION, @EXPORT, @ISA); 82 | 83 | @ISA = qw(Exporter); 84 | 85 | @EXPORT = qw(initializeStatistic calculateStatistic 86 | getErrorCode getErrorMessage getStatisticName); 87 | 88 | $VERSION = '0.97'; 89 | 90 | 91 | =item calculateStatistic() - method to calculate the Chi-squared value. 92 | 93 | INPUT PARAMS : $count_values .. Reference of an hash containing 94 | the count values computed by the 95 | count.pl program. 96 | 97 | RETURN VALUES : $x2 .. x2 value for this bigram. 98 | 99 | =cut 100 | 101 | sub calculateStatistic 102 | { 103 | my %values = @_; 104 | 105 | # computes and returns the observed and expected values from 106 | # the frequency combination values. returns 0 if there is an 107 | # error in the computation or the values are inconsistent. 108 | if( !(Text::NSP::Measures::2D::CHI::getValues(\%values)) ) { 109 | return; 110 | } 111 | # Now calculate the xsquare 112 | my $Xsquare = 0; 113 | 114 | $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n11, $m11); 115 | $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n12, $m12); 116 | $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n21, $m21); 117 | $Xsquare += Text::NSP::Measures::2D::CHI::computeVal($n22, $m22); 118 | 119 | return $Xsquare; 120 | } 121 | 122 | 123 | 124 | =item getStatisticName() - Returns the name of this statistic 125 | 126 | INPUT PARAMS : none 127 | 128 | RETURN VALUES : $name .. Name of the measure. 129 | 130 | =cut 131 | 132 | sub getStatisticName 133 | { 134 | return "Chi-squared test"; 135 | } 136 | 137 | 138 | 139 | 1; 140 | __END__ 141 | 142 | 143 | =back 144 | 145 | =head1 AUTHOR 146 | 147 | Ted Pedersen, University of Minnesota Duluth 148 | Etpederse@d.umn.eduE 149 | 150 | Satanjeev Banerjee, Carnegie Mellon University 151 | Esatanjeev@cmu.eduE 152 | 153 | Amruta Purandare, University of Pittsburgh 154 | Eamruta@cs.pitt.eduE 155 | 156 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 157 | Ebthompson@d.umn.eduE 158 | 159 | Saiyam Kohli, University of Minnesota Duluth 160 | Ekohli003@d.umn.eduE 161 | 162 | =head1 HISTORY 163 | 164 | Last updated: $Id: x2.pm,v 1.10 2006/06/21 11:10:52 saiyam_kohli Exp $ 165 | 166 | =head1 BUGS 167 | 168 | 169 | =head1 SEE ALSO 170 | 171 | L 172 | 173 | L 174 | 175 | 176 | =head1 COPYRIGHT 177 | 178 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 179 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 180 | 181 | This program is free software; you can redistribute it and/or modify it 182 | under the terms of the GNU General Public License as published by the Free 183 | Software Foundation; either version 2 of the License, or (at your option) 184 | any later version. 185 | 186 | This program is distributed in the hope that it will be useful, but 187 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 188 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 189 | for more details. 190 | 191 | You should have received a copy of the GNU General Public License along 192 | with this program; if not, write to 193 | 194 | The Free Software Foundation, Inc., 195 | 59 Temple Place - Suite 330, 196 | Boston, MA 02111-1307, USA. 197 | 198 | Note: a copy of the GNU General Public License is available on the web 199 | at L and is included in this 200 | distribution as GPL.txt. 201 | 202 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/odds.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::odds - Perl module to compute the Odds 4 | ratio for bigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::odds; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $odds_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$odds_value."\n""; 26 | } 27 | 28 | 29 | 30 | =head1 DESCRIPTION 31 | 32 | Assume that the frequency count data associated with a bigram 33 | is stored in a 2x2 contingency table: 34 | 35 | word2 ~word2 36 | word1 n11 n12 | n1p 37 | ~word1 n21 n22 | n2p 38 | -------------- 39 | np1 np2 npp 40 | 41 | where n11 is the number of times occur together, and 42 | n12 is the number of times occurs with some word other than 43 | word2, and n1p is the number of times in total that word1 occurs as 44 | the first word in a bigram. 45 | 46 | The odds ratio computes the ratio of the number of times that 47 | the words in a bigram occur together (or not at all) to the 48 | number of times the words occur individually. It is the cross 49 | product of the diagonal and the off-diagonal. 50 | 51 | Thus, ODDS RATIO = n11*n22/n21*n12 52 | 53 | if n21 and/or n12 is 0, then each zero value is "smoothed" to one to 54 | avoid a zero in the denominator. 55 | 56 | =over 57 | 58 | =cut 59 | 60 | 61 | package Text::NSP::Measures::2D::odds; 62 | 63 | 64 | use Text::NSP::Measures::2D; 65 | use strict; 66 | use Carp; 67 | use warnings; 68 | no warnings 'redefine'; 69 | require Exporter; 70 | 71 | our ($VERSION, @EXPORT, @ISA); 72 | 73 | @ISA = qw(Exporter); 74 | 75 | @EXPORT = qw(initializeStatistic calculateStatistic 76 | getErrorCode getErrorMessage getStatisticName); 77 | 78 | $VERSION = '0.97'; 79 | 80 | 81 | =item calculateStatistic() - method to calculate the odds ratio value! 82 | 83 | INPUT PARAMS : $count_values .. Reference of an hash containing 84 | the count values computed by the 85 | count.pl program. 86 | 87 | RETURN VALUES : $odds .. Odds ratio for this bigram. 88 | 89 | =cut 90 | 91 | sub calculateStatistic 92 | { 93 | my %values = @_; 94 | 95 | # computes and returns the marginal totals from the frequency 96 | # combination values. returns undef if there is an error in 97 | # the computation or the values are inconsistent. 98 | if(!(Text::NSP::Measures::2D::computeMarginalTotals(\%values)) ){ 99 | return; 100 | } 101 | 102 | # computes and returns the observed from the frequency 103 | # combination values. returns 0 if there is an error in 104 | # the computation or the values are inconsistent. 105 | if( !(Text::NSP::Measures::2D::computeObservedValues(\%values)) ) { 106 | return(0); 107 | } 108 | 109 | # Add-one smoothing to avoid zero denominator 110 | 111 | if ($n21 == 0) 112 | { 113 | $n21 = 1; 114 | } 115 | if ($n12 == 0) 116 | { 117 | $n12 = 1; 118 | } 119 | 120 | my $odds = (($n11*$n22) / ($n12*$n21)); 121 | 122 | return ($odds); 123 | } 124 | 125 | 126 | 127 | =item getStatisticName() - Returns the name of this statistic 128 | 129 | INPUT PARAMS : none 130 | 131 | RETURN VALUES : $name .. Name of the measure. 132 | 133 | =cut 134 | 135 | sub getStatisticName 136 | { 137 | return "Odds Ratio"; 138 | } 139 | 140 | 141 | 142 | 1; 143 | __END__ 144 | 145 | 146 | =back 147 | 148 | =head1 AUTHOR 149 | 150 | Ted Pedersen, University of Minnesota Duluth 151 | Etpederse@d.umn.eduE 152 | 153 | Satanjeev Banerjee, Carnegie Mellon University 154 | Esatanjeev@cmu.eduE 155 | 156 | Amruta Purandare, University of Pittsburgh 157 | Eamruta@cs.pitt.eduE 158 | 159 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 160 | Ebthompson@d.umn.eduE 161 | 162 | Saiyam Kohli, University of Minnesota Duluth 163 | Ekohli003@d.umn.eduE 164 | 165 | =head1 HISTORY 166 | 167 | Last updated: $Id: odds.pm,v 1.18 2006/06/21 11:10:52 saiyam_kohli Exp $ 168 | 169 | =head1 BUGS 170 | 171 | 172 | =head1 SEE ALSO 173 | 174 | @inproceedings{ blaheta01unsupervised, 175 | author = {D. BLAHETA and M. JOHNSON}, 176 | title = {Unsupervised learning of multi-word verbs}, 177 | booktitle = {}Proceedings of the 39th Annual Meeting of the ACL}, 178 | year = {2001}, 179 | pages = {54-60}, 180 | url = L } 181 | 182 | L 183 | 184 | L 185 | 186 | 187 | =head1 COPYRIGHT 188 | 189 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 190 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 191 | 192 | This program is free software; you can redistribute it and/or modify it 193 | under the terms of the GNU General Public License as published by the Free 194 | Software Foundation; either version 2 of the License, or (at your option) 195 | any later version. 196 | 197 | This program is distributed in the hope that it will be useful, but 198 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 199 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 200 | for more details. 201 | 202 | You should have received a copy of the GNU General Public License along 203 | with this program; if not, write to 204 | 205 | The Free Software Foundation, Inc., 206 | 59 Temple Place - Suite 330, 207 | Boston, MA 02111-1307, USA. 208 | 209 | Note: a copy of the GNU General Public License is available on the web 210 | at L and is included in this 211 | distribution as GPL.txt. 212 | 213 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI/phi.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::CHI::phi - Perl module that implements Phi coefficient 4 | measure for bigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::CHI::phi; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $phi_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$phi_value."\n""; 26 | } 27 | 28 | =head1 DESCRIPTION 29 | 30 | This function computes the the square of the traditional formulation of 31 | the Phi Coefficient. 32 | 33 | Assume that the frequency count data associated with a bigram 34 | is stored in a 2x2 contingency table: 35 | 36 | word2 ~word2 37 | word1 n11 n12 | n1p 38 | ~word1 n21 n22 | n2p 39 | -------------- 40 | np1 np2 npp 41 | 42 | where n11 is the number of times occur together, and 43 | n12 is the number of times occurs with some word other than 44 | word2, and n1p is the number of times in total that word1 occurs as 45 | the first word in a bigram. 46 | 47 | PHI^2 = ((n11 * n22) - (n21 * n21))^2/(n1p * np1 * np2 * n2p) 48 | 49 | Note that the value of PHI^2 is equivalent to 50 | Pearson's Chi-Squared test multiplied by the sample size, that is: 51 | 52 | Chi-Squared = npp * PHI^2 53 | 54 | We use PHI^2 rather than PHI since PHI^2 was employed for collocation 55 | identification in: 56 | 57 | Church, K. (1991) Concordances for Parallel Text, Seventh Annual 58 | Conference of the UW Centre for the New OED and Text Research, Oxford, 59 | England. 60 | 61 | =over 62 | 63 | =cut 64 | 65 | 66 | package Text::NSP::Measures::2D::CHI::phi; 67 | 68 | 69 | use Text::NSP::Measures::2D::CHI; 70 | use strict; 71 | use Carp; 72 | use warnings; 73 | no warnings 'redefine'; 74 | require Exporter; 75 | 76 | our ($VERSION, @EXPORT, @ISA); 77 | 78 | @ISA = qw(Exporter); 79 | 80 | @EXPORT = qw(initializeStatistic calculateStatistic 81 | getErrorCode getErrorMessage getStatisticName); 82 | 83 | $VERSION = '0.97'; 84 | 85 | 86 | =item calculateStatistic() - method to calculate the Phi Coefficient 87 | 88 | INPUT PARAMS : $count_values .. Reference of an hash containing 89 | the count values computed by the 90 | count.pl program. 91 | 92 | RETURN VALUES : $phi .. phi value for this bigram. 93 | 94 | =cut 95 | 96 | sub calculateStatistic 97 | { 98 | my %values = @_; 99 | 100 | # computes and returns the observed and expected values from 101 | # the frequency combination values. returns 0 if there is an 102 | # error in the computation or the values are inconsistent. 103 | if( !(Text::NSP::Measures::2D::CHI::getValues(\%values)) ) { 104 | return; 105 | } 106 | 107 | # Now calculate the phi coefficient 108 | my $phi = 0; 109 | 110 | $phi += Text::NSP::Measures::2D::CHI::computeVal($n11, $m11); 111 | $phi += Text::NSP::Measures::2D::CHI::computeVal($n12, $m12); 112 | $phi += Text::NSP::Measures::2D::CHI::computeVal($n21, $m21); 113 | $phi += Text::NSP::Measures::2D::CHI::computeVal($n22, $m22); 114 | 115 | return $phi/$values{npp}; 116 | } 117 | 118 | 119 | 120 | =item getStatisticName() - Returns the name of this statistic 121 | 122 | INPUT PARAMS : none 123 | 124 | RETURN VALUES : $name .. Name of the measure. 125 | 126 | =cut 127 | 128 | sub getStatisticName 129 | { 130 | return "Phi Coefficient"; 131 | } 132 | 133 | 134 | 135 | 1; 136 | __END__ 137 | 138 | 139 | =back 140 | 141 | =head1 AUTHOR 142 | 143 | Ted Pedersen, University of Minnesota Duluth 144 | Etpederse@d.umn.eduE 145 | 146 | Satanjeev Banerjee, Carnegie Mellon University 147 | Esatanjeev@cmu.eduE 148 | 149 | Amruta Purandare, University of Pittsburgh 150 | Eamruta@cs.pitt.eduE 151 | 152 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 153 | Ebthompson@d.umn.eduE 154 | 155 | Saiyam Kohli, University of Minnesota Duluth 156 | Ekohli003@d.umn.eduE 157 | 158 | =head1 HISTORY 159 | 160 | Last updated: $Id: phi.pm,v 1.12 2006/06/21 11:10:52 saiyam_kohli Exp $ 161 | 162 | =head1 BUGS 163 | 164 | 165 | =head1 SEE ALSO 166 | 167 | @inproceedings{GaleC91, 168 | author = {Gale, W. and Church, K.}, 169 | title = {A Program for Aligning Sentences in Bilingual Corpora}, 170 | booktitle = {Proceedings of the 29th Annual Meeting of the 171 | Association for Computational Linguistics}, 172 | address = {Berkeley, CA}, 173 | year = {1991} 174 | url = L} 175 | 176 | 177 | L 178 | 179 | L 180 | 181 | 182 | =head1 COPYRIGHT 183 | 184 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 185 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 186 | 187 | This program is free software; you can redistribute it and/or modify it 188 | under the terms of the GNU General Public License as published by the Free 189 | Software Foundation; either version 2 of the License, or (at your option) 190 | any later version. 191 | 192 | This program is distributed in the hope that it will be useful, but 193 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 194 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 195 | for more details. 196 | 197 | You should have received a copy of the GNU General Public License along 198 | with this program; if not, write to 199 | 200 | The Free Software Foundation, Inc., 201 | 59 Temple Place - Suite 330, 202 | Boston, MA 02111-1307, USA. 203 | 204 | Note: a copy of the GNU General Public License is available on the web 205 | at L and is included in this 206 | distribution as GPL.txt. 207 | 208 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/ps.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::MI::ps - Perl module that implements Poisson-Stirling 4 | measure of association for bigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::MI::ps; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $ps_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$ps_value."\n""; 26 | } 27 | 28 | =head1 DESCRIPTION 29 | 30 | The log-likelihood ratio measures the deviation between the observed data 31 | and what would be expected if and were independent. The 32 | higher the score, the less evidence there is in favor of concluding that 33 | the words are independent. 34 | 35 | Assume that the frequency count data associated with a bigram 36 | as shown by a 2x2 contingency table: 37 | 38 | word2 ~word2 39 | word1 n11 n12 | n1p 40 | ~word1 n21 n22 | n2p 41 | -------------- 42 | np1 np2 npp 43 | 44 | where n11 is the number of times occur together, and 45 | n12 is the number of times occurs with some word other than 46 | word2, and n1p is the number of times in total that word1 occurs as 47 | the first word in a bigram. 48 | 49 | The expected values for the internal cells are calculated by taking the 50 | product of their associated marginals and dividing by the sample size, 51 | for example: 52 | 53 | np1 * n1p 54 | m11= --------- 55 | npp 56 | 57 | The Poisson Stirling measure is a negative logarithmic approximation 58 | of the Poisson-likelihood measure. It uses the Stirling's formula to 59 | approximate the factorial in Poisson-likelihood measure. 60 | 61 | Poisson-Stirling = n11 * ( log(n11) - log(m11) - 1) 62 | 63 | which is same as 64 | 65 | Poisson-Stirling = n11 * ( log(n11/m11) - 1) 66 | 67 | 68 | =head2 Methods 69 | 70 | =over 71 | 72 | =cut 73 | 74 | 75 | package Text::NSP::Measures::2D::MI::ps; 76 | 77 | 78 | use Text::NSP::Measures::2D::MI; 79 | use strict; 80 | use Carp; 81 | use warnings; 82 | no warnings 'redefine'; 83 | require Exporter; 84 | 85 | our ($VERSION, @EXPORT, @ISA); 86 | 87 | @ISA = qw(Exporter); 88 | 89 | @EXPORT = qw(initializeStatistic calculateStatistic 90 | getErrorCode getErrorMessage getStatisticName); 91 | 92 | $VERSION = '0.97'; 93 | 94 | =item calculateStatistic() - This method calculates the ps value 95 | 96 | INPUT PARAMS : $count_values .. Reference of an hash containing 97 | the count values computed by the 98 | count.pl program. 99 | 100 | RETURN VALUES : $poissonStirling .. Poisson-Stirling value for this bigram. 101 | 102 | =cut 103 | 104 | sub calculateStatistic 105 | { 106 | my %values = @_; 107 | 108 | # computes and returns the observed and expected values from 109 | # the frequency combination values. returns 0 if there is an 110 | # error in the computation or the values are inconsistent. 111 | if( !(Text::NSP::Measures::2D::MI::getValues(\%values)) ) { 112 | return; 113 | } 114 | 115 | # Now for the actual calculation of Loglikelihood! 116 | my $poissonStirling = 0; 117 | 118 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so! 119 | $poissonStirling = $n11 * (Text::NSP::Measures::2D::MI::computePMI($n11,$m11) - 1); 120 | 121 | return $poissonStirling; 122 | } 123 | 124 | 125 | =item getStatisticName() - Returns the name of this statistic 126 | 127 | INPUT PARAMS : none 128 | 129 | RETURN VALUES : $name .. Name of the measure. 130 | 131 | =cut 132 | 133 | sub getStatisticName 134 | { 135 | return "Poisson-Stirling Measure"; 136 | } 137 | 138 | 139 | 140 | 1; 141 | __END__ 142 | 143 | 144 | =back 145 | 146 | =head1 AUTHOR 147 | 148 | Ted Pedersen, University of Minnesota Duluth 149 | Etpederse@d.umn.eduE 150 | 151 | Satanjeev Banerjee, Carnegie Mellon University 152 | Esatanjeev@cmu.eduE 153 | 154 | Amruta Purandare, University of Pittsburgh 155 | Eamruta@cs.pitt.eduE 156 | 157 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 158 | Ebthompson@d.umn.eduE 159 | 160 | Saiyam Kohli, University of Minnesota Duluth 161 | Ekohli003@d.umn.eduE 162 | 163 | =head1 HISTORY 164 | 165 | Last updated: $Id: ps.pm,v 1.9 2008/03/26 17:20:28 tpederse Exp $ 166 | 167 | =head1 BUGS 168 | 169 | 170 | =head1 SEE ALSO 171 | 172 | L 173 | 174 | L 175 | 176 | @article{SmadjaMH96, 177 | author = {Quasthoff, Uwe and Wolff, Christian}, 178 | title = {The Poisson collocation measure and its application}, 179 | journal = {Workshop on Computational Approaches to Collocations}, 180 | year = {2002}, 181 | url = L} 182 | 183 | =head1 COPYRIGHT 184 | 185 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 186 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 187 | 188 | This program is free software; you can redistribute it and/or modify it 189 | under the terms of the GNU General Public License as published by the Free 190 | Software Foundation; either version 2 of the License, or (at your option) 191 | any later version. 192 | 193 | This program is distributed in the hope that it will be useful, but 194 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 195 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 196 | for more details. 197 | 198 | You should have received a copy of the GNU General Public License along 199 | with this program; if not, write to 200 | 201 | The Free Software Foundation, Inc., 202 | 59 Temple Place - Suite 330, 203 | Boston, MA 02111-1307, USA. 204 | 205 | Note: a copy of the GNU General Public License is available on the web 206 | at L and is included in this 207 | distribution as GPL.txt. 208 | 209 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher/twotailed.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Fisher::twotailed - Perl module implementation of the two-sided 4 | Fisher's exact test. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Fisher::twotailed; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $twotailed_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage(); 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$twotailed_value; 26 | } 27 | 28 | =head1 DESCRIPTION 29 | 30 | Assume that the frequency count data associated with a bigram 31 | is stored in a 2x2 contingency table: 32 | 33 | word2 ~word2 34 | word1 n11 n12 | n1p 35 | ~word1 n21 n22 | n2p 36 | -------------- 37 | np1 np2 npp 38 | 39 | where n11 is the number of times occur together, and 40 | n12 is the number of times occurs with some word other than 41 | word2, and n1p is the number of times in total that word1 occurs as 42 | the first word in a bigram. 43 | 44 | The fishers exact tests are calculated by fixing the marginal totals 45 | and computing the hypergeometric probabilities for all the possible 46 | contingency tables, 47 | 48 | A twotailed fishers test is calculated by adding the probabilities of 49 | all the contingency tables with probabilities less than the probability 50 | of the observed table. The twotailed fishers test tells us how likely 51 | it would be to observe an contingency table which is less probable than 52 | the current table. 53 | 54 | =head2 Methods 55 | 56 | =over 57 | 58 | =cut 59 | 60 | package Text::NSP::Measures::2D::Fisher::twotailed; 61 | 62 | 63 | use Text::NSP::Measures::2D::Fisher; 64 | use strict; 65 | use Carp; 66 | use warnings; 67 | no warnings 'redefine'; 68 | require Exporter; 69 | 70 | our ($VERSION, @EXPORT, @ISA); 71 | 72 | @ISA = qw(Exporter); 73 | 74 | @EXPORT = qw(initializeStatistic calculateStatistic 75 | getErrorCode getErrorMessage getStatisticName); 76 | 77 | $VERSION = '0.97'; 78 | 79 | 80 | =item calculateStatistic() - This method calculates the twotailed 81 | Fisher value 82 | 83 | INPUT PARAMS : $count_values .. Reference of an array containing 84 | the count values computed by the 85 | count.pl program. 86 | 87 | RETURN VALUES : $twotailed .. Twotailed Fisher value. 88 | 89 | =cut 90 | 91 | sub calculateStatistic 92 | { 93 | my %values = @_; 94 | 95 | my $probabilities; 96 | 97 | # computes and returns the observed and marginal values from 98 | # the frequency combination values. returns 0 if there is an 99 | # error in the computation or the values are inconsistent. 100 | if( !(Text::NSP::Measures::2D::Fisher::getValues(\%values)) ) 101 | { 102 | return; 103 | } 104 | 105 | my $final_limit = ($n1p < $np1) ? $n1p : $np1; 106 | 107 | my $n11_org = $n11; 108 | my $n11_start = $n1p + $np1 - $npp; 109 | if($n11_start<0) 110 | { 111 | $n11_start = 0; 112 | } 113 | 114 | if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($n11_start, $final_limit))) 115 | { 116 | return; 117 | } 118 | 119 | my $value; 120 | 121 | my $ttfisher=0; 122 | 123 | foreach $value (sort { $a <=> $b } values %$probabilities) 124 | { 125 | if($value > $probabilities->{$n11_org}) 126 | { 127 | next; 128 | } 129 | $ttfisher += exp($value); 130 | } 131 | 132 | return $ttfisher; 133 | } 134 | 135 | 136 | =item getStatisticName() - Returns the name of this statistic 137 | 138 | INPUT PARAMS : none 139 | 140 | RETURN VALUES : $name .. Name of the measure. 141 | 142 | =cut 143 | 144 | sub getStatisticName 145 | { 146 | return "Two Tailed Fisher"; 147 | } 148 | 149 | 150 | 151 | 1; 152 | __END__ 153 | 154 | =back 155 | 156 | =head1 AUTHOR 157 | 158 | Ted Pedersen, University of Minnesota Duluth 159 | Etpederse@d.umn.eduE 160 | 161 | Satanjeev Banerjee, Carnegie Mellon University 162 | Esatanjeev@cmu.eduE 163 | 164 | Amruta Purandare, University of Pittsburgh 165 | Eamruta@cs.pitt.eduE 166 | 167 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 168 | Ebthompson@d.umn.eduE 169 | 170 | Saiyam Kohli, University of Minnesota Duluth 171 | Ekohli003@d.umn.eduE 172 | 173 | =head1 HISTORY 174 | 175 | Last updated: $Id: twotailed.pm,v 1.13 2008/03/26 17:21:19 tpederse Exp $ 176 | 177 | =head1 BUGS 178 | 179 | 180 | =head1 SEE ALSO 181 | 182 | @inproceedings{Pedersen96, 183 | author = {Pedersen, T.}, 184 | title = {Fishing For Exactness}, 185 | booktitle = {Proceedings of the South Central SAS User's 186 | Group (SCSUG-96) Conference}, 187 | year = {1996}, 188 | pages = {188--200}, 189 | month ={October}, 190 | address = {Austin, TX} 191 | url = L} 192 | 193 | L 194 | 195 | L 196 | 197 | 198 | =head1 COPYRIGHT 199 | 200 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 201 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 202 | 203 | This program is free software; you can redistribute it and/or modify it 204 | under the terms of the GNU General Public License as published by the Free 205 | Software Foundation; either version 2 of the License, or (at your option) 206 | any later version. 207 | 208 | This program is distributed in the hope that it will be useful, but 209 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 210 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 211 | for more details. 212 | 213 | You should have received a copy of the GNU General Public License along 214 | with this program; if not, write to 215 | 216 | The Free Software Foundation, Inc., 217 | 59 Temple Place - Suite 330, 218 | Boston, MA 02111-1307, USA. 219 | 220 | Note: a copy of the GNU General Public License is available on the web 221 | at L and is included in this 222 | distribution as GPL.txt. 223 | 224 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/tmi.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::MI::tmi - Perl module that implements True Mutual 4 | Information. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::MI::tmi; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $tmi_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$tmi_value."\n""; 26 | } 27 | 28 | =head1 DESCRIPTION 29 | 30 | Assume that the frequency count data associated with a bigram 31 | is stored in a 2x2 contingency table: 32 | 33 | word2 ~word2 34 | word1 n11 n12 | n1p 35 | ~word1 n21 n22 | n2p 36 | -------------- 37 | np1 np2 npp 38 | 39 | where n11 is the number of times occur together, and 40 | n12 is the number of times occurs with some word other than 41 | word2, and n1p is the number of times in total that word1 occurs as 42 | the first word in a bigram. 43 | 44 | The expected values for the internal cells are calculated by taking the 45 | product of their associated marginals and dividing by the sample size, 46 | for example: 47 | 48 | np1 * n1p 49 | m11= --------- 50 | npp 51 | 52 | True Mutual Information (tmi) is defined as the weighted average of the 53 | Pointwise mutual informations for all the observed and expected value pairs. 54 | 55 | tmi = [n11/npp * log(n11/m11) + n12/npp * log(n12/m12) + 56 | n21/npp * log(n21/m21) + n22/npp * log(n22/m22)] 57 | 58 | 59 | PMI = log (n11/m11) 60 | 61 | =head2 Methods 62 | 63 | =over 64 | 65 | =cut 66 | 67 | package Text::NSP::Measures::2D::MI::tmi; 68 | 69 | 70 | use Text::NSP::Measures::2D::MI; 71 | use strict; 72 | use Carp; 73 | use warnings; 74 | no warnings 'redefine'; 75 | require Exporter; 76 | 77 | our ($VERSION, @EXPORT, @ISA); 78 | 79 | @ISA = qw(Exporter); 80 | 81 | @EXPORT = qw(initializeStatistic calculateStatistic 82 | getErrorCode getErrorMessage getStatisticName); 83 | 84 | $VERSION = '0.97'; 85 | 86 | 87 | =item calculateStatistic() - This method calculates the tmi value 88 | 89 | INPUT PARAMS : $count_values .. Reference of an hash containing 90 | the count values computed by the 91 | count.pl program. 92 | 93 | RETURN VALUES : $tmi .. TMI value for this bigram. 94 | 95 | =cut 96 | 97 | sub calculateStatistic 98 | { 99 | my %values = @_; 100 | 101 | # computes and returns the observed and expected values from 102 | # the frequency combination values. returns 0 if there is an 103 | # error in the computation or the values are inconsistent. 104 | if( !(Text::NSP::Measures::2D::MI::getValues(\%values)) ) { 105 | return(0); 106 | } 107 | 108 | #my $marginals = $self->computeMarginalTotals(@_); 109 | 110 | # Now for the actual calculation of TMI! 111 | my $tmi = 0; 112 | 113 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so! 114 | $tmi += $n11/$npp * Text::NSP::Measures::2D::MI::computePMI( $n11, $m11 )/ log 2; 115 | $tmi += $n12/$npp * Text::NSP::Measures::2D::MI::computePMI( $n12, $m12 )/ log 2; 116 | $tmi += $n21/$npp * Text::NSP::Measures::2D::MI::computePMI( $n21, $m21 )/ log 2; 117 | $tmi += $n22/$npp * Text::NSP::Measures::2D::MI::computePMI( $n22, $m22 )/ log 2; 118 | 119 | return ($tmi); 120 | } 121 | 122 | 123 | =item getStatisticName() - Returns the name of this statistic 124 | 125 | INPUT PARAMS : none 126 | 127 | RETURN VALUES : $name .. Name of the measure. 128 | 129 | =cut 130 | 131 | sub getStatisticName 132 | { 133 | return "True Mutual Information"; 134 | } 135 | 136 | 137 | 138 | 1; 139 | __END__ 140 | 141 | 142 | =back 143 | 144 | =head1 AUTHOR 145 | 146 | Ted Pedersen, University of Minnesota Duluth 147 | Etpederse@d.umn.eduE 148 | 149 | Satanjeev Banerjee, Carnegie Mellon University 150 | Esatanjeev@cmu.eduE 151 | 152 | Amruta Purandare, University of Pittsburgh 153 | Eamruta@cs.pitt.eduE 154 | 155 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 156 | Ebthompson@d.umn.eduE 157 | 158 | Saiyam Kohli, University of Minnesota Duluth 159 | Ekohli003@d.umn.eduE 160 | 161 | =head1 HISTORY 162 | 163 | Last updated: $Id: tmi.pm,v 1.23 2008/03/26 17:20:28 tpederse Exp $ 164 | 165 | =head1 BUGS 166 | 167 | 168 | =head1 SEE ALSO 169 | 170 | @inproceedings{moore:2004:EMNLP, 171 | author = {Moore, Robert C.}, 172 | title = {On Log-Likelihood-Ratios and the Significance of Rare 173 | Events }, 174 | booktitle = {Proceedings of EMNLP 2004}, 175 | editor = {Dekang Lin and Dekai Wu}, 176 | year = 2004, 177 | month = {July}, 178 | address = {Barcelona, Spain}, 179 | publisher = {Association for Computational Linguistics}, 180 | pages = {333--340} 181 | url = L} 182 | 183 | L 184 | 185 | L 186 | 187 | 188 | =head1 COPYRIGHT 189 | 190 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 191 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 192 | 193 | This program is free software; you can redistribute it and/or modify it 194 | under the terms of the GNU General Public License as published by the Free 195 | Software Foundation; either version 2 of the License, or (at your option) 196 | any later version. 197 | 198 | This program is distributed in the hope that it will be useful, but 199 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 200 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 201 | for more details. 202 | 203 | You should have received a copy of the GNU General Public License along 204 | with this program; if not, write to 205 | 206 | The Free Software Foundation, Inc., 207 | 59 Temple Place - Suite 330, 208 | Boston, MA 02111-1307, USA. 209 | 210 | Note: a copy of the GNU General Public License is available on the web 211 | at L and is included in this 212 | distribution as GPL.txt. 213 | 214 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher/left.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Fisher::left - Perl module implementation of the left sided 4 | Fisher's exact test. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Fisher::left; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $left_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage(); 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$left_value; 26 | } 27 | 28 | 29 | =head1 DESCRIPTION 30 | 31 | Assume that the frequency count data associated with a bigram 32 | is stored in a 2x2 contingency table: 33 | 34 | word2 ~word2 35 | word1 n11 n12 | n1p 36 | ~word1 n21 n22 | n2p 37 | -------------- 38 | np1 np2 npp 39 | 40 | where n11 is the number of times occur together, and 41 | n12 is the number of times occurs with some word other than 42 | word2, and n1p is the number of times in total that word1 occurs as 43 | the first word in a bigram. 44 | 45 | The fishers exact tests are calculated by fixing the marginal totals 46 | and computing the hypergeometric probabilities for all the possible 47 | contingency tables, 48 | 49 | A left sided test is calculated by adding the probabilities of all 50 | the possible two by two contingency tables formed by fixing the 51 | marginal totals and changing the value of n11 to less than the given 52 | value. A left sided Fisher's Exact Test tells us how likely it is to 53 | randomly sample a table where n11 is less than observed. In other words, 54 | it tells us how likely it is to sample an observation where the two words 55 | are less dependent than currently observed. 56 | 57 | =head2 Methods 58 | 59 | =over 60 | 61 | =cut 62 | 63 | 64 | package Text::NSP::Measures::2D::Fisher::left; 65 | 66 | 67 | use Text::NSP::Measures::2D::Fisher; 68 | use strict; 69 | use Carp; 70 | use warnings; 71 | no warnings 'redefine'; 72 | require Exporter; 73 | 74 | our ($VERSION, @EXPORT, @ISA); 75 | 76 | @ISA = qw(Exporter); 77 | 78 | @EXPORT = qw(initializeStatistic calculateStatistic 79 | getErrorCode getErrorMessage getStatisticName); 80 | 81 | $VERSION = '0.97'; 82 | 83 | 84 | =item calculateStatistic() - This method computes the left sided Fishers 85 | exact test. 86 | 87 | INPUT PARAMS : $count_values .. Reference of an array containing 88 | the count values computed by the 89 | count.pl program. 90 | 91 | RETURN VALUES : $left .. Left Fisher value. 92 | 93 | =cut 94 | 95 | sub calculateStatistic 96 | { 97 | my %values = @_; 98 | my $probabilities; 99 | 100 | # computes and returns the observed and marginal values from 101 | # the frequency combination values. returns 0 if there is an 102 | # error in the computation or the values are inconsistent. 103 | if( !(Text::NSP::Measures::2D::Fisher::getValues(\%values)) ) 104 | { 105 | return; 106 | } 107 | 108 | my $final_limit = $n11; 109 | my $n11_start = $n1p + $np1 - $npp; 110 | if($n11_start<0) 111 | { 112 | $n11_start = 0; 113 | } 114 | 115 | if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($n11_start, $final_limit))) 116 | { 117 | return; 118 | } 119 | 120 | 121 | my $key_n11; 122 | 123 | my $leftfisher=0; 124 | 125 | foreach $key_n11 (sort { $a <=> $b } keys %$probabilities) 126 | { 127 | if($key_n11>$final_limit) 128 | { 129 | last; 130 | } 131 | $leftfisher += exp($probabilities->{$key_n11}); 132 | } 133 | 134 | return $leftfisher; 135 | } 136 | 137 | 138 | =item getStatisticName() 139 | 140 | Returns the name of this statistic 141 | 142 | INPUT PARAMS : none 143 | 144 | RETURN VALUES : $name .. Name of the measure. 145 | 146 | =cut 147 | 148 | sub getStatisticName 149 | { 150 | return "Left Fisher"; 151 | } 152 | 153 | 154 | 155 | 1; 156 | __END__ 157 | 158 | =back 159 | 160 | =head1 AUTHOR 161 | 162 | Ted Pedersen, University of Minnesota Duluth 163 | Etpederse@d.umn.eduE 164 | 165 | Satanjeev Banerjee, Carnegie Mellon University 166 | Esatanjeev@cmu.eduE 167 | 168 | Amruta Purandare, University of Pittsburgh 169 | Eamruta@cs.pitt.eduE 170 | 171 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 172 | Ebthompson@d.umn.eduE 173 | 174 | Saiyam Kohli, University of Minnesota Duluth 175 | Ekohli003@d.umn.eduE 176 | 177 | =head1 HISTORY 178 | 179 | Last updated: $Id: left.pm,v 1.12 2006/06/21 11:10:52 saiyam_kohli Exp $ 180 | 181 | =head1 BUGS 182 | 183 | 184 | =head1 SEE ALSO 185 | 186 | @inproceedings{Pedersen96, 187 | author = {Pedersen, T.}, 188 | title = {Fishing For Exactness}, 189 | booktitle = {Proceedings of the South Central SAS User's 190 | Group (SCSUG-96) Conference}, 191 | year = {1996}, 192 | pages = {188--200}, 193 | month ={October}, 194 | address = {Austin, TX} 195 | url = L} 196 | 197 | L 198 | 199 | L 200 | 201 | 202 | =head1 COPYRIGHT 203 | 204 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 205 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 206 | 207 | This program is free software; you can redistribute it and/or modify it 208 | under the terms of the GNU General Public License as published by the Free 209 | Software Foundation; either version 2 of the License, or (at your option) 210 | any later version. 211 | 212 | This program is distributed in the hope that it will be useful, but 213 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 214 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 215 | for more details. 216 | 217 | You should have received a copy of the GNU General Public License along 218 | with this program; if not, write to 219 | 220 | The Free Software Foundation, Inc., 221 | 59 Temple Place - Suite 330, 222 | Boston, MA 02111-1307, USA. 223 | 224 | Note: a copy of the GNU General Public License is available on the web 225 | at L and is included in this 226 | distribution as GPL.txt. 227 | 228 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher2/twotailed.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Fisher2::twotailed - Perl module implementation of the two-sided 4 | Fisher's exact test (Deprecated). 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Fisher2::twotailed; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $twotailed_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage(); 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$twotailed_value; 26 | } 27 | 28 | 29 | =head1 DESCRIPTION 30 | 31 | This module provides a naive implementation of the fishers twotailed 32 | exact tests. That is the implementation does not have any 33 | optimizations for performance. This will compute the factorials and 34 | the hypergeometric measures using direct multiplications. 35 | 36 | This measure should be used if you need exact values without any 37 | rounding errors, and you are not worried about the performance of 38 | the measure, otherwise use the implementations under the 39 | Text::NSP::Measures::2D::Fisher module. To use this implementation, 40 | you will have to specify the entire module name. Usage: 41 | 42 | statistic.pl Text::NSP::Measures::Fisher2::twotailed dest.txt source.cnt 43 | 44 | Assume that the frequency count data associated with a bigram 45 | is stored in a 2x2 contingency table: 46 | 47 | word2 ~word2 48 | word1 n11 n12 | n1p 49 | ~word1 n21 n22 | n2p 50 | -------------- 51 | np1 np2 npp 52 | 53 | where n11 is the number of times occur together, and 54 | n12 is the number of times occurs with some word other than 55 | word2, and n1p is the number of times in total that word1 occurs as 56 | the first word in a bigram. 57 | 58 | The fishers exact tests are calculated by fixing the marginal totals 59 | and computing the hypergeometric probabilities for all the possible 60 | contingency tables, 61 | 62 | A twotailed fishers test is calculated by adding the probabilities of 63 | all the contingency tables with probabilities less than the probability 64 | of the observed table. The twotailed fishers test tells us how likely 65 | it would be to observe an contingency table which is less probable than 66 | the current table. 67 | 68 | =head2 Methods 69 | 70 | =over 71 | 72 | =cut 73 | 74 | package Text::NSP::Measures::2D::Fisher2::twotailed; 75 | 76 | 77 | use Text::NSP::Measures::2D::Fisher2; 78 | use strict; 79 | use Carp; 80 | use warnings; 81 | no warnings 'redefine'; 82 | require Exporter; 83 | 84 | our ($VERSION, @EXPORT, @ISA); 85 | 86 | @ISA = qw(Exporter); 87 | 88 | @EXPORT = qw(initializeStatistic calculateStatistic 89 | getErrorCode getErrorMessage getStatisticName); 90 | 91 | $VERSION = '0.97'; 92 | 93 | 94 | =item calculateStatistic() - This method computes the right sided Fishers 95 | exact test. 96 | 97 | INPUT PARAMS : $count_values .. Reference of an array containing 98 | the count values computed by the 99 | count.pl program. 100 | 101 | RETURN VALUES : $twotailed .. Twotailed Fisher value. 102 | 103 | =cut 104 | 105 | sub calculateStatistic 106 | { 107 | my %values = @_; 108 | 109 | my $probabilities; 110 | 111 | # computes and returns the observed and marginal values from 112 | # the frequency combination values. returns 0 if there is an 113 | # error in the computation or the values are inconsistent. 114 | if( !(Text::NSP::Measures::2D::Fisher2::getValues(\%values)) ) 115 | { 116 | return; 117 | } 118 | 119 | 120 | my $final_limit = ($n1p < $np1) ? $n1p : $np1; 121 | my $n11_org = $n11; 122 | 123 | my $n11_start = $n1p + $np1 - $npp; 124 | if($n11_start<0) 125 | { 126 | $n11_start = 0; 127 | } 128 | 129 | if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($n11_start, $final_limit))) 130 | { 131 | return; 132 | } 133 | 134 | my $value; 135 | 136 | my $ttfisher=0; 137 | 138 | foreach $value (sort { $a <=> $b } values %$probabilities) 139 | { 140 | if($value > $probabilities->{$n11_org}) 141 | { 142 | next; 143 | } 144 | $ttfisher += $value; 145 | } 146 | 147 | return $ttfisher; 148 | } 149 | 150 | 151 | =item getStatisticName() 152 | 153 | Returns the name of this statistic 154 | 155 | INPUT PARAMS : none 156 | 157 | RETURN VALUES : $name .. Name of the measure. 158 | 159 | =cut 160 | 161 | sub getStatisticName 162 | { 163 | return "Two Tailed Fisher"; 164 | } 165 | 166 | 167 | 168 | 1; 169 | __END__ 170 | 171 | =back 172 | 173 | =head1 AUTHOR 174 | 175 | Ted Pedersen, University of Minnesota Duluth 176 | Etpederse@d.umn.eduE 177 | 178 | Satanjeev Banerjee, Carnegie Mellon University 179 | Esatanjeev@cmu.eduE 180 | 181 | Amruta Purandare, University of Pittsburgh 182 | Eamruta@cs.pitt.eduE 183 | 184 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 185 | Ebthompson@d.umn.eduE 186 | 187 | Saiyam Kohli, University of Minnesota Duluth 188 | Ekohli003@d.umn.eduE 189 | 190 | =head1 HISTORY 191 | 192 | Last updated: $Id: twotailed.pm,v 1.10 2008/03/26 17:24:15 tpederse Exp $ 193 | 194 | =head1 BUGS 195 | 196 | 197 | =head1 SEE ALSO 198 | 199 | L 200 | 201 | L 202 | 203 | 204 | =head1 COPYRIGHT 205 | 206 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 207 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 208 | 209 | This program is free software; you can redistribute it and/or modify it 210 | under the terms of the GNU General Public License as published by the Free 211 | Software Foundation; either version 2 of the License, or (at your option) 212 | any later version. 213 | 214 | This program is distributed in the hope that it will be useful, but 215 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 216 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 217 | for more details. 218 | 219 | You should have received a copy of the GNU General Public License along 220 | with this program; if not, write to 221 | 222 | The Free Software Foundation, Inc., 223 | 59 Temple Place - Suite 330, 224 | Boston, MA 02111-1307, USA. 225 | 226 | Note: a copy of the GNU General Public License is available on the web 227 | at L and is included in this 228 | distribution as GPL.txt. 229 | 230 | =cut 231 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/tmi.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::3D::MI::tmi - Perl implementation for True Mutual 4 | Information for trigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::3D::MI::tmi; 11 | 12 | $tmi_value = calculateStatistic( n111=>10, 13 | n1pp=>40, 14 | np1p=>45, 15 | npp1=>42, 16 | n11p=>20, 17 | n1p1=>23, 18 | np11=>21, 19 | nppp=>100); 20 | 21 | if( ($errorCode = getErrorCode())) 22 | { 23 | print STDERR $erroCode." - ".getErrorMessage()."\n"; 24 | } 25 | else 26 | { 27 | print getStatisticName."value for bigram is ".$tmi_value."\n"; 28 | } 29 | 30 | =head1 DESCRIPTION 31 | 32 | True Mutual Information (tmi) is defined as the weighted average of the 33 | pointwise mutual informations for all the observed and expected value pairs. 34 | 35 | tmi = [n111/nppp * log(n111/m111) + n112/nppp * log(n112/m112) + 36 | n121/nppp * log(n121/m121) + n122/nppp * log(n122/m122) + 37 | n211/nppp * log(n211/m211) + n212/nppp * log(n212/m212) + 38 | n221/nppp * log(n221/m221) + n222/nppp * log(n222/m222)] 39 | 40 | PMI = log (n111/m111) 41 | 42 | Here n111 represents the observed value for the cell (1,1,1) and m111 43 | represents the expected value for that cell. The expected values for 44 | the internal cells are calculated by taking the product of their 45 | associated marginals and dividing by the sample size, for example: 46 | 47 | n1pp * np1p * npp1 48 | m111= -------------------- 49 | nppp 50 | 51 | =head2 Methods 52 | 53 | =over 54 | 55 | =cut 56 | 57 | 58 | package Text::NSP::Measures::3D::MI::tmi; 59 | 60 | 61 | use Text::NSP::Measures::3D::MI; 62 | use strict; 63 | use Carp; 64 | use warnings; 65 | no warnings 'redefine'; 66 | require Exporter; 67 | 68 | our ($VERSION, @EXPORT, @ISA); 69 | 70 | @ISA = qw(Exporter); 71 | 72 | @EXPORT = qw(initializeStatistic calculateStatistic 73 | getErrorCode getErrorMessage getStatisticName); 74 | 75 | $VERSION = '0.97'; 76 | 77 | 78 | =item calculateStatistic($count_values) - This method calculates 79 | the tmi value 80 | 81 | INPUT PARAMS : $count_values .. Reference of an hash containing 82 | the count values computed by the 83 | count.pl program. 84 | 85 | RETURN VALUES : $tmi .. TMI value for this trigram. 86 | 87 | =cut 88 | 89 | sub calculateStatistic 90 | { 91 | my %values = @_; 92 | 93 | # computes and returns the observed and expected values from 94 | # the frequency combination values. returns 0 if there is an 95 | # error in the computation or the values are inconsistent. 96 | if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) { 97 | return(0); 98 | } 99 | 100 | #my $marginals = $self->computeMarginalTotals(@_); 101 | 102 | # Now for the actual calculation of TMI! 103 | my $tmi = 0; 104 | 105 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so! 106 | $tmi += $n111/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n111, $m111 )/ log 2; 107 | $tmi += $n112/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n112, $m112 )/ log 2; 108 | $tmi += $n121/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n121, $m121 )/ log 2; 109 | $tmi += $n122/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n122, $m122 )/ log 2; 110 | $tmi += $n211/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n211, $m211 )/ log 2; 111 | $tmi += $n212/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n212, $m212 )/ log 2; 112 | $tmi += $n221/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n221, $m221 )/ log 2; 113 | $tmi += $n222/$nppp * Text::NSP::Measures::3D::MI::computePMI( $n222, $m222 )/ log 2; 114 | 115 | return ($tmi); 116 | } 117 | 118 | 119 | =item getStatisticName() - Returns the name of this statistic 120 | 121 | INPUT PARAMS : none 122 | 123 | RETURN VALUES : $name .. Name of the measure. 124 | 125 | =cut 126 | 127 | sub getStatisticName 128 | { 129 | return "True Mutual Information"; 130 | } 131 | 132 | 133 | 134 | 1; 135 | __END__ 136 | 137 | =back 138 | 139 | =head1 AUTHOR 140 | 141 | Ted Pedersen, University of Minnesota Duluth 142 | Etpederse@d.umn.eduE 143 | 144 | Satanjeev Banerjee, Carnegie Mellon University 145 | Esatanjeev@cmu.eduE 146 | 147 | Amruta Purandare, University of Pittsburgh 148 | Eamruta@cs.pitt.eduE 149 | 150 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 151 | Ebthompson@d.umn.eduE 152 | 153 | Saiyam Kohli, University of Minnesota Duluth 154 | Ekohli003@d.umn.eduE 155 | 156 | =head1 HISTORY 157 | 158 | Last updated: $Id: tmi.pm,v 1.10 2006/06/21 11:10:53 saiyam_kohli Exp $ 159 | 160 | =head1 BUGS 161 | 162 | 163 | =head1 SEE ALSO 164 | 165 | @inproceedings{moore:2004:EMNLP, 166 | author = {Moore, Robert C.}, 167 | title = {On Log-Likelihood-Ratios and the Significance of Rare 168 | Events }, 169 | booktitle = {Proceedings of EMNLP 2004}, 170 | editor = {Dekang Lin and Dekai Wu}, 171 | year = 2004, 172 | month = {July}, 173 | address = {Barcelona, Spain}, 174 | publisher = {Association for Computational Linguistics}, 175 | pages = {333--340} 176 | url = L} 177 | 178 | L 179 | 180 | L 181 | 182 | 183 | =head1 COPYRIGHT 184 | 185 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 186 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 187 | 188 | This program is free software; you can redistribute it and/or modify it 189 | under the terms of the GNU General Public License as published by the Free 190 | Software Foundation; either version 2 of the License, or (at your option) 191 | any later version. 192 | 193 | This program is distributed in the hope that it will be useful, but 194 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 195 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 196 | for more details. 197 | 198 | You should have received a copy of the GNU General Public License along 199 | with this program; if not, write to 200 | 201 | The Free Software Foundation, Inc., 202 | 59 Temple Place - Suite 330, 203 | Boston, MA 02111-1307, USA. 204 | 205 | Note: a copy of the GNU General Public License is available on the web 206 | at L and is included in this 207 | distribution as GPL.txt. 208 | 209 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher2/left.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Fisher2::left - Perl module implementation of the left sided 4 | Fisher's exact test (Deprecated). 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Fisher2::left; 11 | 12 | my $leftFisher = Text::NSP::Measures::2D::Fisher2::left->new(); 13 | 14 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 15 | 16 | $leftFisher_value = $leftFisher->calculateStatistic( n11=>$n11, 17 | n1p=>$n1p, 18 | np1=>$np1, 19 | npp=>$npp); 20 | 21 | if( ($errorCode = $leftFisher->getErrorCode())) 22 | { 23 | print STDERR $erroCode." - ".$leftFisher->getErrorMessage(); 24 | } 25 | else 26 | { 27 | print $leftFisher->getStatisticName."value for bigram is ".$leftFisher_value; 28 | } 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | This module provides a naive implementation of the fishers left 34 | sided exact tests. That is the implementation does not have any 35 | optimizations for performance. This will compute the factorials and 36 | the hypergeometric measures using direct multiplications. 37 | 38 | This measure should be used if you need exact values without any 39 | rounding errors, and you are not worried about the performance of 40 | the measure, otherwise use the implementations under the 41 | Text::NSP::Measures::2D::Fisher module. To use this implementation, 42 | you will have to specify the entire module name. Usage: 43 | 44 | statistic.pl Text::NSP::Measures::Fisher2::left dest.txt source.cnt 45 | 46 | Assume that the frequency count data associated with a bigram 47 | is stored in a 2x2 contingency table: 48 | 49 | word2 ~word2 50 | word1 n11 n12 | n1p 51 | ~word1 n21 n22 | n2p 52 | -------------- 53 | np1 np2 npp 54 | 55 | where n11 is the number of times occur together, and 56 | n12 is the number of times occurs with some word other than 57 | word2, and n1p is the number of times in total that word1 occurs as 58 | the first word in a bigram. 59 | 60 | The fishers exact tests are calculated by fixing the marginal totals 61 | and computing the hypergeometric probabilities for all the possible 62 | contingency tables, 63 | 64 | A left sided test is calculated by adding the probabilities of all 65 | the possible two by two contingency tables formed by fixing the 66 | marginal totals and changing the value of n11 to less than the given 67 | value. A left sided Fisher's Exact Test tells us how likely it is to 68 | randomly sample a table where n11 is less than observed. In other words, 69 | it tells us how likely it is to sample an observation where the two words 70 | are less dependent than currently observed. 71 | 72 | =head2 Methods 73 | 74 | =over 75 | 76 | =cut 77 | 78 | 79 | package Text::NSP::Measures::2D::Fisher2::left; 80 | 81 | 82 | use Text::NSP::Measures::2D::Fisher2; 83 | use strict; 84 | use Carp; 85 | use warnings; 86 | no warnings 'redefine'; 87 | require Exporter; 88 | 89 | our ($VERSION, @EXPORT, @ISA); 90 | 91 | @ISA = qw(Exporter); 92 | 93 | @EXPORT = qw(initializeStatistic calculateStatistic 94 | getErrorCode getErrorMessage getStatisticName); 95 | 96 | $VERSION = '0.97'; 97 | 98 | 99 | =item calculateStatistic() - This method computes the left sided Fishers 100 | exact test. 101 | 102 | INPUT PARAMS : $count_values .. Reference of an array containing 103 | the count values computed by the 104 | count.pl program. 105 | 106 | RETURN VALUES : $left .. Left Fisher value. 107 | 108 | =cut 109 | 110 | sub calculateStatistic 111 | { 112 | my %values = @_; 113 | 114 | my $probabilities; 115 | 116 | # computes and returns the observed and marginal values from 117 | # the frequency combination values. returns 0 if there is an 118 | # error in the computation or the values are inconsistent. 119 | if( !(Text::NSP::Measures::2D::Fisher2::getValues(\%values)) ) 120 | { 121 | return; 122 | } 123 | 124 | my $final_limit = $n11; 125 | my $n11 = $n1p + $np1 - $npp; 126 | if($n11<0) 127 | { 128 | $n11 = 0; 129 | } 130 | 131 | if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($n11, $final_limit))) 132 | { 133 | return; 134 | } 135 | 136 | 137 | my $key_n11; 138 | 139 | my $leftfisher=0; 140 | 141 | foreach $key_n11 (sort { $a <=> $b } keys %$probabilities) 142 | { 143 | if($key_n11>$final_limit) 144 | { 145 | last; 146 | } 147 | $leftfisher += $probabilities->{$key_n11}; 148 | } 149 | 150 | return $leftfisher; 151 | } 152 | 153 | 154 | =item getStatisticName() - Returns the name of this statistic 155 | 156 | INPUT PARAMS : none 157 | 158 | RETURN VALUES : $name .. Name of the measure. 159 | 160 | =cut 161 | 162 | sub getStatisticName 163 | { 164 | return "Left Fisher"; 165 | } 166 | 167 | 168 | 169 | 1; 170 | __END__ 171 | 172 | =back 173 | 174 | =head1 AUTHOR 175 | 176 | Ted Pedersen, University of Minnesota Duluth 177 | Etpederse@d.umn.eduE 178 | 179 | Satanjeev Banerjee, Carnegie Mellon University 180 | Esatanjeev@cmu.eduE 181 | 182 | Amruta Purandare, University of Pittsburgh 183 | Eamruta@cs.pitt.eduE 184 | 185 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 186 | Ebthompson@d.umn.eduE 187 | 188 | Saiyam Kohli, University of Minnesota Duluth 189 | Ekohli003@d.umn.eduE 190 | 191 | =head1 HISTORY 192 | 193 | Last updated: $Id: left.pm,v 1.10 2008/03/26 17:24:15 tpederse Exp $ 194 | 195 | =head1 BUGS 196 | 197 | 198 | =head1 SEE ALSO 199 | 200 | L 201 | 202 | L 203 | 204 | 205 | =head1 COPYRIGHT 206 | 207 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 208 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 209 | 210 | This program is free software; you can redistribute it and/or modify it 211 | under the terms of the GNU General Public License as published by the Free 212 | Software Foundation; either version 2 of the License, or (at your option) 213 | any later version. 214 | 215 | This program is distributed in the hope that it will be useful, but 216 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 217 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 218 | for more details. 219 | 220 | You should have received a copy of the GNU General Public License along 221 | with this program; if not, write to 222 | 223 | The Free Software Foundation, Inc., 224 | 59 Temple Place - Suite 330, 225 | Boston, MA 02111-1307, USA. 226 | 227 | Note: a copy of the GNU General Public License is available on the web 228 | at L and is included in this 229 | distribution as GPL.txt. 230 | 231 | =cut 232 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/CHI.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::CHI - Perl module that provides error checks 4 | for the Pearson's chi squared, phi coefficient 5 | and the Tscore measures. 6 | 7 | =head1 SYNOPSIS 8 | 9 | =head3 Basic Usage 10 | 11 | use Text::NSP::Measures::2D::CHI::x2; 12 | 13 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 14 | 15 | $x2_value = calculateStatistic( n11=>$n11, 16 | n1p=>$n1p, 17 | np1=>$np1, 18 | npp=>$npp); 19 | 20 | if( ($errorCode = getErrorCode())) 21 | { 22 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 23 | } 24 | else 25 | { 26 | print getStatisticName."value for bigram is ".$x2_value."\n""; 27 | } 28 | 29 | =head1 DESCRIPTION 30 | 31 | This module is the base class for the Chi-squared and Phi coefficient 32 | measures. This module provides error checks specific for these measures, 33 | it also implements the computations that are common to these measures. 34 | 35 | =over 36 | 37 | =item Pearson's Chi-Squared 38 | 39 | x2 = 2 * [((n11 - m11)/m11)^2 + ((n12 - m12)/m12)^2 + 40 | ((n21 - m21)/m21)^2 + ((n22 -m22)/m22)^2] 41 | 42 | =item Phi Coefficient 43 | 44 | PHI^2 = ((n11 * n22) - (n21 * n21))^2/(n1p * np1 * np2 * n2p) 45 | 46 | =item T-Score 47 | 48 | tscore = (n11 - m11)/sqrt(n11) 49 | 50 | =back 51 | 52 | Note that the value of PHI^2 is equivalent to 53 | Pearson's Chi-Squared test multiplied by the sample size, that is: 54 | 55 | Chi-Squared = npp * PHI^2 56 | 57 | Although T-score seems quite different from the other two measures we 58 | have put it in the CHI family because like the other two measures it 59 | uses the difference between the observed and expected values and is also 60 | quite similar in ranking the bigrams. 61 | 62 | =over 63 | 64 | =cut 65 | 66 | 67 | package Text::NSP::Measures::2D::CHI; 68 | 69 | 70 | use Text::NSP::Measures::2D; 71 | use strict; 72 | use Carp; 73 | use warnings; 74 | # use subs(calculateStatistic); 75 | require Exporter; 76 | 77 | our ($VERSION, @EXPORT, @ISA); 78 | 79 | @ISA = qw(Exporter); 80 | 81 | @EXPORT = qw(initializeStatistic calculateStatistic 82 | getErrorCode getErrorMessage getStatisticName 83 | $n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22 84 | $npp $np1 $np2 $n2p $n1p $errorCodeNumber 85 | $errorMessage); 86 | 87 | $VERSION = '1.03'; 88 | 89 | =item getValues() - This method calls the computeMarginalTotals(), 90 | computeObservedValues() and the computeExpectedValues() methods to 91 | compute the observed and expected values. It checks thees values for 92 | any errors that might cause the PHI and x2 measures to fail. 93 | 94 | INPUT PARAMS : $count_values .. Reference of an hash containing 95 | the count values computed by the 96 | count.pl program. 97 | 98 | RETURN VALUES : 1/undef ..returns '1' to indicate success 99 | and an undefined(NULL) value to indicate 100 | failure. 101 | 102 | =cut 103 | 104 | sub getValues 105 | { 106 | my ($values)=@_; 107 | 108 | if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ) { 109 | return; 110 | } 111 | 112 | if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) { 113 | return; 114 | } 115 | 116 | if( !(Text::NSP::Measures::2D::computeExpectedValues($values)) ) { 117 | return; 118 | } 119 | 120 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so and return; 121 | if ( $n11 ) 122 | { 123 | if ($m11 == 0) 124 | { 125 | $errorMessage = "Expected value in cell (1,1) must not be zero"; 126 | $errorCodeNumber = 221; 127 | return; 128 | } 129 | } 130 | if ( $n12 ) 131 | { 132 | if ($m12 == 0) 133 | { 134 | $errorMessage = "Expected value in cell (1,2) must not be zero"; 135 | $errorCodeNumber = 221; 136 | return; 137 | } 138 | } 139 | if ( $n21 ) 140 | { 141 | if ($m21 == 0) 142 | { 143 | $errorMessage = "Expected value in cell (2,1) must not be zero"; 144 | $errorCodeNumber = 221; 145 | return; 146 | } 147 | } 148 | if ( $n22 ) 149 | { 150 | if ($m22 == 0) 151 | { 152 | $errorMessage = "Expected value in cell (2,2) must not be zero"; 153 | $errorCodeNumber = 221; 154 | return; 155 | } 156 | } 157 | # Everything looks good so we can return 1 158 | return 1; 159 | } 160 | 161 | 162 | 163 | 164 | =item computeVal() - Computes the deviation in observed value with respect 165 | to the expected values 166 | 167 | INPUT PARAMS : $n ..Observed value 168 | $m ..Expected value 169 | 170 | RETURN VALUES : (n-m)^2/m ..the log of the ratio of 171 | observed value to expected 172 | value. 173 | 174 | =cut 175 | 176 | sub computeVal 177 | { 178 | my $n = shift; 179 | my $m = shift; 180 | if($m) 181 | { 182 | return (($n-$m)**2)/$m; 183 | } 184 | else 185 | { 186 | return 0; 187 | } 188 | } 189 | 190 | 191 | 192 | 1; 193 | __END__ 194 | 195 | 196 | =back 197 | 198 | =head1 AUTHOR 199 | 200 | Ted Pedersen, University of Minnesota Duluth 201 | Etpederse@d.umn.eduE 202 | 203 | Satanjeev Banerjee, Carnegie Mellon University 204 | Esatanjeev@cmu.eduE 205 | 206 | Amruta Purandare, University of Pittsburgh 207 | Eamruta@cs.pitt.eduE 208 | 209 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 210 | Ebthompson@d.umn.eduE 211 | 212 | Saiyam Kohli, University of Minnesota Duluth 213 | Ekohli003@d.umn.eduE 214 | 215 | =head1 HISTORY 216 | 217 | Last updated: $Id: CHI.pm,v 1.14 2008/03/26 17:18:26 tpederse Exp $ 218 | 219 | =head1 BUGS 220 | 221 | 222 | =head1 SEE ALSO 223 | 224 | L 225 | 226 | L 227 | 228 | 229 | =head1 COPYRIGHT 230 | 231 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 232 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 233 | 234 | This program is free software; you can redistribute it and/or modify it 235 | under the terms of the GNU General Public License as published by the Free 236 | Software Foundation; either version 2 of the License, or (at your option) 237 | any later version. 238 | 239 | This program is distributed in the hope that it will be useful, but 240 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 241 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 242 | for more details. 243 | 244 | You should have received a copy of the GNU General Public License along 245 | with this program; if not, write to 246 | 247 | The Free Software Foundation, Inc., 248 | 59 Temple Place - Suite 330, 249 | Boston, MA 02111-1307, USA. 250 | 251 | Note: a copy of the GNU General Public License is available on the web 252 | at L and is included in this 253 | distribution as GPL.txt. 254 | 255 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/pmi.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::MI::pmi - Perl module that implements Pointwise 4 | Mutual Information. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::MI::pmi; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $pmi_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$pmi_value."\n""; 26 | } 27 | 28 | =head1 DESCRIPTION 29 | 30 | Assume that the frequency count data associated with a bigram 31 | is stored in a 2x2 contingency table: 32 | 33 | word2 ~word2 34 | word1 n11 n12 | n1p 35 | ~word1 n21 n22 | n2p 36 | -------------- 37 | np1 np2 npp 38 | 39 | where n11 is the number of times occur together, and 40 | n12 is the number of times occurs with some word other than 41 | word2, and n1p is the number of times in total that word1 occurs as 42 | the first word in a bigram. 43 | 44 | The expected values for the internal cells are calculated by taking the 45 | product of their associated marginals and dividing by the sample size, 46 | for example: 47 | 48 | np1 * n1p 49 | m11= --------- 50 | npp 51 | 52 | Pointwise Mutual Information (pmi) is defined as the log of the deviation 53 | between the observed frequency of a bigram (n11) and the probability of 54 | that bigram if it were independent (m11). 55 | 56 | PMI = log (n11/m11) 57 | 58 | The Pointwise Mutual Information tends to overestimate bigrams with low 59 | observed frequency counts. To prevent this sometimes a variation of pmi 60 | is used which increases the influence of the observed frequency. 61 | 62 | PMI = log((n11^$exp)/m11) 63 | 64 | The $exp is 1 by default, so by default the measure will compute the 65 | Pointwise Mutual Information for the given bigram. To use a variation of 66 | the measure, users can pass the $exp parameter using the --pmi_exp 67 | command line option in statistic.pl or by passing the $exp to the 68 | initializeStatistic() method from their program. 69 | 70 | The usage for statistic.pl is 71 | 72 | statistic.pl pmi out_pmi.stt out.cnt - for Point Wise Mutual Information 73 | $exp is 1 in this case. 74 | 75 | statistic.pl --pmi_exp 2 pmi out_pmi2.stt out.cnt - for the variant with 76 | $exp set to 2. 77 | 78 | =head2 Methods 79 | 80 | =over 81 | 82 | =cut 83 | 84 | 85 | package Text::NSP::Measures::2D::MI::pmi; 86 | 87 | 88 | use Text::NSP::Measures::2D::MI; 89 | use strict; 90 | use Carp; 91 | use warnings; 92 | no warnings 'redefine'; 93 | require Exporter; 94 | 95 | our ($VERSION, @EXPORT, @ISA, $exp); 96 | 97 | $exp = 1; 98 | 99 | @ISA = qw(Exporter); 100 | 101 | @EXPORT = qw(initializeStatistic calculateStatistic 102 | getErrorCode getErrorMessage getStatisticName); 103 | 104 | $VERSION = '0.97'; 105 | 106 | 107 | =item initializeStatistic() -Initialization of the pmi_exp parameter if required 108 | 109 | INPUT PARAMS : none 110 | 111 | RETURN VALUES : none 112 | 113 | =cut 114 | 115 | sub initializeStatistic 116 | { 117 | $exp = shift; 118 | } 119 | 120 | 121 | 122 | =item calculateStatistic() - This method calculates the pmi value 123 | 124 | INPUT PARAMS : $count_values .. Reference of a hash containing 125 | the count values computed by the 126 | count.pl program. 127 | 128 | RETURN VALUES : $pmi .. PMI value for this bigram. 129 | 130 | =cut 131 | 132 | sub calculateStatistic 133 | { 134 | my %values = @_; 135 | 136 | # computes and returns the observed and expected values from 137 | # the frequency combination values. returns 0 if there is an 138 | # error in the computation or the values are inconsistent. 139 | if( !(Text::NSP::Measures::2D::MI::getValues(\%values)) ) { 140 | return; 141 | } 142 | 143 | # Now the calculations! 144 | my $pmi = Text::NSP::Measures::2D::MI::computePMI($n11**$exp,$m11); 145 | 146 | return($pmi/log(2)); 147 | } 148 | 149 | 150 | 151 | =item getStatisticName() - Returns the name of this statistic 152 | 153 | INPUT PARAMS : none 154 | 155 | RETURN VALUES : $name .. Name of the measure. 156 | 157 | =cut 158 | 159 | sub getStatisticName 160 | { 161 | return "Pointwise Mutual Information"; 162 | } 163 | 164 | 165 | 166 | 1; 167 | __END__ 168 | 169 | 170 | =back 171 | 172 | =head1 AUTHOR 173 | 174 | Ted Pedersen, University of Minnesota Duluth 175 | Etpederse@d.umn.eduE 176 | 177 | Satanjeev Banerjee, Carnegie Mellon University 178 | Esatanjeev@cmu.eduE 179 | 180 | Amruta Purandare, University of Pittsburgh 181 | Eamruta@cs.pitt.eduE 182 | 183 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 184 | Ebthompson@d.umn.eduE 185 | 186 | Saiyam Kohli, University of Minnesota Duluth 187 | Ekohli003@d.umn.eduE 188 | 189 | =head1 HISTORY 190 | 191 | Last updated: $Id: pmi.pm,v 1.24 2008/03/26 17:20:28 tpederse Exp $ 192 | 193 | =head1 BUGS 194 | 195 | 196 | =head1 SEE ALSO 197 | 198 | @inproceedings{ church89word, 199 | author = {Kenneth W. Church and Patrick Hanks}, 200 | title = {Word association norms, mutual information, and Lexicography}, 201 | booktitle = {Proceedings of the 27th. Annual Meeting of the Association for Computational Linguistics}, 202 | publisher = {Association for Computational Linguistics}, 203 | address = {Vancouver, B.C.}, 204 | pages = {76--83}, 205 | year = {1989}, 206 | url = L } 207 | 208 | 209 | L 210 | 211 | L 212 | 213 | 214 | =head1 COPYRIGHT 215 | 216 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 217 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 218 | 219 | This program is free software; you can redistribute it and/or modify it 220 | under the terms of the GNU General Public License as published by the Free 221 | Software Foundation; either version 2 of the License, or (at your option) 222 | any later version. 223 | 224 | This program is distributed in the hope that it will be useful, but 225 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 226 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 227 | for more details. 228 | 229 | You should have received a copy of the GNU General Public License along 230 | with this program; if not, write to 231 | 232 | The Free Software Foundation, Inc., 233 | 59 Temple Place - Suite 330, 234 | Boston, MA 02111-1307, USA. 235 | 236 | Note: a copy of the GNU General Public License is available on the web 237 | at L and is included in this 238 | distribution as GPL.txt. 239 | 240 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI/ll.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::MI::ll - Perl module that implements Loglikelihood 4 | measure of association for bigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::MI::ll; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $ll_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage(); 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$ll_value; 26 | } 27 | 28 | =head1 DESCRIPTION 29 | 30 | The log-likelihood ratio measures the deviation between the observed data 31 | and what would be expected if and were independent. The 32 | higher the score, the less evidence there is in favor of concluding that 33 | the words are independent. 34 | 35 | Assume that the frequency count data associated with a bigram 36 | as shown by a 2x2 contingency table: 37 | 38 | word2 ~word2 39 | word1 n11 n12 | n1p 40 | ~word1 n21 n22 | n2p 41 | -------------- 42 | np1 np2 npp 43 | 44 | where n11 is the number of times occur together, and 45 | n12 is the number of times occurs with some word other than 46 | word2, and n1p is the number of times in total that word1 occurs as 47 | the first word in a bigram. 48 | 49 | The expected values for the internal cells are calculated by taking the 50 | product of their associated marginals and dividing by the sample size, 51 | for example: 52 | 53 | np1 * n1p 54 | m11= --------- 55 | npp 56 | 57 | Then the deviation between observed and expected values for each internal 58 | cell is computed to arrive at the log-likelihood value. 59 | 60 | Log-Likelihood = 2 * [n11 * log(n11/m11) + n12 * log(n12/m12) + 61 | n21 * log(n21/m21) + n22 * log(n22/m22)] 62 | 63 | =head2 Methods 64 | 65 | =over 66 | 67 | =cut 68 | 69 | 70 | package Text::NSP::Measures::2D::MI::ll; 71 | 72 | 73 | use Text::NSP::Measures::2D::MI; 74 | use strict; 75 | use Carp; 76 | use warnings; 77 | no warnings 'redefine'; 78 | require Exporter; 79 | 80 | our ($VERSION, @EXPORT, @ISA); 81 | 82 | @ISA = qw(Exporter); 83 | 84 | @EXPORT = qw(initializeStatistic calculateStatistic 85 | getErrorCode getErrorMessage getStatisticName); 86 | 87 | $VERSION = '0.97'; 88 | 89 | =item calculateStatistic() - This method calculates the ll value 90 | 91 | INPUT PARAMS : $count_values .. Reference of an hash containing 92 | the count values computed by the 93 | count.pl program. 94 | 95 | RETURN VALUES : $loglikelihood .. Loglikelihood value for this bigram. 96 | 97 | =cut 98 | 99 | sub calculateStatistic 100 | { 101 | my %values = @_; 102 | 103 | # computes and sets the observed and expected values from 104 | # the frequency combination values. returns 0 if there is an 105 | # error in the computation or the values are inconsistent. 106 | if( !Text::NSP::Measures::2D::MI::getValues(\%values) ) 107 | { 108 | return; 109 | } 110 | 111 | # Now for the actual calculation of Loglikelihood! 112 | my $logLikelihood = 0; 113 | 114 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so! 115 | $logLikelihood += $n11 * Text::NSP::Measures::2D::MI::computePMI( $n11, $m11 ); 116 | $logLikelihood += $n12 * Text::NSP::Measures::2D::MI::computePMI( $n12, $m12 ); 117 | $logLikelihood += $n21 * Text::NSP::Measures::2D::MI::computePMI( $n21, $m21 ); 118 | $logLikelihood += $n22 * Text::NSP::Measures::2D::MI::computePMI( $n22, $m22 ); 119 | 120 | return ( 2 * $logLikelihood ); 121 | } 122 | 123 | 124 | =item getStatisticName() - Returns the name of this statistic 125 | 126 | INPUT PARAMS : none 127 | 128 | RETURN VALUES : $name .. Name of the measure. 129 | 130 | =cut 131 | 132 | sub getStatisticName 133 | { 134 | return "Log-likelihood"; 135 | } 136 | 137 | 138 | 139 | 1; 140 | __END__ 141 | 142 | 143 | =back 144 | 145 | =head1 AUTHOR 146 | 147 | Ted Pedersen, University of Minnesota Duluth 148 | Etpederse@d.umn.eduE 149 | 150 | Satanjeev Banerjee, Carnegie Mellon University 151 | Esatanjeev@cmu.eduE 152 | 153 | Amruta Purandare, University of Pittsburgh 154 | Eamruta@cs.pitt.eduE 155 | 156 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 157 | Ebthompson@d.umn.eduE 158 | 159 | Saiyam Kohli, University of Minnesota Duluth 160 | Ekohli003@d.umn.eduE 161 | 162 | =head1 HISTORY 163 | 164 | Last updated: $Id: ll.pm,v 1.23 2008/03/26 17:20:27 tpederse Exp $ 165 | 166 | =head1 BUGS 167 | 168 | 169 | =head1 SEE ALSO 170 | 171 | @article{Dunning93, 172 | author = {Dunning, T.}, 173 | title = {Accurate Methods for the Statistics of 174 | Surprise and Coincidence}, 175 | journal = {Computational Linguistics}, 176 | volume = {19}, 177 | number = {1}, 178 | year = {1993}, 179 | pages = {61-74} 180 | url = L} 181 | 182 | @inproceedings{moore:2004:EMNLP, 183 | author = {Moore, Robert C.}, 184 | title = {On Log-Likelihood-Ratios and the Significance of Rare 185 | Events }, 186 | booktitle = {Proceedings of EMNLP 2004}, 187 | editor = {Dekang Lin and Dekai Wu}, 188 | year = 2004, 189 | month = {July}, 190 | address = {Barcelona, Spain}, 191 | publisher = {Association for Computational Linguistics}, 192 | pages = {333--340} 193 | url = L} 194 | 195 | L 196 | 197 | L 198 | 199 | 200 | =head1 COPYRIGHT 201 | 202 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 203 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 204 | 205 | This program is free software; you can redistribute it and/or modify it 206 | under the terms of the GNU General Public License as published by the Free 207 | Software Foundation; either version 2 of the License, or (at your option) 208 | any later version. 209 | 210 | This program is distributed in the hope that it will be useful, but 211 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 212 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 213 | for more details. 214 | 215 | You should have received a copy of the GNU General Public License along 216 | with this program; if not, write to 217 | 218 | The Free Software Foundation, Inc., 219 | 59 Temple Place - Suite 330, 220 | Boston, MA 02111-1307, USA. 221 | 222 | Note: a copy of the GNU General Public License is available on the web 223 | at L and is included in this 224 | distribution as GPL.txt. 225 | 226 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/functional_class.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | $/=undef; 4 | use Cwd; 5 | $pwd=cwd(); 6 | use lib "$ARGV[5]/class/lib"; 7 | use lib "$ARGV[5]/class//lib/Tie-IxHash-1.23"; 8 | use Tie::IxHash; 9 | use Statistics::Multtest qw(bonferroni holm hommel hochberg BH BY qvalue); 10 | use Statistics::Multtest qw(:all); 11 | tie %kegg, 'Tie::IxHash'; 12 | use Text::NSP::Measures::2D::Fisher::right; 13 | use Number::FormatEng qw(:all); 14 | use Data::Dumper; 15 | @myout=split("/",$ARGV[1]); 16 | open(RESULT,">@ARGV[2]/@{myout[$#myout]}_${ARGV[3]}_functional_classification.tsv"); 17 | ##if(! defined @ARGV[0] && ! defined @ARGV[1] && ! defined @ARGV[2] || $ARGV[0] eq "-h") 18 | ##{ 19 | #print"Gene Ontology/Functional Classification for set of genes\n--------------------------------------------------------\nUsage: OntologyClass [gene_identifier] [list]\n\nidentifier: Gene identifier for given input ['sym' - without quotes for gene symbols and 'gid' - without quotes for Gene ID ].\nlist: Gene list for analysis.\n\nAuthor: Santhilal Subhash\nsanthilal.subhash\@gu.se\nLast Updated: 2013 July 14\n" 20 | ##print"Gene Ontology/Functional Classification for set of genes\n--------------------------------------------------------\nUsage: OntologyClass [identifier] [list] [OUTPATH]\n\nidentifier: Gene identifier for given input ['sym' - without quotes for gene symbols and 'gid' - without quotes for Gene ID ].\nlist: Gene list for analysis.\nOUTPATH: The path where output file should be stored (This script generates output named YOUR_INPUT_FILE_functional_classification.tsv in the mentioned path).\n\nAuthor: Santhilal Subhash\nsanthilal.subhash\@gu.se\nLast Updated: 2014 February 6\n" 21 | ##} 22 | 23 | #### GeneOntology DB #### 24 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_all") 25 | { 26 | $mytype="Gene Symbol"; 27 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new.txt") or die "Error opening in file"; 28 | } 29 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_all") 30 | { 31 | $mytype="Entrez GeneID"; 32 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new.txt") or die "Error opening in file"; 33 | } 34 | 35 | 36 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_BP") 37 | { 38 | $mytype="Gene Symbol"; 39 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new_bp.txt") or die "Error opening in file"; 40 | } 41 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_BP") 42 | { 43 | $mytype="Entrez GeneID"; 44 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new_bp.txt") or die "Error opening in file"; 45 | } 46 | 47 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_MF") 48 | { 49 | $mytype="Gene Symbol"; 50 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new_mf.txt") or die "Error opening in file"; 51 | } 52 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_MF") 53 | { 54 | $mytype="Entrez GeneID"; 55 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new_mf.txt") or die "Error opening in file"; 56 | } 57 | 58 | if($ARGV[0] eq "sym" && $ARGV[3] eq "GO_CC") 59 | { 60 | $mytype="Gene Symbol"; 61 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated140122_new_cc.txt") or die "Error opening in file"; 62 | } 63 | if($ARGV[0] eq "gid" && $ARGV[3] eq "GO_CC") 64 | { 65 | $mytype="Entrez GeneID"; 66 | open(IN1,"$ARGV[5]/annotation/gene_association.grouped.annotated_RplcdIDs140122_new_cc.txt") or die "Error opening in file"; 67 | } 68 | 69 | #### KEGG DB #### 70 | if($ARGV[0] eq "sym" && $ARGV[3] eq "KEGG") 71 | { 72 | $mytype="Gene Symbol"; 73 | open(IN1,"$ARGV[5]/annotation/KEGG_pathway_updated130711_geneSym.txt") or die "Error opening in file"; 74 | } 75 | if($ARGV[0] eq "gid" && $ARGV[3] eq "KEGG") 76 | { 77 | $mytype="Entrez GeneID"; 78 | open(IN1,"$ARGV[5]/annotation/KEGG_pathway_updated130711_geneID.txt") or die "Error opening in file"; 79 | } 80 | 81 | #### REACTOME DB #### 82 | if($ARGV[0] eq "sym" && $ARGV[3] eq "REACTOME") 83 | { 84 | $mytype="Gene Symbol"; 85 | open(IN1,"$ARGV[5]/annotation/ReactomePathways_updated150605_geneSym.txt") or die "Error opening in file"; 86 | } 87 | if($ARGV[0] eq "gid" && $ARGV[3] eq "REACTOME") 88 | { 89 | $mytype="Entrez GeneID"; 90 | open(IN1,"$ARGV[5]/annotation/ReactomePathways_updated150605_RplcdIDs.txt") or die "Error opening in file"; 91 | } 92 | 93 | if($ARGV[0] eq "sym" && $ARGV[3] eq "NCG") 94 | { 95 | $mytype="Gene Symbol"; 96 | open(IN1,"$ARGV[5]/annotation/NCG4.0_annotation_Updated150605_geneSym.txt") or die "Error opening in file"; 97 | } 98 | 99 | if($ARGV[0] eq "gid" && $ARGV[3] eq "NCG") 100 | { 101 | $mytype="Gene Symbol"; 102 | open(IN1,"$ARGV[5]/annotation/NCG4.0_annotation_Updated150605_RplcdIDs.txt") or die "Error opening in file"; 103 | } 104 | 105 | 106 | 107 | open(IN2,$ARGV[1]) or print "\n***\nError opening input file: $ARGV[1]\n***\n\n"; 108 | 109 | print RESULT "Genes\tProcess\tGO:Class\tnum_of_Genes\tgene_group\tpercentage%\tP-value\tEASE (http://david.abcc.ncifcrf.gov/content.jsp?file=functional_annotation.html#fisher) \tBenjamini and Hochberg (FDR)\t Hommel singlewise process\tBonferroni single-step process\tHommel singlewise process\tHochberg step-up process\tBenjamini and Yekutieli\n"; 110 | 111 | while(){ 112 | chomp; 113 | @temp1=split("\n",$_); 114 | foreach $temp1(@temp1) 115 | { 116 | @ar1 = split("\t",$temp1); 117 | $kegg{$ar1[0]} = ["$ar1[1]","$ar1[2]"]; 118 | } 119 | } 120 | 121 | 122 | 123 | while(){ 124 | @temp2=split("\n",$_); 125 | foreach $temp2(@temp2) 126 | { 127 | @ar2 = split("\t",$temp2); 128 | $bp{$ar2[0]} = $ar2[0]; 129 | } 130 | } 131 | $gene_list=@temp2; 132 | 133 | @mykeys=(); 134 | foreach my $keykg ( keys %kegg ) 135 | { 136 | 137 | @kgene=split(",",$kegg{$keykg}[1]); 138 | $kcount=1; 139 | $gcount=1; 140 | $gnum=0; 141 | @gset=(); 142 | foreach $kgene(@kgene) 143 | {$knum=$kcount++; 144 | if(exists $bp{$kgene}) 145 | { 146 | $gnum=$gcount++; 147 | $indgene=$bp{$kgene}.";"; 148 | push(@gset,$indgene); 149 | ##print RESULT $bp{$kgene}.";"; 150 | } 151 | 152 | 153 | } 154 | 155 | 156 | if($gnum>0) 157 | { 158 | $x=$gnum; 159 | $n=$gene_list; 160 | $M=$knum; ## total genes in process 161 | $N=$ARGV[4]; 162 | 163 | 164 | 165 | 166 | $fisher_value = calculateStatistic( n11=>$x,n1p=>$n,np1=>$x+$M,npp=>$N+$n); 167 | $ease_value= calculateStatistic( n11=>$x-1,n1p=>$n,np1=>$x+$M,npp=>$N+$n); 168 | 169 | push(@new,$fisher_value); 170 | 171 | ##print RESULT "\t$keykg\t$kegg{$keykg}[0]\t$gnum\t$knum\t$fisher_value\t$ease_value\n"; 172 | $percent=(($gnum/$knum)*100); 173 | push(@finres,"@gset\t$keykg\t$kegg{$keykg}[0]\t$gnum\t$knum\t$percent\t$fisher_value\t$ease_value\t"); 174 | 175 | } 176 | 177 | 178 | } 179 | 180 | $p=\@new; 181 | $bhres = BH($p); 182 | $holmres = holm($p); 183 | $bfres=bonferroni($p); 184 | $hommel=hommel($p); 185 | $hochberg=hochberg($p); 186 | $byres=BY($p); 187 | 188 | 189 | for($i=0;$i<=$#finres;$i++) 190 | { 191 | print RESULT $finres[$i]; 192 | print RESULT @$bhres[$i]."\t".@$holmres[$i]."\t".@$bfres[$i]."\t".@$hommel[$i]."\t".@$hochberg[$i]."\t".@$byres[$i]."\n"; 193 | 194 | } 195 | 196 | 197 | 198 | 199 | 200 | close(RESULT); 201 | print"=================\nRun successful. Check your output directory $ARGV[2] \n=================\n\nParameters used:\n\nbackground genes:\t$ARGV[4]\nIdentitiy:\t\t$mytype\nDatabase used:\t\t$ARGV[3]\nOutput file:\t\t@ARGV[2]@{myout[$#myout]}_${ARGV[3]}_functional_classification.tsv\n\t\tWARNING: Your output is not sorted with P-val/FDR.\n\n\n---------------------\n\nAuthor: Santhilal Subhash\nsanthilal.subhash\@gu.se\nLast Updated: 2015 June 05\n" 202 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/3D/MI/ll.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::3D::MI::ll - Perl module that implements Loglikelihood 4 | measure of association for trigrams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::3D::MI::ll; 11 | 12 | $ll_value = calculateStatistic( n111=>10, 13 | n1pp=>40, 14 | np1p=>45, 15 | npp1=>42, 16 | n11p=>20, 17 | n1p1=>23, 18 | np11=>21, 19 | nppp=>100); 20 | 21 | if( ($errorCode = getErrorCode())) 22 | { 23 | print STDERR $erroCode." - ".getErrorMessage()."\n"; 24 | } 25 | else 26 | { 27 | print getStatisticName."value for trigram is ".$ll_value."\n"; 28 | } 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | The log-likelihood ratio measures the devitation between the observed data 34 | and what would be expected if , and were independent. 35 | The higher the score, the less evidence there is in favor of concluding that 36 | the words are independent. 37 | 38 | The expected values for the internal cells are calculated by taking the 39 | product of their associated marginals and dividing by the sample size, 40 | for example: 41 | 42 | n1pp * np1p * npp1 43 | m111= -------------------- 44 | nppp 45 | 46 | Then the deviation between observed and expected values for each internal 47 | cell is computed to arrive at the log-likelihood value. 48 | 49 | Log-Likelihood = 2 * [n111 * log(n111/m111) + n112 * log(n112/m112) + 50 | n121 * log(n121/m121) + n122 * log(n122/m122) + 51 | n211 * log(n211/m211) + n212 * log(n212/m212) + 52 | n221 * log(n221/m221) + n222 * log(n222/m222)] 53 | 54 | =over 55 | 56 | =cut 57 | 58 | 59 | package Text::NSP::Measures::3D::MI::ll; 60 | 61 | 62 | use Text::NSP::Measures::3D::MI; 63 | use strict; 64 | use Carp; 65 | use warnings; 66 | no warnings 'redefine'; 67 | require Exporter; 68 | 69 | our ($VERSION, @EXPORT, @ISA); 70 | 71 | @ISA = qw(Exporter); 72 | 73 | @EXPORT = qw(initializeStatistic calculateStatistic 74 | getErrorCode getErrorMessage getStatisticName); 75 | 76 | $VERSION = '0.97'; 77 | 78 | =item calculateStatistic($count_values) - This method calculates 79 | the ll value 80 | 81 | INPUT PARAMS : $count_values .. Reference of an hash containing 82 | the count values computed by the 83 | count.pl program. 84 | 85 | RETURN VALUES : $loglikelihood .. Loglikelihood value for this trigram. 86 | 87 | =cut 88 | 89 | sub calculateStatistic 90 | { 91 | my %values = @_; 92 | 93 | # computes and sets the observed and expected values from 94 | # the frequency combination values. returns 0 if there is an 95 | # error in the computation or the values are inconsistent. 96 | if( !(Text::NSP::Measures::3D::MI::getValues(\%values)) ) { 97 | return; 98 | } 99 | 100 | # Now for the actual calculation of Loglikelihood! 101 | my $logLikelihood = 0; 102 | 103 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so! 104 | $logLikelihood += $n111 * Text::NSP::Measures::3D::MI::computePMI( $n111, $m111 ); 105 | $logLikelihood += $n112 * Text::NSP::Measures::3D::MI::computePMI( $n112, $m112 ); 106 | $logLikelihood += $n121 * Text::NSP::Measures::3D::MI::computePMI( $n121, $m121 ); 107 | $logLikelihood += $n122 * Text::NSP::Measures::3D::MI::computePMI( $n122, $m122 ); 108 | $logLikelihood += $n211 * Text::NSP::Measures::3D::MI::computePMI( $n211, $m211 ); 109 | $logLikelihood += $n212 * Text::NSP::Measures::3D::MI::computePMI( $n212, $m212 ); 110 | $logLikelihood += $n221 * Text::NSP::Measures::3D::MI::computePMI( $n221, $m221 ); 111 | $logLikelihood += $n222 * Text::NSP::Measures::3D::MI::computePMI( $n222, $m222 ); 112 | 113 | return ( 2 * $logLikelihood ); 114 | } 115 | 116 | 117 | =item getStatisticName() - Returns the name of this statistic 118 | 119 | INPUT PARAMS : none 120 | 121 | RETURN VALUES : $name .. Name of the measure. 122 | 123 | =cut 124 | 125 | sub getStatisticName 126 | { 127 | return "Loglikelihood"; 128 | } 129 | 130 | 131 | 132 | 1; 133 | __END__ 134 | 135 | 136 | =back 137 | 138 | =head1 AUTHOR 139 | 140 | Ted Pedersen, University of Minnesota Duluth 141 | Etpederse@d.umn.eduE 142 | 143 | Satanjeev Banerjee, Carnegie Mellon University 144 | Esatanjeev@cmu.eduE 145 | 146 | Amruta Purandare, University of Pittsburgh 147 | Eamruta@cs.pitt.eduE 148 | 149 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 150 | Ebthomson@d.umn.eduE 151 | 152 | Saiyam Kohli, University of Minnesota Duluth 153 | Ekohli003@d.umn.eduE 154 | 155 | =head1 HISTORY 156 | 157 | Last updated: $Id: ll.pm,v 1.10 2011/12/23 21:59:33 btmcinnes Exp $ 158 | 159 | =head1 BUGS 160 | 161 | 162 | =head1 SEE ALSO 163 | 164 | @article{Dunning93, 165 | author = {Dunning, T.}, 166 | title = {Accurate Methods for the Statistics of 167 | Surprise and Coincidence}, 168 | journal = {Computational Linguistics}, 169 | volume = {19}, 170 | number = {1}, 171 | year = {1993}, 172 | pages = {61-74} 173 | url = L} 174 | 175 | @inproceedings{moore:2004:EMNLP, 176 | author = {Moore, Robert C.}, 177 | title = {On Log-Likelihood-Ratios and the Significance of Rare 178 | Events }, 179 | booktitle = {Proceedings of EMNLP 2004}, 180 | editor = {Dekang Lin and Dekai Wu}, 181 | year = 2004, 182 | month = {July}, 183 | address = {Barcelona, Spain}, 184 | publisher = {Association for Computational Linguistics}, 185 | pages = {333--340} 186 | url = L} 187 | 188 | L 189 | 190 | L 191 | 192 | 193 | =head1 COPYRIGHT 194 | 195 | Copyright (C) 2000-2011, Ted Pedersen, Satanjeev Banerjee, Amruta 196 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 197 | 198 | This program is free software; you can redistribute it and/or modify it 199 | under the terms of the GNU General Public License as published by the Free 200 | Software Foundation; either version 2 of the License, or (at your option) 201 | any later version. 202 | 203 | This program is distributed in the hope that it will be useful, but 204 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 205 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 206 | for more details. 207 | 208 | You should have received a copy of the GNU General Public License along 209 | with this program; if not, write to 210 | 211 | The Free Software Foundation, Inc., 212 | 59 Temple Place - Suite 330, 213 | Boston, MA 02111-1307, USA. 214 | 215 | Note: a copy of the GNU General Public License is available on the web 216 | at L and is included in this 217 | distribution as GPL.txt. 218 | 219 | =cut 220 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Number/FormatEng.pm: -------------------------------------------------------------------------------- 1 | package Number::FormatEng; 2 | 3 | use warnings; 4 | use strict; 5 | use Carp; 6 | use POSIX; 7 | use Scalar::Util qw(looks_like_number); 8 | 9 | require Exporter; 10 | our @ISA = qw(Exporter); 11 | our @EXPORT_OK = qw(format_eng format_pref unformat_pref use_e_zero no_e_zero); 12 | our %EXPORT_TAGS = (all => \@EXPORT_OK); 13 | 14 | our $VERSION = '0.01'; 15 | 16 | my %prefix = ( 17 | '-8' => 'y', '8' => 'Y', 18 | '-7' => 'z', '7' => 'Z', 19 | '-6' => 'a', '6' => 'E', 20 | '-5' => 'f', '5' => 'P', 21 | '-4' => 'p', '4' => 'T', 22 | '-3' => 'n', '3' => 'G', 23 | '-2' => 'u', '2' => 'M', 24 | '-1' => 'm', '1' => 'k', 25 | '0' => '' , 26 | ); 27 | my %exponent = reverse %prefix; 28 | 29 | my $no_e_zero = 1; 30 | 31 | sub use_e_zero { 32 | $no_e_zero = 0; 33 | } 34 | 35 | sub no_e_zero { 36 | $no_e_zero = 1; 37 | } 38 | 39 | sub format_pref { 40 | return format_num(1, @_); 41 | } 42 | 43 | sub format_eng { 44 | return format_num(0, @_); 45 | } 46 | 47 | sub format_num { 48 | my $prefix_mode = shift; 49 | my $num = shift; 50 | 51 | my $name = ($prefix_mode) ? 'format_pref' : 'format_eng'; 52 | 53 | # Check validity of input 54 | unless (defined $num) { 55 | croak("Error: $name requires numeric input. ", 56 | 'It seems like no input was provided or input was undefined'); 57 | } 58 | unless (looks_like_number($num)) { 59 | croak("Error: $name requires numeric input. '$num' is not numeric"); 60 | } 61 | 62 | if ($num == 0) { 63 | if ($prefix_mode or $no_e_zero) { 64 | return '0'; 65 | } 66 | else { 67 | return '0e0'; 68 | } 69 | } 70 | 71 | my $sign = ($num < 0) ? '-' : ''; 72 | $num = abs $num; 73 | 74 | if ($prefix_mode) { 75 | if ( ($num >= 1e27) or ($num <= 1e-25) ) { 76 | # switch to number exponent mode 77 | $prefix_mode = 0; 78 | } 79 | } 80 | 81 | my $e = floor( log($num) / log(1000) ); 82 | my $mult = 1000**$e; 83 | $num = adjust($num / $mult); 84 | if ($prefix_mode) { 85 | return $sign . $num . $prefix{$e}; 86 | } 87 | else { 88 | if ($no_e_zero and ($e == 0)) { 89 | return $sign . $num; 90 | } 91 | else { 92 | return $sign . $num . 'e' . 3*$e; 93 | } 94 | } 95 | } 96 | 97 | sub adjust { 98 | my $num = shift; 99 | if ($num < 1) { 100 | return 1; 101 | } 102 | elsif (($num < 10) and ($num > 9.999_999_999)) { 103 | return 10; 104 | } 105 | elsif (($num < 100) and ($num > 99.999_999_99)) { 106 | return 100; 107 | } 108 | else { 109 | return $num; 110 | } 111 | } 112 | 113 | sub unformat_pref { 114 | my ($num) = @_; 115 | 116 | # Check validity of input 117 | unless (defined $num) { 118 | croak('Error: unformat_pref requires input. ', 119 | 'It seems like no input was provided or input was undefined'); 120 | } 121 | 122 | # Trim leading and trailing whitespace 123 | $num =~ s/^\s+//; 124 | $num =~ s/\s+$//; 125 | 126 | unless (length $num) { 127 | croak('Error: unformat_pref requires input. ', 128 | 'It seems like no input was provided'); 129 | } 130 | 131 | my $prefix = substr $num, -1; 132 | if (exists $exponent{$prefix}) { 133 | chop $num; 134 | unless (looks_like_number($num)) { 135 | croak("Error: unformat_pref input '$num' is not numeric before prefix '$prefix'"); 136 | } 137 | $num = $num * (1000**$exponent{$prefix}); 138 | } 139 | else { 140 | unless (looks_like_number($num)) { 141 | croak("Error: unformat_pref input '$num' is not numeric"); 142 | } 143 | } 144 | 145 | return $num; 146 | } 147 | 148 | 149 | =head1 NAME 150 | 151 | Number::FormatEng - Format a number using engineering notation 152 | 153 | =head1 VERSION 154 | 155 | This document refers to Number::FormatEng version 0.01. 156 | 157 | =head1 SYNOPSIS 158 | 159 | use Number::FormatEng qw(:all); 160 | print format_eng(1234); # prints 1.234e3 161 | print format_pref(-0.035); # prints -35m 162 | unformat_pref('1.23T'); # returns 1.23e+12 163 | 164 | =head1 DESCRIPTION 165 | 166 | Format a number for printing using engineering notation. 167 | Engineering notation is similar to scientific notation except that 168 | the power of ten must be a multiple of three. 169 | Alternately, the number can be formatted using an International 170 | System of Units (SI) prefix representing a factor of a thousand. 171 | 172 | =head1 SUBROUTINES 173 | 174 | =over 4 175 | 176 | =item format_eng($number) 177 | 178 | Format a numeric value using engineering notation. This function 179 | returns a string whose exponent is a multiple of 3. Here are some examples: 180 | 181 | format_eng(1234); # returns 1.234e3 182 | format_eng(-0.03); # returns -30e-3 183 | format_eng(7.8e7); # returns 78e6 184 | 185 | In most cases, the precision is preserved. However, rounding will occur 186 | if the number of digits is too large (system-dependent). Keep this in 187 | mind if C<$number> is a numeric expression. For example, the following 188 | may return a different number of digits from system to system: 189 | 190 | format_eng(1/3); 191 | 192 | =item format_pref($number) 193 | 194 | Format a numeric value using engineering notation. This function 195 | returns a string using one of the following SI prefixes (representing a 196 | power of a thousand): 197 | 198 | m u n p f a z y 199 | k M G T P E Z Y 200 | 201 | Notice that lower-case C is used instead of the Greek letter Mu. 202 | 203 | If the number is beyond the prefix ranges (y and Y), then C 204 | returns the same formatted string as C. In other words, it 205 | does not use an SI prefix. 206 | 207 | Here are some examples: 208 | 209 | format_pref(1234); # returns 1.234k 210 | format_pref(-0.0004); # returns -400u 211 | format_pref(1.27e13); # returns 12.7G 212 | format_pref(7.5e60); # returns 7.5e60 213 | 214 | =item unformat_pref($string) 215 | 216 | Convert a string formatted using C into a numeric value. 217 | Here are some examples: 218 | 219 | unformat_pref('1.23T'); # returns 1.23e+12 220 | unformat_pref('-400u'); # returns -4e-4 221 | unformat_pref(37.5); # returns 37.5 222 | 223 | =item use_e_zero() and no_e_zero() 224 | 225 | By default, if the exponent is zero, C is not displayed by 226 | C. To explicitly display C, use the C method. 227 | Use the C method to return to the default behavior. 228 | 229 | format_eng(55); # returns 55 230 | Number::FormatEng::use_e_zero(); 231 | format_eng(55); # now returns 55e0 232 | Number::FormatEng::no_e_zero(); 233 | format_eng(55); # back to 55 234 | 235 | =back 236 | 237 | =head1 EXPORT 238 | 239 | Nothing is exported by default. Functions may be exported individually, or 240 | all functions may be exported at once, using the special tag C<:all>. 241 | 242 | =head1 DIAGNOSTICS 243 | 244 | Error conditions cause the program to die using C from the 245 | C Core module. 246 | 247 | =head1 BUGS AND LIMITATIONS 248 | 249 | There are no known bugs in this module. 250 | 251 | =head1 SEE ALSO 252 | 253 | Refer to the following website: 254 | 255 | L 256 | 257 | =head1 AUTHOR 258 | 259 | Gene Sullivan (gsullivan@cpan.org) 260 | 261 | =head1 ACKNOWLEDGEMENTS 262 | 263 | Influenced by the following PerlMonks: BrowserUk, GrandFather and repellent. 264 | 265 | =head1 COPYRIGHT AND LICENSE 266 | 267 | Copyright (c) 2009 Gene Sullivan. All rights reserved. 268 | 269 | This module is free software; you can redistribute it and/or modify 270 | it under the same terms as Perl itself. See L. 271 | 272 | =cut 273 | 274 | 1; 275 | 276 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/MI.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::MI - Perl module that provides error checks 4 | for Loglikelihood, Total Mutual 5 | Information, Pointwise Mutual Information 6 | and Poisson-Stirling Measure. 7 | 8 | =head1 SYNOPSIS 9 | 10 | =head3 Basic Usage 11 | 12 | use Text::NSP::Measures::2D::MI::ll; 13 | 14 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 15 | 16 | $ll_value = calculateStatistic( n11=>$n11, 17 | n1p=>$n1p, 18 | np1=>$np1, 19 | npp=>$npp); 20 | 21 | if( ($errorCode = getErrorCode())) 22 | { 23 | print STDERR $errorCode." - ".getErrorMessage()."\n""; 24 | } 25 | else 26 | { 27 | print getStatisticName."value for bigram is ".$ll_value."\n""; 28 | } 29 | 30 | =head1 DESCRIPTION 31 | 32 | This module is the base class for the Loglikelihood, Total Mutual 33 | Information and the Pointwise Mutual Information measures. All these 34 | measure are similar. This module provides error checks specific for 35 | these measures, it also implements the computations that are common 36 | to these measures. 37 | 38 | =over 39 | 40 | =item Log-Likelihood measure is computed as 41 | 42 | Log-Likelihood = 2 * [n11 * log(n11/m11) + n12 * log(n12/m12) + 43 | n21 * log(n21/m21) + n22 * log(n22/m22)] 44 | 45 | =item Total Mutual Information 46 | 47 | TMI = (1/npp)*[n11 * log(n11/m11)/log 2 + n12 * log(n12/m12)/log 2 + 48 | n21 * log(n21/m21)/log 2 + n22 * log(n22/m22)/log 2] 49 | 50 | =item Pointwise Mutual Information 51 | 52 | PMI = log (n11/m11)/log 2 53 | 54 | =item Poisson Stirling Measures 55 | 56 | PS = n11*(log (n11/m11)-1) 57 | 58 | =back 59 | 60 | All these methods use the ratio of the observed values to expected values, 61 | for computations, and thus have common error checks, so they have been grouped 62 | together. 63 | 64 | =head2 Methods 65 | 66 | =over 67 | 68 | =cut 69 | 70 | 71 | package Text::NSP::Measures::2D::MI; 72 | 73 | 74 | use Text::NSP::Measures::2D; 75 | use strict; 76 | use Carp; 77 | use warnings; 78 | # use subs(calculateStatistic); 79 | require Exporter; 80 | 81 | our ($VERSION, @EXPORT, @ISA); 82 | 83 | @ISA = qw(Exporter); 84 | 85 | @EXPORT = qw(initializeStatistic calculateStatistic 86 | getErrorCode getErrorMessage getStatisticName 87 | $errorCodeNumber $errorMessage 88 | $n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22 89 | $npp $np1 $np2 $n2p $n1p); 90 | 91 | $VERSION = '1.03'; 92 | 93 | 94 | =item getValues() - This method calls the computeMarginalTotals(), 95 | computeObservedValues() and the computeExpectedValues() methods to 96 | compute the observed and expected values. It checks these values for 97 | any errors that might cause the Loglikelihood, TMI & PMI measures to 98 | fail. 99 | 100 | 101 | INPUT PARAMS : $count_values .. Reference of an hash containing 102 | the count values computed by the 103 | count.pl program. 104 | 105 | 106 | RETURN VALUES : 1/undef ..returns '1' to indicate success 107 | and an undefined(NULL) value to indicate 108 | failure. 109 | 110 | =cut 111 | 112 | sub getValues 113 | { 114 | my ($values)=@_; 115 | 116 | if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){ 117 | return; 118 | } 119 | 120 | if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) { 121 | return; 122 | } 123 | 124 | if( !(Text::NSP::Measures::2D::computeExpectedValues($values)) ) { 125 | return; 126 | } 127 | 128 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so and return; 129 | if ( $n11 ) 130 | { 131 | if ($m11 == 0) 132 | { 133 | $errorMessage = "Expected value in cell (1,1) must not be zero"; 134 | $errorCodeNumber = 211; 135 | return; 136 | } 137 | } 138 | if ( $n12 ) 139 | { 140 | if ($m12 == 0) 141 | { 142 | $errorMessage = "Expected value in cell (1,2) must not be zero"; 143 | $errorCodeNumber = 211; 144 | return; 145 | } 146 | } 147 | if ( $n21 ) 148 | { 149 | if ($m21 == 0) 150 | { 151 | $errorMessage = "Expected value in cell (2,1) must not be zero"; 152 | $errorCodeNumber = 211; 153 | return; 154 | } 155 | } 156 | if ( $n22 ) 157 | { 158 | if ($m22 == 0) 159 | { 160 | $errorMessage = "Expected value in cell (2,2) must not be zero"; 161 | $errorCodeNumber = 211; 162 | return; 163 | } 164 | } 165 | if ($m11 < 0) 166 | { 167 | $errorMessage = "Expected value for cell (1,1) should not be negative"; 168 | $errorCodeNumber = 212; 169 | return; 170 | } 171 | if ($m12 < 0) 172 | { 173 | $errorMessage = "Expected value for cell (1,2) should not be negative"; 174 | $errorCodeNumber = 212; 175 | return; 176 | } 177 | if ($m21 < 0) 178 | { 179 | $errorMessage = "Expected value for cell (2,1) should not be negative"; 180 | $errorCodeNumber = 212; 181 | return; 182 | } 183 | if ($m22 < 0) 184 | { 185 | $errorMessage = "Expected value for cell (2,2) should not be negative"; 186 | $errorCodeNumber = 212; 187 | return; 188 | } 189 | 190 | # Everything looks good so we can return 1 191 | return 1; 192 | } 193 | 194 | 195 | 196 | 197 | =item computePMI() - Computes the pmi of a given observed and expected 198 | value pair. 199 | 200 | INPUT PARAMS : $n ..Observed value 201 | $m ..Expected value 202 | 203 | RETURN VALUES : log(n/m) ..the log of the ratio of 204 | observed value to expected 205 | value. 206 | 207 | =cut 208 | 209 | sub computePMI 210 | { 211 | my $n = shift; 212 | my $m = shift; 213 | if($n) 214 | { 215 | my $val = $n/$m; 216 | return log($val); 217 | } 218 | else 219 | { 220 | return 0; 221 | } 222 | } 223 | 224 | 225 | 226 | 1; 227 | __END__ 228 | 229 | 230 | =back 231 | 232 | =head1 AUTHOR 233 | 234 | Ted Pedersen, University of Minnesota Duluth 235 | Etpederse@d.umn.eduE 236 | 237 | Satanjeev Banerjee, Carnegie Mellon University 238 | Esatanjeev@cmu.eduE 239 | 240 | Amruta Purandare, University of Pittsburgh 241 | Eamruta@cs.pitt.eduE 242 | 243 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 244 | Ebthompson@d.umn.eduE 245 | 246 | Saiyam Kohli, University of Minnesota Duluth 247 | Ekohli003@d.umn.eduE 248 | 249 | =head1 HISTORY 250 | 251 | Last updated: $Id: MI.pm,v 1.27 2008/03/26 17:18:26 tpederse Exp $ 252 | 253 | =head1 BUGS 254 | 255 | 256 | =head1 SEE ALSO 257 | 258 | L 259 | 260 | L 261 | 262 | 263 | =head1 COPYRIGHT 264 | 265 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 266 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 267 | 268 | This program is free software; you can redistribute it and/or modify it 269 | under the terms of the GNU General Public License as published by the Free 270 | Software Foundation; either version 2 of the License, or (at your option) 271 | any later version. 272 | 273 | This program is distributed in the hope that it will be useful, but 274 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 275 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 276 | for more details. 277 | 278 | You should have received a copy of the GNU General Public License along 279 | with this program; if not, write to 280 | 281 | The Free Software Foundation, Inc., 282 | 59 Temple Place - Suite 330, 283 | Boston, MA 02111-1307, USA. 284 | 285 | Note: a copy of the GNU General Public License is available on the web 286 | at L and is included in this 287 | distribution as GPL.txt. 288 | 289 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher/right.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Fisher::right - Perl module implementation of the right sided 4 | Fisher's exact test. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Fisher::right; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $right_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage(); 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$right_value; 26 | } 27 | 28 | 29 | =head1 DESCRIPTION 30 | 31 | Assume that the frequency count data associated with a bigram 32 | is stored in a 2x2 contingency table: 33 | 34 | word2 ~word2 35 | word1 n11 n12 | n1p 36 | ~word1 n21 n22 | n2p 37 | -------------- 38 | np1 np2 npp 39 | 40 | where n11 is the number of times occur together, and 41 | n12 is the number of times occurs with some word other than 42 | word2, and n1p is the number of times in total that word1 occurs as 43 | the first word in a bigram. 44 | 45 | The fishers exact tests are calculated by fixing the marginal totals 46 | and computing the hypergeometric probabilities for all the possible 47 | contingency tables, 48 | 49 | A right sided test is calculated by adding the probabilities of all 50 | the possible two by two contingency tables formed by fixing the 51 | marginal totals and changing the value of n11 to greater than or 52 | equal to the given value. A right sided Fisher's Exact Test tells us 53 | how likely it is to randomly sample a table where n11 is greater 54 | than observed. In other words, it tells us how likely it is to sample 55 | an observation where the two words are more dependent than currently 56 | observed. 57 | 58 | =head2 Methods 59 | 60 | =over 61 | 62 | =cut 63 | 64 | package Text::NSP::Measures::2D::Fisher::right; 65 | 66 | 67 | use Text::NSP::Measures::2D::Fisher; 68 | use strict; 69 | use Carp; 70 | use warnings; 71 | no warnings 'redefine'; 72 | require Exporter; 73 | 74 | our ($VERSION, @EXPORT, @ISA); 75 | 76 | @ISA = qw(Exporter); 77 | 78 | @EXPORT = qw(initializeStatistic calculateStatistic 79 | getErrorCode getErrorMessage getStatisticName); 80 | 81 | $VERSION = '0.97'; 82 | 83 | 84 | =item calculateStatistic() - This method calculates the right Fisher value 85 | 86 | INPUT PARAMS : $count_values .. Reference of an hash containing 87 | the count values computed by the 88 | count.pl program. 89 | 90 | RETURN VALUES : $right .. Right Fisher value. 91 | 92 | =cut 93 | 94 | sub calculateStatistic 95 | { 96 | my %values = @_; 97 | 98 | my $probabilities; 99 | my $left_flag = 0; 100 | 101 | # computes and returns the observed and marginal values from 102 | # the frequency combination values. returns 0 if there is an 103 | # error in the computation or the values are inconsistent. 104 | if( !(Text::NSP::Measures::2D::Fisher::getValues(\%values)) ) 105 | { 106 | return; 107 | } 108 | 109 | my $final_limit = ($n1p < $np1) ? $n1p : $np1; 110 | my $n11_org = $n11; 111 | 112 | my $n11_start = $n1p + $np1 - $npp; 113 | if($n11_start < $n11) 114 | { 115 | $n11_start = $n11; 116 | } 117 | 118 | 119 | # to make the computations faster, we check which would require less computations 120 | # computing the leftfisher value and subtracting it from 1 or directly computing 121 | # the right fisher value. We do this since, generally for bigrams n11 is quite small 122 | # so its much faster to compute the left Fisher value. 123 | my $left_final_limit = $n11-1; 124 | my $left_n11 = $n1p + $np1 - $npp; 125 | if($left_n11<0) 126 | { 127 | $left_n11 = 0; 128 | } 129 | 130 | # if computing the left fisher values first will take lesser amount of time them 131 | # we set a flag for later reference and then compute the leftfisher score for 132 | # n11-1 and then subtract the total score from one to get the right fisher value. 133 | if(($left_final_limit - $left_n11) < ($final_limit - $n11_start)) 134 | { 135 | $left_flag = 1; 136 | if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($left_n11, $left_final_limit))) 137 | { 138 | return; 139 | } 140 | } 141 | 142 | #else we compute the value normally and simply sum to get the rightfisher value. 143 | else 144 | { 145 | if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($n11_start, $final_limit))) 146 | { 147 | return; 148 | } 149 | } 150 | 151 | my $key_n11; 152 | 153 | my $rightfisher=0; 154 | 155 | foreach $key_n11 (sort { $b <=> $a } keys %$probabilities) 156 | { 157 | if($left_flag) 158 | { 159 | if($key_n11 >= $n11_org) 160 | { 161 | last; 162 | } 163 | } 164 | else 165 | { 166 | if($key_n11 < $n11_org) 167 | { 168 | last; 169 | } 170 | } 171 | $rightfisher += exp($probabilities->{$key_n11}); 172 | } 173 | 174 | # if we computed the leftfisher value to get the right fisher value, we subtract 175 | # the sum of the probabilities for the tables from one to get the right fisher score. 176 | if($left_flag) 177 | { 178 | if ($rightfisher > 1) 179 | { 180 | $rightfisher = 0; 181 | } 182 | else 183 | { 184 | $rightfisher = 1 - $rightfisher; 185 | } 186 | } 187 | 188 | return $rightfisher; 189 | } 190 | 191 | 192 | =item getStatisticName() - Returns the name of this statistic 193 | 194 | INPUT PARAMS : none 195 | 196 | RETURN VALUES : $name .. Name of the measure. 197 | 198 | =cut 199 | 200 | sub getStatisticName 201 | { 202 | return "Right Fisher"; 203 | } 204 | 205 | 206 | 207 | 1; 208 | __END__ 209 | 210 | =back 211 | 212 | =head1 AUTHOR 213 | 214 | Ted Pedersen, University of Minnesota Duluth 215 | Etpederse@d.umn.eduE 216 | 217 | Satanjeev Banerjee, Carnegie Mellon University 218 | Esatanjeev@cmu.eduE 219 | 220 | Amruta Purandare, University of Pittsburgh 221 | Eamruta@cs.pitt.eduE 222 | 223 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 224 | Ebthompson@d.umn.eduE 225 | 226 | Saiyam Kohli, University of Minnesota Duluth 227 | Ekohli003@d.umn.eduE 228 | 229 | =head1 HISTORY 230 | 231 | Last updated: $Id: right.pm,v 1.12 2006/06/21 11:10:52 saiyam_kohli Exp $ 232 | 233 | =head1 BUGS 234 | 235 | 236 | =head1 SEE ALSO 237 | 238 | @inproceedings{Pedersen96, 239 | author = {Pedersen, T.}, 240 | title = {Fishing For Exactness}, 241 | booktitle = {Proceedings of the South Central SAS User's 242 | Group (SCSUG-96) Conference}, 243 | year = {1996}, 244 | pages = {188--200}, 245 | month ={October}, 246 | address = {Austin, TX} 247 | url = L} 248 | 249 | L 250 | 251 | L 252 | 253 | 254 | =head1 COPYRIGHT 255 | 256 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 257 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 258 | 259 | This program is free software; you can redistribute it and/or modify it 260 | under the terms of the GNU General Public License as published by the Free 261 | Software Foundation; either version 2 of the License, or (at your option) 262 | any later version. 263 | 264 | This program is distributed in the hope that it will be useful, but 265 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 266 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 267 | for more details. 268 | 269 | You should have received a copy of the GNU General Public License along 270 | with this program; if not, write to 271 | 272 | The Free Software Foundation, Inc., 273 | 59 Temple Place - Suite 330, 274 | Boston, MA 02111-1307, USA. 275 | 276 | Note: a copy of the GNU General Public License is available on the web 277 | at L and is included in this 278 | distribution as GPL.txt. 279 | 280 | =cut -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/2D/Fisher2/right.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::2D::Fisher2::right - Perl module implementation of the right sided 4 | Fisher's exact test (Deprecated). 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | 10 | use Text::NSP::Measures::2D::Fisher2::right; 11 | 12 | my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; 13 | 14 | $right_value = calculateStatistic( n11=>$n11, 15 | n1p=>$n1p, 16 | np1=>$np1, 17 | npp=>$npp); 18 | 19 | if( ($errorCode = getErrorCode())) 20 | { 21 | print STDERR $errorCode." - ".getErrorMessage(); 22 | } 23 | else 24 | { 25 | print getStatisticName."value for bigram is ".$right_value; 26 | } 27 | 28 | 29 | =head1 DESCRIPTION 30 | 31 | This module provides a naive implementation of the fishers right 32 | sided exact tests. That is the implementation does not have any 33 | optimizations for performance. This will compute the factorials and 34 | the hypergeometric measures using direct multiplications. 35 | 36 | This measure should be used if you need exact values without any 37 | rounding errors, and you are not worried about the performance of 38 | the measure, otherwise use the implementations under the 39 | Text::NSP::Measures::2D::Fisher module. To use this implementation, 40 | you will have to specify the entire module name. Usage: 41 | 42 | statistic.pl Text::NSP::Measures::Fisher2::right dest.txt source.cnt 43 | 44 | Assume that the frequency count data associated with a bigram 45 | is stored in a 2x2 contingency table: 46 | 47 | word2 ~word2 48 | word1 n11 n12 | n1p 49 | ~word1 n21 n22 | n2p 50 | -------------- 51 | np1 np2 npp 52 | 53 | where n11 is the number of times occur together, and 54 | n12 is the number of times occurs with some word other than 55 | word2, and n1p is the number of times in total that word1 occurs as 56 | the first word in a bigram. 57 | 58 | The fishers exact tests are calculated by fixing the marginal totals 59 | and computing the hypergeometric probabilities for all the possible 60 | contingency tables, 61 | 62 | A right sided test is calculated by adding the probabilities of all 63 | the possible two by two contingency tables formed by fixing the 64 | marginal totals and changing the value of n11 to greater than or 65 | equal to the given value. A right sided Fisher's Exact Test tells us 66 | how likely it is to randomly sample a table where n11 is greater 67 | than observed. In other words, it tells us how likely it is to sample 68 | an observation where the two words are more dependent than currently 69 | observed. 70 | 71 | =head2 Methods 72 | 73 | =over 74 | 75 | =cut 76 | 77 | package Text::NSP::Measures::2D::Fisher2::right; 78 | 79 | 80 | use Text::NSP::Measures::2D::Fisher2; 81 | use strict; 82 | use Carp; 83 | use warnings; 84 | no warnings 'redefine'; 85 | require Exporter; 86 | 87 | our ($VERSION, @EXPORT, @ISA); 88 | 89 | @ISA = qw(Exporter); 90 | 91 | @EXPORT = qw(initializeStatistic calculateStatistic 92 | getErrorCode getErrorMessage getStatisticName); 93 | 94 | $VERSION = '0.97'; 95 | 96 | 97 | =item calculateStatistic() - This method computes the right sided Fishers 98 | exact test. 99 | 100 | INPUT PARAMS : $count_values .. Reference of an array containing 101 | the count values computed by the 102 | count.pl program. 103 | 104 | RETURN VALUES : $right .. Right Fisher value. 105 | 106 | =cut 107 | 108 | sub calculateStatistic 109 | { 110 | my %values = @_; 111 | 112 | 113 | my $probabilities; 114 | my $left_flag = 0; 115 | 116 | # computes and returns the observed and marginal values from 117 | # the frequency combination values. returns 0 if there is an 118 | # error in the computation or the values are inconsistent. 119 | if( !(Text::NSP::Measures::2D::Fisher2::getValues(\%values)) ) 120 | { 121 | return; 122 | } 123 | 124 | my $final_limit = ($n1p < $np1) ? $n1p : $np1; 125 | 126 | my $n11_org = $n11; 127 | my $n11_start = $n1p + $np1 - $npp; 128 | if($n11_start < $n11) 129 | { 130 | $n11_start = $n11; 131 | } 132 | 133 | 134 | # to make the computations faster, we check which would require less computations 135 | # computing the leftfisher value and subtracting it from 1 or directly computing 136 | # the right fisher value. 137 | my $left_final_limit = $n11-1; 138 | my $left_n11 = $n1p + $np1 - $npp; 139 | if($left_n11<0) 140 | { 141 | $left_n11 = 0; 142 | } 143 | 144 | # if computing the left fisher values first will take lesser amount of time them 145 | # we set a flag for later reference and then compute the leftfisher score for 146 | # n11-1 and then subtract the total score from one to get the right fisher value. 147 | if(($left_final_limit - $left_n11) < ($final_limit - $n11_start)) 148 | { 149 | $left_flag = 1; 150 | if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($left_n11, $left_final_limit))) 151 | { 152 | return; 153 | } 154 | } 155 | 156 | #else we compute the value normally and simply sum to get the rightfisher value. 157 | else 158 | { 159 | if( !($probabilities = Text::NSP::Measures::2D::Fisher2::computeDistribution($n11_start, $final_limit))) 160 | { 161 | return; 162 | } 163 | } 164 | 165 | my $key_n11; 166 | 167 | my $rightfisher=0; 168 | 169 | foreach $key_n11 (sort { $b <=> $a } keys %$probabilities) 170 | { 171 | if($left_flag) 172 | { 173 | if($key_n11 >= $n11_org) 174 | { 175 | last; 176 | } 177 | } 178 | else 179 | { 180 | if($key_n11 < $n11_org) 181 | { 182 | last; 183 | } 184 | } 185 | $rightfisher += $probabilities->{$key_n11}; 186 | } 187 | 188 | # if we computed the leftfisher value to get the right fisher value, we subtract 189 | # the sum of the probabilities for the tables from one to get the right fisher score. 190 | if($left_flag) 191 | { 192 | $rightfisher = 1 - $rightfisher; 193 | } 194 | 195 | return $rightfisher; 196 | } 197 | 198 | 199 | =item getStatisticName() - Returns the name of this statistic 200 | 201 | INPUT PARAMS : none 202 | 203 | RETURN VALUES : $name .. Name of the measure. 204 | 205 | =cut 206 | 207 | sub getStatisticName 208 | { 209 | return "Right Fisher"; 210 | } 211 | 212 | 213 | 214 | 1; 215 | __END__ 216 | 217 | =back 218 | 219 | =head1 AUTHOR 220 | 221 | Ted Pedersen, University of Minnesota Duluth 222 | Etpederse@d.umn.eduE 223 | 224 | Satanjeev Banerjee, Carnegie Mellon University 225 | Esatanjeev@cmu.eduE 226 | 227 | Amruta Purandare, University of Pittsburgh 228 | Eamruta@cs.pitt.eduE 229 | 230 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 231 | Ebthompson@d.umn.eduE 232 | 233 | Saiyam Kohli, University of Minnesota Duluth 234 | Ekohli003@d.umn.eduE 235 | 236 | =head1 HISTORY 237 | 238 | Last updated: $Id: right.pm,v 1.10 2008/03/26 17:24:15 tpederse Exp $ 239 | 240 | =head1 BUGS 241 | 242 | 243 | =head1 SEE ALSO 244 | 245 | L 246 | 247 | L 248 | 249 | 250 | =head1 COPYRIGHT 251 | 252 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 253 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 254 | 255 | This program is free software; you can redistribute it and/or modify it 256 | under the terms of the GNU General Public License as published by the Free 257 | Software Foundation; either version 2 of the License, or (at your option) 258 | any later version. 259 | 260 | This program is distributed in the hope that it will be useful, but 261 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 262 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 263 | for more details. 264 | 265 | You should have received a copy of the GNU General Public License along 266 | with this program; if not, write to 267 | 268 | The Free Software Foundation, Inc., 269 | 59 Temple Place - Suite 330, 270 | Boston, MA 02111-1307, USA. 271 | 272 | Note: a copy of the GNU General Public License is available on the web 273 | at L and is included in this 274 | distribution as GPL.txt. 275 | 276 | =cut 277 | -------------------------------------------------------------------------------- /geneSCF-master-v1.0/class/lib/Text/NSP/Measures/4D/MI/ll.pm: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Text::NSP::Measures::4D::MI::ll - Perl module that implements Loglikelihood 4 | measure of association for 4-grams. 5 | 6 | =head1 SYNOPSIS 7 | 8 | =head3 Basic Usage 9 | use Text::NSP::Measures::4D::MI::ll; 10 | 11 | $ll_value = calculateStatistic( 12 | n1111=>8, 13 | n1ppp=>306, 14 | np1pp=>83, 15 | npp1p=>83, 16 | nppp1=>57, 17 | n11pp=>8, 18 | n1p1p=>8, 19 | n1pp1=>8, 20 | np11p=>83, 21 | np1p1=>56, 22 | npp11=>56, 23 | n111p=>8, 24 | n11p1=>8, 25 | n1p11=>8, 26 | np111=>56, 27 | npppp=>15180); 28 | 29 | if( ($errorCode = getErrorCode())) 30 | { 31 | print STDERR $erroCode." - ".getErrorMessage()."\n"; 32 | } 33 | else 34 | { 35 | print getStatisticName."value for 4-gram is ".$ll_value."\n"; 36 | } 37 | 38 | =head1 DESCRIPTION 39 | 40 | The log-likelihood ratio measures the devitation between the observed data 41 | and what would be expected if , , and were 42 | independent.The higher the score, the less evidence there is in favor of 43 | concluding thatthe words are independent. 44 | 45 | The expected values for the internal cells are calculated by taking the 46 | product of their associated marginals and dividing by the sample size, 47 | for example: 48 | 49 | n1ppp * np1pp * npp1p * nppp1 50 | m111= ------------------------------- 51 | npppp ^ 3 52 | 53 | Then the deviation between observed and expected values for each internal 54 | cell is computed to arrive at the log-likelihood value. 55 | 56 | Log-Likelihood = 2 * [n1111 * log ( n1111 / m1111 ) + n1112 * log ( n1112 / m1112 ) + 57 | n1121 * log ( n1121 / m1121 ) + n1122 * log ( n1122 / m1122 ) + 58 | n1211 * log ( n1211 / m1211 ) + n1212 * log ( n1212 / m1212 ) + 59 | n1221 * log ( n1221 / m1221 ) + n1222 * log ( n1222 / m1222 ) + 60 | n2111 * log ( n2111 / m2111 ) + n2112 * log ( n2112 / m2112 ) + 61 | n2121 * log ( n2121 / m2121 ) + n2122 * log ( n2122 / m2122 ) + 62 | n2211 * log ( n2211 / m2211 ) + n2212 * log ( n2212 / m2212 ) + 63 | n2221 * log ( n2221 / m2221 ) + n2222 * log ( n2222 / m2222 )]; 64 | 65 | =head2 Methods 66 | 67 | =over 68 | 69 | =cut 70 | 71 | 72 | package Text::NSP::Measures::4D::MI::ll; 73 | 74 | 75 | use Text::NSP::Measures::4D::MI; 76 | use strict; 77 | use Carp; 78 | use warnings; 79 | no warnings 'redefine'; 80 | require Exporter; 81 | 82 | our ($VERSION, @EXPORT, @ISA); 83 | 84 | @ISA = qw(Exporter); 85 | 86 | @EXPORT = qw(initializeStatistic calculateStatistic 87 | getErrorCode getErrorMessage getStatisticName); 88 | 89 | $VERSION = '0.97'; 90 | 91 | =item calculateStatistic($count_values) - This method calculates 92 | the ll value 93 | 94 | INPUT PARAMS : $count_values .. Reference of an hash containing 95 | the count values computed by the 96 | count.pl program. 97 | 98 | RETURN VALUES : $loglikelihood .. Loglikelihood value for this 4-gram. 99 | 100 | =cut 101 | 102 | sub calculateStatistic 103 | { 104 | my %values = @_; 105 | 106 | # computes and sets the observed and expected values from 107 | # the frequency combination values. returns 0 if there is an 108 | # error in the computation or the values are inconsistent. 109 | if( !(Text::NSP::Measures::4D::MI::getValues(\%values)) ) { 110 | return; 111 | } 112 | 113 | # Now for the actual calculation of Loglikelihood! 114 | my $logLikelihood = 0; 115 | 116 | 117 | # dont want ($nxy / $mxy) to be 0 or less! flag error if so! 118 | $logLikelihood += $n1111 * Text::NSP::Measures::4D::MI::computePMI ( $n1111, $m1111 ); 119 | $logLikelihood += $n1112 * Text::NSP::Measures::4D::MI::computePMI ( $n1112, $m1112 ); 120 | $logLikelihood += $n1121 * Text::NSP::Measures::4D::MI::computePMI ( $n1121, $m1121 ); 121 | $logLikelihood += $n1122 * Text::NSP::Measures::4D::MI::computePMI ( $n1122, $m1122 ); 122 | $logLikelihood += $n1211 * Text::NSP::Measures::4D::MI::computePMI ( $n1211, $m1211 ); 123 | $logLikelihood += $n1212 * Text::NSP::Measures::4D::MI::computePMI ( $n1212, $m1212 ); 124 | $logLikelihood += $n1221 * Text::NSP::Measures::4D::MI::computePMI ( $n1221, $m1221 ); 125 | $logLikelihood += $n1222 * Text::NSP::Measures::4D::MI::computePMI ( $n1222, $m1222 ); 126 | $logLikelihood += $n2111 * Text::NSP::Measures::4D::MI::computePMI ( $n2111, $m2111 ); 127 | $logLikelihood += $n2112 * Text::NSP::Measures::4D::MI::computePMI ( $n2112, $m2112 ); 128 | $logLikelihood += $n2121 * Text::NSP::Measures::4D::MI::computePMI ( $n2121, $m2121 ); 129 | $logLikelihood += $n2122 * Text::NSP::Measures::4D::MI::computePMI ( $n2122, $m2122 ); 130 | $logLikelihood += $n2211 * Text::NSP::Measures::4D::MI::computePMI ( $n2211, $m2211 ); 131 | $logLikelihood += $n2212 * Text::NSP::Measures::4D::MI::computePMI ( $n2212, $m2212 ); 132 | $logLikelihood += $n2221 * Text::NSP::Measures::4D::MI::computePMI ( $n2221, $m2221 ); 133 | $logLikelihood += $n2222 * Text::NSP::Measures::4D::MI::computePMI ( $n2222, $m2222 ); 134 | return ( 2 * $logLikelihood ); 135 | } 136 | 137 | 138 | =item getStatisticName() - Returns the name of this statistic 139 | 140 | INPUT PARAMS : none 141 | 142 | RETURN VALUES : $name .. Name of the measure. 143 | 144 | =cut 145 | 146 | sub getStatisticName 147 | { 148 | return "Loglikelihood"; 149 | } 150 | 151 | 152 | 153 | 1; 154 | __END__ 155 | 156 | 157 | =back 158 | 159 | =head1 AUTHOR 160 | 161 | Ted Pedersen, University of Minnesota Duluth 162 | Etpederse@d.umn.eduE 163 | 164 | Satanjeev Banerjee, Carnegie Mellon University 165 | Esatanjeev@cmu.eduE 166 | 167 | Amruta Purandare, University of Pittsburgh 168 | Eamruta@cs.pitt.eduE 169 | 170 | Bridget Thomson-McInnes, University of Minnesota Twin Cities 171 | Ebthomson@cs.umn.eduE 172 | 173 | Saiyam Kohli, University of Minnesota Duluth 174 | Ekohli003@d.umn.eduE 175 | 176 | =head1 HISTORY 177 | 178 | Last updated: $Id: ll.pm,v 1.1 2008/11/22 18:53:13 btmcinnes Exp $ 179 | 180 | =head1 BUGS 181 | 182 | 183 | =head1 SEE ALSO 184 | 185 | @article{Dunning93, 186 | author = {Dunning, T.}, 187 | title = {Accurate Methods for the Statistics of 188 | Surprise and Coincidence}, 189 | journal = {Computational Linguistics}, 190 | volume = {19}, 191 | number = {1}, 192 | year = {1993}, 193 | pages = {61-74} 194 | url = L} 195 | 196 | @inproceedings{moore:2004:EMNLP, 197 | author = {Moore, Robert C.}, 198 | title = {On Log-Likelihood-Ratios and the Significance of Rare 199 | Events }, 200 | booktitle = {Proceedings of EMNLP 2004}, 201 | editor = {Dekang Lin and Dekai Wu}, 202 | year = 2004, 203 | month = {July}, 204 | address = {Barcelona, Spain}, 205 | publisher = {Association for Computational Linguistics}, 206 | pages = {333--340} 207 | url = L} 208 | 209 | L 210 | 211 | L 212 | 213 | 214 | =head1 COPYRIGHT 215 | 216 | Copyright (C) 2000-2006, Ted Pedersen, Satanjeev Banerjee, Amruta 217 | Purandare, Bridget Thomson-McInnes and Saiyam Kohli 218 | 219 | This program is free software; you can redistribute it and/or modify it 220 | under the terms of the GNU General Public License as published by the Free 221 | Software Foundation; either version 2 of the License, or (at your option) 222 | any later version. 223 | 224 | This program is distributed in the hope that it will be useful, but 225 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 226 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 227 | for more details. 228 | 229 | You should have received a copy of the GNU General Public License along 230 | with this program; if not, write to 231 | 232 | The Free Software Foundation, Inc., 233 | 59 Temple Place - Suite 330, 234 | Boston, MA 02111-1307, USA. 235 | 236 | Note: a copy of the GNU General Public License is available on the web 237 | at L and is included in this 238 | distribution as GPL.txt. 239 | 240 | =cut 241 | --------------------------------------------------------------------------------