├── Abstract Acquisition Scripts ├── DBLP XML fetch abstracts .pl ├── ReadMe.txt ├── extractfromDBLP CAL.xq ├── extractfromDBLP EC-TEL.xq ├── extractfromDBLP ICALT.xq ├── extractfromDBLP ICHL.xq ├── extractfromDBLP ICWL.xq ├── wrap inproceedings CAL.sh ├── wrap inproceedings ICHL.sh └── wrap inproceedings.sh ├── Adjacent Words ├── AdjacentWords.R ├── TEL Blogs 20090101-20120912 Init.R ├── Union C 2005-2011 Init.R └── Union C 2005-2012 Init.R ├── Alerts ├── BlogScore Alert.R ├── BlogScore BrewChunk.html ├── BlogScore BrewFooter.html ├── BlogScore BrewHeader.html ├── RisingFallingTerms Alert.R └── RisingFallingTerms BrewTemplate.html ├── Common Functions ├── CustomStopwords.R └── sentimentFunctions.R ├── Compair ├── 2011 ICCE and ICALT │ └── Init_Compair.R ├── BrewTemplate.html ├── CETIS Conf 2012 │ └── Init_Compair.R ├── Compair.R └── ReadMe.txt ├── Frequent TermSet ├── Frequent TermSet.R └── ReadMe.txt ├── History Visualiser ├── CETIS Conf 2012 │ └── HV_Init.R ├── HV Brew Template.html ├── HistoryVis.R ├── ReadMe.txt ├── Union B │ └── HV_Init.R └── Union C │ └── HV_Init.R ├── PESTLE Scan ├── BrewChunk.html ├── BrewFooter.html ├── BrewHeader.html ├── InquirerPESTLE.csv └── PESTLE Scan.R ├── Pre-process.R ├── Proximate Word Scan └── ProximateWordScan.R ├── README.txt ├── Rising and Falling Terms ├── CB BrewTemplate.html ├── Conference BrewTemplate ODT content.xml ├── Conference BrewTemplate ODT files │ ├── Configurations2 │ │ └── accelerator │ │ │ └── current.xml │ ├── META-INF │ │ └── manifest.xml │ ├── layout-cache │ ├── manifest.rdf │ ├── meta.xml │ ├── mimetype │ ├── settings.xml │ └── styles.xml ├── RF_Brew_Core.R ├── RF_Terms.R ├── ReadMe.txt ├── TELBlogs │ ├── RF_CB_Brew.R │ └── RF_Init.R ├── Union A │ ├── Prepare ODT Report.sh │ ├── Prepare ODT Report.sh~ │ ├── RF_Conference_Brew.R │ └── RF_Init.R ├── Union B │ ├── RF_CB_Brew.R │ └── RF_Init.R ├── Union C │ ├── RF_CB_Brew.R │ └── RF_Init.R └── plotFunctions.R ├── SQLite Schema.sql ├── Stem Helper.R ├── WordookieDriver └── WordookieDriver.pde └── commonFunctions.R /Abstract Acquisition Scripts/DBLP XML fetch abstracts .pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | ## Use XML as extracted from DBLP XML dump as a source of the metadata and a URL 3 | ## from which to retrieve a HTML page containing (among other things) the abstract 4 | ## Extract the abstract and compose a CSV output along with the metadata from the XML 5 | ## ******** designed to work with IEEE ICALT, Elsevier/Sciencedirect CAL and Springer EC-TEL, ICWL, ICHL proceedings 6 | 7 | ##TOTAL FUCKERS - SPRINGER HAVE CHANGED IT SO THAT ABSTRACTS ARE CONCEALED, THIS NO LONGER WORKS 8 | 9 | ##NB: some older papers (especially) in DBLP do not have content 10 | 11 | use warnings; 12 | use XML::Simple; 13 | use LWP::Simple; 14 | use Data::Dumper; 15 | 16 | #preparatory stuff - the agent to fetch from the URL and prep an output file 17 | my $ua = LWP::UserAgent->new(); 18 | $ua->agent("Mozilla/4.0"); 19 | 20 | my $url_found =0; 21 | my $abs_got = 0; 22 | my $no_url = 0; 23 | my $booktitle=""; 24 | my $publisher=""; 25 | my $min_year = 2012; 26 | my $max_year = 2012; 27 | my $conf = "ICALT"; 28 | my $infile = "../Source Data/Raw and Part-Processed/".$conf."/".$conf." inproceedings ".$max_year.".xml"; 29 | my $outfile = "../Source Data/Abstracts/".$conf." Abstracts ".$min_year."-".$max_year.".csv"; 30 | print "Starting - acquire abstracts $min_year -> $max_year \n"; 31 | 32 | unlink $outfile; 33 | open(OUTFILE, ">".$outfile); 34 | #uncomment the following line UNLESS intending to append the outfile to an existing CSV 35 | print OUTFILE "year,pages,title,authors,abstract,keywords,url,dblp_url\n"; 36 | 37 | # Read in the XML and start looping over elements 38 | # forcearry ensures that "author" is always read in as an array even if only 1 author element 39 | # the "keyattr" voodoo prevents the @key attribute being interpreted as an indexing-in term 40 | my $xml = XMLin($infile, forcearray => [ 'author' ], keyattr => []); 41 | #print Dumper ($xml); #for inspecting/testing contents 42 | 43 | #this gets the first item only (test) @{$xml->{inproceedings}}[0] 44 | 45 | foreach my $item (@{$xml->{inproceedings}}){ 46 | # get the metadata 47 | $year = $item->{year}; 48 | # skip 2001, 2003, 2004 since experience shows there is either no "ee" URL or the resolved page does not contain an abstract 49 | if($year < $min_year || $year > $max_year){ 50 | print "skipping $year\n"; 51 | next; 52 | } 53 | if($item->{pages}){ 54 | $pages = $item->{pages}; 55 | }else{ 56 | $pages=""; 57 | } 58 | $title = $item->{title}; 59 | $authors = join(', ', @{$item->{author}}); 60 | $booktitle = $item->{booktitle}; 61 | for ($booktitle) { 62 | if (/ICALT/) {$publisher = "IEEE";} 63 | elsif (/EC-TEL/) {$publisher = "Springer"; } 64 | elsif (/ICWL/) {$publisher = "Springer";} 65 | elsif (/ICHL/) {$publisher = "Springer";} 66 | elsif (/Computers & Education/) {$publisher = "Sciencedirect";} 67 | else {$publisher = "unknown";} 68 | } 69 | 70 | #Does this script know how to scrape? (is the publisher known) 71 | if($publisher=~m/"unknown"/){ 72 | print "Cannot process; publisher not known for title \"$booktitle\"\n"; 73 | }else{ 74 | #print "Publisher: $publisher\n"; 75 | # the element contains a DOI URL we can use to get the abstract e.g. http://dx.doi.org/10.1109/ICALT.2010.13 76 | if($item->{ee}){ 77 | $url_found++; 78 | my $url = $item->{ee};#this is [usually] a DOI URL to publisher page giving abstract 79 | my $dblp_url = $item->{url};#this is to a page on DBLP (which is used in the AERCS database) 80 | print "Fetching $url\n"; 81 | #the DOI request is redirected, this will happen automatically... 82 | my $request = HTTP::Request->new(GET => $url); 83 | my $response = $ua->request($request); 84 | my $abstract = ""; 85 | my $donext=0; 86 | #guard against no response 87 | if ($response->is_success) { 88 | # A DIFFERENT SCRAPER FOR EACH PUBLISHER (which is indicated by the element 89 | for ($publisher) { 90 | if (/Springer/){ 91 | #Springer hides their abstract - only shown using ajax on a "show..." click. 92 | #However, we can frig the URL to fetch what ajax does 93 | my $newurl = $response->request()->url(); 94 | $newurl =~ s/\?MUD=MP/primary/; 95 | my $newrequest = HTTP::Request->new(GET => $newurl); 96 | my $response = $ua->request($newrequest); 97 | my @html = split /\n/, $response->content(); 98 | if ($response->is_success) { 99 | foreach my $line (@html){ #reads line by line from response content 100 | #print "L:". $line ."\n\n"; 101 | if($donext == 1 && $line =~ /<\/div>/){ 102 | $abs_got++; 103 | last; 104 | } 105 | if($line =~ /class=\"Abstract\".*>(.*)/){ 106 | $line=$1; 107 | $donext=1; 108 | } 109 | if($donext == 1){ 110 | $line =~ s/<.*?>//g;#remove html tags 111 | $line=~s/^\s+//; #remove leading spaces 112 | chomp($line); 113 | $abstract .= $line . " "; 114 | } 115 | } 116 | 117 | #print "ABS:". $abstract . "\n\n"; 118 | } 119 | } 120 | elsif (/Sciencedirect/){ 121 | #scrape the abstract directly from the HTTP response 122 | my @html = split /\n/, $response->content(); 123 | foreach my $line (@html){ #reads line by line from response content 124 | #WAS PREVIOUSLY if($line =~ /

Abstract<\/h3>(.*)<\/p><\/div>/){ 125 | if($line =~ /

Abstract<\/h2>(.*)<\/p><\/div>/){ 126 | $abstract = $1; 127 | $abstract =~ s/<.*?>//g;#remove html tags 128 | $abstract=~s/^\s+//; #remove leading spaces 129 | chomp($abstract); 130 | $abs_got++; 131 | last; 132 | } 133 | } 134 | } 135 | elsif (/IEEE/){ 136 | #scrape the abstract directly from the HTTP response 137 | #since 2012 everything is on one line. This makes sure divs start on a new line otherwise later select fails 138 | my $responseContent = $response->content(); 139 | $responseContent =~ s/>> $line \n"; 143 | $line =~ s/\r//; #get \r in 2008 and 2009! 144 | #This works with one style (2008) 145 | if($donext == 1){ 146 | $line =~ s/<.*?>//g;#remove html tags 147 | $line=~s/^\s+//; #remove leading spaces 148 | chomp($line); 149 | $abstract = $line; 150 | $abs_got++; 151 | last; 152 | } 153 | if($line =~ /

Abstract<\/h2>/){ 154 | $donext=1; 155 | } 156 | #This works with another style (the main one) 157 | if($line =~ /class=\"abs-articlesummary\"/){ 158 | $line =~ s/<.*?>//g;#remove html tags 159 | $line=~s/^\s+//; #remove leading spaces 160 | chomp($line); 161 | $abstract = $line; 162 | $abs_got++; 163 | last; 164 | } 165 | } 166 | } 167 | else{ 168 | print "**** error ****\n";#this should never happen 169 | } 170 | } 171 | 172 | 173 | $abstract=~s/^\s+//; #remove leading spaces 174 | $abstract=~s/\s+$//; #remove trailing spaces 175 | #deal with pesky symbols - left and right single quote marks, normal quotes etc 176 | #a general s/\&.*;// would be too risky 177 | $abstract =~ s/\"//g; 178 | $abstract =~ s/'//g; #occurs as students' and later gets converted to &psila; leading to studentspsila in the TM 179 | $abstract =~ s/\ / /g; 180 | $abstract =~ s/\&/and/g; 181 | $abstract =~ s/\–/-/g; 182 | $abstract =~ s/\&.5;//g; 183 | $abstract =~ s/\&.4;//g; 184 | $abstract =~ s/\&.3;//g; 185 | $abstract =~ s/\&.2;//g; 186 | $abstract =~ s/\&#.*;//g; 187 | 188 | $abstract =~ s/\‘//g; 189 | $abstract =~ s/\’//g; 190 | $abstract =~ s/\“//g; 191 | $abstract =~ s/\”//g; 192 | #$abstract =~ s/\&psila;//g; 193 | #$abstract =~ s/\™//g; 194 | #$abstract =~ s/\•//g; 195 | $title =~ s/\"//g; 196 | 197 | print "$year $title \n $abstract\n"; 198 | print OUTFILE "$year,$pages,\"$title\",\"$authors\",\"$abstract\",\"\",$url,\"$dblp_url\"\n" 199 | }else{ 200 | print "Fetch of DOI URL ". $url . " failed\n\n"; 201 | } 202 | }else{ 203 | print "No DOI URL found: $year $title\n\n"; 204 | $no_url++; 205 | } 206 | } 207 | } 208 | close OUTFILE; 209 | print "\nSUMMARY: $url_found URLs found, from which $abs_got abstracts were recovered. $no_url items had no URL to retrieve\n"; 210 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/ReadMe.txt: -------------------------------------------------------------------------------- 1 | == Scripts etc to acquire abstracts for various conferences == 2 | Requires use of BaseX 3 | 4 | Notes on how to do it follow: 5 | 6 | Download the DBLP master XML file and unzip 7 | 8 | In BaseX, create a new DB, setting the option to use the internal parser and to parse entities (not doing this causes a barf due to too many entities) 9 | 10 | Use one of the saved XQueries (.xq file) to create an extract: save this 11 | 12 | Add an xml decl and root element to it (this can be done with the "wrap inproceedings.sh": 13 | 14 | 15 | ... 16 | 17 | 18 | NB: CAL requires some additional processing to fudge the XML element names to "inproceedings" style - see "wrap inproceedings CAL.sh" 19 | 20 | Run the PERL "DBLP XML fetch abstracts.pl", having set the file locations in it 21 | 22 | Import into LibreOffice as UTF-8 23 | NB sometimes it seems necessary to set the encoding to Western Europe (ISO-8859-1) since this is what comes through, originally from DBLP (this may have been caused by incorrect XML decl addition to inproceedings in the past) 24 | 25 | NB!!!!!!!!! 26 | Since CAL proceedings appear rather long after the conference, the data in DBLP is likely to be +1 year. CHeck and edit in the CSV!!!!!!! 27 | (this may also apply to ICWL since it is late in the year) 28 | 29 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/extractfromDBLP CAL.xq: -------------------------------------------------------------------------------- 1 | root()/*:dblp/*:article[journal[text() = "Computers & Education"]][volume[text()="54"]][number[text()="3"]]| 2 | root()/*:dblp/*:article[journal[text() = "Computers & Education"]][volume[text()="50"]][number[text()="2"]] 3 | 4 | #2011 CAL conference 5 | root()/*:dblp/*:article[journal[text() = "Computers & Education"]][volume[text()="59"]][number[text()="1"]] 6 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/extractfromDBLP EC-TEL.xq: -------------------------------------------------------------------------------- 1 | root()/*:dblp/*:inproceedings[booktitle[text() = "EC-TEL"]] -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/extractfromDBLP ICALT.xq: -------------------------------------------------------------------------------- 1 | root()/*:dblp/*:inproceedings[booktitle[text() = "ICALT"] and year[text() = "2011"]] 2 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/extractfromDBLP ICHL.xq: -------------------------------------------------------------------------------- 1 | root()/*:dblp/*:inproceedings[booktitle[text() = "ICHL"]] 2 | # 3 | root()/*:dblp/*:inproceedings[booktitle[text() = "ICHL"] and year[text() = "2011"]] 4 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/extractfromDBLP ICWL.xq: -------------------------------------------------------------------------------- 1 | root()/*:dblp/*:inproceedings[booktitle[text() = "ICWL"]] 2 | # 3 | root()/*:dblp/*:inproceedings[booktitle[text() = "ICWL"] and year[text() = "2011"]] 4 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/wrap inproceedings CAL.sh: -------------------------------------------------------------------------------- 1 | #makes the inproceedings.xml file be "proper" 2 | #also changes the XML from
...
form to the form 3 | # Also removes unhelpful HTML tags as mixed content is unhelpful! 4 | echo "\n" > ./inproceedings.tmp 5 | cat "CAL inproceedings 2011.xml" | sed -e 's/
/booktitle>/g' -e 's// /' -e 's/<\/i>/ /' >> ./inproceedings.tmp 6 | echo "\n" >> ./inproceedings.tmp 7 | mv "CAL inproceedings 2011.xml" "CAL inproceedings 2011.old" 8 | mv inproceedings.tmp "CAL inproceedings 2011.xml" 9 | 10 | 11 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/wrap inproceedings ICHL.sh: -------------------------------------------------------------------------------- 1 | #makes the inproceedings.xml file be "proper". Also removes unhelpful HTML tags as mixed content is unhelpful! 2 | echo "\n" > ./inproceedings.tmp 3 | cat "ICHL inproceedings.xml" | sed -e 's// /' -e 's/<\/i>/ /' >> ./inproceedings.tmp 4 | echo "\n" >> ./inproceedings.tmp 5 | mv "ICHL inproceedings.xml" "ICHL inproceedings.old" 6 | mv inproceedings.tmp "ICHL inproceedings.xml" 7 | -------------------------------------------------------------------------------- /Abstract Acquisition Scripts/wrap inproceedings.sh: -------------------------------------------------------------------------------- 1 | #makes the inproceedings.xml file be "proper". Also removes unhelpful HTML tags as mixed content is unhelpful! 2 | #echo "\n" > ./inproceedings.tmp 3 | #cat "ECTEL inproceedings 2012.xml" | sed -e 's// /' -e 's/<\/i>/ /' >> ./inproceedings.tmp 4 | #echo "\n" >> ./inproceedings.tmp 5 | #mv "ECTEL inproceedings 2012.xml" "ECTEL inproceedings 2012.old" 6 | #mv inproceedings.tmp "ECTEL inproceedings 2012.xml" 7 | 8 | echo "\n" > ./inproceedings.tmp 9 | cat "ICALT inproceedings 2012.xml" | sed -e 's// /' -e 's/<\/i>/ /' >> ./inproceedings.tmp 10 | echo "\n" >> ./inproceedings.tmp 11 | mv "ICALT inproceedings 2012.xml" "ICALT inproceedings 2012.old" 12 | mv inproceedings.tmp "ICALT inproceedings 2012.xml" 13 | 14 | echo "\n" > ./inproceedings.tmp 15 | cat "ICWL inproceedings 2012.xml" | sed -e 's// /' -e 's/<\/i>/ /' >> ./inproceedings.tmp 16 | echo "\n" >> ./inproceedings.tmp 17 | mv "ICWL inproceedings 2012.xml" "ICWL inproceedings 2012.old" 18 | mv inproceedings.tmp "ICWL inproceedings 2012.xml" 19 | 20 | echo "\n" > ./inproceedings.tmp 21 | cat "ICHL inproceedings 2012.xml" | sed -e 's// /' -e 's/<\/i>/ /' >> ./inproceedings.tmp 22 | echo "\n" >> ./inproceedings.tmp 23 | mv "ICHL inproceedings 2012.xml" "ICHL inproceedings 2012.old" 24 | mv inproceedings.tmp "ICHL inproceedings 2012.xml" 25 | -------------------------------------------------------------------------------- /Adjacent Words/AdjacentWords.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## find which words preceed and follow a given word(words) and create a directed graph to visualise this pattern, showing frequency by node weight 3 | 4 | library("igraph") 5 | library("RSQLite") 6 | 7 | ## TO DO refactor to use database queries for year-splitting????? 8 | 9 | if(profile){ 10 | Rprof() 11 | } 12 | 13 | ## USEFUL FUNCTIONS 14 | make.table<-function(x){ 15 | t<- table(as.factor(x)) 16 | return(t) 17 | } 18 | source("~/R Projects/Text Mining Weak Signals/Common Functions/CustomStopwords.R") 19 | 20 | 21 | # filesystem setup 22 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 23 | source.dir<-paste(base.dir,"Source Data",sep="/") 24 | output.dir<-paste("/home/arc1/R Projects/Text Mining Weak Signals Output/Adjacent Words",set.name,sep="/") 25 | #brew.dir<-paste(base.dir,"AdjacentWords",sep="/") 26 | dir.create(output.dir, showWarnings=FALSE) 27 | setwd(output.dir) 28 | 29 | ## Preliminaries to get the run parameters ready 30 | if(source.type=="a"){ 31 | ## ADJUST THESE to get "nice" results (otherwise may be over-sparse, "no useful results" or over-dense) 32 | #the quantile to be applied to adjacent word frequency when visualising (only cases above the cut appear) 33 | cut.quantile<-0.85 34 | cut.quantile.split<-0.75 #used for a split-off year. should be smaller than cut.quantile otherwise they are sparse 35 | title<-"Conference Proceedings from ICALT, ECTEL, CAL, ICHL and ICWL" 36 | short.title<-"Conferences" 37 | filename.postfix<-"conf" 38 | }else if(source.type=="b"){ 39 | cut.quantile<-0.95 40 | cut.quantile.split<-0.85 41 | title<-"TEL Blogs Jan 2009-" 42 | short.title<-"TEL Blogs" 43 | filename.postfix<-"blog" 44 | }else{ 45 | stop(paste("Unknown Source Type:",source.type)) 46 | } 47 | if(is.na(focus.placeholder)){ 48 | focus.placeholder<-focus.words[1] 49 | } 50 | if(hide.stopwords){ 51 | stop.words<-CustomStopwords() 52 | } 53 | 54 | ## SET UP DATABASE and SQL according to source type 55 | sqlite.filename<-"TMWS Data A.sqlite" 56 | use.sqlite<-!is.na(sqlite.filename) 57 | if(use.sqlite){ 58 | # instantiate the SQLite driver in the R process 59 | sqlite<- dbDriver("SQLite") 60 | # open sqlite connection. db is a "connection" 61 | db<- dbConnect(sqlite, dbname=paste(source.dir,sqlite.filename,sep="/")) 62 | summary(db) 63 | } 64 | #must not have a "-" in query because it means Boolean NOT 65 | queryTerms<-unique(gsub("-","",focus.words)) 66 | if(source.type=="a"){ 67 | sql<-"SELECT year, title, abstract as content FROM abstract WHERE year > '2006'" 68 | sql<-paste(sql," AND id in(SELECT docid from abstract_fts4 where abstract match '", 69 | paste(queryTerms, collapse=" OR "),"')",sep="") 70 | }else{ 71 | sql<-"SELECT content,datestamp FROM blog_post WHERE datestamp > '2009-01-01'" 72 | sql<-paste(sql," AND id in(SELECT docid from blog_post_fts4 where content match '", 73 | paste(queryTerms, collapse=" OR "),"')",sep="") 74 | } 75 | 76 | 77 | ## 78 | ## Log run info and stats 79 | ## 80 | name.suffix <-gsub("/","-",focus.placeholder) 81 | log.file<-file(paste("AdjacentWords - ",name.suffix,".log", sep=""), open="wt") 82 | cat(title, file = log.file, sep = "\n", append = FALSE) 83 | cat(paste("Run for", focus.placeholder), file = log.file, sep = "\n") 84 | cat(paste("Central node is:", paste(focus.words,sep=",")), file = log.file, sep = "\n") 85 | cat(file = log.file, sep = "\n") 86 | cat(paste("Only showing words above the ", cut.quantile*100, "% quantile",sep=""),file = log.file, sep = "\n") 87 | if(split.years){ 88 | cat(paste("(or for single years, the ", cut.quantile.split*100, "% quantile)",sep=""),file = log.file, sep = "\n") 89 | } 90 | cat(file = log.file, sep = "\n") 91 | 92 | ## 93 | ## Read in the documents. from SQLite 94 | ## 95 | source.table<-dbGetQuery(db,sql)#query, fetch all records to dataframe and clear resultset in one go 96 | 97 | #some massaging depending on the source 98 | if (source.type=="b"){ 99 | if(split.years){ 100 | source.table<-cbind(source.table, data.frame(year=substr(source.table[,"datestamp"],1,4)))# for splitting 101 | } 102 | } 103 | 104 | #prepare to split by years 105 | #a bit of a hack - dummy year = "" is used for all-years 106 | if(split.years){ 107 | y<-as.factor(c(as.character(source.table[,"year"]),"")) 108 | }else{ 109 | y<-as.factor("") 110 | } 111 | 112 | # lloop over the years or a dummy loop over all-years as required 113 | for(y.filter in levels(y)){ 114 | if(y.filter==""){ 115 | cat("Processing all entries", file = log.file, sep = "\n") 116 | p<-source.table[,"content"] 117 | }else{ 118 | y.bool<-y==y.filter 119 | cat(paste("Loop for:",y.filter, " N(docs)=",sum(y.bool)), file = log.file, sep = "\n") 120 | p<-source.table[y.bool,"content"] 121 | } 122 | print(paste("**************", y.filter,"**************")) 123 | 124 | #change "." to something that matches as a "word"; this is used to mark end and beginning of sentence as a pseudo-word in the results 125 | sent.mark=" eos " 126 | p<- gsub(pattern="\\.", replacement=sent.mark, p) 127 | #make sure we start and end with markers 128 | p<-gsub(pattern="\\A|\\Z", replacement = sent.mark, p, perl=T) 129 | 130 | #kill any stray HTML entities that may have got through 131 | p<-gsub(pattern="&#[0-9]+;", replacement = "", p, perl=T) 132 | #kill remaining punctuation 133 | p<- gsub(pattern="[[:punct:]]", replacement="", p) 134 | #force lower case since we do not care about case differences 135 | p<-tolower(p) 136 | 137 | 138 | # find words before the given one. Convention is "b" means before, "a" means after 139 | # spaces are kept in the match because ?<= doesn't allow repetition (remove spaces later) 140 | # - patterns 141 | b.pattern<- paste("(\\w+\\s+)(",paste("(?=",focus.words, "\\s)", collapse="|",sep=""), ")", sep="") 142 | a.pattern<- paste("(",paste("(?<=\\s",focus.words,")",collapse="|", sep=""),")(\\s+\\w+)", sep="") 143 | # - matching 144 | mp.b<-gregexpr(b.pattern,p, perl=T, ignore.case=T) 145 | rm.b<-regmatches(p,mp.b) 146 | mp.a<-gregexpr(a.pattern,p, perl=T, ignore.case=T) 147 | rm.a<-regmatches(p,mp.a) 148 | # - normalise case and eliminate spaces 149 | words.b<-gsub(pattern=" ", replacement="", tolower(unlist(rm.b))) 150 | words.a<-gsub(pattern=" ", replacement="", tolower(unlist(rm.a))) 151 | # - tabulation of the word occurrences 152 | t.b<-make.table(words.b) 153 | t.a<-make.table(words.a) 154 | #this should equal the number of focus words. NB: this is before removing stopwords or selecting 155 | total.count<-sum(t.a) 156 | 157 | # ABORT if nothing found - i.e. if the focus words never appear in a matchable context 158 | if(y.filter=="" && (sum(t.a)==0 || sum(t.b)==0)){ 159 | stop("FOCUS WORDS NOT FOUND. STOPPING") 160 | } 161 | 162 | #which terms are above the Qth quantile? 163 | if(y.filter==""){ 164 | q<-cut.quantile 165 | }else{ 166 | q<-cut.quantile.split 167 | } 168 | cut.b<-floor(quantile(t.b, probs=q)) 169 | cut.a<-floor(quantile(t.a, probs=q)) 170 | t.b.sel<- t.b[t.b>=cut.a] 171 | t.a.sel<- t.a[t.a>=cut.b] 172 | #additionally always suppress words with <1.5% freq 173 | f.cut<-total.count*0.015 174 | t.b.sel<- t.b.sel[t.b.sel>f.cut] 175 | t.a.sel<- t.a.sel[t.a.sel>f.cut] 176 | 177 | #remove stopwords if required, and also remove the "end of sentence" tag in this case 178 | if(hide.stopwords){ 179 | t.a.sel<-t.a.sel[!(names(t.a.sel)%in% stop.words)] 180 | t.b.sel<-t.b.sel[!(names(t.b.sel)%in% stop.words)] 181 | t.a.sel<-t.a.sel[(names(t.a.sel)!="eos")] 182 | t.b.sel<-t.b.sel[(names(t.b.sel)!="eos")] 183 | }else{ 184 | #revert the "eos" placeholder to a full stop for display 185 | names(t.a.sel)[names(t.a.sel)=="eos"]<-"." 186 | names(t.b.sel)[names(t.b.sel)=="eos"]<-"." 187 | } 188 | 189 | if(length(t.a.sel)==0 && length(t.b.sel)==0){ 190 | print(paste("No useful results for", y.filter," - skipping")) 191 | cat("No useful results", file=log.file, sep="\n") 192 | }else{ 193 | #print/log actual word counts as these are sometimes useful 194 | cat(paste("Word count equivalent to 100% is:",total.count) ,file = log.file, sep = "\n") 195 | print("Word frequencies (before):") 196 | print(t.b.sel) 197 | print("Word frequencies (after):") 198 | print(t.a.sel) 199 | 200 | #scale factor -> percentages as weights 201 | scale<-100.0 / total.count 202 | 203 | ## 204 | ## VISUALISATION 205 | ## 206 | ## in Gephi use graphviz layout engine. 207 | #node output for Gephi. NB: weight is sqrt((freq) 208 | focus.id<-"_focus_" 209 | df.nodes<-data.frame(Id=focus.id, Label=focus.placeholder, Weight=sqrt(sum(t.a))) 210 | df.edges<-data.frame() 211 | if(length(t.b.sel)>0){ 212 | before.ids<-paste(names(t.b.sel), "_", sep="") 213 | df.nodes<-rbind(df.nodes, data.frame(Id=before.ids, Label=names(t.b.sel), Weight=sqrt(as.numeric(t.b.sel)))) 214 | #edges. NB: weight = freq, scaled to % 215 | df.edges<-data.frame(Source=before.ids, Target=rep(focus.id,length(before.ids)), Type=rep("Directed", length(before.ids)), Weight=scale*as.numeric(t.b.sel)) 216 | } 217 | if(length(t.a.sel)>0){ 218 | after.ids<-paste("_", names(t.a.sel), sep="") 219 | df.nodes<-rbind(df.nodes, data.frame(Id=after.ids, Label=names(t.a.sel), Weight=sqrt(as.numeric(t.a.sel)))) 220 | df.edges<-rbind(df.edges,data.frame(Source=rep(focus.id,length(after.ids)), Target=after.ids, Type=rep("Directed", length(after.ids)), Weight=scale*as.numeric(t.a.sel))) 221 | } 222 | 223 | #scale size to area for node weight before writing out 224 | write.csv(df.nodes, file=paste(name.suffix, y.filter, filename.postfix, "AdjacentWord Nodes.csv",sep=" "), row.names=FALSE) 225 | write.csv(df.edges, file=paste(name.suffix, y.filter, filename.postfix, " AdjacentWord Edges.csv",sep=" "), row.names=FALSE) 226 | 227 | df.nodes 228 | df.edges 229 | 230 | # graph using igraph - intended for interactive use; gephi for "smart" presentation 231 | # rename the columns as required 232 | colnames(df.nodes)[2]<-"label" 233 | colnames(df.nodes)[3]<-"size" 234 | #edge label = weight (= linked word weighted freq) but neatened up! 235 | df.edges<-cbind(df.edges, data.frame(label = paste(round(df.edges[,"Weight"]),"%", sep=""))) 236 | df.nodes[,"size"]<-df.nodes[,"size"]*10*sqrt(scale) #make disaply easier to look at! 237 | g <- graph.data.frame(df.edges, directed=TRUE, vertices=df.nodes) 238 | write.graph(g, file=paste(name.suffix," ", y.filter, " ", filename.postfix,".graphml", sep=""), format="graphml") 239 | subText<-paste(total.count,"words = 100%") 240 | plot.igraph(g, main=paste(short.title,y.filter), sub=subText, edge.arrow.size=2) 241 | png(file=paste(name.suffix," ", y.filter, " ", filename.postfix,".png", sep=""), width=1000, height=1000,pointsize=12, res=150) 242 | plot.igraph(g, main=paste(short.title,y.filter), xlab=subText, xlim=c(-0.8,0.8),ylim=c(-0.8,0.8), edge.arrow.size=2) 243 | ad<-dev.off() 244 | }# end if useful results 245 | 246 | }#end the y.filter loop 247 | 248 | #stop logging 249 | close(log.file) 250 | 251 | # properly terminate database use 252 | if(use.sqlite){ 253 | dbDisconnect(db) 254 | } 255 | 256 | if(profile){ 257 | Rprof(NULL) 258 | # summaryRprof("Rprof.out")$by.self 259 | } -------------------------------------------------------------------------------- /Adjacent Words/TEL Blogs 20090101-20120912 Init.R: -------------------------------------------------------------------------------- 1 | 2 | ## Run Properties - dependent on the source 3 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 4 | source.dir<-paste(base.dir,"Source Data",sep="/") 5 | set.name<-"TEL Blogs 20090101-20120912" #<<<<<<<<<<<<< edit me - for the output dir 6 | output.dir<-paste("/home/arc1/R Projects/Text Mining Weak Signals Output/Adjacent Words",set.name,sep="/") 7 | #brew.dir<-paste(base.dir,"AdjacentWords",sep="/") 8 | 9 | dir.create(output.dir, showWarnings=TRUE) 10 | setwd(output.dir) 11 | 12 | ## SOURCE DATA SELECTION 13 | # Either a) [DEPRECATED] list csv files "with metrics" as produced by Pre-process.R These are combined into one corpus or 14 | # b) Locate a SQLite Database and define a query to extract 15 | set.csv<-NA 16 | sqlite.filename<-"TMWS Data A.sqlite" #set this to NA to use [deprecated] option a 17 | sql<-"SELECT content,datestamp FROM blog_post WHERE datestamp BETWEEN '2009-01-01' AND '2012-09-12'" 18 | use.sqlite<-!is.na(sqlite.filename) 19 | 20 | title<-"TEL Blogs Jan 2009 - Mid Sept 2012" 21 | 22 | data.type<-"b" #a = conference abstracts, b = blogs 23 | 24 | #whether to split the dataset by year and create separate plots for each 25 | split.years=TRUE 26 | 27 | # allow for multiple words to be the "word in focus", around which the previous/following words are counted. a compound word is OK (e.g. "learning object") 28 | focus.words=c("cloud") 29 | focus.placeholder<-"cloud" #used as the label in the visualistion to represent the focus word(s) 30 | 31 | #the quantile to be applied to adjacent word frequency when visualising (only cases above the cut appear) 32 | cut.quantile<-0.99 33 | cut.quantile.split<-0.98 #used for a split-off year. often smaller than cut.quantile otherwise they are sparse 34 | 35 | source(paste(base.dir,"Adjacent Words/AdjacentWords.R", sep="/")) -------------------------------------------------------------------------------- /Adjacent Words/Union C 2005-2011 Init.R: -------------------------------------------------------------------------------- 1 | 2 | ## Run Properties - dependent on the source 3 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 4 | source.dir<-paste(base.dir,"Source Data",sep="/") 5 | set.name<-"Union C 2006 to 2011" 6 | output.dir<-paste("/home/arc1/R Projects/Text Mining Weak Signals Output/Adjacent Words",set.name,sep="/") 7 | #brew.dir<-paste(base.dir,"AdjacentWords",sep="/") 8 | 9 | dir.create(output.dir, showWarnings=TRUE) 10 | setwd(output.dir) 11 | 12 | ## SOURCE DATA SELECTION 13 | # Either a) [DEPRECATED] list csv files "with metrics" as produced by Pre-process.R These are combined into one corpus or 14 | # b) Locate a SQLite Database and define a query to extract 15 | #sets.csv <- c("ICALT Abstracts 2005-2011 with metrics.csv", 16 | #"ECTEL Abstracts 2006-2011 with metrics.csv", 17 | #"ICWL Abstracts 2005-2011 with metrics.csv", 18 | #"ICHL Abstracts 2008-2011 with metrics.csv", 19 | # "CAL Abstracts 2007-2011 with metrics.csv") 20 | set.csv<-NA 21 | sqlite.filename<-"TMWS Data A.sqlite" #set this to NA to use [deprecated] option a 22 | sql<-"SELECT year, title, abstract FROM abstract WHERE year BETWEEN '2005' AND '2011'" 23 | use.sqlite<-!is.na(sqlite.filename) 24 | 25 | title<-"Conference Proceedings from ICALT, ECTEL, CAL, ICHL and ICWL" 26 | 27 | data.type<-"a" #a = conference abstracts, b = blogs 28 | 29 | #whether to split the dataset by year and create separate plots for each 30 | split.years=TRUE 31 | 32 | # allow for multiple words to be the "word in focus", around which the previous/following words are counted. a compound word is OK (e.g. "learning object") 33 | focus.words=c("analytics") 34 | focus.placeholder<-"analytics" #used as the label in the visualistion to represent the focus word(s) 35 | 36 | ## ADJUST THESE to get "nice" results (otherwise may be over-sparse, "no useful results" or over-dense) 37 | #the quantile to be applied to adjacent word frequency when visualising (only cases above the cut appear) 38 | cut.quantile<-0.5#0.95 39 | cut.quantile.split<-0.1 #used for a split-off year. should be smaller than cut.quantile otherwise they are sparse 40 | 41 | source(paste(base.dir,"Adjacent Words/AdjacentWords.R", sep="/")) -------------------------------------------------------------------------------- /Adjacent Words/Union C 2005-2012 Init.R: -------------------------------------------------------------------------------- 1 | 2 | ## Run Properties - dependent on the source 3 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 4 | source.dir<-paste(base.dir,"Source Data",sep="/") 5 | set.name<-"Union C 2006 to 2012" 6 | output.dir<-paste("/home/arc1/R Projects/Text Mining Weak Signals Output/Adjacent Words",set.name,sep="/") 7 | #brew.dir<-paste(base.dir,"AdjacentWords",sep="/") 8 | 9 | dir.create(output.dir, showWarnings=TRUE) 10 | setwd(output.dir) 11 | 12 | ## SOURCE DATA SELECTION 13 | # Either a) [DEPRECATED] list csv files "with metrics" as produced by Pre-process.R These are combined into one corpus or 14 | # b) Locate a SQLite Database and define a query to extract 15 | #sets.csv <- c("ICALT Abstracts 2005-2011 with metrics.csv", 16 | #"ECTEL Abstracts 2006-2011 with metrics.csv", 17 | #"ICWL Abstracts 2005-2011 with metrics.csv", 18 | #"ICHL Abstracts 2008-2011 with metrics.csv", 19 | # "CAL Abstracts 2007-2011 with metrics.csv") 20 | set.csv<-NA 21 | sqlite.filename<-"TMWS Data A.sqlite" #set this to NA to use [deprecated] option a 22 | sql<-"SELECT year, title, abstract FROM abstract WHERE year BETWEEN '2005' AND '2012'" 23 | use.sqlite<-!is.na(sqlite.filename) 24 | 25 | title<-"Conference Proceedings from ICALT, ECTEL, CAL, ICHL and ICWL" 26 | 27 | data.type<-"a" #a = conference abstracts, b = blogs 28 | 29 | #whether to split the dataset by year and create separate plots for each 30 | split.years=TRUE 31 | 32 | # allow for multiple words to be the "word in focus", around which the previous/following words are counted. a compound word is OK (e.g. "learning object") 33 | focus.words=c("gesture") 34 | focus.placeholder<-"gesture" #used as the label in the visualistion to represent the focus word(s) 35 | 36 | ## ADJUST THESE to get "nice" results (otherwise may be over-sparse, "no useful results" or over-dense) 37 | #the quantile to be applied to adjacent word frequency when visualising (only cases above the cut appear) 38 | cut.quantile<-0.5#0.95 39 | cut.quantile.split<-0.0 #used for a split-off year. should be smaller than cut.quantile otherwise they are sparse 40 | 41 | source(paste(base.dir,"Adjacent Words/AdjacentWords.R", sep="/")) -------------------------------------------------------------------------------- /Alerts/BlogScore Alert.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | 11 | 12 | ## 13 | ## A Simple alerter. Produce a concise report to highligh blog posts in a (short) timewindow with 14 | ## pos/neg/subj, econ/polit/legal/knowing/doing, novelty scores. 15 | ## Relies on Database for source data (no CSV) and assumes that pre-process.R has already been run 16 | ## 17 | library("RSQLite") 18 | library("tm") 19 | library("slam") 20 | library("brew") 21 | #library("RColorBrewer") 22 | 23 | home.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 24 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output" 25 | db.dir<-paste(home.dir,"Source Data",sep="/") 26 | template.dir<-paste(home.dir,"Alerts",sep="/") 27 | 28 | ## 29 | ## RUN PARAMETERS - Often Changed 30 | title<-"TEL Blog Scan - Last 14 Days" 31 | # The date of the report. Posts up to and including this date are candidates for output 32 | report.date<-as.POSIXlt("2012-09-26") 33 | # how many days to include when seeking candidates 34 | report.days<-14 35 | 36 | ## 37 | ## RUN PARAMETERS - Not often changed 38 | # locate the database 39 | sqlite.filename <- "TMWS Data A.sqlite" 40 | # set output subfolder 41 | output.subdir <- "BlogScore Alert" 42 | # score columns. **** NB score.mins must follow this order ***** 43 | score.cols<-c("pos_score", "neg_score", "subj_score", "econ_score", "polit_score", "legal_score", 44 | "doing_score", "knowing_score") 45 | # Reader-fiendly captions to match. These determine what is reported (set an element to NA to omit) 46 | score.title<-c("Positive Sentiment", "Negative Sentiment",NA,"Economics Topic", "Political Topic", "Legal Topic", "Doing: Aims and Means", "Knowing and Problem-solving") 47 | # These are shown as explanatory text under the titles in the report. 48 | score.caption<-c("Posts containing a high proportion of words associated with positive sentiment", 49 | "Posts containing a high proportion of words associated with negative sentiment", 50 | "Posts with high subjective character (both positive and negative sentiment", 51 | "Posts containing typically economic keywords", 52 | "Posts containing typically political keywords", 53 | "Posts containing typically legal keywords", 54 | "Posts containing keywords associated with motivation, goals, the means of achieving goals and also the state of failure", 55 | "Posts containing keywords associated with knowledge, knowledge acquisition and problem solving") 56 | # These are used to work out a scale the scores to give a "rating" for approximate comparability. 57 | # i.e. to that readers are not distracted by inherent differences between the dictionary word occurrence in the corpus. Use something like 1.1*(corpus max score) 58 | rating.fullscale<-c(.325,.181,.382, 59 | .275,.240,.154, #econ, polit, legal 60 | .1925,.224) 61 | # How many months to use for novelty baseline. 62 | nov.baseline.months<-12 63 | # Thresholds for selection 64 | # only ever show the top few. This is a max; if the score thresholds are not met nothing shows 65 | top.n<-4 66 | # dictionary score minima by dictionary. Order must be same as score.cols. 67 | # NB: these are unweighted and are used to define the point at which rating=1 68 | score.mins<-c(0.15, #pos 69 | 0.05, #neg 70 | 0.15, #subj 71 | 0.12, #econ 72 | 0.12, #ploit 73 | 0.04, #legal 74 | 0.08,#doing 75 | 0.08)#knowing 76 | # novelty score min 77 | nov.min<-0.82 78 | # novelty rating zero-point. make this less than nov.min. This is the far-left position on the rating gauge 79 | nov.ref<-0.7 80 | 81 | ## 82 | ## PRELIMINARIES - some initial setup-specific working 83 | # database query 84 | qdate<-function(d){ 85 | return (paste("'",as.character(d),"'", sep="")) 86 | } 87 | # this query defines the posts to be considered for report. BETWEEN is inclusive 88 | report.start<-report.date 89 | report.start$mday<-report.start$mday - report.days 90 | sql<- paste("select content, title, authors, datestamp, url,",paste(score.cols, collapse=", "), 91 | "from blog_post where datestamp between",qdate(report.start),"and", qdate(report.date)) 92 | # this query fetches the baseline (large) for novelty calculation 93 | nov.end<-report.start 94 | nov.end$mday<-nov.end$mday-1 95 | nov.start<-nov.end 96 | nov.start$mon<-nov.start$mon-nov.baseline.months 97 | baseline.sql<- paste("select content, datestamp from blog_post where datestamp between",qdate(nov.start),"and", qdate(nov.end)) 98 | # initialise database access 99 | # instantiate the SQLite driver in the R process 100 | sqlite<- dbDriver("SQLite") 101 | # open sqlite connection. db is a "connection" 102 | db<- dbConnect(sqlite, dbname=paste(db.dir,sqlite.filename,sep="/")) 103 | summary(db) 104 | # preparation for output destination 105 | #setwd(paste(output.dir, output.subdir,sep="/")) 106 | reportFile<-paste(paste(output.dir, output.subdir,paste(report.date,".html",sep=""),sep="/")) 107 | 108 | map<-list(Content="content", DateTimeStamp="datestamp")# Heading="title", Author="authors", URL="url") 109 | 110 | ## 111 | ## MAIN 112 | # Write out the HTML Header 113 | #this palette is used in brew for color-coding scores 114 | score.pal<-c("#00FFFF","#0000FF","#800080","#FF0000","#FF8040") 115 | pal.len<-length(score.pal) 116 | brew.conn<-file(reportFile, open="wt") 117 | brew(file=paste(template.dir,"BlogScore BrewHeader.html",sep="/"), output=brew.conn) 118 | 119 | # Loop over the dictionaries, emitting a section of HTML if there are any posts matching the thresholds 120 | for(i in 1:length(score.caption)){ 121 | section<-score.title[i] 122 | caption<-score.caption[i] 123 | if(!is.na(section)){ 124 | sect.sql<-paste(sql,"and ", score.cols[i],">",score.mins[i], "order by", score.cols[i],"desc limit", as.character(top.n)) 125 | hits<-dbGetQuery(db,sect.sql)#query, fetch all records to dataframe and clear resultset in one go 126 | #only create output if there are some "hits" 127 | if(length(hits[,1])>0){ 128 | #extract and massage the scores for "friendly" display 129 | scores<-hits[,score.cols[i]] 130 | ratings<-round(100*(scores-score.mins[i])/(rating.fullscale[i]-score.mins[i])) 131 | ratings.capped<-pmin(100,ratings) 132 | #write out 133 | brew(file=paste(template.dir,"BlogScore BrewChunk.html",sep="/"), output=brew.conn) 134 | } 135 | } 136 | } 137 | # The novelty calculation requires some real work since we need to compare all posts in the reporting-period window against all those in the baseline period. 138 | # This is not quite the same as in "Rising and Falling Terms" since the two sets are disjoint here 139 | #candidates are in the reporting time window 140 | candidates<-dbGetQuery(db,sql) 141 | candidates.corp<-Corpus(DataframeSource(candidates), readerControl=list(reader= readTabular(mapping=map))) 142 | candidates.corp<-tm_map(candidates.corp,removeNumbers) 143 | candidates.corp<-tm_map(candidates.corp,removePunctuation) 144 | candidates.dtm.bin<-DocumentTermMatrix(candidates.corp, control=list(stemming=TRUE, stopwords=TRUE, minWordLength=3, weighting=weightBin)) 145 | #eliminate very short docs as they give unreliable novelty calcs (too sparse). must have >15 different non-stopwords 146 | ok.posts<-row_sums(candidates.dtm.bin)>15 147 | candidates<-candidates[ok.posts,] 148 | candidates.dtm.bin<-candidates.dtm.bin[ok.posts,] 149 | # the baseline is what novelty is calculated with respect to 150 | baseline<-dbGetQuery(db,baseline.sql) 151 | baseline.corp<-Corpus(DataframeSource(baseline), readerControl=list(reader= readTabular(mapping=map))) 152 | baseline.corp<-tm_map(baseline.corp,removeNumbers) 153 | baseline.corp<-tm_map(baseline.corp,removePunctuation) 154 | baseline.dtm.bin<-DocumentTermMatrix(baseline.corp, control=list(stemming=TRUE, stopwords=TRUE, minWordLength=3, weighting=weightBin)) 155 | # Calculate distance using cosine measure, i.e. as a scalar product 156 | #first vector norms 157 | doc.norm.mat<-sqrt(tcrossprod(row_sums(candidates.dtm.bin),row_sums(baseline.dtm.bin))) 158 | #now eliminate terms that are not shared since their product will be zero anyway and we need to avoid attempts to allocate memory beyond what is permitted. Norms calc must be BEFORE this. 159 | shared.terms<-Terms(candidates.dtm.bin)[Terms(candidates.dtm.bin) %in% Terms(baseline.dtm.bin)] 160 | candidates.dtm.bin<-candidates.dtm.bin[,shared.terms] 161 | baseline.dtm.bin<-baseline.dtm.bin[,shared.terms] 162 | #the dot product 163 | difference.mat<-1.0-tcrossprod(as.matrix(candidates.dtm.bin), 164 | as.matrix(baseline.dtm.bin)) / doc.norm.mat 165 | #this should not be necessary since sets are disjoint but it is possible identical posts are present, one in both sets, so make sure that any such do not interfere 166 | difference.mat[difference.mat[,]==0]<-1.0 167 | #sometimes NaNs creep in (not sure why) 168 | difference.mat[is.nan(difference.mat[,])]<-1.0 169 | #novelty means there is no other close doc so find the smallest difference 170 | novelty<-apply(difference.mat,1,min) 171 | # "hits" are those candidates above the threshold and in the top.n 172 | top.n.bool<-order(novelty, decreasing=T)[1:min(top.n,length(novelty))] 173 | hits<-candidates[top.n.bool,] 174 | nov.top.n<-novelty[top.n.bool] 175 | hits<-hits[nov.top.n>=nov.min,] 176 | #only create output if there are some "hits" 177 | if(length(hits[,1])>0){ 178 | section<-"Novelty" 179 | caption<-paste("Posts with an unusual combination of words compared to posts from the previous",nov.baseline.months,"months") 180 | #extract and massage the scores for "friendly" display 181 | scores<-nov.top.n[nov.top.n>=nov.min] 182 | ratings<-round(100*(scores-nov.min)/(1-nov.ref)) 183 | ratings.capped<-pmin(100,ratings) 184 | #write out 185 | brew(file=paste(template.dir,"BlogScore BrewChunk.html",sep="/"), output=brew.conn) 186 | } 187 | 188 | #Write out the HTML Footer and close the report file 189 | brew(file=paste(template.dir,"BlogScore BrewFooter.html",sep="/"), output=brew.conn) 190 | close(brew.conn) 191 | 192 | 193 | dbDisconnect(db) -------------------------------------------------------------------------------- /Alerts/BlogScore BrewChunk.html: -------------------------------------------------------------------------------- 1 | 2 |

<%=section%>

3 | 4 | 5 | <%for(kk in 1:length(hits[,1])){%> 6 | 7 | 11 | 14 | 15 | 16 | 21 | <%}%> 22 |
8 | <%=hits[kk,"authors"]%>
9 | "><%=hits[kk,"title"]%>
10 | <%=strtrim(hits[kk,"content"], 500)%>...
12 |
13 | (raw score=<%=format(scores[kk],digits=3)%>)
17 | 18 | 19 | 20 |
23 | -------------------------------------------------------------------------------- /Alerts/BlogScore BrewFooter.html: -------------------------------------------------------------------------------- 1 | 2 |

Info

3 | 4 |

Documents are scored according to the occurrence of words in the following lexicons derived from the categories of the Harvard General Inquirer: Economic combines "Econ@" and "ECON", Legal uses "Legal", Political combines "Polit@" and "POLIT", Doing combines a group of dictionaries associated with motivation ("Need", "Goal", "Try", "Means", "Persist", "Complet" and "Fail"), and Knowing and Problem-solving combines "Know" and "Solve". Sometimes a section is omitted from the alert; this occurs when there are no scores above threshold and at most <%=top.n%> posts appear in any section. The raw scores given are the fraction of words in each document that are found in the relevant lexicon. The ratings are an estimate of the size of the score compared to

5 | 6 | 7 |

Copyright, Licence and Credits

8 |

This work was undertaken as part of the TEL-Map Project; TEL-Map is a support and coordination action within EC IST FP7 Technology Enhanced Learning.

9 | 10 | 11 | 12 | 13 | 14 | 16 |
Creative Commons LicenceThis work and original text are ©2012 Adam Cooper, Institute for Educational Cybernetics, University of Bolton, UK.
Adam Cooper has licenced it under a Creative Commons Attribution 3.0 Unported License
Referenced blogs and quoted sections are © the relevant authors. The lexicons used are from the Harvard Inquirer spreadsheet 15 |
17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /Alerts/BlogScore BrewHeader.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Blog Scan Alerts: <%=as.character(report.date)%> 5 | 6 | 7 | 18 | 19 | 51 | 52 | 53 | 59 | 60 | 61 | 62 |

Blog Scan Alerts: <%=as.character(report.date)%>

63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /Alerts/RisingFallingTerms Alert.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | 11 | 12 | ## 13 | ## A Simple alerter. Produce a concise report to highligh blog posts in a (short) timewindow 14 | ## containing statistically significant rising/falling/new terms 15 | ## Relies on Database for source data (no CSV) and assumes that pre-process.R has already been run 16 | ## 17 | library("RSQLite") 18 | library("tm") 19 | library("brew") 20 | 21 | source("/home/arc1/R Projects/Text Mining Weak Signals/commonFunctions.R") 22 | 23 | home.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 24 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output" 25 | db.dir<-paste(home.dir,"Source Data",sep="/") 26 | template.dir<-paste(home.dir,"Alerts",sep="/") 27 | 28 | ## 29 | ## RUN PARAMETERS - Often Changed 30 | # The date of the report. Posts up to and including this date are candidates for output 31 | report.date<-as.POSIXlt("2012-09-26") 32 | # how many days to include when seeking candidates 33 | report.days<-7 34 | # How many months to use for baseline. 35 | baseline.months<-12 36 | 37 | title<-paste("TEL Blog Scan - Last",report.days,"Days") 38 | 39 | ## 40 | ## RUN PARAMETERS - Not often changed 41 | # locate the database 42 | sqlite.filename <- "TMWS Data A.sqlite" 43 | # set output subfolder 44 | output.subdir <- "RFTerms Alert" 45 | # Thresholds for selection 46 | # only ever show the top few. This is a max; if the score thresholds are not met nothing shows 47 | top.n<-6 48 | # how many documents must the term appear in to be listed. This is in addition to the frequency thresholds. A value of 2 is expected, i.e. ignore terms that appear in only one doc 49 | # values higher than 2 may be needed for blogs where a given blog may have an oddity that appears in several posts. 50 | doc_count.thresh <- 4 51 | # p-value to accept the "alternative hypothesis" that there is something interesting 52 | thresh.pval<-0.0001 #i.e. accept a .01% chance that null hypothesis falsely rejected 53 | thresh.pval.falling<-0.005 #sometimes need to use a more lenient threshold for falling terms 54 | #max frequency of term in the past set for eligibility as a weak signal. 55 | #Above this, sigifnicant risers are "established terms" 56 | max.past.freq<-0.0002 57 | # PREVENT any of these origins from appearing in the Zeitgeist list. They are usually just link lists! 58 | zg.blacklist<-c("http://www.lucygray.org/weblog") 59 | 60 | ## 61 | ## PRELIMINARIES - some initial setup-specific working 62 | # database query 63 | qdate<-function(d){ 64 | return (paste("'",as.character(d),"'", sep="")) 65 | } 66 | # this query defines the posts to be considered for report. BETWEEN is inclusive 67 | report.start<-report.date 68 | report.start$mday<-report.start$mday - report.days 69 | sql.recentIds<-paste("select id from blog_post where datestamp between",qdate(report.start),"and", qdate(report.date)) 70 | # this query defines the baseline (large), aka the "past" set of documents to compare against 71 | baseline.end<-report.start 72 | baseline.end$mday<-baseline.end$mday-1 73 | baseline.start<-baseline.end 74 | baseline.start$mon<-baseline.start$mon-baseline.months 75 | sql.pastIds<- paste("select id from blog_post where datestamp between",qdate(baseline.start),"and", qdate(baseline.end)) 76 | #this query fetches the text content and metadata for both recent and past sets 77 | sql<- paste("select id, content, title, authors, datestamp, url, origin from blog_post where datestamp between",qdate(baseline.start),"and", qdate(report.date)) 78 | 79 | # initialise database access 80 | # instantiate the SQLite driver in the R process 81 | sqlite<- dbDriver("SQLite") 82 | # open sqlite connection. db is a "connection" 83 | db<- dbConnect(sqlite, dbname=paste(db.dir,sqlite.filename,sep="/")) 84 | summary(db) 85 | # preparation for output destination 86 | #setwd(paste(output.dir, output.subdir,sep="/")) 87 | reportFile<-paste(paste(output.dir, output.subdir,paste(report.date,".html",sep=""),sep="/")) 88 | 89 | #ID="id", 90 | map<-list(id="id", Content="content", DateTimeStamp="datestamp")# Heading="title", Author="authors", URL="url") 91 | 92 | ## 93 | ## MAIN 94 | ## 95 | # get the ID lists for previous and current sets and the corpus of all documents in either set 96 | recentIds<-as.numeric(dbGetQuery(db,sql.recentIds)[,1]) 97 | pastIds<-as.numeric(dbGetQuery(db,sql.pastIds)[,1]) 98 | corpus.table<-dbGetQuery(db,sql) 99 | corpus<-Corpus(DataframeSource(corpus.table), readerControl=list(reader= readTabular(mapping=map))) 100 | # REALLY we would use ID="id" in the mapping and then the vectors recentIds and pastIds would simply be passed to PearsonChanges. 101 | # BUT there seems to be a bug in "tm" hence a mapping of id="id" is used and the next 4 lines map from "id" values as obtained from the database to ID metadata in the corpus 102 | db.ids<-unlist(meta(corpus,"id",type="local")) 103 | corpus.ids<-unlist(meta(corpus, "ID", type="local")) 104 | recentIds.c<-corpus.ids[db.ids %in% recentIds] 105 | pastIds.c<-corpus.ids[db.ids %in% pastIds] 106 | # compute the significant rising and falling terms 107 | swords<-CustomStopwords() 108 | rfTerms<-PearsonChanges.Corpus(corpus, pastIds.c, recentIds.c, 109 | doc_count.thresh = doc_count.thresh, 110 | thresh.pval = thresh.pval, 111 | thresh.pval.falling = thresh.pval.falling, 112 | max.past.freq = max.past.freq, 113 | stem = TRUE, stop.words = swords) 114 | 115 | ## 116 | ## Additional work to prepare for writing the report 117 | ## 118 | # lookup best guesses for unstemmed forms for the significant lists. 119 | # New is not done since it is not used in the report 120 | # replace NAs by the stemmed term (which is often the answer, dunno why stemCompletion doesn't do this) 121 | stemmed<-names(rfTerms$Rising$P) 122 | rising.words<-stemCompletion(stemmed,corpus,type="shortest") 123 | rising.words[is.na(rising.words)]<-stemmed[is.na(rising.words)] 124 | stemmed<-names(rfTerms$Established$P) 125 | established.words<-stemCompletion(stemmed,corpus,type="shortest") 126 | established.words[is.na(established.words)]<-stemmed[is.na(established.words)] 127 | stemmed<-names(rfTerms$Falling$P) 128 | falling.words<-stemCompletion(stemmed,corpus,type="shortest") 129 | falling.words[is.na(falling.words)]<-stemmed[is.na(falling.words)] 130 | # find the "top n" blog posts as being those with the most rising/established terms in 131 | # 1st recover the DTM for all docs and filter down to the rising/est doc set 132 | dtm.re.bin<-weightBin(rfTerms$DTM.tf[union(Docs(rfTerms$Rising$DTM),Docs(rfTerms$Established$DTM)),]) 133 | # limit to the rising/est term set 134 | dtm.re.bin<-dtm.re.bin[,c(names(rfTerms$Rising$P), names(rfTerms$Established$P))] 135 | #eliminate any posts from blacklisted blogs 136 | filter<-!(corpus.table[Docs(dtm.re.bin),"origin"]%in%zg.blacklist) 137 | dtm.re.bin<-dtm.re.bin[filter,] 138 | ## find the "top n" 139 | term.cnt<-row_sums(dtm.re.bin) 140 | top.n.selector<-order(term.cnt, decreasing = TRUE)[1:top.n] 141 | ratings<-round(100*term.cnt[top.n.selector]/length(Terms(dtm.re.bin))) 142 | top.doc.ids<-names(term.cnt[top.n.selector]) 143 | #get the terms appearing in each top doc#and map these to the unstemmed words 144 | re.words<-c(rising.words, established.words) 145 | top.doc.words<-lapply(top.doc.ids,function(x){re.words[Terms(dtm.re.bin[x,col_sums(dtm.re.bin[x,])>0])]}) 146 | # get the results to show, eliminating any black-listed blogs 147 | hits<-corpus.table[top.doc.ids,] 148 | 149 | ## 150 | ## Produce a HTML Report 151 | ## 152 | #this palette is used in brew for color-coding scores 153 | score.pal<-c("#00FFFF","#0000FF","#800080","#FF0000","#FF8040") 154 | pal.len<-length(score.pal) 155 | # Write out the HTML Header 156 | brew.conn<-file(reportFile, open="wt") 157 | brew(file=paste(template.dir,"RisingFallingTerms BrewTemplate.html",sep="/"), output=brew.conn) 158 | close(brew.conn) 159 | 160 | dbDisconnect(db) -------------------------------------------------------------------------------- /Common Functions/CustomStopwords.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | 11 | require("tm", quietly=TRUE) 12 | CustomStopwords<-function(){ 13 | #+ "paper" (which is common in journal/proceedings abstracts!) 14 | SW<-c(stopwords(kind = "en"),"paper","studentspsila","conference", 15 | "january","february","march","april","may","june", 16 | "july","august","september","october","november","december", 17 | "jan","feb","mar","apr","jun","jul","aug","sept","oct","nov","dec", 18 | "pixelmaid","onomatopoeia","pizzaro","kali","ignatius","grizzla", "iggi") 19 | #- some terms (and various expansions) that are relevant to the education domain 20 | SW<-SW[-grep("group", SW)] 21 | SW<-SW[-grep("problem", SW)] 22 | SW<-SW[-grep("present", SW)] 23 | SW<-SW[-grep("work", SW)] 24 | return(SW) 25 | } -------------------------------------------------------------------------------- /Common Functions/sentimentFunctions.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Prepare lexicons from CSV dictionary lists. Assumes Harvard General Inquirer. 3 | ## @param csvFile is where to find a CSV file comprising columns from the "Basic" spreadsheet 4 | ## @param targets is a list, where each named item becomes a dictionary of the same name 5 | ## and the list elements are vectors of column names to combine 6 | ## Returns a list of dictionaries 7 | ## 8 | #e.g. targets<-list(Economic=c("Econ.","ECON"), Legal="Legal", Political=c("Polit.", "POLIT"), Social =c("Role", "SocRel")) 9 | 10 | prepareLexicons<-function(csvFile, targets){ 11 | inquirer.table<-read.csv(csvFile, header=TRUE, sep=",", quote="\"", stringsAsFactors=FALSE) 12 | sentiment.dics<-list() 13 | # for each sentiment, find out which words are relevant and for cases where there is more 14 | # than one usage (denoted #1 in the word), select only the first one as this is the most frequent in general 15 | for(i in 1:length(targets)){ 16 | dic<-inquirer.table[,"Entry"] 17 | target.cols<-targets[[i]] 18 | if(length(target.cols)>1){ 19 | merged.filter<-row_sums(inquirer.table[,target.cols]!="")>0 20 | }else{ 21 | merged.filter<-inquirer.table[,target.cols]!="" 22 | } 23 | dic<-dic[merged.filter]#limit to words for sentiment 24 | dic<-sub("#1","",dic)#remove '#1' from any words containing it 25 | dic<-dic[-grep("#",dic)]#remove all words still containing # 26 | #manually remove some terms that cause bias 27 | ii<-which(dic=="PROJECT") 28 | if(length(ii)>0){ 29 | dic<-dic[-ii] 30 | } 31 | #store the dictionary 32 | sentiment.dics[[i]]<-tolower(dic) 33 | } 34 | names(sentiment.dics)<-names(targets) 35 | return (sentiment.dics) 36 | } -------------------------------------------------------------------------------- /Compair/2011 ICCE and ICALT/Init_Compair.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This is an initialiser for Compair.R; it contains run-specific setup 12 | ## it should be run before ../Compair.R 13 | ## 14 | # Irritating absolute path specification 15 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 16 | my.dir<-paste(base.dir,"Compair", sep="/") 17 | source.dir<-paste(base.dir,"Source Data",sep="/") 18 | 19 | ## Run Properties - dependent on the source 20 | #the output directory. NB convention to include the year 21 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Compair/_2011 ICALT and ICCE" 22 | dir.create(output.dir, showWarnings=FALSE) 23 | setwd(output.dir) 24 | 25 | #informative labels etc for each set 26 | title<-"Comparison: 2011 Conference Proceedings from ICALT and ICCE" 27 | name.A <- "ICALT" 28 | name.B<- "ICCE" 29 | title.A <- "IEEE International Conference on Advanced Learning Technologies" 30 | title.B<-"ICCE" 31 | url.A <- "http://www.ask4research.info/icalt/2010/" #only used if brew.type indicates conference 32 | url.B<-"http://www.nectec.or.th/icce2011/" #only used if brew.type indicates conference 33 | 34 | #specification of the source for each set 35 | #may be either a directory containing N PDFs or a CSV 36 | source.type<-"PDF"#"CSV" 37 | #subdirectories of source.dir for source.type="PDF" 38 | dir.A <- "ICALT Full 2011" 39 | dir.B<-"ICCE Full 2011" 40 | #file names in source.dir for source.type="CSV" 41 | #file.A<-"" 42 | #file.B<-"" 43 | # the intro text for the Brew output will mostly be the same but needs adaption 44 | brew.type<-"c2" 45 | 46 | # additional stopwords, e.g. to remove words appearing in boilerplate of one set and not the other 47 | extra.stopwords<-c("IEEE","International","Conference","Advanced","Learning", 48 | "Technologies","Proceedings","Computers","Education","Asia-Pacific", 49 | "Society") 50 | 51 | 52 | 53 | ## 54 | ## Run properties - statistical 55 | ## 56 | # minimum term frequency for inclusion (aggregate over both sets) 57 | # NB as a fraction of terms 58 | min.term.freq<-0.002 #0.2% 59 | #max term freq is meant to be used to eliminate terms that mess up the plots 60 | max.term.freq<-0.02 61 | #min number of docs in each set separately. Set=1 if sets small and use min.docs to control 62 | # set >1 for large sets to increase speed (reduces compute for term-set merging) 63 | min.docs.single<-1 64 | #min number of docs term must appear in (agg over both sets) 65 | min.docs<-4 66 | # statistical significance threshold 67 | p.max<-0.001 68 | ## 69 | ## Output control 70 | ## 71 | # edges (for gephi) will only be exported if the weight is >= the following quantile of calculated edge weights 72 | edge.quantile <- 0.9 #suggest default of 0.75 73 | 74 | -------------------------------------------------------------------------------- /Compair/BrewTemplate.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | <%=title%> 5 | 6 | 7 | 18 | 19 | 51 | 52 | 53 | 59 | 60 | 61 | 62 |

<%=title%>

63 | 64 |

This is an un-interpreted and automatically-generated report to show the variation in terms used in 65 | <%if(brew.type=="c2"){%> 66 | the full text of papers from two conferences: <%=name.A%> and <%=name.B%>. 67 | A separate subsection shows those terms that appear more frequently in papers from each conference in turn. 68 | <%}else if(brew.type=="b2"){%> 69 | two sets of blogs: <%=title.A%> and <%=title.B%> (data obtained from the TELMap Mediabase created by RWTH Aachen University). 70 | A separate subsection shows those terms that appear more frequently in blog posts from each set in turn. 71 | <%}%> 72 | Selection criteria based on statistical significance are used to determine which terms are selected; the probability that the difference in frequency might be due to pure chance must be less than <%=p.max*100%>% in addition to other criteria to select dominant terms (see "technicalities"). 73 | 74 |

All plots will open in a new window/tab as 1000x1000 pixel images if clicked on. The "Wordle" is 1024x768.

75 | 76 |

Overview of Selected Terms

77 |

Only middle-frequency words are considered; the comparison is between terms that are neither very common nor very rare in the aggregate of all <%=doc.type%>s being analysed.

78 |
Word cloud of compared terms
79 | Word cloud for both sets combined and before considering difference in occurrence between the two sets of <%=doc.type%>s: word sizes indicate frequency.
80 | 81 |

 

82 | 83 |

<%=title.A%>

84 |

Frequency Plot

85 |

This plot shows those terms with a statistically-significant higher frequency in <%=title.A%> <%=doc.type%>s.

86 |
<%=name.A%> Frequencies and Significance
87 | Frequency = the fraction of terms
88 | Significance = -log10 of the probability that the difference in frequency between the conferences is "pure chance" (i.e. 3 is 1 in 1,000, 4 is 1 in 10,000 etc)
89 | Docs/1000 = the number of documents in the set that contain the term per thousand (colour code and area of square)
90 | Also available as hi-res pdf.
91 | 92 |

 

93 | 94 |

Term Co-occurrence Graph

95 |

This plot shows the extent to which pairs of the higher frequency terms occur together in the same <%=doc.type%>.

96 |
<%=name.A%> Term Co-occurrence
97 | Node size = relative significance
98 | Node colour is accoring to grouping
99 | Edge (connector) size = number of documents containing both connected terms.
100 | NB: only edges in the top <%=100*(1.0-edge.quantile)%>% are shown 101 |
102 | 103 | 104 |

 

105 | 106 |

<%=title.B%>

107 | 108 |

Frequency Plot

109 |

This plot shows those terms with a statistically-significant higher frequency in <%=title.B%> <%=doc.type%>s.

110 |
<%=name.B%> Frequencies and Significance
111 | Frequency = the fraction of terms
112 | Significance = -log10 of the probability that the difference in frequency between the conferences is "pure chance" (i.e. 3 is 1 in 1,000, 4 is 1 in 10,000 etc)
113 | Docs/1000 = the number of documents in the set that contain the term per thousand (colour code and area of square)
114 | Also available as hi-res pdf.
115 | 116 | 117 |

 

118 | 119 |

Term Co-occurrence Graph

120 |

This plot shows the extent to which pairs of the higher frequency terms occur together in the same <%=doc.type%>.

121 |
<%=name.B%> Term Co-occurrence
122 | Node size = relative significance
123 | Node colour is accoring to grouping
124 | Edge (connector) size = number of documents containing both connected terms.
125 | NB: only edges in the top <%=100*(1.0-edge.quantile)%>% are shown 126 |
127 | 128 | 129 |

 

130 | 131 | 132 |

Information

133 |

Source Code, Data and Technicalities

134 |

Source code for processing and formatting is available on GitHub.

135 |

Raw results are available in pairs, one of each kind being the data behind the two sections above. Gephi files are available separately for <%=name.A%> and <%=name.B%>. All are under the same licence terms as this report.

136 |

137 | The log file contains run parameters. 138 |

139 | 140 |

The technicalities of the method and explanatory notes on the content of the above downloads may be found on the GitHub wiki. These notes explain the term-selection criteria.

141 | 142 |

Copyright, Licence and Credits

143 |

This work was undertaken as part of the TEL-Map Project; TEL-Map is a support and coordination action within EC IST FP7 Technology Enhanced Learning.

144 | 145 | 146 | 147 | 148 | 149 |
Creative Commons LicenceThis work, its images and original text are ©2012 Adam Cooper, Institute for Educational Cybernetics, University of Bolton, UK.
Adam Cooper has licenced it under a Creative Commons Attribution 3.0 Unported License
150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /Compair/CETIS Conf 2012/Init_Compair.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This is an initialiser for Compair.R; it contains run-specific setup 12 | ## it should be run before ../Compair.R 13 | ## 14 | # Irritating absolute path specification 15 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 16 | my.dir<-paste(base.dir,"Compair", sep="/") 17 | source.dir<-paste(base.dir,"Source Data",sep="/") 18 | 19 | ## Run Properties - dependent on the source 20 | #the output directory. NB convention to include the year 21 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Compair/CETIS Conf 2012" 22 | dir.create(output.dir, showWarnings=FALSE) 23 | setwd(output.dir) 24 | 25 | #informative labels etc for each set 26 | title<-"Comparison: CETIS Blogging vs EdTech Bloggers Generally (Jan 2011-Feb 2012)" 27 | name.A <- "CETIS" 28 | name.B<- "non-CETIS" 29 | title.A <- "CETIS Blogs" 30 | title.B<-"EdTech Blogs in TELMap Mediabase" 31 | url.A <- "" #only used if brew.type indicates conference 32 | url.B<-"" #only used if brew.type indicates conference 33 | 34 | #specification of the source for each set 35 | #may be either a directory containing N PDFs or a CSV 36 | source.type<-"CSV"#"PDF" 37 | #subdirectories of source.dir for source.type="PDF" 38 | #dir.A <- "ICALT Full 2011" 39 | #dir.B<-"ICCE Full 2011" 40 | #file names in source.dir for source.type="CSV" 41 | file.A<-"CETIS Blogs 20110101-20120301.csv" 42 | file.B<-"NonCETIS Blogs 20110101-20120301.csv" 43 | # the intro text for the Brew output will mostly be the same but needs adaption 44 | brew.type<-"b2" 45 | 46 | # additional stopwords, e.g. to remove words appearing in boilerplate of one set and not the other or otherwise distracting 47 | extra.stopwords<-c("post", "posted", "posting", "CETIS", "project", "use", "ive", "com", "this", "does") 48 | 49 | ## 50 | ## Run properties - statistical 51 | ## 52 | # minimum term frequency for inclusion (aggregate over both sets) 53 | # NB as a fraction of terms 54 | min.term.freq<-0.001 #0.1% 55 | #max term freq is meant to be used to eliminate terms that mess up the plots 56 | max.term.freq<-0.01 57 | #min number of doc1 in each set separately. Set=1 if sets small and use min.docs to control 58 | # set >1 for large sets to increase speed (reduces compute for term-set merging) 59 | min.docs.single<-3 60 | #min number of docs term must appear in (agg over both sets) 61 | min.docs<-10 62 | # statistical significance threshold 63 | p.max<-0.001 64 | ## 65 | ## Output control 66 | ## 67 | # edges (for gephi) will only be exported if the weight is >= the following quantile of calculated edge weights 68 | edge.quantile <- 0.9 #suggest default of 0.75 69 | 70 | source("/home/arc1/R Projects/Text Mining Weak Signals/Compair/Compair.R") -------------------------------------------------------------------------------- /Compair/ReadMe.txt: -------------------------------------------------------------------------------- 1 | Compare pairs of conferences for differences. 2 | 3 | Uses full text of paper and compares one set (a conference) against another. 4 | 5 | Pearson Chi^2, considering each conference to be a sample from a larger population 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Frequent TermSet/Frequent TermSet.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2011, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## Perform sentence-level term extraction and association rule mining. 12 | ## 13 | 14 | library("tm") 15 | library("slam") 16 | library("arules") 17 | library("arulesViz") 18 | library("Snowball") 19 | library("openNLP") 20 | 21 | ## Run Properties - dependent on the source 22 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 23 | source.dir<-paste(base.dir,"Source Data",sep="/") 24 | #the output directory. NB convention to include the year 25 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Term Cluster/Union B/testing" 26 | dir.create(output.dir, showWarnings=FALSE) 27 | setwd(output.dir) 28 | title<-"Term Clusters - Conference Proceedings from ICALT, CAL, ECTEL and ICWL" 29 | abstracts.csv <- c("ICALT Abstracts 2005-2011 with metrics.csv", 30 | "ECTEL Abstracts 2006-2011 with metrics.csv", 31 | "ICWL Abstracts 2005-2011 with metrics.csv", 32 | "CAL Abstracts 2007-2009 with metrics.csv") 33 | conference.name <- c("ICALT", 34 | "ECTEL", 35 | "ICWL", 36 | "CAL") 37 | conference.title <- c("IEEE International Conference on Advanced Learning Technologies", 38 | "European Conference on Technology Enhanced Learning", 39 | "International Conference on Web-based Learning", 40 | "Computer Assisted Learning Conference") 41 | last.conference.url <- c("http://www.ask4research.info/icalt/2010/", 42 | "http://www.ectel2010.org/", 43 | "http://www.hkws.org/conference/icwl2010/", 44 | "") 45 | publisher.series <- c("IEEE", 46 | "Springer Lecture Notes in Computer Science (LNCS)", 47 | "Springer Lecture Notes in Computer Science (LNCS)", 48 | "Springer Lecture Notes in Computer Science (LNCS)", 49 | "Elsevier Computers and Education Journal") 50 | publisher.url <- c("http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=5570018", 51 | "http://www.springerlink.com/content/978-3-642-16019-6/contents/", 52 | "http://www.springer.com/computer/general+issues/book/978-3-642-17406-3", 53 | "http://www.journals.elsevier.com/computers-and-education/") 54 | 55 | ## 56 | ## Run properties - normally the same between different sources of the same kind for comparability 57 | ## 58 | #time frame 59 | selected.year<-2010 60 | #remove very frequent and infrequent terms to be more efficient 61 | min.doc.freq<-0.001 #terms must appear in >= this frac of docs 62 | max.doc.freq<-0.8 #terms appearing in > this fraction of docs are removed 63 | #a "transaction" is defined as a sentence and items to be the non-stopword terms 64 | #Which association rules should be extracted? The LHS and RHS filters are applied separately 65 | lhs.select<-c("serious","mobile")#the LHS of the association rule must contain one of these words 66 | rhs.select<-c("game") 67 | 68 | ##future extension 69 | #this option allows for metadata to also be treated as an item. These are column names in the CSV. 70 | #the metdata is converted to pseudo-terms of the form "year=...." where "...." is a year from the CSV 71 | #add.meta<-c("origin","year") 72 | 73 | ## 74 | ## Read in the data, aggregating the sources 75 | ## 76 | table<-NULL 77 | for (src in 1:length(abstracts.csv)){ 78 | # read in CSV with format year,pages,title,authors,abstract,keywords. There is a header row. 79 | tmp_table<-read.csv(paste(source.dir,abstracts.csv[[src]],sep="/"),header=TRUE,sep=",",quote="\"") 80 | #accumulate the table 81 | table<-rbind(table,tmp_table) 82 | tmp_table<-NULL 83 | } 84 | 85 | # bring ONLY records with the required year out before stuffing into a corpus since it is more efficient to do so 86 | table<-table[table[,"year"]==as.character(selected.year),] 87 | 88 | #Take the text content and: split into sentences, remove punctuation 89 | sentences <- unlist(sapply(table[,"abstract"],sentDetect, language="en")) 90 | 91 | # now read in the sentences to a corpus 92 | corp<-Corpus(VectorSource(sentences)) 93 | 94 | ## 95 | ## Compute 96 | ## 97 | dtm <- DocumentTermMatrix(corp, control = list(weighting = weightBin, stemming=FALSE, stopwords=TRUE, minWordLength=3, removeNumbers=TRUE, removePunctuation=TRUE)) 98 | #pruning 99 | min.doc.count<-length(Docs(dtm))*min.doc.freq 100 | dtm<-dtm[,col_sums(dtm)>=min.doc.count] 101 | max.doc.count<-length(Docs(dtm))*max.doc.freq 102 | dtm<-dtm[,col_sums(dtm)<=max.doc.count] 103 | dtm<-dtm[row_sums(dtm)>0,]#clean out useless docs 104 | print(dtm) 105 | t<-as(as.matrix(dtm),"transactions") 106 | #support is fraction of transactions containing the LHS set in the rule 107 | #confidence is the fraction of transactions containing the LHS where the RHS is true 108 | ap<-apriori(t, parameter=list(support=0.005, confidence=0.6)) 109 | print(summary(ap)) 110 | print(inspect(ap)) 111 | print("Sub-set with lift>4") 112 | ap.highlift<-subset(ap,subset=lift>2) #subset can also select rules with terms 113 | inspect(ap.highlift) 114 | 115 | # try using arulesviz 116 | plot(ap) 117 | #the closes we can get to controling colour is 118 | plot(ap, control=list(gray_range=c(0.2,1))) 119 | #sel<-plot(ap, interactive=T) #interactive mode 120 | #antecedent-consequent matrix plot (interactive) 121 | plot(ap, method="matrix", interactive=T) 122 | plot(ap, method="matrix", interactive=T, measure=c("lift","confidence")) 123 | #this is a nicer viz 124 | plot(ap, method="grouped")#can also be interactive but doesnt add muxch 125 | #graph representation only readable for small rule-sets so only do subset 126 | plot(ap.highlift, method="graph") 127 | #which can be saved using 128 | saveAsGraph(ap, file="rules.graphml") 129 | #a more easily-read version (although more complex on 1st sight), but no graphml save 130 | plot(ap.highlift, method="graph", control=list(type="items")) 131 | -------------------------------------------------------------------------------- /Frequent TermSet/ReadMe.txt: -------------------------------------------------------------------------------- 1 | Use arules and arulesViz R packages to find frequent item-set rules for terms in sentences. 2 | 1 sentence = 1 transaction 3 | -------------------------------------------------------------------------------- /History Visualiser/CETIS Conf 2012/HV_Init.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2011, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This contains the parameters for running a specific dataset against the HistoryVis.R Method 12 | ## It should be executed first 13 | ## 14 | 15 | ## Run Properties - dependent on the source 16 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 17 | source.dir<-paste(base.dir,"Source Data",sep="/") 18 | set.name<-"CETIS Conf 2012 - CETIS Blogs" 19 | output.dir<-paste("/home/arc1/R Projects/Text Mining Weak Signals Output/History Visualiser",set.name,sep="/") 20 | brew.dir<-paste(base.dir,"History Visualiser",sep="/") 21 | web.page.base<-paste("http://arc12.github.com/Text-Mining-Weak-Signals-Output/History Visualiser",set.name, sep="/") 22 | brew.type<-"b"# c=conference abstracts, b=blog posts 23 | 24 | dir.create(output.dir, showWarnings=TRUE) 25 | setwd(output.dir) 26 | #these are combined into one corpus 27 | sets.csv <- "CETIS Blogs 20090101-20120301 with metrics.csv" 28 | #c("CETIS Blogs 20110101-20120301 with metrics.csv","NonCETIS Blogs 20110101-20120301 with metrics.csv") 29 | 30 | ## 31 | ## Run properties - normally the same between different sources of the same kind for comparability 32 | ## 33 | today<-as.POSIXlt(Sys.Date(), tz = "GMT") 34 | start.year<-2009 35 | start.month<-1#generally set to 1 36 | end.year<-2012 37 | end.month<-1#generally set to 1 unless total period<2 years or so NB this determines the start of the last slice 38 | # data interval control. 39 | slice.size<-4 #how many months in a time slice used in the analysis. the last slice is is from start month 1 in the end.year to the end of month "slice.size" in end.year 40 | interpolate.size<-1 #number of months between interpolated points in the output; 1 "row" is created for each interval. No interpolation if slice.size = interpolate.size 41 | 42 | 43 | ## 44 | ## Which Terms to run for 45 | ## 46 | title.common<-"CETIS Blogs - Jan 2009-Feb 2012" 47 | #the following titles should match each of the entries in the following list 48 | titles<-c("select dominant terms (c.f. EdTech generally)", 49 | "select rising terms (last 6 months)", 50 | "select established rising (last 6 months)", 51 | "select falling terms (last 6 months)") 52 | 53 | # NB!!!!!!!!! these are the STEMMED terms 54 | # NB2!!! if you get the stemmed term wrong then a "subscript out of bounds" error occurs later on 55 | term.lists<-list(Familiar=c("data","level","design","innov","inform", "manag","institut"), 56 | Rising=c("eassess","badg","literaci","registri","mobil","skill"), 57 | Established=c("assess","feedback","compet","staff"), 58 | Falling=c("cloud","oer","rdfs","semant","wave","repositori")) 59 | # .... and these are the pretty versions for display 60 | word.lists<-list(Familiar=c("data","level","design","innovation","information", "management","institutions"), 61 | Rising=c("e-assessment","badge","literacies","registry","mobile","skill"), 62 | Established=c("assessment","feedback","competence","staff"), 63 | Falling=c("cloud","oer","RDFS","semantic","wave","repositories")) 64 | 65 | ## 66 | ## End setup 67 | ## 68 | 69 | 70 | 71 | # in interactive execution it may be best to skip this command and to manually switch to it 72 | #source("../RF_Terms.R") 73 | -------------------------------------------------------------------------------- /History Visualiser/HV Brew Template.html: -------------------------------------------------------------------------------- 1 | 2 | <%if(isGadget){-%> 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | <%}-%> 13 | 14 | History Visualiser: <%=title.common%> - <%=run.title%> 15 | 16 | 17 | 18 | 19 | 20 | 27 | 28 | 29 | 66 | 67 | 68 | 69 |
70 | <%if(!isGadget){-%> 71 |

<%=title.common%> - <%=run.title%>

72 | <%if(source.type=="a"){%>

(ICALT, ECTEL, CAL, ICHL and ICWL)

<%}%> 73 | <%}-%> 74 | 75 |
76 | <%if(isGadget){%> 77 |
78 | <%if(isGroup){%> 79 | Each plotted value is a group of terms. 80 | <%}%> 81 |
82 | <%}else{%> 83 |
84 | <%if(isGroup){%> 85 |

Group details:

86 | <%for(i in 1:length(word.lists)){ 87 | word.list<-word.lists[[i]]%> 88 |

<%=titles[i]%>
89 | (<%=paste(unlist(word.list),collapse=", ")%>)

90 | <%}%> 91 | <%}else{ 92 | if(do.groups){%> 93 |

View terms as a group.

94 | <%}%> 95 |

Other sets:

96 | <%for (other.set in 1:length(file.names)){ 97 | if(other.set != i.run){%> 98 |

<%=titles[other.set]%>

99 | <%} 100 | } 101 | }%> 102 |
103 | <%}%> 104 | 105 | 106 |
107 | <%if(isGadget){-%> 108 |

Larger version with more options and more info...

109 | <%}else{-%> 110 | Add to Google 111 |

112 | This shows the history of the way certain terms have been used (related words are counted also, e.g. "gaming" would be counted as "game"). The chart loads up showing the trend in word-frequency for a term (or group of terms) with the line coloured to show the average "positive sentiment" of <%if(source.type=="a"){%>abstracts<%}else{%>blog posts<%}%> containing the term. "Positive sentiment" is calculated as the fraction of words that generally have either a positive connotation according to the Harvard General Inquirer dictionary. For both sentiment and frequency calculations, common words such as "and" are ignored. The attribute "Documents" is an estimate of the number of <%if(source.type=="a"){%>abstracts per year<%}else{%>blog posts per month<%}%> that contain the specified term(s). Calculations were conducted in <%=slice.size-%> month slices and the number of <%if(source.type=="a"){%>abstracts<%}else{%>blog posts<%}%> per slice is (in date order): <%=paste(slice.docs.cnt, collapse=", ")%>. True calculated values are plotted for the mid-point of each time slice to show the aggregate value over that <%=slice.size-%> month period and spline interpolation is used to give the impression of a smooth change between these points. In total, <%=docs.used-%> <%if(source.type=="a"){%>abstracts<%}else{%>blog posts<%}%> were used. 113 |

114 | 115 |

NB: be aware that insignificant fluctuations in the data may appear magnified since the plots are shown "zoomed in" (especially true for sentiment scores (or "subjectivity", which is the sum of positive and negative sentiment). Also note that sentiment values are essentially meaningless when the frequency is close to the baseline.

116 | <%}-%> 117 |
118 | 119 | <%if(isGadget){-%> 120 | ]]> 121 | 122 | 123 | <%}else{-%> 124 | 125 | <%}%> 126 | -------------------------------------------------------------------------------- /History Visualiser/ReadMe.txt: -------------------------------------------------------------------------------- 1 | Creates HTMLand JS to visualise various aspects of term occurrence and metrics using the Google motionchart. See the comments in the R files 2 | -------------------------------------------------------------------------------- /History Visualiser/Union B/HV_Init.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2011, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This contains the parameters for running a specific dataset against the HistoryVis.R Method 12 | ## It should be executed first 13 | ## 14 | 15 | ## Run Properties - dependent on the source 16 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 17 | source.dir<-paste(base.dir,"Source Data",sep="/") 18 | set.name<-"Union B 2005 to 2010_" 19 | output.dir<-paste("/home/arc1/R Projects/Text Mining Weak Signals Output/History Visualiser",set.name,sep="/") 20 | brew.dir<-paste(base.dir,"History Visualiser",sep="/") 21 | web.page.base<-paste("http://arc12.github.com/Text-Mining-Weak-Signals-Output/History Visualiser",set.name, sep="/") 22 | brew.type<-"c"# c=conference abstracts, b=blog posts 23 | 24 | dir.create(output.dir, showWarnings=TRUE) 25 | setwd(output.dir) 26 | #these are combined into one corpus 27 | sets.csv <- c("ICALT Abstracts 2005-2011 with metrics.csv", 28 | "ECTEL Abstracts 2006-2011 with metrics.csv", 29 | "ICWL Abstracts 2005-2011 with metrics.csv", 30 | "CAL Abstracts 2007-2009 with metrics.csv") 31 | 32 | ## 33 | ## Run properties - normally the same between different sources of the same kind for comparability 34 | ## 35 | today<-as.POSIXlt(Sys.Date(), tz = "GMT") 36 | start.year<-2005 37 | start.month<-1 #default = 1 38 | end.year<-2010 39 | end.month<-1 #default = 1. NB this determines the start of the last slice 40 | # data interval control. 41 | slice.size<-12 #how many months in a time slice used in the analysis. the last slice is is from start month 1 in the end.year to the end of month "slice.size" in end.year 42 | interpolate.size<-3 #number of months between interpolated points in the output; 1 "row" is created for each interval. No interpolation if slice.size = interpolate.size 43 | 44 | ## 45 | ## Which Terms to run for 46 | ## 47 | title.common<-"Conference Proceedings from ICALT, ECTEL and ICWL" #CAL 48 | titles<-c("terms that dipped in 2010 compared to the previous 4 years", 49 | "terms that rose in 2010 and where established in the previous 4 years", 50 | "terms that rose in 2010 from a low level in the previous 4 years") 51 | #"Run the Second")#should match each of the entries in the following list 52 | # NB!!!!!!!!! these are the STEMMED terms 53 | term.lists<-list(Falling=c("blog","content","databas","ontolog","project","technolog"), Established=c("activ","condit","conduct","differ","emot","game","gamebas","motiv","path","profil","strategi","tutor","video"), 54 | Rising=c("besid","competit","eassess","figur","gameplay","gender","hybrid","negat","oral","ples","probabilist","public","qti","risk","selfreflect","serious","statement","tablet","tangibl","uptak","wil")) 55 | # .... and these are the pretty versions for display 56 | word.lists<-list(Falling=c("blog","content","database","ontology","project","technology"), Established=c("active","condition","conduct","different","emotion","game","game-based","motivator","path","profile","strategies","tutor","video"), 57 | Rising=c("besides","competitive","e-assessment","figure","game-play","gender","hybrid","negative","oral","PLEs","probabilistic","public","qti","risk","self-reflect","serious","statements","tablet","tangible","uptake","will")) 58 | 59 | ## 60 | ## End setup 61 | ## 62 | 63 | 64 | 65 | # in interactive execution it may be best to skip this command and to manually switch to it 66 | #source("../RF_Terms.R") 67 | -------------------------------------------------------------------------------- /History Visualiser/Union C/HV_Init.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This contains the parameters for running a specific dataset against the HistoryVis.R Method 12 | ## It should be executed first 13 | ## 14 | 15 | ## Run Properties - dependent on the source 16 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 17 | source.dir<-paste(base.dir,"Source Data",sep="/") 18 | set.name<-"Union C 2006 to 2012" 19 | output.dir<-paste("/home/arc1/R Projects/Text Mining Weak Signals Output/History Visualiser",set.name,sep="/") 20 | brew.dir<-paste(base.dir,"History Visualiser",sep="/") 21 | web.page.base<-paste("http://arc12.github.com/Text-Mining-Weak-Signals-Output/History Visualiser",set.name, sep="/") 22 | brew.type<-"c"# c=conference abstracts, b=blog posts 23 | 24 | dir.create(output.dir, showWarnings=TRUE) 25 | setwd(output.dir) 26 | 27 | ## SOURCE DATA SELECTION 28 | # Either a) [DEPRECATED] list csv files "with metrics" as produced by Pre-process.R These are combined into one corpus or 29 | # b) Locate a SQLite Database and define a query to extract 30 | #sets.csv <- c("ICALT Abstracts 2005-2011 with metrics.csv", 31 | #"ECTEL Abstracts 2006-2011 with metrics.csv", 32 | #"ICWL Abstracts 2005-2011 with metrics.csv", 33 | #"ICHL Abstracts 2008-2011 with metrics.csv", 34 | # "CAL Abstracts 2007-2011 with metrics.csv") 35 | set.csv<-NA 36 | sqlite.filename<-"TMWS Data A.sqlite" #set this to NA to use [deprecated] option a 37 | sql<-"SELECT origin, year, pages, title, authors, abstract, keywords, url, dblp_url, pos_score, neg_score, subj_score FROM abstract WHERE year >= '2006'"#BETWEEN '2006' AND '2011'" 38 | use.sqlite<-!is.na(sqlite.filename) 39 | 40 | ## 41 | ## Run properties - normally the same between different sources of the same kind for comparability 42 | ## 43 | today<-as.POSIXlt(Sys.Date(), tz = "GMT") 44 | start.year<-2006 45 | start.month<-1 #default = 1 46 | end.year<-2012 47 | end.month<-1 #default = 1. NB this determines the start of the last slice 48 | # data interval control. 49 | slice.size<-12 #how many months in a time slice used in the analysis. 50 | interpolate.size<-3 #number of months between interpolated points in the output; 1 "row" is created for each interval. No interpolation if slice.size = interpolate.size 51 | 52 | ## 53 | ## Which Terms to run for 54 | ## 55 | title.common<-"Conference Proceedings from ICALT, ECTEL, CAL, ICHL and ICWL" 56 | # NB!!!!!!!!! these are the STEMMED terms 57 | term.lists<-list(Cloud=c('cloud','virtualis','virtual','saa','paa'), 58 | eBooks=c('ebook', 'etextbook'), 59 | Analytics=c('analyt','data'), 60 | Gesture=c('gesturebas','gestur'), 61 | Context.Sensitive.Services=c('context','contextsensit','contextawar','contextenrich','locat','locationbas','locationawar','geospati'), 62 | Games=c('game', "gamif","gamebas", "gameplay"), 63 | Mobile=c('tablet','smartphon','mobil','ubiquit','pervas'), 64 | Learning.Platforms=c('lms','vle','lcms','eportfolio')) 65 | # .... and these are the pretty versions for display 66 | word.lists<-list(Cloud=c('Cloud','Virtualisation','Virtual','SaaS','PaaS'), 67 | eBooks=c('eBook', 'eTextbook'), 68 | Analytics=c('Analytics','Data'), 69 | Gesture=c('Gesture-based','Gesture'), 70 | Context.Sensitive.Services=c('Context','Context-sensitive','Context-aware','Context-enriched','Location','Location-based','Location-aware','Geospatial'), 71 | Games=c('Game', "Gamification","Game-based", "Game-play"), 72 | Mobile=c('Tablet','Smartphone','Mobile','Ubiquitous','Pervasive'), 73 | Learning.Platforms=c('LMS','VLE','LCMS','E-Portfolio','Platform')) 74 | 75 | titles<-gsub("\\."," ",names(term.lists))#lazy way to titles is to replace "." in the list element names - override if necessary 76 | 77 | # The folloowing indicates which of the lists above is treated as a group. 78 | # NB: the strings in go.groups must exactly match the list item names in term.lists (and "." characters are replaced with " " in output) 79 | do.groups=c("Cloud","eBooks","Analytics","Gesture","Context.Sensitive.Services","Games","Mobile","Learning.Platforms") 80 | 81 | # in interactive execution it may be best to skip this command and to manually switch to it 82 | #source(paste(base.dir,"History Visualiser/HistoryVis.R", sep="/")) 83 | -------------------------------------------------------------------------------- /PESTLE Scan/BrewChunk.html: -------------------------------------------------------------------------------- 1 | 2 |

<%=names(sentiment.dics)[[lex]]%>

3 |

Summary statistics of scores after eliminating zero-scoring documents: 4 |

    5 |
  • Mean = <%=format(summary(lex.score.nz)["Mean"],digits=3)%>
  • 6 |
  • Median = <%=format(summary(lex.score.nz)["Median"],digits=3)%>
  • 7 |
  • Third Quartile = <%=format(summary(lex.score.nz)["3rd Qu."],digits=3)%>
  • 8 |
  • Maximum = <%=format(summary(lex.score.nz)["Max."],digits=3)%>
  • 9 |
10 | For comparison between different lexicons, rescale according to the lexicon size. For <%=names(sentiment.dics)[[lex]]%>, there are <%=length(sentiment.dics[[lex]])%> words. 11 |

12 | 13 | 14 | <%for(kk in topDocs){%> 15 | 16 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 31 | <%}%> 32 |
17 | <%=table[kk,"authors"]%>
18 | "><%=table[kk,"title"]%> 19 |
score
<%=format(lex.score[kk,1],digits=3)%>
<%=strtrim(table[kk,"content"], 500)%>...
27 | 28 | Contains terms: <%=paste(Terms(dtm.tf.unstemmed.lex[kk,col_sums(dtm.tf.unstemmed.lex[kk,])>0]), collapse=", ")%> 29 | 30 |
33 | -------------------------------------------------------------------------------- /PESTLE Scan/BrewFooter.html: -------------------------------------------------------------------------------- 1 |

Copyright, Licence and Credits

2 |

This work was undertaken as part of the TEL-Map Project; TEL-Map is a support and coordination action within EC IST FP7 Technology Enhanced Learning.

3 | 4 | 5 | 6 | 7 | 8 | 10 |
Creative Commons LicenceThis work, its images and original text are ©2012 Adam Cooper, Institute for Educational Cybernetics, University of Bolton, UK.
Adam Cooper has licenced it under a Creative Commons Attribution 3.0 Unported License
Referenced blogs and quoted sections are © the relevant authors. The lexicons used are from the Harvard Inquirer spreadsheet 9 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /PESTLE Scan/BrewHeader.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | PESTLE Analysis: <%=inFileStem%> 5 | 6 | 7 | 18 | 19 | 51 | 52 | 53 | 59 | 60 | 61 | 62 |

PESTLE Analysis: <%=inFileStem%>

63 | 64 |

<%=n.docs%> documents were read in.

65 | 66 | <%if(prefilter.thresh.1>0){%> 67 |

Filtering to only use documents containing at least <%=prefilter.thresh.1%> of the following words:
68 | <%=paste(prefilter.words.1, collapse=", ")%> 69 | <%if(prefilter.thresh.2>0){%> 70 | as well as any of these words:
71 | <%=paste(prefilter.words.2,collapse=", ")%> 72 | <%}%> 73 |

74 | 75 |

<%=n.docs.f%> remain after filtering

76 | <%}%> 77 | 78 |

Documents are scored according to the occurrence of words in the following lexicons derived from the categories of the Harvard Inquirer: Economic combines "Econ@" and "ECON", Legal uses "Legal", Political combines "Polit@" and "POLIT" and Social combines "Role" and "SocRel". The scores given are the fraction of words in each document that are found in the relevant lexicon.

79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /PESTLE Scan/PESTLE Scan.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | 11 | ## 12 | ## Based on pre-process.R - a quick and dirty scan for blogs/abstracts with a strong 13 | ## "sense" of PESTLE (actually PES.L.). according to scoring using the Harvard General Inquirer 14 | ## 15 | ## NB there is some munging of the Inquirer basic list to merge categories 16 | ## 17 | ## This handles conference abstracts and blog posts in slightly different ways 18 | ## abstracts: each CSV is assumed to be a set of abstracts from a 19 | ## single conference series (aka "origin"). The origin is 20 | ## added as part of the pre=processing 21 | ## blogs: each CSV is assumed to contain posts from several blogs 22 | ## and the "origin" is in the CSV as the blog "home" URL 23 | ## 24 | ## (the column headings also differ, sometimes only for "historical reasons") 25 | ## 26 | library("tm") 27 | library("slam") 28 | library("brew") 29 | library("RColorBrewer") 30 | 31 | # load some functions, effectively "macros". Some day make a proper package for the whole show 32 | source("/home/arc1/R Projects/Text Mining Weak Signals/commonFunctions.R") 33 | source("/home/arc1/R Projects/Text Mining Weak Signals/sentimentFunctions.R") 34 | 35 | home.dir<-"/home/arc1/R Projects/Text Mining Weak Signals/PESTLE Scan" 36 | data.dir<-"/home/arc1/R Projects/Text Mining Weak Signals/Source Data" 37 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/PESTLE Scan" 38 | 39 | 40 | #each one of these will be looped over NB the origin.tag must be in the same order as set.csv 41 | # set.csv <- c("ICALT Abstracts 2005-2011.csv", 42 | # "CAL Abstracts 2007-2009.csv", 43 | # "ECTEL Abstracts 2006-2011.csv", 44 | # "ICWL Abstracts 2005-2011.csv") 45 | # origin.tag <- c("ICALT", 46 | # "CAL", 47 | # "ECTEL", 48 | # "ICWL")#only used for abstracts 49 | #set.csv <- c("CETIS Blogs 20110101-20120301.csv","CETIS Blogs 20090101-20120301.csv","NonCETIS Blogs 20110101-20120301.csv") 50 | set.csv <- c("MB/MB Blogs 20100101-20120630.csv") 51 | # this determines the source type: conference abstracts or blog content 52 | source.type="b"#a is for abstracts, b is for blogs 53 | 54 | ## 55 | ## Sometimes it is useful to pre-filter to use only documents containing set words 56 | ## 57 | prefilter.name<-"-"#used to name the output file. 58 | #at least prefilter.thresh.1 of the the prefilter.words.1 must appear for inclusion. Use 0 to skip filtering 59 | #in addition, if prefilter.thresh.2>0 members of prefilter.words.2 MUST ALSO be present 60 | prefilter.thresh.1<-0 61 | prefilter.words.1<-c("open", "educational", "resource", "resources", "oer", "content") 62 | #prefilter.words.1<-c("school","schools", "schooling","pupil","pupils","highschool","high-school","class","classroom","teacher","child","children","parent","parents","child's","parent's","teachers","teacher's","junior","infant","nursery") 63 | prefilter.thresh.2<-0 64 | #prefilter.words.2<-c("Europe","Ruropean","UK","Britain","Germany","France","British","German","French","Spain","Spanish","Italy","Italian","Norway","Norwegian","Swedish","Sweden","Finland","Finnish","Baltic") 65 | prefilter.words.2<-c("assessment","assess","test","assessing","testing") 66 | if(prefilter.thresh.1<1){ 67 | prefilter.name<-"" 68 | prefilter.thresh.1<-0 69 | prefilter.thresh.2<-0 70 | } 71 | #force lower case to match doc-term matrix 72 | prefilter.words.1<-tolower(prefilter.words.1) 73 | prefilter.words.2<-tolower(prefilter.words.2) 74 | 75 | ## 76 | ## Prepare lexicon 77 | ## 78 | # Read in the sentiment word lists (cols extracted from the Harvard Inquirer spreadsheet http://www.wjh.harvard.edu/~inquirer/) 79 | #The column headings MUST be unchanged (but NB data.frame colnames do not allow "@": 80 | # Entry Econ@ ECON Legal Polit@ POLIT Role SocRel 81 | targets<-list(Economic=c("Econ.","ECON"), Legal="Legal", Political=c("Polit.", "POLIT"), Doing=c("Need","Goal","Try","Means","Persist","Complet","Fail"), Knowing=c("Know","Solve")) 82 | sentiment.dics<-prepareLexicons(paste(home.dir,"InquirerPESTLE2.csv",sep="/"), targets) 83 | 84 | ## 85 | ## MAIN LOOP over the sets: Read-in, add columns of metrics and write-out 86 | ## 87 | for (src in 1:length(set.csv)){ 88 | inFile<-paste(data.dir,set.csv[src],sep="/") 89 | print(paste("******************** Processing:",set.csv[src],"********************")) 90 | inFileStem<-strtrim(set.csv[src],nchar(set.csv[src])-4) 91 | outFile<-paste(output.dir,paste(inFileStem, prefilter.name,"PESTLE.csv",sep=" "),sep="/") 92 | brewFile<-paste(output.dir,paste(inFileStem, prefilter.name,"PESTLE.html",sep=" "),sep="/") 93 | # read in CSV with format year,pages,title,authors,abstract,keywords,url,dblp_url. 94 | #There is a header row. DBLP_URL is the vital key into the author centrality data 95 | table<-read.csv(inFile,header=TRUE,sep=",",quote="\"",stringsAsFactors=FALSE) 96 | # choose an appropriate mapping and other source-specific preliminaries 97 | #"Keywords" and after are user-defined "localmetadata" properties while the rest are standard tm package document metadata fields 98 | if(source.type == "a"){ 99 | #insert the "origin" as a new column 100 | origin<-rep(origin.tag[src], length(table[,1])) 101 | table<-cbind(origin,table) 102 | map<-list(Content="abstract", Heading="title", Author="authors", DateTimeStamp="year", Origin="origin", Keywords="keywords", URL="url", DBLP_URL="dblp_url") 103 | }else if(source.type == "b"){ 104 | map<-list(Content="content", Heading="title", Author="authors", DateTimeStamp="datestamp", Origin="origin",URL="url") 105 | }else{ 106 | stop("Unknown source type:",source.type) 107 | } 108 | n.docs<-length(table[,1]) 109 | print(paste(n.docs,"documents read in")) 110 | # create a corpus, handling the metadata via mapping from datatable column names to PlainTextDocument attribute names 111 | corp<-Corpus(DataframeSource(table), readerControl=list(reader= readTabular(mapping=map))) 112 | ## 113 | ## Lexical Analysis at a document level. 114 | ## 115 | # Use the Harvard Inquirer word lists to score sets of responses against several sentiments. 116 | # NB this is an UNSTEMMED treatment 117 | # the baseline is all terms used 118 | stop.words<-c(CustomStopwords(),"social") 119 | dtm.tf.unstemmed.all<-DocumentTermMatrix(corp, 120 | control=list(stemming=FALSE, stopwords=stop.words, minWordLength=3, removeNumbers=TRUE, removePunctuation=FALSE)) 121 | #do pre-filtering if required 122 | if(prefilter.thresh.1>0){ 123 | print(paste("Filtering to only use documents containing at least",prefilter.thresh.1, 124 | " of the following words:", paste(prefilter.words.1, collapse=", "))) 125 | #ensure the list only contains words that are present (otherwise get an "out of bounds" error later) 126 | prefilter.words.1<-prefilter.words.1[prefilter.words.1 %in% Terms(dtm.tf.unstemmed.all)] 127 | dtm.bin.pfw1<-weightBin(dtm.tf.unstemmed.all[,prefilter.words.1]) 128 | if(prefilter.thresh.2>0){ 129 | print(paste("as well as at least",prefilter.thresh.2,"of these words:", 130 | paste(prefilter.words.2,collapse=", "))) 131 | prefilter.words.2<-prefilter.words.2[prefilter.words.2 %in% Terms(dtm.tf.unstemmed.all)] 132 | dtm.bin.pfw2<-weightBin(dtm.tf.unstemmed.all[,prefilter.words.2]) 133 | filter.docs<-(row_sums(dtm.bin.pfw1)>=prefilter.thresh.1) & 134 | (row_sums(dtm.bin.pfw2)>=prefilter.thresh.2) 135 | }else{ 136 | filter.docs<-row_sums(dtm.bin.pfw1)>=prefilter.thresh.1 137 | } 138 | dtm.tf.unstemmed.all<-dtm.tf.unstemmed.all[filter.docs,] 139 | table<-table[filter.docs,] #required for output csv 140 | n.docs.f<-sum(filter.docs) 141 | print(paste(n.docs.f,"documents fulfil criteria")) 142 | } 143 | doc.term.sums<-row_sums(dtm.tf.unstemmed.all) 144 | 145 | #Create a separate report for each input data file. 146 | #This is done in 3 parts: header, a repeated central chunk per lexicon, a footer 147 | brew.conn<-file(brewFile, open="wt") 148 | brew(file=paste(home.dir,"BrewHeader.html",sep="/"), output=brew.conn) 149 | #this palette is used in brew for color-coding scores 150 | score.pal<-brewer.pal(11,"RdBu")[11:1]#reverse it to make red highest scoring 151 | 152 | 153 | 154 | #Loop over the "sentiment dics", each column is a potentially-merged Gen Inquirer category 155 | for(lex in 1:length(sentiment.dics)){ 156 | dtm.tf.unstemmed.lex<-DocumentTermMatrix(corp, 157 | control=list(stemming=FALSE, stopwords=stop.words, minWordLength=3, removeNumbers=TRUE, removePunctuation=FALSE,dictionary=tolower(sentiment.dics[[lex]]))) 158 | #do pre-filtering if required 159 | if(prefilter.thresh.1>0){ 160 | dtm.tf.unstemmed.lex<-dtm.tf.unstemmed.lex[filter.docs,] 161 | } 162 | lex.score<-row_sums(dtm.tf.unstemmed.lex)/doc.term.sums 163 | #force any v. short docs to have scores = 0.0 164 | lex.score[doc.term.sums<40]<-0.0 165 | lex.score<-as.data.frame(lex.score) 166 | colnames(lex.score)<-names(sentiment.dics)[[lex]] 167 | # add to the data.table 168 | table<-cbind(table,lex.score) 169 | print(paste("====","Processed lexicon for", names(sentiment.dics)[[lex]],"====")) 170 | print("Summary stats and histograms AFTER removing score=0 documents") 171 | lex.score.nz<-lex.score[lex.score[,1]>0.0,1] 172 | summary.nz<- summary(lex.score.nz) 173 | print(summary.nz) 174 | print(paste("Summary stats after scaling by 1000/lexicon length (length=",length(sentiment.dics[[lex]]),")")) 175 | print(summary(lex.score.nz*1000/length(sentiment.dics[[lex]]))) 176 | hist(lex.score.nz, main=names(sentiment.dics)[[lex]], breaks=20) 177 | print("Report Top 20 using Brew") 178 | topDocs<-order(lex.score, decreasing=T)[1:20] 179 | #write out the highest scoring documents to the formatted report 180 | brew(file=paste(home.dir,"BrewChunk.html",sep="/"), output=brew.conn) 181 | } 182 | # tidy up 183 | # rm(dtm.tf.unstemmed.all) 184 | # rm(dtm.tf.unstemmed.lex) 185 | # rm(lex.score) 186 | #write out the full PESTLE-scored data and complete the formatted report 187 | write.csv(table, outFile, quote=TRUE, row.names=FALSE) 188 | brew(file=paste(home.dir,"BrewFooter.html",sep="/"), output=brew.conn) 189 | close(brew.conn) 190 | 191 | } 192 | -------------------------------------------------------------------------------- /Proximate Word Scan/ProximateWordScan.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Score documents according to the occurrence of N words with 3 | ## variable penalty for gaps, and options for permuation and synonum use 4 | ## (also with variable penalty) 5 | ## 6 | 7 | library("stringkernels")#also loads kernlab and openNLP 8 | #NB: this needs e.g. openNLPmodels.en package installed too 9 | library("combinat") 10 | library("wordnet") 11 | library("tm") 12 | #needed to allow for OpenNLP to have a large heap space 13 | options(java.parameters="-Xmx2g") 14 | 15 | ##====================== 16 | ## LOAD IN THE DOCUMENTS 17 | ##====================== 18 | #Source type and corpus control 19 | source.type<-"corpus.RData" 20 | RData.source<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union B/2010/Corpus.RData" 21 | #only use documents with a DateStamp in the following range 22 | start.date=as.POSIXlt("2010-01-01") 23 | last.date=as.POSIXlt("2010-12-31") 24 | # using a tm package corpus is not strictly necessary but it allows cross-over to other text mining code, allows for metadata etc... 25 | # for now, a dummy corpus.... 26 | if(source.type=="corpus.RData"){ 27 | load(RData.source) 28 | dts<-as.POSIXlt(unlist(meta(corp,"DateTimeStamp",type="local")), origin="1970-01-01") 29 | filter.bool<- (dts>start.date) & (dts<=last.date) 30 | corp<-corp[filter.bool] 31 | print(paste("Filtering in date range",start.date,"to",last.date, "=>")) 32 | print(corp) 33 | }else{ 34 | #a dummy tesr case is fall-back 35 | corp<-Corpus(VectorSource(c("The Cat sat on the mat","The Cat sat on the mat and figged about with a mouse","The cat sat on the carpet","The cat sat. On the mat was a dog", "On the mat there was a cat","On the mat a cat was sat","On the mat was a picture of a cat","A matte cat sat on the grass"))) 36 | print("Using test case") 37 | } 38 | 39 | ##================ 40 | ## RUN PARAMETERS 41 | ##================ 42 | # The query is considered ordered by default. 43 | query<-"learning design" 44 | #this is a normal calculation, normally TRUE but maybe useful to set to F when using do.pairs and a longer query to investigate proximity of a cluster of terms. 45 | #NB: all words in the query must appear in a document for a score of >0 to be calculated 46 | do.fullQuery<-T 47 | 48 | # option to select pairs of distinct words from the query 49 | # and re-run each is if it had been the query, so building up a proximity matrix 50 | do.pairs<-F 51 | 52 | #gap penalty parameter = lambda for "gapweightkernel" in package stringkernels 53 | #a match is scored as lambda^number_of_gaps, hence a value of 1 does NOT penalise gaps 54 | gap.lambda<-0.75 55 | 56 | #when using synonyms or permutations, documents are selected based on the highest score of any variant 57 | 58 | #option to permute the words in the query. Take care if the query is >3 words; the number of perms explodes! 59 | #NB: when do.pairs=T and the pair proximity is calculated, an equal-weighted perm is automatically done 60 | #(i.e. the pair is considered un-ordered). This does not alter the way the normal calc is performed. 61 | do.permute<-F 62 | #factor to apply to gapweightkernel scores when the permutation is not the original query 63 | #1 gives all permutations equal weight. 0 only counts the original query 64 | perm.factor<-0.75 65 | 66 | #option to use synonyms. 67 | # NB1: only 1 word is synonymised at a time, 68 | # NB2: synonyms applied in addition to permutation if do.permute==T 69 | do.synonyms<-F 70 | #factor to apply to gapweightkernel scores for each synonym substitution 71 | #1 applies no penalty 72 | synonym.factor<-0.8 73 | # if do-synonyms==T then the following should match the length (in words) of "query" and the word order in it. 74 | #NA or any control word not specified below means no synonyms for the corresponding word will be used. (no warning given) 75 | #the permitted values are 76 | # "MANUAL" - means the synonyms are manually specified in the named list, "synonym.manual" 77 | # "NOUN","VERB","ADJECTIVE","ADVERB" - WordNet synonyms are found of the specified part of speech 78 | # "" - means no synonyms are to be used for the corresponding word 79 | # !! use WordNet with care. preferably check what it will give you before using. also NB: "sat" is not considered a VERB 80 | synonym.control<-c(NA,"","MANUAL") 81 | synonym.manual<-list(recommender=c("recommend")) 82 | 83 | #show documents with scores >= this quantile. NB only docs with non-zero score are used to determine the quantile. 84 | score.quantile<-0.8 85 | 86 | #set to true for verbose information 87 | debug=FALSE 88 | 89 | ##===================== 90 | ##PREP SYNONYMS 91 | ##===================== 92 | syn.list<-list() 93 | if(do.synonyms){ 94 | gotWordNet <- initDict() 95 | #loop over all synonym control, building up a list where each named entry 96 | # is a vector of syn-terms of length >=0 97 | for(i in 1:length(synonym.control)){ 98 | sc1<-synonym.control[i] 99 | if(is.na(sc1)){ 100 | #skip 101 | }else if(sc1=="MANUAL"){ 102 | syn.list=c(syn.list,synonym.manual) 103 | }else if(sc1=="NOUN" || sc1=="VERB" || sc1=="ADVERB" || sc1=="ADJECTIVE"){ 104 | if(!gotWordNet) stop(paste("Attempt to use synonym control",sc1,"but cannot find WordNet")) 105 | word<-tolower(q.split[i]) 106 | word.syns<-list(synonyms(word,sc1)) 107 | names(word.syns)<-word 108 | syn.list=c(syn.list,word.syns) 109 | } 110 | } 111 | print("Enumerations of Synonyms for Each Query Term") 112 | if(length(syn.list)>0){ 113 | print(syn.list) 114 | }else{ 115 | print("No synonyms specified or found in spite of do.synonyms==TRUE") 116 | } 117 | } 118 | 119 | 120 | 121 | 122 | 123 | ##================== 124 | ## WORKER FUNCTION 125 | ##================== 126 | proximateWorker<-function(work.query, print.results=FALSE){ 127 | #allowing for work.query to be a subquery in future code 128 | #separate the words - used for length and possibly permutations too 129 | q.split<-strsplit(work.query," ")[[1]] 130 | 131 | # expand the query according to permutations and synonyms, calculating weightings for these 132 | if(do.permute){ 133 | #creates all word permutations in the query 134 | q.permutes <- sapply(permn(q.split),paste,collapse=" ") 135 | expanded.query<-data.frame(q=q.permutes, w=rep(perm.factor,length(q.permutes))) 136 | #sets the original query weight to 1.0 137 | expanded.query[q.permutes==work.query,"w"]<-1.0 138 | }else{ 139 | #use only the original query 140 | expanded.query<-data.frame(q=work.query, w=1.0)#q=query, w= weighting 141 | } 142 | if(length(syn.list)>0){ 143 | #for each of the existing entries in the expanded query, add new expanded queries with appropriate 144 | #weighting for synonymisation of one word at a time. skip if the synonym == the original word 145 | df<-expanded.query 146 | #ripple through the full synonym list 147 | root.words<-names(syn.list) 148 | for(j in 1:length(root.words)){#the word to be replaced 149 | #sub("cat","mog",expanded.query[,"q"],ignore.case=TRUE) 150 | root.word<-root.words[j] 151 | syn.words<-unlist(syn.list[root.word]) 152 | if(length(syn.words)>0){ 153 | for(k in 1:length(syn.words)){#loop over the synonym to replace the j'th word 154 | rep.word<-tolower(syn.words[k]) 155 | if(rep.word!=root.word){ 156 | new.queries<-sub(root.word,rep.word,df[,"q"],ignore.case=TRUE) 157 | new.weights<-synonym.factor*df[,"w"] 158 | expanded.query<-rbind(expanded.query,data.frame(q=new.queries,w=new.weights)) 159 | } 160 | } 161 | } 162 | } 163 | } 164 | 165 | #calculate un-normalised gap weighted scores for the match between the (expanded) query and all documents in the corpus. Using un-normalised avoids penalising an exact match for the query inside a longer document. 166 | gwk = gapweightkernel(length=length(q.split),lambda=gap.lambda,normalized=F) 167 | scores.mat <- kernelMatrix(gwk, as.list(expanded.query[,"q"]), as.list(tolower(corp))) 168 | rownames(scores.mat)<-expanded.query[,"q"] 169 | 170 | # apply the down-weighting for permutations or synonyms 171 | scores.weighted<-scores.mat * expanded.query[,"w"] 172 | 173 | if(debug){ 174 | print("Expanded Queries and Weights") 175 | print(expanded.query) 176 | 177 | print("Expanded Query-Document Un-Weighted Score Matrix") 178 | print(scores.mat) 179 | 180 | print("Expanded Query-Document Weighted Score Matrix") 181 | print(scores.weighted) 182 | } 183 | 184 | # use the maximum kernel function value from the expanded queries to get the best score for each document 185 | scores.max <-apply(scores.weighted,MARGIN=2,max) 186 | 187 | #it is convenient to deal with only docs with non-zero score 188 | nz.scores.max <- scores.max[scores.max>0] 189 | if(length(nz.scores.max)>0){ 190 | # cutoff score for showing documents 191 | score.thresh<-quantile(nz.scores.max,score.quantile) 192 | 193 | #select and order the above-threshold documents 194 | docs.sel.bool<-scores.max>=score.thresh 195 | scores.sel<-scores.max[docs.sel.bool] 196 | corp.sel<-corp[docs.sel.bool] 197 | corp.sel<-corp.sel[order(scores.sel,decreasing=TRUE)] 198 | scores.sel<-scores.sel[order(scores.sel,decreasing=TRUE)] 199 | 200 | if(print.results){ 201 | print("Best Weighted Score Per Document") 202 | print(scores.max) 203 | 204 | summary(scores.max) 205 | hist(scores.max, breaks=50) 206 | 207 | print("Following statistics are *after* suppressing zero-scoring documents") 208 | summary(nz.scores.max) 209 | hist(nz.scores.max, breaks=50) 210 | 211 | print(paste("Documents with score >= the",score.quantile*100,"% quantile")) 212 | for(i in 1:length(scores.sel)){ 213 | print(paste("score=",scores.sel[i])) 214 | print(as.character(corp.sel[i])) 215 | } 216 | } 217 | }else{ 218 | print("All documents scored 0") 219 | } 220 | return(scores.max) 221 | } 222 | ## -------------- end function def 223 | 224 | ##================== 225 | ## CALCULATION 226 | ##================== 227 | ## >>>>>>>>> if necessary, do pairwise word combinations 228 | if(do.pairs){ 229 | #temp set permutation on and without penalty 230 | do.permute.save<-do.permute 231 | perm.factor.save<-perm.factor 232 | do.permute<-T 233 | perm.factor<-1.0 234 | #prepare the pairs 235 | split.query<-strsplit(query," ")[[1]]; 236 | q.pairs<-combn2(split.query) 237 | #prepare objects to receive results 238 | df.pairs<-data.frame()#catches the vectors of scores from proximateWorker 239 | pair.mat<-matrix(nrow = length(split.query), ncol = length(split.query))#for the mean of the proximateWorker score of each pair 240 | rownames(pair.mat)<-split.query 241 | colnames(pair.mat)<-split.query 242 | #diag(pair.mat)<-NA 243 | for(p in 1:length(q.pairs[,1])){#loop over rows is a loop over word pairs 244 | pair<-paste(q.pairs[p,], collapse=" ") 245 | pair.scores.max<-proximateWorker(pair) 246 | pair.summary<-summary(pair.scores.max) 247 | df.pairs<-rbind(df.pairs,data.frame(row.names=pair,as.list(pair.summary))) 248 | pair.mat[q.pairs[p,1],q.pairs[p,2]]<-pair.summary["Mean"] 249 | pair.mat[q.pairs[p,2],q.pairs[p,1]]<-pair.summary["Mean"] 250 | print(paste("For pair [", pair,"] summary document score ="),sep="") 251 | print(pair.summary) 252 | } 253 | 254 | barplot(as.matrix(df.pairs)[,3], las=2,main="Median of Proximity Score") 255 | barplot(as.matrix(df.pairs)[,4], las=2,main="Mean of Proximity Score") 256 | barplot(as.matrix(df.pairs)[,5], las=2,main="3rd Quartile of Proximity Score") 257 | barplot(as.matrix(df.pairs)[,6], las=2,main="Max of Proximity Score") 258 | 259 | heatmap(pair.mat, main="Un-ordered Word Proximity Map") 260 | 261 | #restore the params for a full/normal run 262 | do.permute<-do.permute.save 263 | perm.factor<-perm.factor.save 264 | } 265 | 266 | ## >>>>>>>>> this is the full query run 267 | if(do.fullQuery){ 268 | scores.max<-proximateWorker(query, print.results=TRUE) 269 | } 270 | 271 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | This github project is for text mining for weak signals, initially in the scope of the TELMap project. 2 | 3 | The initial "naive" approach is the "Rising and Falling Terms" method. The first versions of this simply looked for high percentage rise or fall in term frequency (or new terms) and did not attempt any formal statistical hypothesis testing. This version is hived off to the no-binomial branch, which is effectively dead. 4 | 5 | The wiki will be used to link out to background material, stuff about TELMap etc. 6 | 7 | Everything you find here is provided under an MIT Licence: 8 | ## **************************************** 9 | # Copyright (c) Adam Cooper 10 | # 11 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 16 | ## ************ end licence *************** 17 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Conference BrewTemplate ODT files/Configurations2/accelerator/current.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arc12/Text-Mining-Weak-Signals/f54f198f59b9ae239645713140024b46dffa93e0/Rising and Falling Terms/Conference BrewTemplate ODT files/Configurations2/accelerator/current.xml -------------------------------------------------------------------------------- /Rising and Falling Terms/Conference BrewTemplate ODT files/META-INF/manifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Conference BrewTemplate ODT files/layout-cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arc12/Text-Mining-Weak-Signals/f54f198f59b9ae239645713140024b46dffa93e0/Rising and Falling Terms/Conference BrewTemplate ODT files/layout-cache -------------------------------------------------------------------------------- /Rising and Falling Terms/Conference BrewTemplate ODT files/manifest.rdf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Conference BrewTemplate ODT files/meta.xml: -------------------------------------------------------------------------------- 1 | 2 | user2011-06-06T13:14:00Adam Cooper2011-07-29T17:43:102011-01-11T12:42:009P25DT31M35SLibreOffice/3.3$Linux LibreOffice_project/330m19$Build-202 -------------------------------------------------------------------------------- /Rising and Falling Terms/Conference BrewTemplate ODT files/mimetype: -------------------------------------------------------------------------------- 1 | application/vnd.oasis.opendocument.text -------------------------------------------------------------------------------- /Rising and Falling Terms/Conference BrewTemplate ODT files/settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 1065203235521955truefalseview2767613672010652323533260501false101falsetruefalsefalsefalse1falsefalsetruefalsetruetruetruetruefalsetruetruefalsetruetruefalsetruefalsefalsetrue0truefalsetruefalsefalsetruetrue0falsefalsetruefalsefalsetrue0truefalsefalsejaJP!%),.:;?]}¢°’”‰′″℃、。々〉》」』】〕゛゜ゝゞ・ヽヾ!%),.:;?]}。」、・゙゚¢$([\{£¥‘“〈《「『【〔$([{「£¥truefalsefalsefalsetruefalsefalsefalsetruetruehigh-resolutionfalsefalsetrue -------------------------------------------------------------------------------- /Rising and Falling Terms/RF_Brew_Core.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Take a previous run of RF_Terms.R and create a HTML report 3 | ## MUST be preceeded by a load the RF_Terms workspace 4 | ## SHOULD be preceeded by setting of working dir to appropriate report destination 5 | ## THIS VERSION refers to the combined blogs and conference abstracts template 6 | ## 7 | library("brew") 8 | 9 | date.format = "%b %d %Y" 10 | 11 | #compute some tet tragments that depend on whether this is a conference or blog case 12 | if(source.type=="b"){ 13 | #blog 14 | h1<-paste(title, format(last.date, date.format)) 15 | doc.pseudonym<-"blog post" 16 | doc.Pseudonym<-"Blog post" 17 | docs.pseudonym.long<-"text content of blog posts" 18 | }else{ 19 | #conference abstracts 20 | h1<-paste(title, conf.year.recent) 21 | doc.pseudonym<-"abstract" 22 | doc.Pseudonym<-"Abstract" 23 | docs.pseudonym.long<-"abstracts of the conference proceedings" 24 | } 25 | 26 | # shortens a string at the closest space character to the limit "len" and appends ellipsis if necessary 27 | # used from within Brew template 28 | Head.Text<-function(string2shorten, len=800){ 29 | if(nchar(string2shorten)<=len){ 30 | return (string2shorten) 31 | } 32 | res<-substring(string2shorten,1,len) 33 | if((substr(string2shorten,len,len)!=" ") & (substr(string2shorten,len,len)!=" ")){ 34 | #adjust to word boundary 35 | res<-substr(res,1,regexpr("\\s+\\S*$", res)) 36 | } 37 | return (paste(res,"...") ) 38 | } 39 | 40 | brew(file="/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/CB BrewTemplate.html",output="Report.html",run=TRUE) 41 | #rew(file="/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/Conference BrewTemplate ODT content.xml",output="ODT content.xml",run=TRUE) 42 | # add one for ODP (als need to fix the template for ODT; it comes out rather mashed) 43 | -------------------------------------------------------------------------------- /Rising and Falling Terms/ReadMe.txt: -------------------------------------------------------------------------------- 1 | ReadMe file for "Rising and Falling Terms" directory. The contents of this are all about using a simple (Naive) text mining approach to look for "weak signals". The current version considers conference abstracts but the code should be easily adapted to work with blog sources also. 2 | 3 | The files in this directory are the worker files for use with R [R Development Core Team (2010). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org/.]. Sub-directories MOSTLY contain either: 4 | a) initialiser R code for a single conference 5 | b) initialiser R code to merge abstract texts from several conferences (aka "union" run) 6 | 7 | 8 | The initialisers are: 9 | * RF_Init.R - this is the for a conference or "union" and sets up variables necessary for the RF_Terms.R worker code, which does the analysis and writes out an .RData file and PNG images of the plots. 10 | * RF_Conference_Brew.R - takes the .RData and uses Brew with the template file "Conference BrewTemplate.html" to create an HTML report. 11 | 12 | "Conference BrewTemplate ODT content.xml" is a rather poor experimental attempt at using Brew to generate an ODT report. It relies on files in "Conference BrewTemplate ODT files" and a small script "Prepare ODT Report.sh" in the conference/union subdirectory. 13 | 14 | ---------- 15 | ## **** notes on importing co-occurrence graph into Gephi (v0.8 alpha used) 16 | # import nodes then edges from CSV files. Make Node Weight be a float [it is ESSENTIAL not to leave it as a String] 17 | # show node labels, use "statistics" to calculate modularity 18 | # Use "ranking" to set node size = imported weight 19 | # edge size = imported weight (=number of co-occurrences). by default 20 | # Use "partition" to set node colour = modularity class 21 | # NB the actual scale may need a multiplier/factor to be applied. 22 | # Use a circular auto-layout with nodes ordered by modularity class then use Frucherman Reingold 23 | # - may need to do a label adjust too. 24 | # ** for the "preview" 25 | # - set edge thickness to 5 and label font to 36pt. curved edges work OK 26 | # - uncheck "proportional size" on node and edge 27 | # - set the opacity to somewhere between 60 and 80% so that labels show up better 28 | # - when exporting to PNG, set a 25% margin otherwise it gets cropped! 29 | -------------------------------------------------------------------------------- /Rising and Falling Terms/TELBlogs/RF_CB_Brew.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Take a previous run of RF_Terms.R and create a HTML report 3 | ## 4 | library("tm") 5 | 6 | # Load the RF_Terms workspace 7 | setwd("/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/TELBlogs/CETIS 2012-02") 8 | load("RF_Terms.RData") 9 | 10 | ## 11 | ## Run parameter 12 | ## 13 | show.abstracts<-TRUE 14 | 15 | source("/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/RF_Brew_Core.R") 16 | 17 | 18 | -------------------------------------------------------------------------------- /Rising and Falling Terms/TELBlogs/RF_Init.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This contains the parameters for running a specific dataset against the RF_Terms.R Method 12 | ## It should be executed each time before that script (but after a one-off run of Pre-process.R) 13 | ## 14 | ## 15 | ## This handles conference abstracts and blog posts in slightly different ways 16 | ## abstracts: each CSV is assumed to be a set of abstracts from a 17 | ## single conference series (aka "origin"). The origin is 18 | ## added as part of the pre=processing 19 | ## blogs: each CSV is assumed to contain posts from several blogs 20 | ## and the "origin" is in the CSV as the blog "home" URL 21 | ## 22 | ## (the column headings also differ, sometimes only for "historical reasons") 23 | ## 24 | 25 | ## Run Properties - dependent on the source 26 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 27 | source.dir<-paste(base.dir,"Source Data",sep="/") 28 | #the output directory. NB convention to include the year 29 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/TELBlogs/CETIS 2012-02" 30 | dir.create(output.dir, showWarnings=FALSE) 31 | setwd(output.dir) 32 | title<-"Rising and Falling Terms - CETIS Blogs" 33 | 34 | # this determines the source type: conference abstracts or blog content 35 | source.type="b"#c is for conference abstracts, b is for blogs 36 | 37 | # these three (set.xxxxx) apply whichever source type is used 38 | sets.csv <- c("CETIS Blogs 20090101-20120301 with metrics.csv") 39 | set.name <- c("CETIS") 40 | set.title <- c("CETIS Blogs") 41 | 42 | 43 | recent.themes.txt <- NA # file containing invited conference themes. Use NA if analysing blogs. 44 | 45 | #these apply only for conference abstracts (and are for use in auto-report-generation using Brew) 46 | # last.conference.url <- c("http://www.ask4research.info/icalt/2010/", 47 | # "http://www.ectel2010.org/", 48 | # "http://www.hkws.org/conference/icwl2010/", 49 | # "") 50 | # publisher.series <- c("IEEE", 51 | # "Springer Lecture Notes in Computer Science (LNCS)", 52 | # "Springer Lecture Notes in Computer Science (LNCS)", 53 | # "Springer Lecture Notes in Computer Science (LNCS)", 54 | # "Elsevier Computers and Education Journal") 55 | # publisher.url <- c("http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=5570018", 56 | # "http://www.springerlink.com/content/978-3-642-16019-6/contents/", 57 | # "http://www.springer.com/computer/general+issues/book/978-3-642-17406-3", 58 | # "http://www.journals.elsevier.com/computers-and-education/") 59 | 60 | ## ensure subdirectories exist 61 | dir.create("Gephi", showWarnings=FALSE) 62 | dir.create("Images", showWarnings=FALSE) 63 | dir.create("Wordle", showWarnings=FALSE) 64 | 65 | ## 66 | ## Run properties - date ranges 67 | ## 68 | # key date is the one after which documents are considered to be in the "recent set" 69 | if(source.type=="b"){ 70 | # for blogs key date is an N month period before the start of the current month 71 | recent.months<-6#set to 3 to look at the last quarter 72 | prior.months<-30#use the previous 30 months to compare against (if in dataset) 73 | key.date<-as.POSIXlt(Sys.Date(), tz = "GMT")#today 74 | key.date$mday<-1 75 | last.date<-key.date 76 | last.date$mday<-last.date$mday-1 #to match inequalities in corpus filtering 77 | key.date$mon<-key.date$mon-recent.months 78 | start.date<-key.date 79 | start.date$mon<-start.date$mon - prior.months 80 | start.date$mday<-start.date$mday-1 #to match inequalities in corpus filtering 81 | #the proportion of past:recent 82 | past.recent.ratio<-prior.months/recent.months 83 | }else if(source.type=="c"){ 84 | # for conference abstracts, key date is just the most-recent conference 85 | conf.year.recent<-2010 #conference abs version => 2010 confs are "recent" 86 | conf.years.in_past<-4 # abstracts from the previous 4 years are counted as "past" 87 | key.date<-as.POSIXlt(paste(conf.year.recent-1,"12","31",sep="-"), tz = "GMT") 88 | start.date<-key.date 89 | start.date$year<-start.date$year-conf.years.in_past 90 | last.date<-key.date 91 | last.date$year<-last.date$year+1 92 | past.recent.ratio<-conf.years.in_past 93 | }else{ 94 | stop("Unknown source type",source.type) 95 | } 96 | 97 | ## 98 | ## Run properties - centrality data 99 | ## 100 | #use NA if not available! 101 | papers.table<-NA 102 | authors.table<-NA 103 | # # This is a list of papers (id, dblp url, author-id-list) 104 | # # the row names are made to be the DBLP URLs 105 | # papers.table<-read.csv(paste(source.dir,"Union B Author Ids 2010.csv",sep="/"), 106 | # header=TRUE, sep=",", quote="\"", row.names=2, 107 | # stringsAsFactors=FALSE) 108 | # # this is a list of author centrality measures (id, centrality) NB author IDs must match previous list of papers 109 | # authors.table<-read.csv(paste(source.dir,"Author Betweenness D4_3.csv",sep="/"), 110 | # header=TRUE, sep=",", quote="\"", row.names=1, 111 | # stringsAsFactors=FALSE) 112 | 113 | ## 114 | ## Run properties - thresholds - 115 | ## normally the same between different sources of the same kind for comparability 116 | ## 117 | # how many documents must the term appear in to be listed. This is in addition to the frequency thresholds. A value of 2 is expected, i.e. ignore terms that appear in only one doc 118 | doc_count.thresh <- 3 119 | # p-value to accept the "alternative hypothesis" that there is something interesting 120 | thresh.pval<-0.0005 #i.e. accept a .05% chance that null hypothesis falsely rejected 121 | thresh.pval.falling<-0.001 #use a more lenient threshold for falling terms 122 | #max frequency of term in the past set for eligibility as a weak signal. 123 | #Above this, sigifnicant risers are "established terms" 124 | max.past.freq<-0.0005 #i.e. 0.05% 125 | # *for plotting* cap rising % at this level. If value greater then plot is effectively cut off 126 | rising.plot.max<-800 127 | 128 | #novelty calc 129 | term.doc_occurrence.max<-0.5#remove terms appearing in more than 50% of documents 130 | std.novelty.min<-0.25 #a min value of the "standard novelty" 131 | 132 | ## 133 | ## End setup 134 | ## 135 | 136 | # in interactive execution it may be best to skip this command and to manually switch to it 137 | #source("../RF_Terms.R") 138 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Union A/Prepare ODT Report.sh: -------------------------------------------------------------------------------- 1 | outbasedir="/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union A/2010" 2 | templatedir="/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/Conference BrewTemplate ODT files" 3 | cd "$outbasedir" 4 | if [ -f "ODT content.xml" ] 5 | then 6 | rm -f "Union A Report 2010.odt" 7 | rm -fr ODT-Temp 8 | mkdir ODT-Temp 9 | cp -r "$templatedir"/* ODT-Temp 10 | cp Images/* ODT-Temp/Pictures 11 | mv "ODT content.xml" ODT-Temp/content.xml 12 | cd "ODT-Temp" 13 | zip -r ../"Union A Report 2010.odt" * 14 | cd .. 15 | rm -fr "ODT-Temp" 16 | else 17 | echo 'File ODT content.xml not found. Run Brew.' 18 | fi 19 | 20 | 21 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Union A/Prepare ODT Report.sh~: -------------------------------------------------------------------------------- 1 | outbasedir="/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union A/2010" 2 | templatedir="/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/Conference BrewTemplate ODT files" 3 | cd "$outbasedir" 4 | if [ -f "ODT content.xml" ] 5 | then 6 | rm -f "Union A Report 2010.odt" 7 | rm -fr "Union A Report ODT" 8 | mkdir "Union A Report ODT" 9 | cp -r "$templatedir"/* "Union Report ODT" 10 | cp Images/* "Union A Report ODT/Pictures" 11 | mv "ODT content.xml" "Union A Report ODT"/content.xml 12 | cd "Union A Report ODT" 13 | zip -r ../"Union A Report 2010.odt" * 14 | cd .. 15 | rm -fr "Union Report ODT" 16 | else 17 | echo 'File ODT content.xml not found. Run Brew.' 18 | fi 19 | 20 | 21 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Union A/RF_Conference_Brew.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Take a previous run of RF_Terms.R and create a HTML report 3 | ## THIS VERSION refers to the conference abstracts template 4 | ## 5 | library("tm") 6 | #library("Snowball") 7 | library("brew") 8 | 9 | # Load the RF_Terms workspace 10 | setwd("/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union A/testing") 11 | load("RF_Terms.RData") 12 | 13 | # the template is assumed to be in the parent directory since this script is assumed to be in a conference-specific directory 14 | show.abstracts<-FALSE 15 | brew(file="/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/Conference BrewTemplate.html",output="Report.html",run=TRUE) 16 | #brew(file="/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/Conference BrewTemplate ODT content.xml",output="ODT content.xml",run=TRUE) 17 | # add one for ODP (als need to fix the template for ODT; it comes out rather mashed) -------------------------------------------------------------------------------- /Rising and Falling Terms/Union A/RF_Init.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2011, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This contains the parameters for running a specific dataset against the RF_Terms.R Method 12 | ## It should be executed first 13 | ## 14 | 15 | ## Run Properties - dependent on the source 16 | #the output directory. NB convention to include the year 17 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union A/testing" 18 | dir.create(output.dir, showWarnings=FALSE) 19 | setwd(output.dir) 20 | title<-"Rising and Falling Terms - Conference Proceedings from ICALT, ECTEL and ICWL" 21 | abstracts.csv <- c("/home/arc1/R Projects/Source Data/ICALT Abstracts 2005-2010.csv", 22 | "/home/arc1/R Projects/Source Data/ECTEL Abstracts 2006-2010.csv", 23 | "/home/arc1/R Projects/Source Data/ICWL Abstracts 2003-2010.csv") 24 | recent.themes.txt <- NA # file containing invited conference themes. Use NA if analysing blogs. 25 | conference.name <- c("ICALT", 26 | "ECTEL", 27 | "ICWL") 28 | last.conference.url <- c("http://www.ask4research.info/icalt/2010/", 29 | "http://www.ectel2010.org/", 30 | "http://www.hkws.org/conference/icwl2010/") 31 | conference.title <- c("IEEE International Conference on Advanced Learning Technologies", 32 | "European Conference on Technology Enhanced Learning", 33 | "International Conference on Web-based Learning") 34 | publisher.series <- c("IEEE", 35 | "Springer Lecture Notes in Computer Science (LNCS)", 36 | "Springer Lecture Notes in Computer Science (LNCS)") 37 | publisher.url <- c("http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=5570018", 38 | "http://www.springerlink.com/content/978-3-642-16019-6/contents/", 39 | "http://www.springer.com/computer/general+issues/book/978-3-642-17406-3") 40 | 41 | ## ensure subdirectories exist 42 | dir.create("Gephi", showWarnings=FALSE) 43 | dir.create("Images", showWarnings=FALSE) 44 | dir.create("Wordle", showWarnings=FALSE) 45 | 46 | ## 47 | ## Run properties - normally the same between different sources of the same kind for comparability 48 | ## 49 | # key date is the one after which documents are considered to be recent 50 | # for conference abstracts, this is just the most-recent conference 51 | # for blogs is is a 6 month period before now 52 | today<-as.POSIXlt(Sys.Date(), tz = "GMT") 53 | #key.date<-today;key.date$mon<-key.date$mon-6 #blog version, 6 mon window 54 | conf.year.recent<-2010 #conference abs version => 2010 confs are "recent" 55 | conf.years.in_past<-4 # abstracts from the previous 4 years are counted as "past" 56 | key.date<-as.POSIXlt(paste(conf.year.recent-1,"12","31",sep="-"), tz = "GMT") 57 | start.date<-key.date; start.date$year<-start.date$year-conf.years.in_past 58 | last.date<-key.date;last.date$year<-last.date$year+1 59 | 60 | # thresholds 61 | # how many documents must the term appear in to be listed. This is in addition to the frequency thresholds. A value of 2 is expected, i.e. ignore terms that appear in only one doc 62 | doc_count.thresh <- 2 63 | # p-value to accept the "alternative hypothesis" that there is something interesting 64 | thresh.pval<-0.01 #i.e. accept a 1% chance that null hypothesis falsely rejected 65 | #max frequency of term in the past set for eligibility as a weak signal. 66 | #Above this, sigifnicant risers are "established terms" 67 | max.past.freq<-0.0002 #i.e. 0.02% 68 | # *for plotting* cap rising % at this level. If value greater then plot is effectively cut off 69 | rising.plot.max<-800 70 | 71 | #novelty calc 72 | term.doc_occurrence.max<-0.1#remove terms appearing in more than 10% of documents 73 | std.novelty.min<-0.33 #a min value of the "standard novelty" 74 | 75 | ## 76 | ## End setup 77 | ## 78 | 79 | # in interactive execution it may be best to skip this command and to manually switch to it 80 | #source("../RF_Terms.R") 81 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Union B/RF_CB_Brew.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Take a previous run of RF_Terms.R and create a HTML report 3 | ## 4 | library("tm") 5 | 6 | # Load the RF_Terms workspace 7 | setwd("/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union B/2010") 8 | 9 | #temporary - to allow for use of older .RData 10 | source.type="c" 11 | set.title<-"Union B" 12 | set.name<-set.title 13 | 14 | load("RF_Terms.RData") 15 | 16 | source("/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/RF_Brew_Core.R") 17 | 18 | 19 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Union B/RF_Init.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2011, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This contains the parameters for running a specific dataset against the RF_Terms.R Method 12 | ## It should be executed first 13 | ## 14 | 15 | ## Run Properties - dependent on the source 16 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 17 | source.dir<-paste(base.dir,"Source Data",sep="/") 18 | #the output directory. NB convention to include the year 19 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union B/2010" 20 | dir.create(output.dir, showWarnings=FALSE) 21 | setwd(output.dir) 22 | title<-"Rising and Falling Terms - Conference Proceedings from ICALT, CAL, ECTEL and ICWL" 23 | 24 | # this determines the source type: conference abstracts or blog content 25 | source.type="c"#c is for conference abstracts, b is for blogs 26 | 27 | # these three (set.xxxxx) apply whichever source type is used 28 | sets.csv <- c("ICALT Abstracts 2005-2011 with metrics.csv", 29 | "ECTEL Abstracts 2006-2011 with metrics.csv", 30 | "ICWL Abstracts 2005-2011 with metrics.csv", 31 | "CAL Abstracts 2007-2009 with metrics.csv") 32 | set.name <- c("ICALT", 33 | "ECTEL", 34 | "ICWL", 35 | "CAL") 36 | set.title <- c("IEEE International Conference on Advanced Learning Technologies", 37 | "European Conference on Technology Enhanced Learning", 38 | "International Conference on Web-based Learning", 39 | "Computer Assisted Learning Conference") 40 | 41 | recent.themes.txt <- NA # file containing invited conference themes. Use NA if analysing blogs. 42 | 43 | #these apply only for conference abstracts (and are for use in auto-report-generation using Brew) 44 | last.conference.url <- c("http://www.ask4research.info/icalt/2010/", 45 | "http://www.ectel2010.org/", 46 | "http://www.hkws.org/conference/icwl2010/", 47 | "") 48 | publisher.series <- c("IEEE", 49 | "Springer Lecture Notes in Computer Science (LNCS)", 50 | "Springer Lecture Notes in Computer Science (LNCS)", 51 | "Springer Lecture Notes in Computer Science (LNCS)", 52 | "Elsevier Computers and Education Journal") 53 | publisher.url <- c("http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=5570018", 54 | "http://www.springerlink.com/content/978-3-642-16019-6/contents/", 55 | "http://www.springer.com/computer/general+issues/book/978-3-642-17406-3", 56 | "http://www.journals.elsevier.com/computers-and-education/") 57 | 58 | ## ensure subdirectories exist 59 | dir.create("Gephi", showWarnings=FALSE) 60 | dir.create("Images", showWarnings=FALSE) 61 | dir.create("Wordle", showWarnings=FALSE) 62 | 63 | ## 64 | ## Run properties - date ranges 65 | ## 66 | # key date is the one after which documents are considered to be in the "recent set" 67 | if(source.type=="b"){ 68 | # for blogs key date is an N month period before the start of the current month 69 | recent.months<-3#set to 3 to look at the last quarter 70 | prior.months<-12#use the previous 12 months to compare against (if in dataset) 71 | key.date<-as.POSIXlt(Sys.Date(), tz = "GMT")#today 72 | key.date$mday<-1 73 | last.date<-key.date 74 | last.date$mday<-last.date$mday-1 #to match inequalities in corpus filtering 75 | key.date$mon<-key.date$mon-recent.months 76 | start.date<-key.date 77 | start.date$mon<-start.date$mon - prior.months 78 | start.date$mday<-start.date$mday-1 #to match inequalities in corpus filtering 79 | }else if(source.type=="c"){ 80 | # for conference abstracts, key date is just the most-recent conference 81 | conf.year.recent<-2010 #conference abs version => 2010 confs are "recent" 82 | conf.years.in_past<-4 # abstracts from the previous 4 years are counted as "past" 83 | key.date<-as.POSIXlt(paste(conf.year.recent-1,"12","31",sep="-"), tz = "GMT") 84 | start.date<-key.date 85 | start.date$year<-start.date$year-conf.years.in_past 86 | last.date<-key.date 87 | last.date$year<-last.date$year+1 88 | }else{ 89 | stop("Unknown source type",source.type) 90 | } 91 | 92 | ## 93 | ## Run properties - centrality data 94 | ## 95 | #use NA if not available! 96 | # This is a list of papers (id, dblp url, author-id-list) 97 | # the row names are made to be the DBLP URLs 98 | papers.table<-read.csv(paste(source.dir,"Union B Author Ids 2010.csv",sep="/"), 99 | header=TRUE, sep=",", quote="\"", row.names=2, 100 | stringsAsFactors=FALSE) 101 | # this is a list of author centrality measures (id, centrality) NB author IDs must match previous list of papers 102 | authors.table<-read.csv(paste(source.dir,"Author Betweenness D4_3.csv",sep="/"), 103 | header=TRUE, sep=",", quote="\"", row.names=1, 104 | stringsAsFactors=FALSE) 105 | 106 | ## 107 | ## Run properties - thresholds - 108 | ## normally the same between different sources of the same kind for comparability 109 | ## 110 | # how many documents must the term appear in to be listed. This is in addition to the frequency thresholds. A value of 2 is expected, i.e. ignore terms that appear in only one doc 111 | doc_count.thresh <- 2 112 | # p-value to accept the "alternative hypothesis" that there is something interesting 113 | thresh.pval<-0.005 #i.e. accept a .5% chance that null hypothesis falsely rejected 114 | thresh.pval.falling<-0.01 #use a more lenient threshold for falling terms 115 | #max frequency of term in the past set for eligibility as a weak signal. 116 | #Above this, sigifnicant risers are "established terms" 117 | max.past.freq<-0.0002 #i.e. 0.02% 118 | # *for plotting* cap rising % at this level. If value greater then plot is effectively cut off 119 | rising.plot.max<-800 120 | 121 | #novelty calc 122 | term.doc_occurrence.max<-0.5#remove terms appearing in more than 50% of documents 123 | std.novelty.min<-0.25 #a min value of the "standard novelty" 124 | 125 | ## 126 | ## End setup 127 | ## 128 | 129 | # in interactive execution it may be best to skip this command and to manually switch to it 130 | #source("../RF_Terms.R") 131 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Union C/RF_CB_Brew.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Take a previous run of RF_Terms.R and create a HTML report 3 | ## 4 | library("tm") 5 | 6 | # Load the RF_Terms workspace 7 | setwd("/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union C/2011") 8 | 9 | show.abstracts<-TRUE 10 | show.nov.detail<-TRUE #novelty plots 11 | show.sub.detail<-show.nov.detail #subjectivity plots 12 | 13 | #temporary - to allow for use of older .RData 14 | # source.type="c" 15 | # set.title<-"Union B" 16 | # set.name<-set.title 17 | 18 | load("RF_Terms.RData") 19 | 20 | source("/home/arc1/R Projects/Text Mining Weak Signals/Rising and Falling Terms/RF_Brew_Core.R") 21 | 22 | 23 | -------------------------------------------------------------------------------- /Rising and Falling Terms/Union C/RF_Init.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | ## 11 | ## This contains the parameters for running a specific dataset against the RF_Terms.R Method 12 | ## It should be executed first 13 | ## 14 | 15 | ## Run Properties - dependent on the source 16 | base.dir<-"/home/arc1/R Projects/Text Mining Weak Signals" 17 | source.dir<-paste(base.dir,"Source Data",sep="/") 18 | #the output directory. NB convention to include the year 19 | output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union C/2011" 20 | dir.create(output.dir, showWarnings=FALSE) 21 | setwd(output.dir) 22 | title<-"Rising and Falling Terms - Conference Proceedings from ICALT, CAL, ECTEL, ICHL and ICWL" 23 | 24 | # this determines the source type: conference abstracts or blog content 25 | source.type="c"#c is for conference abstracts, b is for blogs 26 | 27 | # these three (set.xxxxx) apply whichever source type is used 28 | sets.csv <- c("ICALT Abstracts 2005-2011 with metrics.csv", 29 | "ECTEL Abstracts 2006-2011 with metrics.csv", 30 | "ICWL Abstracts 2005-2011 with metrics.csv", 31 | "ICHL Abstracts 2008-2011 with metrics.csv", 32 | "CAL Abstracts 2007-2011 with metrics.csv") 33 | set.name <- c("ICALT", 34 | "ECTEL", 35 | "ICWL", 36 | "ICHL", 37 | "CAL") 38 | set.title <- c("IEEE International Conference on Advanced Learning Technologies", 39 | "European Conference on Technology Enhanced Learning", 40 | "International Conference on Web-based Learning", 41 | "International Conference on Hybrid Learning", 42 | "Computer Assisted Learning Conference") 43 | 44 | recent.themes.txt <- NA # file containing invited conference themes. Use NA if analysing blogs. 45 | 46 | #these apply only for conference abstracts (and are for use in auto-report-generation using Brew) 47 | last.conference.url <- c("http://www.ask4research.info/icalt/2011/", 48 | "http://www.gast.it.uc3m.es/ectel2011/", 49 | "http://www.hkws.org/conference/icwl2011/", 50 | "http://www.hkuspace.hku.hk/ichl2011/", 51 | "http://www.cal-conference.elsevier.com/") 52 | publisher.series <- c("IEEE", 53 | "Springer Lecture Notes in Computer Science (LNCS)", 54 | "Springer Lecture Notes in Computer Science (LNCS)", 55 | "Springer Lecture Notes in Computer Science (LNCS)", 56 | "Elsevier Computers and Education Journal") 57 | publisher.url <- c("http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=5991609", 58 | "http://www.springerlink.com/content/978-3-642-23984-7/", 59 | "http://www.springerlink.com/content/978-3-642-25812-1/", 60 | "http://www.springerlink.com/content/978-3-642-22762-2/", 61 | "http://www.journals.elsevier.com/computers-and-education/") 62 | 63 | ## ensure subdirectories exist 64 | dir.create("Gephi", showWarnings=FALSE) 65 | dir.create("Images", showWarnings=FALSE) 66 | dir.create("Wordle", showWarnings=FALSE) 67 | 68 | ## 69 | ## Run properties - date ranges 70 | ## 71 | # key date is the one after which documents are considered to be in the "recent set" 72 | if(source.type=="b"){ 73 | # for blogs key date is an N month period before the start of the current month 74 | recent.months<-3#set to 3 to look at the last quarter 75 | prior.months<-12#use the previous 12 months to compare against (if in dataset) 76 | key.date<-as.POSIXlt(Sys.Date(), tz = "GMT")#today 77 | key.date$mday<-1 78 | last.date<-key.date 79 | last.date$mday<-last.date$mday-1 #to match inequalities in corpus filtering 80 | key.date$mon<-key.date$mon-recent.months 81 | start.date<-key.date 82 | start.date$mon<-start.date$mon - prior.months 83 | start.date$mday<-start.date$mday-1 #to match inequalities in corpus filtering 84 | display.dateLength<-10 85 | }else if(source.type=="c"){ 86 | # for conference abstracts, key date is just the most-recent conference 87 | conf.year.recent<-2011 #conference abs version => 2011 confs are "recent" 88 | conf.years.in_past<-4 # abstracts from the previous 4 years are counted as "past" 89 | key.date<-as.POSIXlt(paste(conf.year.recent-1,"12","31",sep="-"), tz = "GMT") 90 | start.date<-key.date 91 | start.date$year<-start.date$year-conf.years.in_past 92 | last.date<-key.date 93 | last.date$year<-last.date$year+1 94 | display.dateLength<-4 95 | }else{ 96 | stop("Unknown source type",source.type) 97 | } 98 | 99 | ## 100 | ## Run properties - centrality data 101 | ## 102 | #use NA if not available! 103 | # This is a list of papers (id, dblp url, author-id-list) 104 | # the row names are made to be the DBLP URLs 105 | papers.table<-NA # read.csv(paste(source.dir,"Union B Author Ids 2010.csv",sep="/"), header=TRUE, sep=",", quote="\"", row.names=2, stringsAsFactors=FALSE) 106 | # this is a list of author centrality measures (id, centrality) NB author IDs must match previous list of papers 107 | authors.table<-NA #read.csv(paste(source.dir,"Author Betweenness D4_3.csv",sep="/"), header=TRUE, sep=",", quote="\"", row.names=1, stringsAsFactors=FALSE) 108 | 109 | ## 110 | ## Run properties - thresholds - 111 | ## normally the same between different sources of the same kind for comparability 112 | ## 113 | # how many documents must the term appear in to be listed. This is in addition to the frequency thresholds. A value of 2 is expected, i.e. ignore terms that appear in only one doc 114 | doc_count.thresh <- 2 115 | # p-value to accept the "alternative hypothesis" that there is something interesting 116 | thresh.pval<-0.005 #i.e. accept a .5% chance that null hypothesis falsely rejected 117 | thresh.pval.falling<-0.01 #use a more lenient threshold for falling terms 118 | #max frequency of term in the past set for eligibility as a weak signal. 119 | #Above this, sigifnicant risers are "established terms" 120 | max.past.freq<-0.0002 #i.e. 0.02% 121 | # *for plotting* cap rising % at this level. If value greater then plot is effectively cut off 122 | rising.plot.max<-800 123 | 124 | #novelty calc 125 | term.doc_occurrence.max<-0.5#remove terms appearing in more than 50% of documents 126 | std.novelty.min<-0.25 #a min value of the "standard novelty" 127 | 128 | ## 129 | ## End setup 130 | ## 131 | 132 | # in interactive execution it may be best to skip this command and to manually switch to it 133 | #source("../RF_Terms.R") 134 | -------------------------------------------------------------------------------- /Rising and Falling Terms/plotFunctions.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2011, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | 11 | ## 12 | ## Utility plot functions, plotting to standard output and to a file, which is generally 13 | ## assumed to be in a working directory defined in the calling code.# 14 | ## 15 | 16 | #var to use to make some more space at the bottom of plots for long label (use par(mar=mar.bigmar) ) 17 | mar.default<-par("mar") 18 | mar.bigmar<-c(6,4,4,2) 19 | 20 | #boxplot with added mean values as circules 21 | # X is a list of lists 22 | basic.boxplot<-function(X, Main="", Ylab="", Names=names(X), OutputFile=NA, AddMean=TRUE){ 23 | if(length(X)==0){ 24 | empty.plot(Main,OutputFile) 25 | return() 26 | } 27 | par(mar=mar.bigmar) 28 | boxplot(X, main=Main, ylab=Ylab, names=Names, las=2, cex.axis=0.7) 29 | if(AddMean){ 30 | points(as.numeric(lapply(X,FUN=mean)), pch=4) 31 | } 32 | if(!is.na(OutputFile)){ 33 | # Repeat the previous to create a png 34 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 35 | boxplot(X, main=Main, ylab=Ylab, names=Names, las=2, cex.axis=0.7) 36 | if(AddMean){ 37 | points(as.numeric(lapply(X,FUN=mean)), pch=4) 38 | } 39 | ad<-dev.off() 40 | } 41 | } 42 | 43 | # basic barplot with labels horizontal and under x axis 44 | basic.barplot<-function(X,Main,Xlab,Ylab,Names,OutputFile=NA, ForceIntYAxis=FALSE){ 45 | if(length(X)==0){ 46 | empty.plot(Main,OutputFile) 47 | return() 48 | } 49 | par(mar=mar.bigmar) 50 | barplot(X,main=Main, xlab=Xlab, ylab=Ylab, names.arg=Names, axes=!ForceIntYAxis) 51 | if(ForceIntYAxis){ 52 | axis(2, at=0:max(X))# to get integers only 53 | } 54 | if(!is.na(OutputFile)){ 55 | # Repeat the previous to create a png 56 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 57 | barplot(X,main=Main, xlab=Xlab, ylab=Ylab, names.arg=Names, axes=!ForceIntYAxis) 58 | if(ForceIntYAxis){ 59 | axis(2, at=0:max(X))# to get integers only 60 | } 61 | ad<-dev.off() 62 | } 63 | } 64 | 65 | # no x axis title or labels; labels appear inside bars 66 | insideLabel.barplot<-function(X,Main,Ylab,Names,OutputFile=NA, ForceIntYAxis=FALSE){ 67 | if(length(X)==0){ 68 | empty.plot(Main,OutputFile) 69 | return() 70 | } 71 | par(mar=mar.default) 72 | x.pos<-barplot(X,main=Main, ylab=Ylab, names.arg="", axes=!ForceIntYAxis) 73 | if(ForceIntYAxis){ 74 | axis(2, at=0:max(X))# to get integers only 75 | } 76 | par(srt=90) 77 | text(x=x.pos, y=0.1, Names, adj=c(0,0.5))# puts labels inside bars 78 | par(srt=0) 79 | 80 | if(!is.na(OutputFile)){ 81 | # Repeat the previous plot to create png 82 | png(OutputFile, width=1000, height=1000, pointsize=12, res=150) 83 | x.pos<-barplot(X,main=Main, ylab=Ylab, names.arg="", axes=!ForceIntYAxis) 84 | if(ForceIntYAxis){ 85 | axis(2, at=0:max(X))# to get integers only 86 | } 87 | par(srt=90) 88 | text(x=x.pos, y=0.1, Names, adj=c(0,0.5))# puts labels inside bars 89 | par(srt=0) 90 | ad<-dev.off() 91 | } 92 | } 93 | 94 | basic.hist<-function(X,Main,Xlab,OutputFile=NA, Breaks=10){ 95 | if(length(X)==0){ 96 | empty.plot(Main,OutputFile) 97 | return() 98 | } 99 | par(mar=mar.bigmar) 100 | #if(min(X)>=0){ 101 | # Xlim<-c(0, ceiling(10*max(X))/10) 102 | #}else{ 103 | # Xlim=c(-ceiling(-10*min(X))/10, 0) 104 | #} 105 | Xlim<-c(min(c(0,-ceiling(-10*min(X))/10)), max(c(0,ceiling(10*max(X))/10))) 106 | 107 | hist(X, breaks=Breaks, main=Main, xlab=Xlab, xlim=Xlim) 108 | 109 | if(!is.na(OutputFile)){ 110 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 111 | hist(X, breaks=Breaks, main=Main, xlab=Xlab, xlim=Xlim) 112 | ad<-dev.off() 113 | } 114 | } 115 | 116 | 117 | colorized.barplot<-function(X, Main, Ylab, Names, Colours, OutputFile=NA){ 118 | if(length(X)==0){ 119 | empty.plot(Main,OutputFile) 120 | return() 121 | } 122 | par(mar=mar.bigmar) 123 | barplot(X, las=2, cex.names=0.7, col=Colours, main=Main, ylab=Ylab, names.arg=Names) 124 | 125 | if(!is.na(OutputFile)){ 126 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 127 | barplot(X, las=2, cex.names=0.7, col=Colours, main=Main, ylab=Ylab, names.arg=Names) 128 | ad<-dev.off() 129 | } 130 | } 131 | 132 | #for showing comparison between previous and current sets as stack or side-by-side 133 | pair.barplot<-function(X.past, X.target, Main, Ylab, Names, OutputFile=NA, Beside=FALSE){ 134 | if(length(X.past)==0 || length(X.target)==0){ 135 | empty.plot(Main,OutputFile) 136 | return() 137 | } 138 | barplot(rbind(X.past, X.target),las=2, cex.names=0.7, names.arg=Names, main=Main, legend.text=c("Previous","Target"), ylab=Ylab, beside=Beside) 139 | 140 | if(!is.na(OutputFile)){ 141 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 142 | barplot(rbind(X.past, X.target),las=2, cex.names=0.7, names.arg=Names, main=Main, legend.text=c("Previous","Target"), ylab=Ylab, beside=Beside) 143 | ad<-dev.off() 144 | } 145 | } 146 | 147 | #for showing comparison between pos/neg sentments as stack or side-by-side 148 | sentiment.barplot<-function(Neg, Pos, Main, Ylab, OutputFile=NA, Beside=FALSE){ 149 | barplot(rbind(Neg, Pos),las=2, cex.names=0.7, main=Main, xlab="Document ID", 150 | legend.text=c("Negative","Positive"), ylab=Ylab, beside=Beside, 151 | col=c("lightblue","pink")) 152 | 153 | if(!is.na(OutputFile)){ 154 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 155 | barplot(rbind(Neg, Pos),las=2, cex.names=0.7, main=Main, xlab="Document ID", 156 | legend.text=c("Negative","Positive"), ylab=Ylab, beside=Beside, 157 | col=c("lightblue","pink")) 158 | ad<-dev.off() 159 | } 160 | } 161 | 162 | 163 | #simplest heat map 164 | basic.heatmap<-function(M, Main, ColumnLabels, OutputFile=NA){ 165 | if(length(M[1,])<2 || length(M[,1])<2){ 166 | empty.plot(Main,OutputFile) 167 | return() 168 | } 169 | par(mar=mar.bigmar) 170 | heatmap(M, main=Main, labCol=ColumnLabels, margins=c(10,5)) 171 | 172 | if(!is.na(OutputFile)){ 173 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 174 | heatmap(M, main=Main, labCol=ColumnLabels, margins=c(10,5)) 175 | ad<-dev.off() 176 | } 177 | } 178 | 179 | #basic scatter plot with log y exis 180 | log.scatter<-function(X, Y, Main, Xlab, Ylab, OutputFile=NA){ 181 | if(length(X)==0){ 182 | empty.plot(Main,OutputFile) 183 | return() 184 | } 185 | plot(X, Y ,main=Main, xlab=Xlab, ylab=Ylab, log="y") 186 | 187 | if(!is.na(OutputFile)){ 188 | png(OutputFile, width=1000, height=1000, pointsize=12, res=150) 189 | plot(X, Y ,main=Main, xlab=Xlab, ylab=Ylab, log="y") 190 | ad<-dev.off() 191 | } 192 | } 193 | 194 | #to be used when there is no data. Useful for Brew/HTML output 195 | empty.plot<-function(Main, OutputFile=NA){ 196 | plot.new() 197 | box() 198 | title(Main) 199 | text(x=0.5, y=0.5, "No Terms Found", pos=1) 200 | if(!is.na(OutputFile)){ 201 | png(OutputFile, width=1000, height=1000,pointsize=12, res=150) 202 | plot.new() 203 | box() 204 | title(Main) 205 | text(x=0.5, y=0.5, "No Terms Found", pos=1) 206 | ad<-dev.off() 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /SQLite Schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE abstract ( 2 | "id" INTEGER PRIMARY KEY AUTOINCREMENT, 3 | "origin" TEXT NOT NULL, 4 | "year" TEXT NOT NULL, 5 | "pages" TEXT, 6 | "title" TEXT NOT NULL, 7 | "authors" TEXT NOT NULL, 8 | "abstract" TEXT NOT NULL, 9 | "keywords" TEXT, 10 | "url" TEXT NOT NULL, 11 | "dblp_url" TEXT, 12 | "pos_score" REAL, 13 | "neg_score" REAL, 14 | "subj_score" REAL, 15 | "econ_score" REAL, 16 | "polit_score" REAL, 17 | "legal_score" REAL, 18 | "doing_score" REAL, 19 | "knowing_score" REAL, 20 | "treated" TEXT, 21 | "non_stopwords" INTEGER, 22 | "treated_words" INTEGER 23 | ); 24 | CREATE UNIQUE INDEX "ABSTRACT_URL_UNQ" on "abstract" (url ASC); 25 | CREATE INDEX "ABSTRACT_YEAR" on abstract (year ASC); 26 | CREATE TABLE blog_post ( 27 | "id" INTEGER PRIMARY KEY AUTOINCREMENT, 28 | "content" TEXT NOT NULL, 29 | "title" TEXT NOT NULL, 30 | "authors" TEXT NOT NULL, 31 | "datestamp" TEXT NOT NULL, 32 | "origin" TEXT NOT NULL, 33 | "url" TEXT NOT NULL, 34 | "pos_score" REAL, 35 | "neg_score" REAL, 36 | "subj_score" REAL, 37 | "econ_score" REAL, 38 | "polit_score" REAL, 39 | "legal_score" REAL, 40 | "doing_score" REAL, 41 | "knowing_score" REAL, 42 | "treated" TEXT, 43 | "non_stopwords" INTEGER, 44 | "treated_words" INTEGER 45 | ); 46 | CREATE UNIQUE INDEX "BLOG_POST_URL_UNQ" on blog_post (url ASC); 47 | CREATE INDEX "BLOG_POST_DATESTAMP" on blog_post (datestamp ASC); 48 | 49 | CREATE VIRTUAL TABLE abstract_fts4 USING fts4(content="abstract", abstract, treated); 50 | CREATE VIRTUAL TABLE blog_post_fts4 USING fts4(content="blog_post", content, treated); 51 | 52 | -------------------------------------------------------------------------------- /Stem Helper.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## simple helper to find the stemmed forms of candidate words and also to determine the shortes form in a corpus 3 | ## 4 | 5 | library("tm") 6 | 7 | #where to find a corpus 8 | load("/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union C/2011/Corpus.RData") 9 | 10 | #space-separated list of words to stem 11 | #look.for<-"LMS VLE LCMS E-Portfolio Games Gesture Metadata Adaptive Open Social Ubiquitous Semantic Agents Cloud Broadband Video" 12 | look.for<-"cloud virtualisation virtualization saas paas" 13 | look.for<-"ebook etextbooks" 14 | look.for<-"analytics analytical analysis data" 15 | look.for<-"gesturebased gestural" 16 | look.for<-"context contextsensitive contextaware contextenriched location locationbased locationaware geospatial" 17 | look.for<-"Game gaming gamification game-based game-play" 18 | look.for<-"Immersive Standards OSS Blog Twitter Wiki Tablet Smartphone Mobile Streaming" 19 | 20 | #look.for<-tolower(removePunctuation(look.for)) 21 | look.for.vec<- unlist(strsplit(look.for," ")) 22 | look.for.stems<-stemDocument(tolower(removePunctuation(look.for.vec))) 23 | 24 | #now lookup shortest and prevalent forms leading to stem within the corpus 25 | shortest.words<-stemCompletion(look.for.stems,corp,type="shortest") 26 | completion.fails<-is.na(shortest.words) | shortest.words=="" 27 | shortest.words[completion.fails]<-names(shortest.words[completion.fails]) 28 | prevalent.words<-stemCompletion(look.for.stems,corp,type="prevalent") 29 | completion.fails<-is.na(prevalent.words) | prevalent.words=="" 30 | prevalent.words[completion.fails]<-names(prevalent.words[completion.fails]) 31 | 32 | #some output for easy cut/paste 33 | print("Looked-for words:") 34 | print(paste("c('",paste(look.for.vec,collapse="','"),"')", sep="")) 35 | 36 | print("Stemmed words:") 37 | print(paste("c('",paste(look.for.stems,collapse="','"),"')", sep="")) 38 | 39 | print("Shortest words in the given corpus to match these stems:") 40 | print(paste("c('",paste(shortest.words,collapse="','"),"')", sep="")) 41 | print("Prevalent words in the given corpus to match these stems:") 42 | print(paste("c('",paste(prevalent.words,collapse="','"),"')", sep="")) 43 | -------------------------------------------------------------------------------- /WordookieDriver/WordookieDriver.pde: -------------------------------------------------------------------------------- 1 | /* 2 | Modified Adam Cooper 2010 from an original: 3 | Copyright 2009 Michael Ogawa 4 | 5 | This file is part of Wordookie. 6 | 7 | Wordookie is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | Wordookie is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with Wordookie. If not, see . 19 | */ 20 | 21 | import wordookie.*; 22 | import wordookie.util.*; 23 | import wordookie.parsers.*; 24 | import java.io.*; 25 | 26 | WordWeightParser parser; 27 | Layout layout; 28 | java.util.List words; 29 | Iterator itr; 30 | PFont font; 31 | int startTime; 32 | int[] colourScheme; 33 | int scaleMin; 34 | int scaleMax; 35 | 36 | final String FILENAME = "AB Pre-sig Wordle subQ3";//append .txt to get input name, .jpg for stored image 37 | 38 | void setup() 39 | { 40 | size( 1024, 768 ); 41 | smooth(); 42 | // ======= Style Settings ======= 43 | /* 44 | Layout options: 45 | ANYWAY - Words are angled "any which way." 46 | HALF_AND_HALF - Half the words are angled horizontally and the other half vertically. 47 | HORIZONTAL - All words are angled horizontally. 48 | MOSTLY_HORIZONTAL - Most words are angled horizontally; the rest are vertical. 49 | MOSTLY_VERTICAL - Most of the words are angled vertically; the rest are horizontal. 50 | VERTICAL - All words are angled vertically. 51 | Colour scheme options: 52 | Autumn - Reds and oranges. 53 | BlueIce - A white-to-blue theme. 54 | EasterEgg - Pastel colors. 55 | */ 56 | font = createFont( "SansSerif", 32 );//the size is effectively irrelevant - see scale values 57 | scaleMin=16;//default 16 58 | scaleMax=100;//default 100. Reduce if the weight distribution is quite uniform or top-heavy (layout cannot fit words in!) 59 | int bgColour = color(0); 60 | background(bgColour); 61 | layout = new Layout( this, bgColour ); 62 | layout.setAngleType( layout.HALF_AND_HALF ); 63 | colourScheme = ColorStuff.Autumn; 64 | // == end Style Settings == 65 | 66 | parser = new WordWeightParser(); 67 | 68 | InputStream in = createInput( FILENAME+".txt" ); 69 | try 70 | { 71 | parser.load( in ); 72 | } 73 | catch( Exception ex ) 74 | { 75 | ex.printStackTrace(); 76 | } 77 | 78 | words = parser.getWords(); 79 | Collections.sort( words ); 80 | itr = words.iterator(); 81 | 82 | startTime = millis(); 83 | } 84 | 85 | void draw() 86 | { 87 | if( itr.hasNext() ) 88 | { 89 | Word word = (Word)itr.next(); 90 | println( word.toString() ); 91 | int fontSize = (int)map( word.weight, parser.getMinWeight(), parser.getMaxWeight(), 16, 50 ); 92 | word.font = font; 93 | word.fontSize = fontSize; 94 | layout.doLayout( word ); 95 | fill(colourScheme[ (int)random(colourScheme.length) ] ); 96 | layout.paintWord( word ); 97 | } 98 | else 99 | { 100 | save(FILENAME+".jpg"); 101 | int endTime = millis(); 102 | println( "Done: " + (endTime - startTime) + " msec" ); 103 | noLoop(); 104 | } 105 | } 106 | 107 | 108 | -------------------------------------------------------------------------------- /commonFunctions.R: -------------------------------------------------------------------------------- 1 | ## ***Made available using the The MIT License (MIT)*** 2 | # Copyright (c) 2012, Adam Cooper 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | # 6 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | # 8 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | ## ************ end licence *************** 10 | 11 | source("~/R Projects/Text Mining Weak Signals/Common Functions/CustomStopwords.R") 12 | 13 | ## 14 | ## common functions for RF_Terms. No attempt is made to make these generic utility functions (a job for a rainy day or another life) 15 | ## 16 | 17 | # Given a corpus, extra metadata and a list of document IDs, extract document metadata and content into a dataframe 18 | # and optionally print out the extracted info 19 | #ExtraMeta is a data frame with Doc IDs as row names, columns for max betweenness and Std Novelty 20 | ExtractDocs<-function(Corp, ExtraMeta, DocIds, DateLength=4, Print=TRUE){ 21 | empty.field.c<-rep(NA,length(DocIds)) 22 | df<-data.frame(origin=empty.field.c, date=empty.field.c, 23 | heading=empty.field.c, authors=empty.field.c, id=empty.field.c, 24 | url=empty.field.c, dblp_url=empty.field.c, 25 | abstract=empty.field.c, std.novelty=empty.field.c, positive=empty.field.c, 26 | negative=empty.field.c, subjectivity=empty.field.c, max.betweenness=empty.field.c, 27 | stringsAsFactors=FALSE) 28 | jj<-1 29 | for (j in DocIds){ 30 | #BEWARE order is critical! 31 | df[jj,]<-c(as.character(meta(Corp[[j]], tag="Origin")), 32 | substr(as.character(meta(Corp[[j]], tag="DateTimeStamp")),1,DateLength), 33 | as.character(meta(Corp[[j]], tag="Heading")), 34 | as.character(meta(Corp[[j]], tag="Author")), as.character(meta(Corp[[j]], tag="ID")), 35 | as.character(meta(Corp[[j]], tag="URL")), as.character(meta(Corp[[j]], tag="DBLP_URL")), 36 | as.character(Corp[[j]]), 37 | as.numeric(ExtraMeta[j,"StdNovelty"]), 38 | as.numeric(meta(Corp[[j]], tag="Positive")), 39 | as.numeric(meta(Corp[[j]], tag="Negative")), 40 | as.numeric(meta(Corp[[j]], tag="Subjectivity")), 41 | as.numeric(ExtraMeta[j,"MaxBetweenness"])) 42 | if(Print){ 43 | print("") 44 | print(df[jj,"heading"]) 45 | print(paste("Metrics: std. novelty=", df[jj,"std.novelty"], 46 | " + sentiment=", df[jj,"positive"], 47 | " - sentiment=", df[jj,"negative"], 48 | " subjectivity=", df[jj,"subjectivity"], 49 | " max betweenness=",df[jj,"max.betweenness"],sep="")) 50 | print(paste(df[jj,"origin"],df[jj,"date"],", ",df[jj,"authors"],", ", "ID=", df[jj,"id"], sep="")) 51 | print(df[jj,"abstract"]) 52 | } 53 | jj<-jj+1 54 | } 55 | 56 | return(df) 57 | } 58 | 59 | LogTerms<-function(fileName, terms, words=NULL){ 60 | #first clean up old sinks 61 | while(sink.number()>0) 62 | {sink()} 63 | sink(file=fileName, append=FALSE, type="output", split=TRUE) 64 | cat(paste("c(\"",paste(terms,collapse="\",\""),"\")",sep="")) 65 | cat("\n") 66 | if(!is.null(words)){ 67 | cat(paste("c(\"",paste(words,collapse="\",\""),"\")",sep="")) 68 | } 69 | 70 | while(sink.number()>0) 71 | {sink()} 72 | } 73 | 74 | 75 | 76 | ## 77 | ## Apply Pearson Chi^2 test to term distribution in the documents of 2 corpora to identify 78 | ## statistically-significant rising, falling, new terms in a "recent" set vs a "past" set 79 | ## [derived from RF_Terms.R] 80 | ## A list is returned for New/Rising/Established/Falling. 81 | ## NB the DTM in $Falling is of the docs in the pastIds set whereas the other DTMs are for the recentIds 82 | ## 83 | 84 | # how many documents must the term appear in to be listed. This is in addition to the frequency thresholds. A value of 2 is expected, i.e. ignore terms that appear in only one doc 85 | #doc_count.thresh <- 2 86 | # p-value to accept the "alternative hypothesis" that there is something interesting 87 | #thresh.pval<-0.005 #i.e. accept a .5% chance that null hypothesis falsely rejected 88 | #thresh.pval.falling<-0.01 #use a more lenient threshold for falling terms 89 | #max frequency of term in the past set for eligibility as a weak signal. 90 | #Above this, sigifnicant risers are "established terms" 91 | #max.past.freq<-0.0002 92 | 93 | 94 | PearsonChanges.Corpus<-function(corpus, 95 | pastIds, recentIds, 96 | doc_count.thresh = 2, 97 | thresh.pval = 0.005, 98 | thresh.pval.falling = 0.01, 99 | max.past.freq = 0.0002, 100 | stem = TRUE, stop.words = TRUE){ 101 | #sanity checks 102 | if(length(intersect(pastIds,recentIds))>0){ 103 | stop("Error pastIds contains at least one Id also in recentIds.") 104 | } 105 | 106 | #libraries 107 | require("tm") 108 | require("Snowball") 109 | require("slam") 110 | require("corpora") 111 | 112 | ## 113 | ## process the corpus to create a doc-term matrix using default or passed parameters 114 | ## 115 | corpus<-tm_map(corpus,removeNumbers) 116 | corpus<-tm_map(corpus,removePunctuation) 117 | dtm.tf<-DocumentTermMatrix(corpus, control=list(stemming=stem, stopwords=stop.words, minWordLength=3)) 118 | #finally trim the DTM so that it only contains docs that we need and only terms in the needed docs 119 | dtm.tf<-dtm.tf[c(pastIds, recentIds),] 120 | dtm.tf<-dtm.tf[,col_sums(dtm.tf)>0] 121 | 122 | ## 123 | ## segment the DTM according to the two sets of document ids passed in. 124 | ## 125 | # it is helpful to retain the same Terms in each DTM for when comparisons are made 126 | dtm.tf.past<-dtm.tf[pastIds,] 127 | dtm.tf.recent<-dtm.tf[recentIds,] 128 | 129 | ## 130 | ## Make sure that there are at least doc_count.thresh docs containing any given term 131 | ## 132 | dtm.bin<-weightBin(dtm.tf) 133 | dtm.tf.past<-dtm.tf.past[,col_sums(dtm.bin)>=doc_count.thresh] 134 | dtm.tf.recent<-dtm.tf.recent[,col_sums(dtm.bin)>=doc_count.thresh] 135 | 136 | ## 137 | ##aggregate statistics 138 | ## 139 | term.sums.past<-col_sums(dtm.tf.past) 140 | tsp.all<-sum(term.sums.past) 141 | term.sums.recent<-col_sums(dtm.tf.recent) 142 | tsr.all<-sum(term.sums.recent) 143 | 144 | ## 145 | ## GET some boolean filters for three groups, Rising, Falling and New. No decision on significance yet! 146 | ## 147 | #term sums of 0 in the past must be new terms, since we know the corpus sum>0 148 | new.term_ids.bool <- (term.sums.past==0) 149 | #which terms should be considered in rising/falling? 150 | rise.term_ids.bool <- (term.sums.recent/tsr.all>term.sums.past/tsp.all) & (term.sums.past>0) 151 | fall.term_ids.bool <- (term.sums.recent/tsr.allmax.past.freq 189 | p.established.rising <- p.rising[established.term_ids.bool] 190 | p.rising <- p.rising[!established.term_ids.bool] 191 | established.rise.ratio<-rise.ratio[established.term_ids.bool] 192 | rise.ratio<-rise.ratio[!established.term_ids.bool] 193 | 194 | 195 | ## 196 | ## Which documents in the corpus contain terms in the relevant new/rising/falling/established sets 197 | ## Lists are convenient structures for returning values 198 | ## 199 | 200 | if(is.na(p.new[1])){ 201 | l.new<-NULL 202 | }else{ 203 | dtm.tf.new<-dtm.tf.recent[,names(p.new)] 204 | dtm.tf.new<-dtm.tf.new[row_sums(dtm.tf.new)>0] 205 | newIds<-Docs(dtm.tf.new) 206 | l.new<-list(DTM = dtm.tf.new, 207 | Frequency = term.sums.new, 208 | P = p.new) 209 | } 210 | if(is.na(p.rising[1])){ 211 | rising<-NULL 212 | }else{ 213 | dtm.tf.rising<-dtm.tf.recent[,names(p.rising)] 214 | dtm.tf.rising<-dtm.tf.rising[row_sums(dtm.tf.rising)>0] 215 | risingIds<-Docs(dtm.tf.rising) 216 | rising<-list(DTM = dtm.tf.rising, 217 | Frequency = col_sums(dtm.tf.rising)/tsr.all, 218 | BaselineFrequency = col_sums(dtm.tf.past[,names(p.rising)])/tsp.all, 219 | Change = rise.ratio, 220 | P = p.rising) 221 | } 222 | if(is.na(p.established.rising[1])){ 223 | established<-NULL 224 | }else{ 225 | dtm.tf.established<-dtm.tf.recent[,names(p.established.rising)] 226 | dtm.tf.established<-dtm.tf.established[row_sums(dtm.tf.established)>0] 227 | establishedIds<-Docs(dtm.tf.established) 228 | established<-list(DTM = dtm.tf.established, 229 | Frequency = col_sums(dtm.tf.established)/tsr.all, 230 | BaselineFrequency = col_sums(dtm.tf.past[,names(p.established.rising)])/tsp.all, 231 | Change = established.rise.ratio, 232 | P = p.established.rising) 233 | } 234 | if(is.na(p.falling[1])){ 235 | falling<-NULL 236 | }else{ 237 | dtm.tf.falling<-dtm.tf.past[,names(p.falling)] 238 | dtm.tf.falling<-dtm.tf.falling[row_sums(dtm.tf.falling)>0] 239 | fallingIds<-Docs(dtm.tf.falling) 240 | falling<-list(DTM = dtm.tf.falling, 241 | Frequency = col_sums(dtm.tf.recent[,names(p.falling)])/tsr.all, 242 | BaselineFrequency = col_sums(dtm.tf.falling)/tsp.all, 243 | Change = fall.ratio, 244 | P = p.falling) 245 | } 246 | 247 | #combine into a single list for the return "class" 248 | pearsonChanges<-list(DTM.tf=dtm.tf, New=l.new, Established=established, Rising=rising, Falling=falling) 249 | class(pearsonChanges)<-"PearsonChanges" 250 | 251 | pearsonChanges #return value 252 | } --------------------------------------------------------------------------------