├── .gitignore ├── Classifiers ├── MDW_by_subject.csv ├── Naive-Bayes-Classifier.py └── PracticePDFExtractor.py ├── LICENSE ├── README.md ├── UI └── SimpleQuerier.html ├── cc-by-nc-sa-3.0.md ├── extractor_research ├── extractors │ ├── __init__.py │ ├── miner.py │ ├── pdf2.py │ ├── pdfbox-app-1.8.5.jar │ ├── pdfbox.py │ ├── textstream.py │ ├── textstream │ │ ├── LICENSE │ │ ├── PDFTextStream.jar │ │ ├── TextStream.class │ │ └── TextStream.java │ └── xpdf.py ├── input │ ├── American_Opera_Rev_Syllabus.pdf │ ├── Leonard_Intro_Musicology_syllabus(2).pdf │ ├── Leonard_Intro_Musicology_syllabus.pdf │ ├── Leonard_Victorian_Music_syllabus_docx.pdf │ ├── Leonard_Women_Music_syllabus.pdf │ ├── Music_since_1900_syllabus.pdf │ └── pride_and_prej │ │ ├── 1.pdf │ │ ├── 2.pdf │ │ ├── 3.pdf │ │ └── plain.txt ├── main.py ├── output │ ├── American_Opera_Rev_Syllabus.html │ ├── American_Opera_Rev_Syllabus.xml │ ├── Leonard_Intro_Musicology_syllabus(2).html │ ├── Leonard_Intro_Musicology_syllabus(2).xml │ ├── Leonard_Intro_Musicology_syllabus.html │ ├── Leonard_Intro_Musicology_syllabus.xml │ ├── Leonard_Victorian_Music_syllabus_docx.html │ ├── Leonard_Victorian_Music_syllabus_docx.xml │ ├── Leonard_Women_Music_syllabus.html │ ├── Leonard_Women_Music_syllabus.xml │ ├── Music_since_1900_syllabus.html │ ├── Music_since_1900_syllabus.xml │ └── pride_and_prej │ │ ├── miner_with_layout │ │ └── 1.txt │ │ ├── miner_without_layout │ │ └── 1.txt │ │ ├── pdf2_default │ │ └── 1.txt │ │ ├── pdfbox_default │ │ └── 1.txt │ │ ├── textstream_default │ │ └── 1.txt │ │ ├── xpdf_with_layout │ │ └── 1.txt │ │ └── xpdf_without_layout │ │ └── 1.txt ├── stats │ └── pride_and_prej │ │ └── 1_speed_log.txt └── visualize │ ├── __init__.py │ └── html_parser.py ├── gpl-3.0.md ├── opensyllabus ├── __init__.py ├── config.py ├── core │ ├── __init__.py │ ├── extractor.py │ ├── ingestion.py │ ├── mongo.py │ ├── ocr.py │ └── utils.py ├── run_getemptydocs.py ├── run_getstats.py └── run_ingestion.py ├── sanitize.py └── twitter ├── .gitignore └── twitter.py /.gitignore: -------------------------------------------------------------------------------- 1 | .project 2 | .pydevproject 3 | *.pyc 4 | scrapes/mobileread/data/* 5 | -------------------------------------------------------------------------------- /Classifiers/MDW_by_subject.csv: -------------------------------------------------------------------------------- 1 | SUBJECT,anthropology,art,biology,chemistry,economics,english,french,geology,german,history,italian,latin,math,music,philosophy,physical-education,physics,political-science,psychology,sociology,spanish,theatre 2 | 0,christmas,sculpture,genetic,chem,econ,paraphrase,vous,tectonics,frei,hist,katerinov,bayerle,tutoring,archetto,meditations,vigorous,erasing,ucsd,psychodynamic,soc,curso,rob 3 | 1,neandertals,stokstad,genetics,aqueous,maddox,informing,fren,sedimentary,ferien,aftermath,clotilde,hbayerl,exponential,concert,hume,initiate,frosso,gospel,psypos,mcquaide,repaso,mowen 4 | 2,archaeology,marilyn,cells,bonding,monopoly,aivey,devoir,rocks,und,criticizing,boriosi,fianl,logarithmic,marchetto,hackett,gym,reproducible,enemy,improperly,findings,tulo,onstage 5 | 3,kalahari,studio,mutation,guilty,microeconomics,ivey,sur,geologic,schoene,dleinwe,katerin,mowethfr,trigonometric,cough,nicomachean,fitness,planner,lenin,conviction,stratification,para,accommodated 6 | 4,kinship,diverging,transport,mole,aplia,adriane,cours,metamorphic,deutschkurs,youngblood,vacanze,mowefr,thoughts,melody,phil_ox,gymnasium,reproducibility,mussolini,patti,deviance,composici,husband 7 | 5,anthro,interpretative,outcrops,reactions,unemployment,persuasively,mercredi,minerals,lehrbuch,sashmor,istruzione,whiteley,predicate,governs,enquiry,worn,seitaridou,donald,infancy,ehrenreich,hispanic,instill 8 | 6,abu,sketchbook,outcrop,stoichiometry,athletes,disruption,pour,weathering,wie,strives,italiano,gould,trigonometry,choral,unjust,jewelry,momentum,democracies,psychologist,medicalization,por,script 9 | 7,biocultural,patron,respiration,chemist,oligopoly,liveliness,jours,sanctioned,arbeitsbuch,ashmore,introduzione,conspicuous,theorem,poise,phil,fax,serway,election,freud,dimed,enero,imaginatively 10 | 8,anthropologist,cubism,membrane,expects,monopolistic,pertinence,expos,mineral,wiederholung,hst,albergo,ovid,binomial,piano,eewilso,adhering,eseitar,unfamiliar,mindedness,notions,introducci,talented 11 | 9,symbols,camille,chromosome,thermochemistry,scholastic,drama,lundi,igneous,ueber,constituting,letture,punctual,emergencies,skirt,mistaken,wellness,electricity,jeffrey,unguarded,interactionism,abril,clog 12 | 10,anthropology,cottrell,inheritance,configuration,macroeconomics,adherence,semaine,glaciation,abschlusspruefungen,impermissible,esame,linguist,unaided,accompanist,kant,sweat,optics,hardship,adolescence,mcknight,febrero,proximity 13 | 11,goldschmidt,receptivity,knisely,carbon,mankiw,midsummer,tre,volcanism,kapitel,scrupulous,citta,metamorphoses,optimization,receipt,phaedo,disclose,magnetism,electoral,proclaims,homicide,presentaci,finalize 14 | 12,aping,egg,mitosis,programmable,converted,tangential,avril,henderson,ich,archival,varie,informs,antiderivatives,eraser,realizing,offensive,vuille,cutting,psychologists,orgins,leer,experiencing 15 | 13,caveman,dialogic,meiosis,chromatography,preclude,typo,bureau,deformation,zum,instant,viaggio,rapid,polynomial,ticket,decipher,pant,torque,bargaining,testable,looks,puntos,clock 16 | 14,genital,fauvism,biodiversity,hybridization,ninkovic,refusing,detectable,crustal,dien,mississippi,testi,translating,wherever,repertoire,kress,probation,rocket,partisan,deception,cockerham,trabajo,entertain 17 | 15,schick,hardback,photosynthesis,spectroscopy,taxation,persuasiveness,mardi,hydrocarbon,stadt,lowing,capitolo,save,separable,chorale,influencing,taker,emailing,madison,humanistic,lindsey,todo,unapproved 18 | 16,hijras,transportable,nitya,organic,jninkov,neglect,jsvient,earthquakes,diens,footnoted,merc,latin,nit,chamber,unfortunately,inappropriately,weighting,ssb,abnormal,mckinlay,lengua,experiential 19 | 17,hodder,globe,vascular,solvent,curved,haphazard,printemps,busch,fruehlingssemester,awake,corso,consequently,inverse,intonation,philosophers,reoccurring,farhan,pols,prenatal,outlaw,otro,mysteries 20 | 18,knauft,criticality,njacob,gases,breech,harassment,svienty,volcanoes,sommerferien,curse,eserciziario,kept,poisson,blouse,groundwork,endurance,llewellyn,bshapir,differentiating,gabe,sobre,warmups 21 | 19,mayr,tempera,transmission,processed,jasminka,proceeding,vrier,deserve,den,sponsorship,vacanza,henry,sigma,ankle,dictated,assess,segre,votes,therapies,catches,las,suzan 22 | -------------------------------------------------------------------------------- /Classifiers/Naive-Bayes-Classifier.py: -------------------------------------------------------------------------------- 1 | import textblob 2 | import numpy 3 | 4 | class Document(object): 5 | 6 | STOPWORDS = "are you my I a and these to it with me your not but him do so" 7 | 8 | @classmethod 9 | def make_stop_words(cls, stopwords): 10 | return stopwords.lower().split() 11 | 12 | def __init__(self, text, label=None): 13 | self.text = text 14 | self.label = label 15 | self.stopwords = Document.make_stop_words(Document.STOPWORDS) 16 | self.wordVector = None 17 | 18 | def get_label(self): 19 | return self.label 20 | 21 | def split_and_remove_stop_words(self): 22 | ## split and make all the words lower case 23 | splitText = self.text.lower().split() 24 | scrubbedText = [] 25 | for word in splitText: 26 | if word not in self.stopwords: 27 | scrubbedText.append(word) 28 | self.wordVector = scrubbedText 29 | 30 | def count_tokens(self): 31 | return len(self.wordVector) 32 | 33 | def get_word_frequencies(self): 34 | wordFreq = {} 35 | for word in self.wordVector: 36 | if word not in wordFreq: 37 | wordFreq[word] = 1 38 | else: 39 | wordFreq[word] += 1 40 | return wordFreq 41 | 42 | def get_vocabulary(self): 43 | wordFreq = self.get_word_frequencies() 44 | return wordFreq.keys() 45 | 46 | class DocDatabase(object): 47 | 48 | def __init__(self, documents): 49 | self.documents = documents 50 | self.classes = self.get_classes() 51 | self.vocabulary = self.construct_complete_vocabulary() 52 | self.priorProbs = self.calc_prior_probs() 53 | self.conditionalProbs = self.calc_conditional_prob_per_word() 54 | 55 | def get_classes(self): 56 | classes = [] 57 | for d in self.documents: 58 | label = d.get_label() 59 | if label not in classes: 60 | classes.append(label) 61 | return classes 62 | 63 | def count_docs_per_class(self): 64 | """ Determine the number of documents per class """ 65 | classCounts = { c:0 for c in self.classes } 66 | for d in self.documents: 67 | label = d.get_label() 68 | classCounts[label] += 1 69 | return classCounts 70 | 71 | def calc_prior_probs(self): 72 | """ Determine the probabilty of each class. This is also known as the 73 | prior probability. """ 74 | classCounts = self.count_docs_per_class() 75 | totalNumTexts = sum(classCounts.values()) 76 | classProbs = { c:( classCounts[c] / float(totalNumTexts) ) for c in classCounts.keys() } 77 | return classProbs 78 | 79 | def construct_complete_vocabulary(self): 80 | """ Generate a complete list of vocabulary words across all documents """ 81 | vocab = set([]) 82 | for d in self.documents: 83 | vocab = vocab.union(set(d.get_vocabulary())) 84 | return vocab 85 | 86 | def calc_word_freq_per_class(self): 87 | """ Determine the word frequencies for each class """ 88 | classVocab = {} 89 | for c in self.classes: 90 | ## initialize the word frequencies to 0 91 | classVocab[c] = { word:0 for word in self.vocabulary } 92 | for d in self.documents: 93 | myClass = classVocab[d.get_label()] 94 | myFrequencies = d.get_word_frequencies() 95 | for word in myFrequencies.keys(): 96 | myClass[word] += myFrequencies[word] 97 | return classVocab 98 | 99 | def count_tokens_per_class(self): 100 | countTokens = { c:0 for c in self.classes } 101 | for d in self.documents: 102 | countTokens[d.get_label()] += d.count_tokens() 103 | return countTokens 104 | 105 | def calc_conditional_prob_per_word(self): 106 | """ We will use LAPLACE ADD-1 SMOOTHING: 107 | p(word | class ) = [ # of tokens of word in class ) + 1 ] / [ ( total number of tokens in class ) + VOCAB_SIZE] """ 108 | conditionalProbs = self.calc_word_freq_per_class() 109 | countTokens = self.count_tokens_per_class() 110 | for c in conditionalProbs.keys(): 111 | for w in conditionalProbs[c].keys(): 112 | conditionalProbs[c][w] = float( conditionalProbs[c][w] + 1) / float( countTokens[c] + len(self.vocabulary)) 113 | return conditionalProbs 114 | 115 | def prior_prob(self, givenClass): 116 | return self.priorProbs[givenClass] 117 | 118 | def conditional_prob(self, givenClass, word): 119 | ## if the word is actually contained in the known vocabulary for the class, 120 | ## return the conditional probability 121 | if word in self.conditionalProbs[givenClass].keys(): 122 | return self.conditionalProbs[givenClass][word] 123 | ## if the word is unknown, then use the following smoothing approximation 124 | ## Pr(word) = 1 / ( VOCAB-SIZE + 1 ) 125 | else: 126 | return 1 / float(len(self.vocabulary) + 1) 127 | 128 | 129 | def classify(self, testDoc): 130 | """ Given a test document, determine the most probable classification """ 131 | ## Get the word frequencies for the document 132 | doc = Document(testDoc) 133 | doc.split_and_remove_stop_words() 134 | docWordFreqs = doc.get_word_frequencies() 135 | docWords = docWordFreqs.keys() 136 | ## P(c|w) = [ P(w|c) ^ (count_w) ] * P(c) 137 | results = {} 138 | for c in self.classes: 139 | productOfConditionals = numpy.prod(map(lambda x: self.conditional_prob(c,x) ** docWordFreqs[x], docWords)) 140 | probOfClass = productOfConditionals * self.prior_prob(c) 141 | results[c] = probOfClass 142 | bestLabel = max( results.items(), key=lambda x: x[1]) 143 | return bestLabel[0] 144 | 145 | def classify_test_set(self, testSet): 146 | return map(lambda x: self.classify(x), testSet) 147 | 148 | 149 | def test_doc(): 150 | class1 = [ "How are you my friends I brought you a sandwich", 151 | "I found a sandwich and these beers and I wanted to know you wanted to share it with me", 152 | "Listen my friend I going to get a beer tonight you want to join me" ] 153 | class2 = [ "Friends Romans countryman lend me your ears", 154 | "I come not to praise caesar but to bury him gentle romans", 155 | "mighty caesar do you lie so low" ] 156 | testSet = [ "Beers sandwich tonight", "caesar romans beers", "bury bury friends sandwiches share" ] 157 | documents = [] 158 | for doc in class1: 159 | docObject = Document(doc, 'class1') 160 | docObject.split_and_remove_stop_words() 161 | documents.append(docObject) 162 | for doc in class2: 163 | docObject = Document(doc, 'class2') 164 | docObject.split_and_remove_stop_words() 165 | documents.append(docObject) 166 | 167 | myDD = DocDatabase(documents) 168 | myDD.construct_complete_vocabulary() 169 | print myDD.classify_test_set(testSet) 170 | #print myDD.documents 171 | #print myDD.get_classes() 172 | #print myDD.vocabulary 173 | #print myDD.calc_word_freq_per_class() 174 | #print myDD.count_tokens_per_class() 175 | #print myDD.calc_conditional_prob_per_word() 176 | 177 | """ 178 | doc1 = Document(class1[0], 'class1') 179 | doc1.split_and_remove_stop_words() 180 | print doc1.wordVector 181 | print doc1.count_tokens() 182 | print doc1.get_word_frequencies() 183 | print doc1.get_vocabulary() 184 | """ 185 | 186 | test_doc() 187 | -------------------------------------------------------------------------------- /Classifiers/PracticePDFExtractor.py: -------------------------------------------------------------------------------- 1 | import PyPDF2 2 | import csv 3 | import pdfminer 4 | import os 5 | 6 | 7 | def extract_text_using_pypdf(fileName): 8 | pdf = PyPDF2.PdfFileReader(open(fileName, "rb")) 9 | allText = [] 10 | for page in pdf.pages: 11 | allText.append(page.extractText()) 12 | return allText 13 | 14 | def extract_text_using_pdf_miner(inputFile, outputFile): 15 | """ This method calls a command line argument from the pdfminer library 16 | Indicate txt or html by the file name output.txt or output.html 17 | For more commands, see http://www.unixuser.org/~euske/python/pdfminer/ 18 | """ 19 | commandString = "pdf2txt.py -o " + outputFile + " " + inputFile 20 | os.system(commandString) 21 | 22 | def export_to_csv(inputFileName, csvFileName): 23 | f = open(inputFileName, 'r') 24 | with open(csvFileName, 'wb') as csvfile: 25 | myWriter = csv.writer(csvfile, delimiter='\t') 26 | myWriter.writerow(f.readlines()) 27 | 28 | def test(): 29 | #textVector = extract_text_using_pypdf("Lunch-Money.pdf") 30 | #export_to_csv(textVector[2:], "Lunch-Money.csv") 31 | extract_text_using_pdf_miner("E3562014236085.pdf", "E3562014236085.html") 32 | export_to_csv("E3562014236085.html", "E3562014236085.csv") 33 | 34 | 35 | if __name__ == '__main__': 36 | test() 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Code License 2 | 3 | This code is released under the [GPL], version 2 or later: 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program; if not, write to the Free Software 17 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | 19 | The GNU General Public License is available in the file COPYING in 20 | the source distribution. On Debian systems, the complete text of the 21 | GPL can be found in `/usr/share/common-licenses/GPL`. 22 | 23 | [GPL]: http://www.gnu.org/copyleft/gpl.html 24 | 25 | Documentation License 26 | 27 | All documentation contained in this repository is licensed under a 28 | Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. 29 | For details, please see the license file located in this directory (./cc-by-nc-sa-3.0.md) or 30 | visit [the link](http://creativecommons.org/licenses/by-nc-sa/3.0/deed.en_US). 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | opensyllabus 2 | ============ 3 | 4 | *The Open Syllabus Project* seeks to promote institutional cooperation in the task of gathering and analyzing a significant corpus of syllabi. Our principles are as follows: 5 | 6 | *We believe in openness and transparency while respecting the intellectual property rights of all constituents.* We create and make publicly available a rich dataset of metadata, while protecting the original documents in a secure “research sandbox” environment. 7 | 8 | *We believe in data-driven innovation.* A critical mass of documents can foster new tools, drive policy change, enable best-practices, provide metrics, and aid in search, discovery, and the creation of new course materials. 9 | 10 | *We invite participating scholars and institutions to collaborate and benefit* from the project’s research, platform- and tool-development experiments. Our team includes the nation’s leading librarians and legal scholars who are committed to an ongoing dialog about knowledge sharing, preservation, and accessibility. 11 | 12 | Email: 13 | share [at] opensyllabusproject [dot] org 14 | 15 | Twitter: 16 | @opensyllabus 17 | -------------------------------------------------------------------------------- /UI/SimpleQuerier.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | Simple Page for Querying Open Syllabus Project REST API 14 | 23 | 24 | 71 | 72 | 73 | 74 |

Simple Page for querying Open Syllabus Project REST API

75 |

76 | Instructions: (1) specify parameter values, (2) click the Query button, (3) scroll down to review the results. 77 |

78 |

79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 132 | 133 |
Parameter NameParameter ValueParameter Information
Base URLThe URL for querying the Open Syllabus Project MongoDb. (Get this from the OSP.)
UsernameYour username. (Get this from the OSP.)
PasswordYour password. (Get this from the OSP.)
criteriaSearch criteria, formatted as a JSON object.
fieldsNames of fields, formatted into a JSON object, that you want in the results.
sortNames of fields, formatted into a JSON object, by which you want the results to be sorted.
skipnumber
limitnumber
explain
batch_sizenumber to return
Constructed URL for queryYou do not need to edit this because it is dynamically constructed from the preceding 129 | parameter values you provide. However, if desired, you can skip providing parameter values 130 | other than base URL, username, and password and provide your own constructed URL directly. 131 |
134 |

135 |

136 | 137 |

138 |

139 | 140 | 141 | 142 | 143 |
Query Results
144 |

145 |

146 |

147 |

148 | 149 | 150 | -------------------------------------------------------------------------------- /cc-by-nc-sa-3.0.md: -------------------------------------------------------------------------------- 1 | # Creative Commons 2 | 3 | 4 | ## Creative Commons Legal Code 5 | 6 | ### Attribution-NonCommercial-ShareAlike 3.0 Unported 7 | 8 | 9 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. 10 | 11 | *License* 12 | 13 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. 14 | 15 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. 16 | 17 | **1. Definitions** 18 | 19 | "Adaptation" means a work based upon the Work, or upon the Work and other pre-existing works, such as a translation, adaptation, derivative work, arrangement of music or other alterations of a literary or artistic work, or phonogram or performance and includes cinematographic adaptations or any other form in which the Work may be recast, transformed, or adapted including in any form recognizably derived from the original, except that a work that constitutes a Collection will not be considered an Adaptation for the purpose of this License. For the avoidance of doubt, where the Work is a musical work, performance or phonogram, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered an Adaptation for the purpose of this License. 20 | "Collection" means a collection of literary or artistic works, such as encyclopedias and anthologies, or performances, phonograms or broadcasts, or other works or subject matter other than works listed in Section 1(g) below, which, by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. A work that constitutes a Collection will not be considered an Adaptation (as defined above) for the purposes of this License. 21 | "Distribute" means to make available to the public the original and copies of the Work or Adaptation, as appropriate, through sale or other transfer of ownership. 22 | "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, Noncommercial, ShareAlike. 23 | "Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License. 24 | "Original Author" means, in the case of a literary or artistic work, the individual, individuals, entity or entities who created the Work or if no individual or entity can be identified, the publisher; and in addition (i) in the case of a performance the actors, singers, musicians, dancers, and other persons who act, sing, deliver, declaim, play in, interpret or otherwise perform literary or artistic works or expressions of folklore; (ii) in the case of a phonogram the producer being the person or legal entity who first fixes the sounds of a performance or other sounds; and, (iii) in the case of broadcasts, the organization that transmits the broadcast. 25 | "Work" means the literary and/or artistic work offered under the terms of this License including without limitation any production in the literary, scientific and artistic domain, whatever may be the mode or form of its expression including digital form, such as a book, pamphlet and other writing; a lecture, address, sermon or other work of the same nature; a dramatic or dramatico-musical work; a choreographic work or entertainment in dumb show; a musical composition with or without words; a cinematographic work to which are assimilated works expressed by a process analogous to cinematography; a work of drawing, painting, architecture, sculpture, engraving or lithography; a photographic work to which are assimilated works expressed by a process analogous to photography; a work of applied art; an illustration, map, plan, sketch or three-dimensional work relative to geography, topography, architecture or science; a performance; a broadcast; a phonogram; a compilation of data to the extent it is protected as a copyrightable work; or a work performed by a variety or circus performer to the extent it is not otherwise considered a literary or artistic work. 26 | "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation. 27 | "Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images. 28 | "Reproduce" means to make copies of the Work by any means including without limitation by sound or visual recordings and the right of fixation and reproducing fixations of the Work, including storage of a protected performance or phonogram in digital form or other electronic medium. 29 | 30 | **2. Fair Dealing Rights.** Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright or rights arising from limitations or exceptions that are provided for in connection with the copyright protection under copyright law or other applicable laws. 31 | 32 | **3. License Grant.** Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below: 33 | 34 | to Reproduce the Work, to incorporate the Work into one or more Collections, and to Reproduce the Work as incorporated in the Collections; 35 | to create and Reproduce Adaptations provided that any such Adaptation, including any translation in any medium, takes reasonable steps to clearly label, demarcate or otherwise identify that changes were made to the original Work. For example, a translation could be marked "The original work was translated from English to Spanish," or a modification could indicate "The original work has been modified."; 36 | to Distribute and Publicly Perform the Work including as incorporated in Collections; and, 37 | to Distribute and Publicly Perform Adaptations. 38 | The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. Subject to Section 8(f), all rights not expressly granted by Licensor are hereby reserved, including but not limited to the rights described in Section 4(e). 39 | 40 | **4. Restrictions.** The license granted in Section 3 above is expressly made subject to and limited by the following restrictions: 41 | 42 | You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from any Licensor You must, to the extent practicable, remove from the Collection any credit as required by Section 4(d), as requested. If You create an Adaptation, upon notice from any Licensor You must, to the extent practicable, remove from the Adaptation any credit as required by Section 4(d), as requested. 43 | You may Distribute or Publicly Perform an Adaptation only under: (i) the terms of this License; (ii) a later version of this License with the same License Elements as this License; (iii) a Creative Commons jurisdiction license (either this or a later license version) that contains the same License Elements as this License (e.g., Attribution-NonCommercial-ShareAlike 3.0 US) ("Applicable License"). You must include a copy of, or the URI, for Applicable License with every copy of each Adaptation You Distribute or Publicly Perform. You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License. You must keep intact all notices that refer to the Applicable License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License. 44 | You may not exercise any of the rights granted to You in Section 3 above in any manner that is primarily intended for or directed toward commercial advantage or private monetary compensation. The exchange of the Work for other copyrighted works by means of digital file-sharing or otherwise shall not be considered to be intended for or directed toward commercial advantage or private monetary compensation, provided there is no payment of any monetary compensation in con-nection with the exchange of copyrighted works. 45 | If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribution ("Attribution Parties") in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and, (iv) consistent with Section 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). The credit required by this Section 4(d) may be implemented in any reasonable manner; provided, however, that in the case of a Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributing authors of the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributing authors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Original Author, Licensor and/or Attribution Parties, as appropriate, of You or Your use of the Work, without the separate, express prior written permission of the Original Author, Licensor and/or Attribution Parties. 46 | For the avoidance of doubt: 47 | 48 | Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; 49 | Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License if Your exercise of such rights is for a purpose or use which is otherwise than noncommercial as permitted under Section 4(c) and otherwise waives the right to collect royalties through any statutory or compulsory licensing scheme; and, 50 | Voluntary License Schemes. The Licensor reserves the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License that is for a purpose or use which is otherwise than noncommercial as permitted under Section 4(c). 51 | Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right granted in Section 3(b) of this License (the right to make Adaptations) would be deemed to be a distortion, mutilation, modification or other derogatory action prejudicial to the Original Author's honor and reputation, the Licensor will waive or not assert, as appropriate, this Section, to the fullest extent permitted by the applicable national law, to enable You to reasonably exercise Your right under Section 3(b) of this License (right to make Adaptations) but not otherwise. 52 | 53 | **5. Representations, Warranties and Disclaimer** 54 | 55 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING AND TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO THIS EXCLUSION MAY NOT APPLY TO YOU. 56 | 57 | **6. Limitation on Liability.** EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 58 | 59 | **7. Termination** 60 | 61 | This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License. 62 | Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above. 63 | 64 | **8. Miscellaneous** 65 | 66 | Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License. 67 | Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License. 68 | If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. 69 | No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent. 70 | This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You. 71 | The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law. 72 | Creative Commons Notice 73 | 74 | Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor. 75 | 76 | Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of this License. 77 | 78 | Creative Commons may be contacted at http://creativecommons.org/. 79 | -------------------------------------------------------------------------------- /extractor_research/extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/extractors/__init__.py -------------------------------------------------------------------------------- /extractor_research/extractors/miner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from pdfminer.layout import LAParams 3 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 4 | from pdfminer.converter import TextConverter, HTMLConverter, XMLConverter 5 | from pdfminer.pdfpage import PDFPage 6 | 7 | class Miner: 8 | def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True): 9 | self.pdf_file = file(pdf_file, 'rb') 10 | self.outfp = file(txt_file, 'w') 11 | 12 | if layout_analysis: 13 | laparams = LAParams() 14 | else: 15 | laparams = None 16 | 17 | self.rsrcmgr = PDFResourceManager(caching=True) 18 | 19 | if file_format == 'txt': 20 | self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', 21 | laparams=laparams, imagewriter=None) 22 | elif file_format == 'html': 23 | self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 24 | laparams=laparams, imagewriter=None) 25 | elif file_format == 'xml': 26 | self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 27 | laparams=laparams, imagewriter=None) 28 | 29 | def extract(self): 30 | interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) 31 | pagenos = set() 32 | for page in PDFPage.get_pages(self.pdf_file, pagenos, maxpages=0, 33 | password=None, caching=True, check_extractable=True): 34 | interpreter.process_page(page) 35 | self.pdf_file.close() 36 | self.device.close() 37 | self.outfp.close() 38 | 39 | if __name__ == '__main__': 40 | import os 41 | import re 42 | 43 | #converts pdfs in the input directory into html format 44 | pdfList = [('../input/%s' % f) for f in os.listdir('../input/') if '.pdf' in f] 45 | htmlList = [re.sub(r'.pdf', r'.xml', f) for f in pdfList] 46 | htmlList = [re.sub(r'input', r'output', f) for f in htmlList] 47 | 48 | for i in range(len(pdfList)): 49 | print 'converting: %s to %s' % (pdfList[i], htmlList[i]) 50 | miner = Miner(pdfList[i], htmlList[i], file_format='xml') 51 | miner.extract() -------------------------------------------------------------------------------- /extractor_research/extractors/pdf2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from PyPDF2 import PdfFileReader 3 | 4 | class PDF2: 5 | def __init__(self, pdf_file, txt_file): 6 | self.doc = PdfFileReader(open(pdf_file, 'rb')) 7 | self.output = open(txt_file, 'w') 8 | 9 | def extract(self): 10 | for page in self.doc.pages: 11 | self.output.write(page.extractText()) 12 | self.output.close() 13 | 14 | if __name__ == '__main__': 15 | pdf = PDF2('../input/pride_and_prej/1.pdf', '../output/pride_and_prej/pdf2/1.txt') 16 | pdf.extract() -------------------------------------------------------------------------------- /extractor_research/extractors/pdfbox-app-1.8.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/extractors/pdfbox-app-1.8.5.jar -------------------------------------------------------------------------------- /extractor_research/extractors/pdfbox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | 4 | class PDFBox: 5 | def __init__(self, pdf_file, txt_file): 6 | self.pdf_file = pdf_file 7 | self.txt_file = txt_file 8 | 9 | def extract(self): 10 | # need to hardcode path because of imports 11 | command = 'java -jar ~/workspace/OSP/opensyllabus/extractor_research/extractors/pdfbox-app-1.8.5.jar ExtractText ' + self.pdf_file + ' ' + self.txt_file 12 | os.system(command) 13 | 14 | if __name__ == '__main__': 15 | pdf = PDFBox('../input/pride_and_prej/1.pdf', '../output/pride_and_prej/pdfbox/1.txt') 16 | pdf.extract() -------------------------------------------------------------------------------- /extractor_research/extractors/textstream.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | 4 | class TextStream: 5 | def __init__(self, pdf_file, txt_file): 6 | self.pdf_file = pdf_file 7 | self.txt_file = txt_file 8 | 9 | def extract(self): 10 | # need to hardcode path because of imports 11 | command = 'java -cp "./extractors/textstream:./extractors/textstream/*" TextStream ' + self.pdf_file + ' ' + self.txt_file 12 | os.system(command) 13 | 14 | if __name__ == '__main__': 15 | pdf = TextStream('../input/pride_and_prej/1.pdf', '../output/pride_and_prej/textstream/1.txt') 16 | pdf.extract() 17 | -------------------------------------------------------------------------------- /extractor_research/extractors/textstream/LICENSE: -------------------------------------------------------------------------------- 1 | A copy of this license is included with your distribution of PDFTextStream™ Software. 2 | 3 | SNOWTIDE™ INFORMATICS SYSTEMS, INC. 4 | PDFTEXTSTREAM™ SOFTWARE END USER LICENSE AGREEMENT ("EULA") 5 | 6 | IMPORTANT: 7 | 8 | • NEVER USE THE SOFTWARE TO VIOLATE ANYONE’S INTELLECTUAL PROPERTY OR OTHER RIGHTS. 9 | 10 | • THIS EULA IS A CONTRACT BETWEEN YOU AND SNOWTIDE. READ IT CAREFULLY BEFORE COMPLETING THE INSTALLATION PROCESS AND USING THE SOFTWARE. IT PROVIDES A LICENSE TO USE THE SOFTWARE AND CONTAINS WARRANTY INFORMATION AND LIABILITY DISCLAIMERS. BY INSTALLING AND USING THE SOFTWARE, YOU CONFIRM YOUR ACCEPTANCE OF THE SOFTWARE AND AGREE TO BE BOUND BY THIS EULA. IF YOU DO NOT AGREE TO BE BOUND, DO NOT INSTALL OR USE THE SOFTWARE. 11 | 12 | 1. Definitions 13 | (a) "Snowtide" means Snowtide Informatics Systems, Inc. and its suppliers and licensors, if any. 14 | (b) "Trial Version" means a version of the Software to be used only to review 15 | and evaluate the Software. The Trial Version may have limited functionality and 16 | may alter ts output or behaviour. 17 | (c) "Software" means the PDFTextStream™ software program and third party software program(s) supplied by Snowtide with it, which may also include associated media, printed materials, and electronic documentation. The Trial Version is also “Software” under this EULA. 18 | (d) "Production Environment" means a single computer system used to provide capabilities and features to the public or to the end users of your products or services. 19 | 20 | 2. License 21 | This EULA allows you to: 22 | (a) Absent a purchased license key, either: 23 | (i) evaluate the Software for potential future inclusion in your products or services, where such evaluation may not include using the Software in any Production Environment 24 | (ii) Install and use the Software in Production Environments in 25 | "single-threaded applications", as defined and described in the technical 26 | documentation of the Software. 27 | (b) Install and use the Software on a single Production Environment OR store the Software on a storage device, like a network server, used only to install the Software on other Production Environments over an internal network, provided you have a license key for each Production Environment on which the Software is installed and run. A license key for the Software may not be shared or used concurrently on different computers. 28 | (c) Make one copy of the Software in machine-readable form only for backup purposes. You must reproduce all copyright notices and other proprietary legends on the original copy of the Software on each copy. 29 | 30 | 3. License Restrictions 31 | (a) Except as allowed by Section 2, you may not make or distribute copies of the Software or electronically transfer the Software from one computer to another or over a network. 32 | (b) You may not decompile, reverse engineer, disassemble, or otherwise reduce the Software to a human-perceivable form. 33 | (c) You may not rent, lease, or sublicense the Software. 34 | (d) You may not redistribute the Software as part of another application. 35 | (e) You may permanently transfer all of your rights under this EULA provided you keep no copies, you transfer all of the Software (including all component parts, the media and printed materials, any upgrades, this EULA, and the serial numbers or license files), and the recipient agrees to this EULA. If the Software is an upgrade, any transfer must include all prior versions of the Software. You may not sell or transfer any Software purchased under a volume discount. 36 | (f) You may not modify the Software or create derivative works based upon the Software. 37 | (g) You may not export the Software into any country prohibited by the United States Export Administration Act and the regulations thereunder. 38 | (h) It is possible to use the Software (and many other programs) to violate the intellectual property and other rights of others. No permission for any such use is given by this EULA. You agree to hold Snowtide harmless and pay all costs including attorney fees because of your use of the Software. 39 | (i) If you fail to comply with this EULA, Snowtide may terminate the license and you must destroy all copies of the Software. 40 | 41 | 4. Upgrades 42 | If this copy of the Software is an upgrade from an earlier version of the Software, it is provided to you on a license exchange basis. Your installation and use of this copy of the Software means you have voluntarily terminated your earlier EULA and that you will not continue to use the earlier version of the Software or transfer it unless the transfer complies with Section 3. 43 | 44 | 5. Ownership 45 | This EULA gives you a limited license to use the Software. Snowtide and its suppliers retain all right, title and interest, including all intellectual property rights, in and to the Software and all copies . All rights not specifically granted in this EULA, including U.S. and International Copyrights, are reserved by Snowtide and its suppliers. 46 | 47 | 6. LIMITED WARRANTY AND DISCLAIMER 48 | (a) LIMITED WARRANTY. Snowtide warrants that for ninety (90) days from the date of delivery (as evidenced by a copy of your purchase receipt): (i) when used with a recommended hardware and software configuration, the Software will perform substantially according to the documentation supplied with the Software; and (ii) any physical media on which the Software is furnished is free from defects in materials and workmanship under normal use. 49 | (b) NO OTHER WARRANTY. EXCEPT AS SET FORTH ABOVE, SNOWTIDE DISCLAIMS ALL OTHER WARRANTIES, WHETHER EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THERE IS NO WARRANTY OF NONINFRINGEMENT, TITLE OR QUIET ENJOYMENT. IF APPLICABLE LAW REQUIRES ANY WARRANTIES OTHER THAN WHAT IS GRANTED HERE, ALL SUCH WARRANTIES ARE LIMITED TO NINETY (90) DAYS FROM THE DATE OF DELIVERY. NO ORAL OR WRITTEN INFORMATION OR ADVICE GIVEN BY SNOWTIDE, ITS DEALERS, DISTRIBUTORS, AGENTS OR EMPLOYEES CREATES A WARRANTY OR INCREASES THE SCOPE OF THIS WARRANTY. 50 | (c) (USA ONLY) SOME STATES DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO THE ABOVE EXCLUSION MAY NOT APPLY TO YOU. THIS WARRANTY GIVES YOU SPECIFIC LEGAL RIGHTS. YOU MAY ALSO HAVE OTHER LEGAL RIGHTS THAT VARY FROM STATE TO STATE. 51 | 52 | 7. Exclusive Remedy 53 | Your exclusive remedy under Section 6 is to return the Software to the place you acquired it, with a copy of your receipt and a description of the problem. Snowtide will use reasonable commercial efforts to supply you with a replacement copy of the Software that substantially conforms to the documentation, provide a replacement for defective media, or refund your purchase price for the Software, at its option. Snowtide will not be liable under this provision if the Software has been altered in any way, if the media has been damaged by accident, abuse or misapplication, or if the failure arises out of using the Software with other than a recommended hardware and software configuration. 54 | 55 | 8. LIMITATION OF LIABILITY. 56 | (a) SNOWTIDE SHALL NOT BE LIABLE TO ANYONE FOR ANY INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES (INCLUDING DAMAGES FOR LOSS OF BUSINESS, LOSS OF PROFITS, BUSINESS INTERRUPTION OR THE LIKE) ARISING FROM USING OR NOT BEING ABLE TO USE, THE SOFTWARE AND BASED ON ANY THEORY OF LIABILITY INCLUDING BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR OTHERWISE, EVEN IF SNOWTIDE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND EVEN IF THE REMEDY ALLOWED IN THIS EULA FAILED OF ITS ESSENTIAL PURPOSE. 57 | (b) SNOWTIDE'S TOTAL LIABILITY FOR ACTUAL DAMAGES FOR ANY REASON WILL BE LIMITED TO THE GREATER OF $500 US DOLLARS OR THE AMOUNT YOU PAID FOR THE SOFTWARE THAT CAUSED THE DAMAGE. 58 | (c) (USA only) SOME STATES DO NOT ALLOW THE LIMITATION OR EXCLUSION OF LIABILITY FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS LIMITATION OR EXCLUSION MAY NOT APPLY TO YOU. YOU MAY ALSO HAVE OTHER LEGAL RIGHTS THAT VARY FROM STATE TO STATE. 59 | 60 | 9. Basis of Bargain 61 | The Limited Warranty, Exclusive Remedies and Limited Liability set forth above are fundamental elements of this EULA. Snowtide would not be able to provide the Software on an economical basis without such limitations. 62 | 63 | 10. U.S. GOVERNMENT RESTRICTED RIGHTS LEGEND 64 | This Software and the documentation are provided with RESTRICTED RIGHTS. Use, duplication, or disclosure by the U.S. Government is subject to restrictions as set forth in this EULA and as provided in DFARS 227.7202-1(a) and 227.7202-3(a) (1995),DFARS 252.227-7013 (c)(1)(ii)(OCT 1988), FAR 12.212(a)(1995), FAR 52.227-19, or FAR 52.227-14, as applicable. Manufacturer: Snowtide Informatics Systems, Inc, 243 King Street, Suite 248, Northampton, MA 01060. 65 | 66 | 11. (Outside of the USA) Consumer End Users Only 67 | The limitations or exclusions of warranties and liability contained in this EULA do not affect or prejudice the statutory rights of a consumer, i.e., a person acquiring goods otherwise than in the course of a business, subject to Section 12. 68 | 69 | 12. General Provisions 70 | This EULA is governed by the laws of the State of Massachusetts, without giving effect to principles of conflict of laws. If the Software is delivered outside the USA, the UN Convention for the International Sale of Goods does not apply to this EULA. This EULA is the full agreement between the parties and supersedes all other agreements or understandings, whether oral or written. All questions about this EULA must be directed to: Snowtide Informatics Systems, Inc, 243 King Street, Suite 248, Northampton, MA 01060, Attention: General Counsel. 71 | 72 | 13. Third Party Software 73 | Certain third party software is incorporated into the Software, as enumerated in Appendix A of this EULA. 74 | 75 | Snowtide and PDFTextStream are trademarks or registered trademarks of Snowtide Informatics Systems, Inc. in the United States and/or other countries. Third party trademarks, trade names, product names and logos may be the trademarks or registered trademarks of their respective owners. 76 | 77 | 14. Delivery 78 | The Software has been delivered to you by internet transmission at the address you registered with Snowtide. If the law of the jurisdiction where the Software was delivered to requires the payment of any sales, use, VAT or other tax on the purchase, use or ownership of the Software, you are responsible for reporting the purchase and paying any such taxes and you agree to indemnify Snowtide from any liability or expense in connection with your failure to do so. 79 | 80 | 15. ARBITRATION 81 | Any action arising out of this EULA, its formation, validity, breach or relating to the subject of this EULA must be filed within one year after the cause of action accrues through the American Arbitration Association under its Commercial Arbitration Rules and the optional rules for emergency measures of protection before one arbitrator who shall be an attorney experienced in trade secret law and matters relating to computer software. All hearings shall be in Northampton, Massachusetts or by telephone or videoconference at the order of the arbitrator. The arbitrator is authorized to issue equitable and legal remedies including preliminary and permanent injunctions. If an action is brought to compel arbitration or enforce the terms of any interim or final order of the arbitrator it may be brought in any court with jurisdiction over the person or property of either party and each party hereby irrevocably agrees to be subject to the jurisdiction of any such court. Any action to compel arbitration or enforce an arbitration order or award shall be governed by the Federal Arbitration Act, this EULA being in interstate commerce. The arbitrator is not empowered to grant damages in any form or amount in excess of the payments required under this EULA. This arbitration provision shall be a complete defense to any suit, action or proceeding. Nothing in this arbitration provision shall give the arbitrator any authority to alter, change, amend, modify, add to or subtract from any provision of this EULA. ALL PARTIES WAIVE ANY RIGHT TO A TRIAL BY JURY. 82 | 83 | *Appendix A - Third Party Software* 84 | 85 | The following is a list of certain third party software incorporated in the Software; any additional terms and conditions solely associated with such third party software are further indicated: 86 | 87 | (a) TIFFFaxDecompressor, Copyright (c) 2005 Sun Microsystems, Inc. All Rights Reserved. 88 | 89 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 90 | 91 | - Redistribution of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 92 | 93 | - Redistribution in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 94 | 95 | Neither the name of Sun Microsystems, Inc. or the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. 96 | 97 | This software is provided "AS IS," without a warranty of any kind. ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE HEREBY EXCLUDED. SUN MIDROSYSTEMS, INC. ("SUN") AND ITS LICENSORS SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. IN NO EVENT WILL SUN OR ITS LICENSORS BE LIABLE FOR ANY LOST REVENUE, PROFIT OR DATA, OR FOR DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL OR PUNITIVE DAMAGES, HOWEVER CAUSED AND REGARDLESS OF THE THEORY OF LIABILITY, ARISING OUT OF THE USE OF OR INABILITY TO USE THIS SOFTWARE, EVEN IF SUN HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 98 | 99 | You acknowledge that this software is not designed or intended for use in the design, construction, operation or maintenance of any nuclear facility. 100 | 101 | (b) Apache Commons-Logging, Licensed under the Apache License, Version 2.0, available at http://commons.apache.org/license.html 102 | -------------------------------------------------------------------------------- /extractor_research/extractors/textstream/PDFTextStream.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/extractors/textstream/PDFTextStream.jar -------------------------------------------------------------------------------- /extractor_research/extractors/textstream/TextStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/extractors/textstream/TextStream.class -------------------------------------------------------------------------------- /extractor_research/extractors/textstream/TextStream.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | 3 | import com.snowtide.pdf.PDFTextStream; 4 | import com.snowtide.pdf.OutputTarget; 5 | 6 | public class TextStream { 7 | 8 | public static void main (String[] args) throws IOException { 9 | File pdfFile = new File(args[0]); 10 | File textFile = new File(args[1]); 11 | 12 | PDFTextStream stream = new PDFTextStream(pdfFile); 13 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(textFile))); 14 | OutputTarget tgt = new OutputTarget(writer); 15 | stream.pipe(tgt); 16 | 17 | writer.flush(); 18 | writer.close(); 19 | stream.close(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /extractor_research/extractors/xpdf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | 4 | class XPDF: 5 | def __init__(self, pdf_file, txt_file, layout=True): 6 | self.pdf_file = pdf_file 7 | self.txt_file = txt_file 8 | # -layout : maintain original physical layout 9 | self.layout = layout 10 | 11 | def extract(self): 12 | if self.layout: 13 | command = 'pdftotext -layout ' + self.pdf_file + ' ' + self.txt_file 14 | else: 15 | command = 'pdftotext ' + self.pdf_file + ' ' + self.txt_file 16 | os.system(command) 17 | 18 | if __name__ == '__main__': 19 | pdf = XPDF('../input/pride_and_prej/1.pdf', '../output/pride_and_prej/xpdf/1.txt') 20 | pdf.extract() -------------------------------------------------------------------------------- /extractor_research/input/American_Opera_Rev_Syllabus.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/American_Opera_Rev_Syllabus.pdf -------------------------------------------------------------------------------- /extractor_research/input/Leonard_Intro_Musicology_syllabus(2).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/Leonard_Intro_Musicology_syllabus(2).pdf -------------------------------------------------------------------------------- /extractor_research/input/Leonard_Intro_Musicology_syllabus.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/Leonard_Intro_Musicology_syllabus.pdf -------------------------------------------------------------------------------- /extractor_research/input/Leonard_Victorian_Music_syllabus_docx.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/Leonard_Victorian_Music_syllabus_docx.pdf -------------------------------------------------------------------------------- /extractor_research/input/Leonard_Women_Music_syllabus.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/Leonard_Women_Music_syllabus.pdf -------------------------------------------------------------------------------- /extractor_research/input/Music_since_1900_syllabus.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/Music_since_1900_syllabus.pdf -------------------------------------------------------------------------------- /extractor_research/input/pride_and_prej/1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/pride_and_prej/1.pdf -------------------------------------------------------------------------------- /extractor_research/input/pride_and_prej/2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/pride_and_prej/2.pdf -------------------------------------------------------------------------------- /extractor_research/input/pride_and_prej/3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/input/pride_and_prej/3.pdf -------------------------------------------------------------------------------- /extractor_research/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ''' 4 | https://stackoverflow.com/questions/582336/how-can-you-profile-a-python-script 5 | (use with a second file option) 6 | http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ 7 | normal sort + token sort (seems less accurate) 8 | ''' 9 | 10 | from extractors import miner, pdf2, pdfbox, textstream, xpdf 11 | import os 12 | import cProfile 13 | import pstats 14 | import StringIO 15 | 16 | 17 | def miner_with_layout(pdf_file, txt_file): 18 | pdf = miner.Miner(pdf_file, txt_file) 19 | pdf.extract() 20 | 21 | def miner_without_layout(pdf_file, txt_file): 22 | pdf = miner.Miner(pdf_file, txt_file, layout_analysis=False) 23 | pdf.extract() 24 | 25 | def xpdf_with_layout(pdf_file, txt_file): 26 | pdf = xpdf.XPDF(pdf_file, txt_file) 27 | pdf.extract() 28 | 29 | def xpdf_without_layout(pdf_file, txt_file): 30 | pdf = xpdf.XPDF(pdf_file, txt_file, layout=False) 31 | pdf.extract() 32 | 33 | def textstream_default(pdf_file, txt_file): 34 | pdf = textstream.TextStream(pdf_file, txt_file) 35 | pdf.extract() 36 | 37 | def pdf2_default(pdf_file, txt_file): 38 | pdf = pdf2.PDF2(pdf_file, txt_file) 39 | pdf.extract() 40 | 41 | def pdfbox_default(pdf_file, txt_file): 42 | pdf = pdfbox.PDFBox(pdf_file, txt_file) 43 | pdf.extract() 44 | 45 | def run_all(pdf_file, txt_file): 46 | miner_with_layout(pdf_file, txt_file) 47 | miner_without_layout(pdf_file, txt_file) 48 | xpdf_with_layout(pdf_file, txt_file) 49 | xpdf_without_layout(pdf_file, txt_file) 50 | textstream_default(pdf_file, txt_file) 51 | pdf2_default(pdf_file, txt_file) 52 | pdfbox_default(pdf_file, txt_file) 53 | 54 | def time_all(pdf_file): 55 | methods = ['miner_with_layout', 'miner_without_layout', 'xpdf_with_layout', 56 | 'xpdf_without_layout', 'textstream_default', 'pdf2_default', 'pdfbox_default'] 57 | 58 | base_name = os.path.basename(pdf_file) 59 | directory_name = os.path.dirname(pdf_file) 60 | 61 | # i.e. 'pride_and_prej' from './input/pride_and_prej/1.pdf' 62 | shorter_directory_name = os.path.basename(directory_name) 63 | # i.e. '1' from './input/pride_and_prej/1.pdf' 64 | file_base_name = os.path.splitext(base_name)[0] 65 | 66 | output = '' 67 | 68 | for method in methods: 69 | # build file path based on source text, input PDF, and method employed 70 | txt_file = './output/' + shorter_directory_name + '/' + method + '/' + file_base_name + '.txt' 71 | 72 | command = method + '(\'%s\', \'%s\')' % (pdf_file, txt_file) 73 | temp = 'statsfile' 74 | cProfile.run(command, temp) 75 | 76 | stream = StringIO.StringIO() 77 | stats = pstats.Stats(temp, stream=stream) 78 | stats.print_stats() 79 | stats.sort_stats('time') 80 | output = output + method + '\n-------------------------------------\n' + stream.getvalue() 81 | 82 | # clean up intermediary file 83 | os.remove('statsfile') 84 | 85 | # write results to log file 86 | with open('./stats/' + shorter_directory_name + '/' + file_base_name + '_speed_log.txt', "w") as log_file: 87 | log_file.write(output) 88 | 89 | if __name__ == '__main__': 90 | pdf_file = './input/pride_and_prej/1.pdf' 91 | time_all(pdf_file) -------------------------------------------------------------------------------- /extractor_research/output/American_Opera_Rev_Syllabus.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
Page 1
6 |
American Opera since 1950 7 |
Revised Syllabus 8 |
9 |
Week 7: March 9, 11, 12 10 |
John Adams: The Death of Klinghoffer (1996) 11 |
Reading: Kraft,
The Death of Klinghoffer,” Perspectives of New Music, Vol. 30, No. 1 (Winter, 1992), 12 |
300-302 13 |
Fink, “Klinghoffer in Brooklyn Heights,”
Cambridge Opera Journal, 2005, 17:2, 173-213. 14 |
Listening/Viewing: DVD 299 15 |
16 |
Week 8: March 16, 18, 19 17 |
SPRING BREAK—NO CLASS 18 |
19 |
Week 9: March 23, 25, 26 20 |
March 23 & 25: William Bolcom: A View from the Bridge (2001) 21 |
Reading: Herwitz, “Notes from the Stage: William Bolcom,”
Opera Quarterly, Vol. 22, No. 3–4, 521– 22 |
533. 23 |
Listening/Viewing: CD 3255 24 |
March 26: In-class conversation with Kiya Heartwood, composer of the opera
Lying to the Sea Gypsy. 25 |
Listening: “Safe Harbor,” http://www.lyingtotheseagypsy.com/ 26 |
27 |
Week 10: March 30, April 1, 2 28 |
NO CLASS 29 |
30 |
Week 11: April 6, 8, 9 31 |
Green Day: American Idiot (2007/2009) 32 |
Reading: http://www.nytimes.com/2009/09/18/theater/18greenday.html?_r=1&hpw 33 |
Listening/Viewing: “
American Idiot: The Musical Trailer”: 34 |
http://www.youtube.com/watch?v=egGARtwaFEo 35 |
“Green Day's
American Idiot Musical: Michael Mayer's Introduction”: 36 |
http://www.youtube.com/watch?v=EwhvAPSOrH0 37 |
“‘Whatshername’ from American Idiot The Musical”: 38 |
http://www.youtube.com/watch?v=IC44dUEwdiw&feature=related 39 |
40 |
Week 12: April 13, 15, 16 41 |
Final written paper due 4/13/10 42 |
April 13: Garfein: Rosencrantz and Guildenstern are Dead (2009) 43 |
Reading: Excerpts from Stoppard,
Rosencrantz and Guildenstern are Dead (New York: Grove Press, 44 |
1967), 41-44. 45 |
Listening/Viewing: R&G stage play, Questions: http://www.youtube.com/watch?v=y-Sx4W2cKlU 46 |
R&G opera, “Questions”: 47 |
http://www.youtube.com/watch?v=PxEzopBC2oA&feature=player_embedded# 48 |
April 15 & 16: Operas on the Edge: 49 |
Wrath of Khan-the Opera: http://trekmovie.com/2009/01/25/watch-wrath-of-khan-the-opera-via-robot- 50 |
chicken/ 51 |
Repo-the Genetic Opera: trailer: http://www.youtube.com/watch?v=MzgpU25C6fg 52 |
“Zydrate Anatomy”: http://www.youtube.com/watch?v=tevg_jT5Sco&feature=fvw 53 |
“Infected”: http://www.youtube.com/watch?v=Ik9JjRqXc-8&feature=channel 54 |
“Chromaggia”: http://www.youtube.com/watch?v=dxhPX0q0kpg&feature=related 55 |
56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 |
Page 2
69 |
“Night Surgeon”: http://www.youtube.com/watch?v=61r9_f5g_zA&feature=related 70 |
Got Milk? Advertising campaign:
Battle for Milkquarious: http://www.milkquarious.com/#/winner 71 |
72 |
Week 13: April 20, 22, 23 73 |
Presentations: 74 |
Sara Noble: Tuesday, April 20, 2:10 p.m. 75 |
Jen Leigh & Erin Winkler: Tuesday, April 20, 2:25 p.m.
76 |
Claire Binek: Tuesday, April 20, 2:40 p.m. 77 |
Marian Sunnergren: Tuesday, April 20, 2:55 p.m. 78 |
Barbara Paterson: Thursday, April 22, 2:10 p.m. 79 |
Lexi Batsios: Thursday, April 22, 2:25 p.m. 80 |
Alyssa Callaghan: Thursday, April 22, 2:40 p.m. 81 |
Liz Johnston: Thursday, April 22, 2:55 p.m. 82 |
Shelley Roberts: Friday, April 23, 2:10 p.m. 83 |
Toni Dodich: Friday, April 23, 2:25 p.m. 84 |
85 |
Week 14: April 27, 29, 30 86 |
Presentations: 87 |
Rachel Barker: Tuesday, April 27, 2:10 p.m. 88 |
Jennifer Lawler: Tuesday, April 27, 2:25 p.m. 89 |
Crystal Roskelley: Tuesday, April 27, 2:40 p.m. 90 |
Shane Magargal: Tuesday, April 27, 2:55 p.m. 91 |
Matt Hill & Aaron Carlyle: Thursday, April 29, 2:10 p.m. 92 |
Olivia Savage: Thursday, April 29, 2:25 p.m. 93 |
Cristina Gonzalez: Thursday, April 29, 2:25 p.m. 94 |
Danielle Wright: Thursday, April 29, 2:40 p.m. 95 |
96 |
Friday, April 30, 2:10 p.m.: Viewing: “Once More With Feeling” (
Buffy the Vampire Slayer)—optional. 97 |
98 | 99 |
Page: 1, 2
100 | 101 | -------------------------------------------------------------------------------- /extractor_research/output/Leonard_Victorian_Music_syllabus_docx.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
Page 1
6 |
WESTMINSTER CHOIR COLLEGE OF 7 |
RIDER UNIVERSITY 8 |
Victorian Music and Culture/MH733, Spring 2011 9 |
Syllabus 10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
Office hours: Tuesday, 9:30 – 10:30 a.m. in 24 |
the library; and by appointment 25 |
513. 238. 8031 (C) 26 |
27 |
28 |
Professor Kendra Leonard 29 |
kleonard@rider.edu 30 |
31 |
Class meeting: TTHF 8:00-9:30 a.m. 32 |
33 |
Course Objectives 34 |
1. To provide students with an understanding of music and culture during the Victorian period in Great 35 |
Britain. 36 |
2. To introduce students to the major composers, lyricists, and performers of the Victorian period. 37 |
3. To explore musical and other artistic developments taking place during the Victorian period. 38 |
4. To encourage students to think critically about reading the musical works, criticism, and practices of 39 |
the past as a way of better understanding those works. 40 |
41 |
Email 42 |
43 |
Your Rider email account is your email address for all official email communications from the 44 |
University. You are expected to check your Rider email account on a frequent and consistent basis in 45 |
order to stay current with University-related communications. Any email from me about this course 46 |
will only be sent to your official Rider email address. Any communication from you to me must come 47 |
from your Rider email address, must contain the course name in the “Subject” line, and must use 48 |
proper spelling and capitalization. I try to respond to all emails within 24 hours. 49 |
50 |
Academic Code of Conduct 51 |
52 |
submission of academic work. In all written work, whether in class or out of class, the student’s name 53 |
on the work is considered to be a statement that the work is his or hers alone, except as otherwise 54 |
indicated. Students are expected to provide proper citations for the statements and ideas of others 55 |
whether submitted word for word or paraphrased. Failure to provide proper citations will be considered 56 |
plagiarism and offenders will be subject to the charge of plagiarism specified in the statement of 57 |
regulations. 58 |
Academic dishonesty includes any unauthorized collaboration or misrepresentation in the 59 |
Similarly, students are expected to adhere to all regulations pertaining to examination conduct. 60 |
These regulations are designed to insure that the work submitted by the student on examinations is an 61 |
honest representation of that student’s effort and that it does not involve unauthorized collaboration, 62 |
unauthorized use of notes during the exam, or unauthorized access to prior information about the 63 |
examination. 64 |
In this course, the first instance of plagiarism will result in a grade of 0 for the assignment, and 65 |
a report will be sent to the dean; a second instance of plagiarism will result in an F for the course, and 66 |
charges of academic dishonesty will be brought to the Academic Integrity Committee. See
The Source, 67 |
pages 10-16, for full information on the academic code of conduct. 68 |
69 |
Required Text/Materials 70 |
Solie, Ruth A. Music in Other Words: Victorian Conversations (Berkeley: University of California 71 |
Press, 2004). Available used on amazon.com and abebooks.com. 72 |
Vaughan Williams, Ralph.
National Music and Other Essays (Oxford: Clarendon Press, 1996). 73 |
Available used on amazon.com and abebooks.com. 74 |
Other readings and links to listening are posted on Blackboard and are titled by author name. We will 75 |
76 |
Page 2
77 |
watch some films in class. 78 |
79 |
Style Manual 80 |
Chicago Manual of Style, 15th ed. Chicago: University of Chicago Press, 2003. 81 |
82 |
Recommended Web Sites 83 |
Music, Theater, and Popular Entertainment in Victorian Britain: 84 |
http://www.victorianweb.org/mt/index.html 85 |
Punch archives: 86 |
http://onlinebooks.library.upenn.edu/webbin/serial?id=punch 87 |
Gilbert & Sullivan Archive: http://math.boisestate.edu/GaS/ 88 |
Victorian Resources Online: http://www2.iath.virginia.edu/bpn2f/victorian/bibliog.html 89 |
The 1900 House site: http://www.pbs.org/wnet/1900house/index.html 90 |
91 |
Technology Requirements 92 |
This course is on-line on Blackboard. The web address is Blackboard.rider.edu. You will need 93 |
to have regular access to the internet and a word-processing program to complete many elements of this 94 |
course. Students who do not have this access at home will need to scheduled time to do so either at the 95 |
library or at other campus computing locations. Always back up your work, whether on a flash drive, 96 |
via email, or through an online back-up service such as Mozy or Google Documents. Assignments will 97 |
not be accepted late because of computer or printer problems. 98 |
99 |
In this course, we will be building a digital exhibit of Victorian music and culture at the website 100 |
101 |
http://victorianmusic.omeka.net/. All students will have access to the site via a login and password I 102 |
will provide on the first day of class. If you have trouble accessing the site, logging in, or other 103 |
problems, it is your responsibility to contact me right away so we can get it fixed. 104 |
105 |
Attendance is expected. Attending class will help you learn the material and be better prepared 106 |
We will also be blogging throughout this course. You can sign up for a free blog at Blogger or 107 |
Wordpress. You must send me the url of your blog no later than 5 p.m. January 27. The blog must be 108 |
open to all other students, although you are free to use a pseudonym. 109 |
110 |
Attendance 111 |
112 |
for assignments. If you miss class, it is your responsibility to get notes from a classmate and be 113 |
prepared for the next class. Unexcused and undocumented absences will affect your participation grade. 114 |
If you miss class because of an illness, I will need a doctor’s note. 115 |
116 |
Students with Disabilities 117 |
118 |
learning disability, please provide me with your university documentation during the first week of class 119 |
or as soon as you are documented. If you think you might have such needs, but have no documentation, 120 |
please contact the E.O.P. office in the basement of Taylor. 121 |
122 |
Assignment Policies 123 |
124 |
turned in after 8:10 a.m. will not be accepted. 125 |
126 |
Assignments and Evaluation 127 |
Participation: 20% 128 |
If you have special needs that will affect performance in this class, such as a documented 129 |
Assignments are due by the beginning of class (8:00 a.m.) on the day specified. Assignments 130 |
131 | 132 | 133 | 134 | 135 | 136 | 137 |
Page 3
138 |
Participation is expected in every class. You are expected to have completed the assigned 139 |
140 |
reading and to be able to discuss it in class. I will keep track of your participation. Missing class will 141 |
affect your participation grade. 142 |
Blogging: 15% 143 |
You will be keeping a blog of your thoughts and comments, such as reactions to the readings, 144 |
Each student will post two visual items (such as photos or other visual artwork) worth 5% each; 145 |
drafts and ideas for paper topics, and other relevant thoughts and links, over the course of the semester. 146 |
To get full credit, you need to post at least two separate posts of 350 words each per week. The last day 147 |
to post blog posts is April 7. 148 |
Omeka exhibit items: 25% 149 |
150 |
two audio/video items worth 5% each; and two written items, such as short essays, a review of a 151 |
recording, film, book, or similar item of no fewer than 1000 words, worth 5% each to the exhibit over 152 |
the course of the term. The last day to post items to the site is April 7. 153 |
154 |
Final Project: 40% 155 |
156 |
of Victorian music or musical culture you find particularly interesting. 157 |
158 |
Your final project will be a 15-minute in-class presentation and 10-12 page paper on any aspect 159 |
Elements of the final project 160 |
A proposal for your paper is due on March 1. The proposal should be a 1-page statement and 161 |
description of what you want to research for your paper and presentation. You will need to explain what 162 |
about the topic is appealing to you, and provide a general outline of the paper. The proposal is worth 163 |
5% of your final grade. 164 |
An annotated bibliography of no fewer than ten scholarly sources is due March 22. The 165 |
annotated bibliography should be in Chicago Manual of Style format. Each bibliographical entry 166 |
should include a description of the source, including its intended audience and why you think it will be 167 |
helpful for your paper. The annotated bibliography is worth 5% of your final grade. 168 |
The final paper is due April 19. Your paper will be 10-12 pages long, not including the 169 |
bibliography. Please format it according to CMS guidelines. The final paper is worth 20% of your final 170 |
project grade. 171 |
Presentations will take place in class during the last week of class. You should prepare a 15- 172 |
minute presentation of your research, using audio/visual materials as appropriate. In-class 173 |
performances are encouraged, as are creative approaches. The presentation is worth 10% of your final 174 |
grade. 175 |
176 |
Your work reflects directly on you: strive for a professional appearance and clear, well-written 177 |
prose in your assignments. Spelling and grammar errors will count against you; always spell-check and 178 |
proofread your work prior to posting it or turning it in. All assignments must be submitted via email as 179 |
.doc or .docx attachments and should be double-spaced and single-sided. Please use Times New Roman 180 |
font in 12 point type in black ink for all assignments. Margins should not be more than 1 inch. Include 181 |
your full name and the course name and number in the upper left hand corner of each assignment. 182 |
183 |
Classroom Etiquette 184 |
185 |
asked to do so in class). Please do not eat or drink anything really odiferous (pickled herring, kimchee, 186 |
rotten bananas, etc.) in class. 187 |
188 |
Please do not use cell phones during class for calls, texting, or accessing the internet (unless 189 |
(cid:1)(cid:1)(cid:1) 190 |
191 |
Course Schedule 192 |
193 |
Page 4
194 |
January 25: Introductions 195 |
January 27: Sweet,
Inventing the Victorians and Wilson, Victorians 196 |
197 |
Feb 1:
Victorian Visitors, “Introduction” and “Wagner” 198 |
Feb 3: Solie, “Music in a Victorian Mirror:
MacMillan’s Magazine in the Grove Years” 199 |
200 |
Feb 8: Solie, “‘Girling’ at the Parlor Piano” 201 |
Feb 10: Solie, “‘Tadpole Pleasures”:
Daniel Deronda as Music Historiography” 202 |
203 |
Feb 15: Solie, “Fictions of the Opera Box” 204 |
Feb 17: Temperley, “The Lost Chord,” Victorian Studies, Vol. 30, No. 1, Music in Victorian Society 205 |
and Culture (Autumn, 1986), pp. 7-23; Bashford, “Historiography and Invisible Musics: Domestic 206 |
Chamber Music in Nineteenth-Century Britain,” Journal of the American Musicological Society, Vol. 207 |
63, No. 2 (Summer 2010), pp. 291- 208 |
360. 209 |
210 |
Feb 22: Elgar: Adams, “Of Worcester and London: An Introduction;” Botstein, “transcending the 211 |
Enigmas of Biography: The Cultural Context of Sir Edward Elgar’s Career” in Edward Elgar and His 212 |
World
; and Vaughan Williams, “What Have We Learnt From Elgar?” 213 |
Feb 24: Elgar: Thomson, “Elgar’s Critical Critics,” and Fuller, “Elgar and the Salons: The Significance 214 |
of a Private Musical World” in
Edward Elgar and His World. 215 |
216 |
Mar 1: Stanford, “Some Thoughts concerning Folk-Song and Nationality,” The Musical Quarterly, Vol. 217 |
1, No. 2 (Apr., 1915), pp. 232-245; Vaughan Williams,
National Music chapters 1-3 and 8, and The 218 |
Making of Music
chapter 6 219 |
Mar 3: Saylor, “Dramatic Applications of Folksong in Vaughan Williams’s Operas
Hugh the Drover 220 |
and
Sir John in Love,” Journal of the Royal Musical Association, Vol. 134, no. 1, 37-83. 221 |
222 |
Mar 8: Vaughan Williams, “Gustav Holst: An Essay and a Note” in
National Music and Other Essays 223 |
Mar 10: No class—instructor at SAM meeting 224 |
225 |
Mar 15: Spring Break 226 |
Mar 17: Spring Break 227 |
228 |
Mar 22: Kift,
The Victorian Music Hall, Chapters 1-3 (annotated bibliographies due) 229 |
Mar 24: Faulk,
Music Hall and Modernity, Introduction and Chapter 1 230 |
231 |
Mar 29: Faulk, Chapters 4 and 5 232 |
Mar 31: in-class viewing:
Topsy-Turvey 233 |
234 |
Apr 5: Fischler, “Dialectics of Social Class in the Gilbert and Sullivan Collaboration,” SEL Studies in 235 |
English Literature 1500-1900, Volume 48, Number 4, Autumn 2008, pp. 829-837; and Fischler, 236 |
“Princess Ida” (review), The Opera Quarterly, Volume 19, Number 4, Autumn 2003, pp. 817-821. 237 |
Apr 7: in-class viewing:
Princess Ida (partial) 238 |
239 |
Apr 12: Princess Ida discussion and analysis 240 |
Apr 14:
Princess Ida discussion and analysis 241 |
242 |
Apr 19: Victorian music and culture rebooted: an introduction to steampunk (final papers due) 243 |
244 |
Page 5
245 |
Apr 21: Victorian music rebooted, con’t and final thoughts 246 |
247 |
Presentations 248 |
Apr 26: in-class presentations 249 |
Apr 28: in-class presentations 250 |
251 |
252 |
Page: 1, 2, 3, 4, 5
253 | 254 | -------------------------------------------------------------------------------- /extractor_research/output/Leonard_Women_Music_syllabus.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
Page 1
6 |
WESTMINSTER CHOIR COLLEGE OF 7 |
RIDER UNIVERSITY 8 |
Women in Music/MH433, Spring 2011 9 |
Syllabus 10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
Office hours: Tuesday, 9:30 – 10:30 a.m. in 24 |
the library; and by appointment 25 |
513. 238. 8031 (C) 26 |
27 |
28 |
Professor Kendra Leonard 29 |
kleonard@rider.edu 30 |
31 |
Class meeting: TTHF 4:30- 6:00 p.m. 32 |
33 |
Course Objectives 34 |
1. To provide students with an understanding of the role of women as composers, performers, and 35 |
patrons of music. 36 |
2. To introduce students to major female composers. 37 |
3. To explore works and other contributions of female composers and performers. 38 |
4. To encourage students to think critically about reading works, criticism, and practices of women in 39 |
music as a way of better understanding those works. 40 |
41 |
Email 42 |
43 |
Your Rider email account is your email address for all official email communications from the 44 |
University. You are expected to check your Rider email account on a frequent and consistent basis in 45 |
order to stay current with University-related communications. Any email from me about this course 46 |
will only be sent to your official Rider email address. Any communication from you to me must come 47 |
from your Rider email address, must contain the course name in the “Subject” line, and must use 48 |
proper spelling and capitalization. I try to respond to all emails within 24 hours. 49 |
50 |
Academic Code of Conduct 51 |
52 |
submission of academic work. In all written work, whether in class or out of class, the student’s name 53 |
on the work is considered to be a statement that the work is his or hers alone, except as otherwise 54 |
indicated. Students are expected to provide proper citations for the statements and ideas of others 55 |
whether submitted word for word or paraphrased. Failure to provide proper citations will be considered 56 |
plagiarism and offenders will be subject to the charge of plagiarism specified in the statement of 57 |
regulations. 58 |
Academic dishonesty includes any unauthorized collaboration or misrepresentation in the 59 |
Similarly, students are expected to adhere to all regulations pertaining to examination conduct. 60 |
These regulations are designed to insure that the work submitted by the student on examinations is an 61 |
honest representation of that student’s effort and that it does not involve unauthorized collaboration, 62 |
unauthorized use of notes during the exam, or unauthorized access to prior information about the 63 |
examination. 64 |
In this course, the first instance of plagiarism will result in a grade of 0 for the assignment, and 65 |
a report will be sent to the dean; a second instance of plagiarism will result in an F for the course, and 66 |
charges of academic dishonesty will be brought to the Academic Integrity Committee. See
The Source, 67 |
pages 10-16, for full information on the academic code of conduct. 68 |
69 |
Required Text/Materials 70 |
Pendle, Karin. Women & Music. Bloomington: Indiana University Press, 2001. 71 |
Additional readings listed are posted on Blackboard as PDFs. 72 |
We will also watch some films and clips in class.
73 |
74 |
Style Manual 75 |
76 |
Page 2
77 |
Chicago Manual of Style, 15th ed. Chicago: University of Chicago Press, 2003. 78 |
79 |
Technology Requirements 80 |
This course is on-line on Blackboard. The web address is Blackboard.rider.edu. You will need 81 |
to have regular access to the internet and a word-processing program to complete many elements of this 82 |
course. Students who do not have this access at home will need to scheduled time to do so either at the 83 |
library or at other campus computing locations. Always back up your work, whether on a flash drive, 84 |
via email, or through an online back-up service such as Mozy or Google Documents. Assignments will 85 |
not be accepted late because of computer or printer problems. 86 |
In this course, we will be building a digital exhibit of women in music at the website 87 |
http://wccwomeninmusic.omeka.net/. All students will have access to the site via a login and password 88 |
I will provide on the first day of class. If you have trouble accessing the site, logging in, or other 89 |
problems, it is your responsibility to contact me right away so we can get it fixed. 90 |
We will be blogging throughout this course. You can sign up for a free blog at Blogger or 91 |
If you have special needs that will affect performance in this class, such as a documented 92 |
Attendance is expected. Attending class will help you learn the material and be better prepared 93 |
Wordpress. You must send me the url of your blog no later than 5 p.m. January 27. The blog must be 94 |
open to all other students, although you are free to use a pseudonym. The last day to post blog posts is 95 |
April 7. 96 |
97 |
Attendance 98 |
99 |
for assignments. If you miss class, it is your responsibility to get notes from a classmate and be 100 |
prepared for the next class. Unexcused and undocumented absences will affect your participation grade. 101 |
If you miss class because of an illness, I will need a doctor’s note. 102 |
103 |
Students with Disabilities 104 |
105 |
learning disability, please provide me with your university documentation during the first week of class 106 |
or as soon as you are documented. If you think you might have such needs, but have no documentation, 107 |
please contact the Academic Student Services office in the basement of Taylor. 108 |
109 |
Assignment Policies 110 |
111 |
Assignments turned in after 4:40 p.m. will not be accepted. 112 |
113 |
Assignments and Evaluation 114 |
Participation: 20% 115 |
116 |
reading and to be able to discuss it in class. I will keep track of your participation. Missing class will 117 |
affect your participation grade. 118 |
Blogging: 15% 119 |
Assignments are due before or at the beginning of class (4:30 p.m.) on the day specified. 120 |
Participation is expected in every class. You are expected to have completed the assigned 121 |
You will be keeping a blog of your thoughts and comments, such as reactions to the readings, 122 |
drafts and ideas for paper topics, and other relevant thoughts and links, over the course of the semester. 123 |
To get full credit, you need to post at least one post of at least 250 words per week. The last day to post 124 |
blog posts for credit is April 7. 125 |
Omeka exhibit items: 25% 126 |
127 |
two audio/video items worth 5% each; and two written items, such as short essays, a review of a 128 |
recording, film, book, or similar item of no fewer than 1000 words, worth 5% each to the exhibit over 129 |
the course of the term. The last day to post items to the site is April 7. 130 |
Each student will post two visual items (such as photos or other visual artwork) worth 5% each; 131 |
132 |
Page 3
133 |
134 |
Final Project: 40% 135 |
136 |
of women in music you find particularly interesting. 137 |
138 |
Your final project will be a 15-minute in-class presentation and 8-10 page paper on any aspect 139 |
Elements of the final project 140 |
A proposal for your paper is due on March 1. The proposal should be a 1-page statement and 141 |
description of what you want to research for your paper and presentation. You will need to explain what 142 |
about the topic is appealing to you, and provide a general outline of the paper. The proposal is worth 143 |
5% of your final grade. 144 |
An annotated bibliography of no fewer than eight scholarly sources is due March 22. The 145 |
annotated bibliography should be in Chicago Manual of Style format. Each bibliographical entry 146 |
should include a description of the source, including its intended audience and why you think it will be 147 |
helpful for your paper. The annotated bibliography is worth 5% of your final grade. 148 |
The final paper is due April 19. Your paper will be 8-10 pages long, not including the 149 |
bibliography. Please format it according to CMS guidelines. The final paper is worth 20% of your final 150 |
grade. 151 |
Presentations will take place in class during the last week of class. You should prepare a 15- 152 |
minute presentation of your research, using audio/visual materials as appropriate. In-class 153 |
performances are encouraged, as are creative approaches. The presentation is worth 10% of your final 154 |
grade. 155 |
156 |
Your work reflects directly on you: strive for a professional appearance and clear, well-written 157 |
prose in your assignments. Spelling and grammar errors will count against you; always spell-check and 158 |
proofread your work prior to posting it or turning it in. All assignments must be submitted via email 159 |
(
not Blackboard dropbox or in hard copy) as .doc or .docx attachments and should be double-spaced 160 |
and single-sided. Please use Times New Roman font in 12 point type in black ink for all assignments. 161 |
Margins should not be more than 1 inch. Include your full name and the course name and number in the 162 |
upper left hand corner of each assignment. 163 |
164 |
Classroom Etiquette 165 |
166 |
asked to do so in class). Please do not eat or drink anything really odiferous (pickled herring, kimchee, 167 |
rotten bananas, etc.) in class. 168 |
169 |
Please do not use cell phones during class for calls, texting, or accessing the internet (unless 170 |
(cid:1)(cid:1)(cid:1) 171 |
172 |
Course Schedule 173 |
All chapters refer to Pendle, Women in Music. Other readings are listed on Blackboard by author name. 174 |
175 |
January 25: Introductions 176 |
January 27: Preface and Chapter 1: Feminist Aesthetics 177 |
178 |
Feb 1: Chapter 2: Women and Music in Ancient Greece and Rome 179 |
Feb 3: Chapter 3: Women in Music to ca. 1450;
Vision (Hildegard of Bingen film, partial, if available) 180 |
181 |
Feb 8: “Ful weel she soong the service dyvyne’: The Cloistered Musician in the Middle Ages” in 182 |
Women Making Music (Yardley) and “Jougleresses and Trobairitz: Secular Musicians in Medieval 183 |
France” (Coldwell) 184 |
Feb 10: Chapter 4: Musical Women in Early Modern Europe 185 |
186 |
Page 4
187 |
188 |
Feb 15: Chapter 5: Musical Women of the 17
th and 18th Centuries 189 |
Feb 17: “Courtesans, Muses, or Musicians? Professional Women Musicians in Sixteenth-Century 190 |
Italy,” in
Women Making Music (Newcomb) 191 |
192 |
Feb 22: Chapter 6: European Composers and Musicians, ca. 1800-1890 193 |
Feb 24: Chapter 7: European Composers and Musicians, 1880-1918 194 |
195 |
Mar 1: Chapter 8: Women in American Music, 1800-1918 (paper proposals due) 196 |
Mar 3: Chapter 11: North America Since 1920 197 |
198 |
Mar 8 Kiya Heartwood visit (and catch-up day in case of snow cancellations) 199 |
Mar 10 No class—instructor at SAM meeting 200 |
201 |
Mar 15 Spring Break 202 |
Mar 17 Spring Break 203 |
204 |
Mar 22: Chapter 9: Contemporary British Composers and “‘Shout, Shout, Up with Your Song!’ Dame 205 |
Ethel Smyth and the Changing Role of the British Composer” (Bernstein) (annotated bibliographies 206 |
due) 207 |
Mar 24: Chapter 10: Composers of Modern Europe, Israel, Australia, and New Zealand 208 |
209 |
Mar 29: Chapter 12: American Popular Music 210 |
Mar 31: in-class viewing: Lilith Fair movie 211 |
212 |
Apr 5: Intro to Women in the World of Music and Chapter 13: Women and Music around the 213 |
Mediterranean 214 |
Apr 7: Video TBD (
Slingshot Hip Hop, if available, or Jericho's Echo: Punk Rock in the Holy Land) 215 |
216 |
Apr 12: Chapter 14: Women in the World of Music: Latin America, Native America, and the African 217 |
Diaspora 218 |
Apr 14: Chapter 15: American Women in Blues and Jazz; in-class viewing:
Lady Sings the Blues 219 |
(partial) 220 |
221 |
Apr 19: in-class viewing:
Girls Rock (final papers due) 222 |
Apr 21: Chapter 16: Women’s Support and Encouragement of Women and Musicians 223 |
224 |
Apr 26: in-class presentations 225 |
Apr 28: in-class presentations 226 |
227 |
228 |
Page: 1, 2, 3, 4
229 | 230 | -------------------------------------------------------------------------------- /extractor_research/visualize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/extractor_research/visualize/__init__.py -------------------------------------------------------------------------------- /extractor_research/visualize/html_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | def soupify(html_file): 6 | return BeautifulSoup(html_file) 7 | 8 | def pretty_print(soup): 9 | print(soup.prettify()) 10 | 11 | def find_attribute(soup, attr, search_str): 12 | return soup.find_all(attrs = {attr : re.compile(search_str)}) 13 | 14 | if __name__ == '__main__': 15 | with open('../output/Leonard_Women_Music_syllabus.html') as file: 16 | soup = soupify(file) 17 | pretty_print(soup) 18 | elements = find_attribute(soup, 'style', r'Bold') 19 | for element in elements: 20 | print element -------------------------------------------------------------------------------- /gpl-3.0.md: -------------------------------------------------------------------------------- 1 | # GNU GENERAL PUBLIC LICENSE 2 | 3 | Version 3, 29 June 2007 4 | 5 | Copyright © 2007 Free Software Foundation, Inc. 6 | 7 | Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. 8 | 9 | ## Preamble 10 | 11 | The GNU General Public License is a free, copyleft license for software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. 14 | 15 | When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. 16 | 17 | To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. 18 | 19 | For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. 20 | 21 | Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. 22 | 23 | For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. 24 | 25 | Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. 26 | 27 | Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. 28 | 29 | The precise terms and conditions for copying, distribution and modification follow. 30 | 31 | TERMS AND CONDITIONS 32 | 33 | ###0. Definitions. 34 | “This License” refers to version 3 of the GNU General Public License. 35 | 36 | “Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. 37 | 38 | “The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations. 39 | 40 | To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work. 41 | 42 | A “covered work” means either the unmodified Program or a work based on the Program. 43 | 44 | To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. 45 | 46 | To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. 47 | 48 | An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 49 | 50 | ###1. Source Code. 51 | The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work. 52 | 53 | A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. 54 | 55 | The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. 56 | 57 | The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. 58 | 59 | The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. 60 | 61 | The Corresponding Source for a work in source code form is that same work. 62 | 63 | ###2. Basic Permissions. 64 | All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. 65 | 66 | You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. 67 | 68 | Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 69 | 70 | ###3. Protecting Users' Legal Rights From Anti-Circumvention Law. 71 | No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. 72 | 73 | When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 74 | 75 | ###4. Conveying Verbatim Copies. 76 | You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. 77 | 78 | You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 79 | 80 | ###5. Conveying Modified Source Versions. 81 | You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: 82 | 83 | a) The work must carry prominent notices stating that you modified it, and giving a relevant date. 84 | b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”. 85 | c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. 86 | d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. 87 | A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 88 | 89 | ###6. Conveying Non-Source Forms. 90 | You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: 91 | 92 | a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. 93 | b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. 94 | c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. 95 | d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. 96 | e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. 97 | A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. 98 | 99 | A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. 100 | 101 | “Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. 102 | 103 | If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). 104 | 105 | The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. 106 | 107 | Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 108 | 109 | ###7. Additional Terms. 110 | “Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. 111 | 112 | When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. 113 | 114 | Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: 115 | 116 | a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or 117 | b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or 118 | c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or 119 | d) Limiting the use for publicity purposes of names of licensors or authors of the material; or 120 | e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or 121 | f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. 122 | All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. 123 | 124 | If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. 125 | 126 | Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 127 | 128 | ###8. Termination. 129 | You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). 130 | 131 | However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. 132 | 133 | Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. 134 | 135 | Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 136 | 137 | ###9. Acceptance Not Required for Having Copies. 138 | You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 139 | 140 | ###10. Automatic Licensing of Downstream Recipients. 141 | Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. 142 | 143 | An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. 144 | 145 | You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 146 | 147 | ###11. Patents. 148 | A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's “contributor version”. 149 | 150 | A contributor's “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. 151 | 152 | Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. 153 | 154 | In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. 155 | 156 | If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. 157 | 158 | If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. 159 | 160 | A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. 161 | 162 | Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 163 | 164 | ###12. No Surrender of Others' Freedom. 165 | If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 166 | 167 | ###13. Use with the GNU Affero General Public License. 168 | Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 169 | 170 | ###14. Revised Versions of this License. 171 | The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. 172 | 173 | Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. 174 | 175 | If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. 176 | 177 | Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 178 | 179 | ###15. Disclaimer of Warranty. 180 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 181 | 182 | ###16. Limitation of Liability. 183 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 184 | 185 | ###17. Interpretation of Sections 15 and 16. 186 | If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. 187 | 188 | END OF TERMS AND CONDITIONS 189 | 190 | How to Apply These Terms to Your New Programs 191 | 192 | If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. 193 | 194 | To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found. 195 | 196 | 197 | Copyright (C) 198 | 199 | This program is free software: you can redistribute it and/or modify 200 | it under the terms of the GNU General Public License as published by 201 | the Free Software Foundation, either version 3 of the License, or 202 | (at your option) any later version. 203 | 204 | This program is distributed in the hope that it will be useful, 205 | but WITHOUT ANY WARRANTY; without even the implied warranty of 206 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 207 | GNU General Public License for more details. 208 | 209 | You should have received a copy of the GNU General Public License 210 | along with this program. If not, see . 211 | Also add information on how to contact you by electronic and paper mail. 212 | 213 | If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: 214 | 215 | Copyright (C) 216 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 217 | This is free software, and you are welcome to redistribute it 218 | under certain conditions; type `show c' for details. 219 | The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an “about box”. 220 | 221 | You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . 222 | 223 | The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . 224 | -------------------------------------------------------------------------------- /opensyllabus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/opensyllabus/__init__.py -------------------------------------------------------------------------------- /opensyllabus/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | 11 | DATA_DIR = '/mnt/osp-archive-mount/document-dump' 12 | TMP_DIR = '/mnt/osp-archive-mount/document-dump/code/opensyllabus/_tmp' 13 | 14 | # log config 15 | LOG_TO_FILE = True 16 | FILE_LOG_VERBOSITY = 'debug' 17 | CONSOLE_LOG_VERBOSITY = 'debug' 18 | INGESTION_LOG_FILE = '/mnt/osp-archive-mount/document-dump/code/opensyllabus/_logs/ingestion.log' 19 | GETEMPTY_LOG_FILE = '/mnt/osp-archive-mount/document-dump/code/opensyllabus/_logs/get_empty.log' 20 | 21 | # MongoDB configurations 22 | MONGODB_HOST = 'localhost' 23 | MONGODB_PORT = 27017 24 | MONGODB_USE_AUTH = True 25 | MONGODB_USER = 'script' 26 | MONGODB_PASSWORD = 'c*;,(yHfmz4J&Ap' 27 | 28 | PROCESS_REPORT_COUNT = 500 29 | THREADS_COUNT = 10 30 | 31 | try: 32 | from opensyllabus.local_config import * 33 | except ImportError: 34 | pass -------------------------------------------------------------------------------- /opensyllabus/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xpmethod/opensyllabus/693e1304e2293515ff2817a86778ea6dde165515/opensyllabus/core/__init__.py -------------------------------------------------------------------------------- /opensyllabus/core/extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import os 11 | import re 12 | import subprocess 13 | 14 | from lxml import html 15 | from lxml.html.clean import Cleaner 16 | from pyPdf import PdfFileReader 17 | from docx import opendocx, getdocumenttext 18 | 19 | from opensyllabus.config import DATA_DIR 20 | from opensyllabus.core.utils import clean_list, get_data_files, get_file_ext 21 | from opensyllabus.core.ocr import OpenSyllabusOCR 22 | 23 | 24 | class TextExtractor(object): 25 | """ 26 | Class for extracting text data from pdf, doc/docx, html and other 27 | """ 28 | 29 | def __init__(self, log): 30 | self.log = log 31 | self.ocr = OpenSyllabusOCR(log) 32 | 33 | 34 | def pdf(self, path): 35 | """ 36 | Method for extracting text data from pdf files 37 | Input: full path to pdf file 38 | Output: extracted text 39 | """ 40 | try: 41 | pdf = PdfFileReader(file(os.path.join(DATA_DIR, path), 'rb')) 42 | text = '\n'.join([page.extractText() for page in pdf.pages]) 43 | except Exception as e: 44 | self.log.exception(e) 45 | else: 46 | if not re.sub('[\n]+', '', text): 47 | return self.ocr.extract(os.path.join(DATA_DIR, path)) 48 | return text 49 | 50 | 51 | def doc(self, path): 52 | """ 53 | Method for extracting text data from doc files 54 | Input: full path to doc file 55 | Output: extracted text 56 | """ 57 | try: 58 | p = subprocess.Popen(['antiword', os.path.join(DATA_DIR, path)], stdout=subprocess.PIPE) 59 | text = p.communicate()[0] 60 | except Exception as e: 61 | self.log.exception(e) 62 | else: 63 | return text 64 | 65 | 66 | def docx(self, path): 67 | """ 68 | Method for extracting text data from docx files 69 | Input: full path to docx file 70 | Output: extracted text 71 | """ 72 | try: 73 | docx = opendocx(os.path.join(DATA_DIR, path)) 74 | text = '\n'.join([page for page in getdocumenttext(docx)]) 75 | except Exception as e: 76 | self.log.exception(e) 77 | else: 78 | return text 79 | 80 | 81 | def htm(self, path): 82 | """ 83 | Method for extracting text data from htm files 84 | Input: full path to htm file 85 | Output: extracted text 86 | """ 87 | return self.html(path) 88 | 89 | 90 | def html(self, path): 91 | """ 92 | Method for extracting text data from html files 93 | Input: full path to html file 94 | Output: extracted text 95 | """ 96 | try: 97 | fh = open(os.path.join(DATA_DIR, path), 'r') 98 | etree = html.fromstring(Cleaner(style=True).clean_html(fh.read())) 99 | text = ' '.join(clean_list(etree.xpath('//text()'))) 100 | except Exception as e: 101 | self.log.exception(e) 102 | else: 103 | return text 104 | 105 | -------------------------------------------------------------------------------- /opensyllabus/core/ingestion.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import os 11 | import sys 12 | import threading 13 | from time import time 14 | 15 | # add project dir to PYTHONPATH 16 | #sys.path.insert(0, os.path.join(os.path.split(sys.path[0])[0])) 17 | 18 | # so now we can import opensyllabus package 19 | from opensyllabus.core.mongo import OpenSyllabusDb 20 | from opensyllabus.core.extractor import TextExtractor 21 | from opensyllabus.config import PROCESS_REPORT_COUNT 22 | from opensyllabus.core.utils import get_data_files, get_file_ext, get_file_type 23 | 24 | 25 | class StatCounter(object): 26 | 27 | def __init__(self): 28 | self.processed = 0 29 | self.ingested = 0 30 | self.unsupported = 0 31 | self.wrong = 0 32 | self.ignored = 0 33 | #-- 34 | self.proc_lock = threading.Lock() 35 | self.ing_lock = threading.Lock() 36 | self.unsupp_lock = threading.Lock() 37 | self.wrong_lock = threading.Lock() 38 | self.ignore_lock = threading.Lock() 39 | #-- 40 | self.report_lock = threading.Lock() 41 | #-- 42 | self.time = time() 43 | 44 | def inc_proc(self): 45 | self.proc_lock.acquire() 46 | try: 47 | self.processed += 1 48 | if self.processed % PROCESS_REPORT_COUNT == 0: 49 | self.show_process_report() 50 | finally: 51 | self.proc_lock.release() 52 | 53 | def inc_ing(self): 54 | self.ing_lock.acquire() 55 | try: 56 | self.ingested += 1 57 | finally: 58 | self.ing_lock.release() 59 | 60 | 61 | def inc_unsupp(self): 62 | self.unsupp_lock.acquire() 63 | try: 64 | self.unsupported += 1 65 | finally: 66 | self.unsupp_lock.release() 67 | 68 | def inc_wrong(self): 69 | self.wrong_lock.acquire() 70 | try: 71 | self.wrong += 1 72 | finally: 73 | self.wrong_lock.release() 74 | 75 | def inc_ignore(self): 76 | self.ignore_lock.acquire() 77 | try: 78 | self.ignored += 1 79 | finally: 80 | self.ignore_lock.release() 81 | 82 | def show_report(self): 83 | """ 84 | Show summary report 85 | Input: None 86 | Output: None 87 | """ 88 | report = '\nIngestion Stats:\n' 89 | report += 'processed: %s\n' % self.processed 90 | report += 'ingested: %s\n' % self.ingested 91 | report += 'unsupported ext: %s\n' % self.unsupported 92 | report += 'wrong ext: %s\n' % self.wrong 93 | report += 'ignored(old): %s\n' % self.ignored 94 | report += 'elapsed time: (%s)\n' % (time() - self.time) 95 | 96 | print '='*80 + report + '='*80 97 | 98 | 99 | def show_process_report(self): 100 | """ 101 | Show process report 102 | """ 103 | report = 'Processed: (%s), ' % self.processed 104 | report += 'ingested: (%s), ' % self.ingested 105 | report += 'unsupported: (%s), ' % self.unsupported 106 | report += 'wrong: (%s), ' % self.wrong 107 | report += 'ignored: (%s)' % self.ignored 108 | 109 | print report 110 | 111 | class Ingester(threading.Thread): 112 | 113 | def __init__(self, queue, log, counter): 114 | self.log = log 115 | self.queue = queue 116 | #-- 117 | self.counter = counter 118 | #-- 119 | self.db = OpenSyllabusDb(log) 120 | self.extractor = TextExtractor(log) 121 | #-- 122 | threading.Thread.__init__(self) 123 | 124 | 125 | def run(self): 126 | while True: 127 | data_file = self.queue.get() 128 | ext = get_file_ext(data_file) 129 | #-- 130 | if ext and (ext in self.extractor.__class__.__dict__) and self.db.is_new(data_file): 131 | self.counter.inc_ing() 132 | file_type = get_file_type(data_file) 133 | data = getattr(self.extractor, file_type or ext)(data_file) 134 | self.db.insert_data(data_file, os.path.split(data_file)[1], data) 135 | else: 136 | if not ext: 137 | self.counter.inc_wrong() 138 | elif ext not in self.extractor.__class__.__dict__: 139 | self.counter.inc_unsupp() 140 | else: 141 | self.counter.inc_ignore() 142 | #-- 143 | self.counter.inc_proc() 144 | self.queue.task_done() 145 | 146 | -------------------------------------------------------------------------------- /opensyllabus/core/mongo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import os 11 | from pymongo import MongoClient, ASCENDING, DESCENDING 12 | 13 | from opensyllabus.config import MONGODB_HOST, MONGODB_PORT, MONGODB_USE_AUTH, \ 14 | MONGODB_USER, MONGODB_PORT 15 | from config import MONGODB_PASSWORD 16 | 17 | 18 | class OpenSyllabusDb(object): 19 | 20 | def __init__(self, log): 21 | self.log = log 22 | self._init_connection() 23 | 24 | def _init_connection(self): 25 | """ 26 | Initilize connection to MongoDB 27 | Input: None 28 | Output: None 29 | """ 30 | try: 31 | # connect to mongodb server 32 | self.client = MongoClient(MONGODB_HOST, MONGODB_PORT) 33 | # select mongodb database with name denten_crawler 34 | self.db = self.client['opensyllabus'] 35 | if MONGODB_USE_AUTH: 36 | # authenticate script user 37 | self.db.authenticate(MONGODB_USER, MONGODB_PASSWORD) 38 | # set collection name 39 | self.collection = self.db['opensyllabus'] 40 | except Exception as e: 41 | self.log.exception(e) 42 | 43 | 44 | def insert_data(self, path, filename, text):#, mime_type): 45 | """ 46 | Insert extracted text to db 47 | Input: path - full path to file 48 | filename - data file name 49 | text - extracted text 50 | Output: None 51 | """ 52 | mongo_item = { 53 | 'path': path, 54 | 'filename': filename, 55 | # 'mime_type': mime_type, 56 | 'text': text, 57 | } 58 | try: 59 | # insert data to collection 60 | self.collection.insert(mongo_item) 61 | except Exception as e: 62 | self.log.exception(e) 63 | 64 | 65 | def is_new(self, filepath): 66 | """ 67 | Check file for exists in db 68 | Input: full path to data file 69 | Output: return True if file doesn't exist in db, otherwise return False 70 | """ 71 | if not self.collection.find({'path': filepath}).count(): 72 | return True 73 | return False 74 | 75 | 76 | def get_empty_docs(self, val=''): 77 | return self.collection.find({'text': val}) 78 | -------------------------------------------------------------------------------- /opensyllabus/core/ocr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import os 11 | import sys 12 | import glob 13 | import logging 14 | import subprocess 15 | 16 | # add project dir to PYTHONPATH 17 | sys.path.insert(0, os.path.split(os.path.split(sys.path[0])[0])[0]) 18 | 19 | from opensyllabus.config import TMP_DIR 20 | 21 | 22 | class OpenSyllabusOCR(object): 23 | 24 | def __init__(self, log): 25 | self.log = log 26 | self.ex_txt = '' 27 | 28 | 29 | def extract(self, input_pdf): 30 | """ 31 | Extract images from pdf file and then extract text from them 32 | Input: full path to pdf file 33 | Output: return extracted text, otherwise return False 34 | """ 35 | glob_img_filename = self._extract_images(input_pdf) 36 | if glob_img_filename: 37 | return self._extract_text(glob_img_filename) 38 | return None 39 | 40 | 41 | def _delete_tmp_files(self, img_file, txt_file): 42 | """ 43 | Delete temporary files 44 | Input: path to image file and path to txt file 45 | Output: None 46 | """ 47 | for tmp_file in (img_file, txt_file): 48 | try: 49 | os.remove(tmp_file) 50 | except IOError as e: 51 | self.log.exception(e) 52 | 53 | 54 | def _extract_images(self, input_pdf): 55 | """ 56 | Extract images from pdf file and save them to hdd 57 | Input: full path to pdf file 58 | Output: pathname pattern for extracted images 59 | """ 60 | pdf_dir, pdf_filename = os.path.split(input_pdf) 61 | pdf_name, pdf_ext = os.path.splitext(pdf_filename) 62 | output_filename = os.path.join(TMP_DIR, pdf_name) 63 | # compose cmd string 64 | cmd = 'gs -q -dNOPAUSE -sDEVICE=pngmono -r300 -sOutputFile="%s_%%d.png" "%s" -c quit' % (output_filename, input_pdf) 65 | 66 | try: 67 | subprocess.check_output(cmd, shell=True) 68 | except subprocess.CalledProcessError as e: 69 | self.log.exception(e) 70 | else: 71 | return '%s_*.png' % output_filename 72 | 73 | 74 | def _extract_text(self, glob_img_filename): 75 | """ 76 | Extract text from images and save it to files 77 | Input: pathname pattern for extracted images 78 | Output: return extracted text 79 | """ 80 | for i, img_file in enumerate(glob.glob(glob_img_filename)[::-1], 1): 81 | img_dir, img_filename = os.path.split(img_file) 82 | img_name, img_ext = os.path.splitext(img_filename) 83 | output_filename = os.path.join(TMP_DIR, img_name) 84 | # compose cmd string 85 | cmd = 'tesseract "%s" "%s"' % (img_file, output_filename) 86 | 87 | try: 88 | subprocess.check_output(cmd, shell=True) 89 | except subprocess.CalledProcessError as e: 90 | self.log.exception(e) 91 | else: 92 | # add extension to output filename 93 | txt_file = '%s.txt' % output_filename 94 | 95 | # read text from file to buffer 96 | try: 97 | fh = open(txt_file, 'r') 98 | except Exception as e: 99 | self.log.exception(e) 100 | else: 101 | self.ex_txt += ' %s' % fh.read() 102 | 103 | # delete tmp files 104 | self._delete_tmp_files(img_file, txt_file) 105 | 106 | return self.ex_txt 107 | 108 | -------------------------------------------------------------------------------- /opensyllabus/core/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import os 11 | import re 12 | import logging 13 | import zipfile 14 | import subprocess 15 | from magic import Magic 16 | from logging.handlers import RotatingFileHandler 17 | from logging import StreamHandler, FileHandler, Formatter 18 | 19 | from opensyllabus.config import DATA_DIR 20 | 21 | 22 | log_levels = { 23 | 'debug': logging.DEBUG, 24 | 'info': logging.INFO, 25 | 'warning': logging.WARNING, 26 | 'error': logging.ERROR 27 | } 28 | 29 | file_types = { 30 | 'application/pdf': 'pdf', 31 | 'application/msword': 'word', 32 | 'text/html': 'html', 33 | } 34 | 35 | def configure_loggers(log, verbosity, log_file, log_verbosity): 36 | LOGFMT_CONSOLE = ('[%(asctime)s] %(name)-10s %(levelname)-7s in %(module)s.%(funcName)s(),' 37 | ' line %(lineno)d\n\t%(message)s') 38 | 39 | LOGFMT_FILE = ('[%(asctime)s] [%(process)d]%(name)-10s %(levelname)-7s in %(module)s.%(funcName)s(),' 40 | ' line %(lineno)d\n\t%(message)s') 41 | 42 | # Configure root logger to log to stdout 43 | logging.basicConfig(level=verbosity, datefmt='%H:%M:%S', format=LOGFMT_CONSOLE) 44 | 45 | # Configure main logger to rotate log files 46 | rh = RotatingFileHandler(log_file, maxBytes=100000, backupCount=25) 47 | log.addHandler(rh) 48 | 49 | # Configure main logger to log to a file 50 | if log_file: 51 | fh = FileHandler(log_file, 'w') 52 | fh.setFormatter(Formatter(LOGFMT_FILE, '%Y-%m-%d %H:%M:%S')) 53 | fh.setLevel(log_verbosity) 54 | log.addHandler(fh) 55 | 56 | return log 57 | 58 | 59 | def clean_list(in_list): 60 | """ 61 | Remove special symbols on each elemnt in the list 62 | Input: list 63 | Output: list with cleaned elements 64 | """ 65 | return [re.sub('[\r\n\t ]+', ' ', el.strip()) for el in in_list if re.sub('[\r\n\t ]+', '', el)] 66 | 67 | 68 | def get_data_files_2(): 69 | """ 70 | Walking over data directories and return data files 71 | Input: None 72 | Output: iterator with pathes to data files 73 | """ 74 | for i in range(1): 75 | for top_dir in os.listdir(DATA_DIR): 76 | for r, dirs, files in os.walk(os.path.join(DATA_DIR, top_dir)): 77 | for data_file in files: 78 | yield os.path.join(r, data_file) 79 | 80 | def get_data_files(): 81 | """ 82 | Walking over data directories and return data files 83 | Input: None 84 | Output: iterator with pathes to data files 85 | """ 86 | for top_dir in os.listdir(DATA_DIR): 87 | if not ('cohen-archive' in top_dir): 88 | for r, dirs, files in os.walk(os.path.join(DATA_DIR, top_dir)): 89 | for data_file in files: 90 | yield os.path.join(r, data_file) 91 | else: 92 | # trick for walk over big directories 93 | archive_path = os.path.join(DATA_DIR, top_dir, 'web.archive.org', 'web') 94 | p = subprocess.Popen(['ls', '-f', archive_path], stdout=subprocess.PIPE) 95 | for dir in p.communicate()[0].split('\n'): 96 | if dir not in ['.', '..']: 97 | for r, dirs, files in os.walk(os.path.join(archive_path, dir)): 98 | for data_file in files: 99 | yield os.path.join(r, data_file) 100 | 101 | 102 | def get_file_dir(filepath): 103 | """ 104 | Return root directory for the file 105 | Input: full path to file 106 | Output: root directory 107 | """ 108 | data_dir = os.path.split(DATA_DIR)[1] 109 | dirs = filepath.split('/') 110 | return dirs[dirs.index(data_dir) + 1] 111 | 112 | 113 | def get_file_ext(filename): 114 | """ 115 | Return file extension 116 | Input: filename 117 | Output: file extension 118 | """ 119 | ext = os.path.splitext(filename)[1].lower() 120 | if 7 > len(ext) > 1: 121 | return ext[1:] 122 | 123 | return None 124 | 125 | 126 | def get_file_type(filename): 127 | """ 128 | Return file mime type 129 | Input: filename 130 | Output: file mime type 131 | """ 132 | try: 133 | mime_type = Magic(mime=True).from_file(filename) 134 | except: 135 | pass 136 | else: 137 | file_type = file_types.get(mime_type, None) 138 | 139 | if file_type == 'word': 140 | if zipfile.is_zipfile(filename): 141 | return 'docx' 142 | else: 143 | return 'doc' 144 | 145 | return file_type 146 | -------------------------------------------------------------------------------- /opensyllabus/run_getemptydocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import sys 11 | import logging 12 | from optparse import OptionParser, OptionGroup 13 | from os.path import split as p_split, join, splitext 14 | 15 | # add project dir to PYTHONPATH 16 | sys.path.insert(0, join(p_split(sys.path[0])[0])) 17 | 18 | # so now we can import opensyllabus package 19 | from opensyllabus.core.mongo import OpenSyllabusDb 20 | from opensyllabus.core.utils import configure_loggers, log_levels 21 | from opensyllabus.config import GETEMPTY_LOG_FILE, FILE_LOG_VERBOSITY, CONSOLE_LOG_VERBOSITY, LOG_TO_FILE 22 | 23 | log = logging.getLogger(__name__) 24 | 25 | 26 | class BrokenDocsStats(object): 27 | 28 | def __init__(self, log): 29 | self.broken_ext_count = {} 30 | #-- 31 | self.db = OpenSyllabusDb(log) 32 | 33 | 34 | def _get_broken_docs_1(self): 35 | """ 36 | Get documents with empty 'text' field 37 | """ 38 | for doc in self.db.get_empty_docs(''): 39 | ext = splitext(doc['filename'])[-1] 40 | self.broken_ext_count[ext] = self.broken_ext_count.setdefault(ext, 0) + 1 41 | print doc['path'] 42 | 43 | 44 | def _get_broken_docs_2(self): 45 | """ 46 | Get documents with null 'text' field 47 | """ 48 | for doc in self.db.get_empty_docs(None): 49 | ext = splitext(doc['filename'])[-1] 50 | self.broken_ext_count[ext] = self.broken_ext_count.setdefault(ext, 0) + 1 51 | print doc['path'] 52 | 53 | def show_result(self): 54 | """ 55 | Show calculated statistics 56 | """ 57 | report = '\nBroken Docs Stats:\n' 58 | for ext, count in self.broken_ext_count.iteritems(): 59 | report += '%s: %s\n' % (ext, count) 60 | print '=' * 80 + report + '=' * 80 61 | 62 | 63 | def get_broken_doc(self, doc_type): 64 | if doc_type == 'empty': 65 | self._get_broken_docs_1() 66 | else: 67 | self._get_broken_docs_2() 68 | #-- 69 | self.show_result() 70 | 71 | 72 | if __name__ == '__main__': 73 | parser = OptionParser(usage='Usage: %prog [options]') 74 | parser.add_option( 75 | '-v', '--verbosity', 76 | dest='verbosity', 77 | type='choice', 78 | choices=log_levels.keys(), 79 | default=CONSOLE_LOG_VERBOSITY, 80 | help='setup console log verbosity' 81 | ) 82 | 83 | parser.add_option( 84 | '-f', '--log-verbosity', 85 | dest='log_verbosity', 86 | type='choice', 87 | choices=log_levels.keys(), 88 | default=FILE_LOG_VERBOSITY, 89 | help='setup file log verbosity' 90 | ) 91 | parser.add_option( 92 | '-t', '--type', 93 | dest='type', 94 | type='choice', 95 | choices=('empty', 'null'), 96 | help='setup broken doc type' 97 | ) 98 | parser.add_option( 99 | '-l', '--log', 100 | dest='log_file', 101 | help='setup log file' 102 | ) 103 | 104 | 105 | options, args = parser.parse_args() 106 | 107 | if LOG_TO_FILE and not options.log_file: 108 | options.log_file = GETEMPTY_LOG_FILE 109 | 110 | log = configure_loggers(log, 111 | log_levels[options.verbosity], 112 | options.log_file, 113 | log_levels[options.log_verbosity]) 114 | 115 | if not options.type: 116 | parser.error('-t option is mandatory') 117 | else: 118 | BrokenDocsStats(log).get_broken_doc(options.type) 119 | 120 | -------------------------------------------------------------------------------- /opensyllabus/run_getstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import sys 11 | from os.path import split as p_split, join 12 | 13 | # add project dir to PYTHONPATH 14 | sys.path.insert(0, join(p_split(sys.path[0])[0])) 15 | 16 | # so now we can import opensyllabus package 17 | from opensyllabus.config import DATA_DIR, PROCESS_REPORT_COUNT 18 | from opensyllabus.core.exceptions import WrongFileExtension 19 | from opensyllabus.core.utils import get_data_files, get_file_ext, get_file_dir 20 | 21 | 22 | class ExtStats(object): 23 | 24 | def __init__(self): 25 | self.counter = 0 26 | self.ext_stats = {} 27 | self.valid_count = 0 28 | self.errors_count = 0 29 | 30 | def _show_stats(self): 31 | """ 32 | Show calculated statistics 33 | """ 34 | report = '\nExtensions Stats:\n' 35 | for ext, count in self.ext_stats.iteritems(): 36 | report += '%s: %s\n' % (ext, count) 37 | 38 | report += 'processed: %s\nvalid: %s\nerrors: %s' % (self.counter, self.valid_count, self.errors_count) 39 | print '=' * 80 + report + '=' * 80 40 | 41 | def _show_process_report(self, filepath): 42 | """ 43 | Show statistics reports during file processing 44 | """ 45 | self.counter += 1 46 | if self.counter % PROCESS_REPORT_COUNT == 0: 47 | print 'Processed (%s): %s files, errors: %s, stats: %s' % (get_file_dir(filepath), self.counter, self.errors_count, self.ext_stats) 48 | 49 | def _calc_stats(self): 50 | """ 51 | Calculating statistics for file extensions 52 | """ 53 | for file in get_data_files(): 54 | try: 55 | ext = get_file_ext(file) 56 | except WrongFileExtension: 57 | self.errors_count += 1 58 | else: 59 | self.valid_count += 1 60 | self.ext_stats[ext] = self.ext_stats.setdefault(ext, 0) + 1 61 | 62 | self._show_process_report(file) 63 | 64 | 65 | def run(self): 66 | """ 67 | Main method for starting calculating 68 | """ 69 | self._calc_stats() 70 | self._show_stats() 71 | 72 | 73 | if __name__ == '__main__': 74 | ExtStats().run() 75 | -------------------------------------------------------------------------------- /opensyllabus/run_ingestion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Author: Maxim Kosinov 4 | Specialization: Python, HighLoad Crawlers, Data Mining, Scraping 5 | E-Mail: astrey.labs@gmail.com 6 | Skype: geminiozz 7 | O-Desk: Astrey 8 | """ 9 | 10 | import os 11 | import re 12 | import sys 13 | import time 14 | import Queue 15 | import logging 16 | from pymongo.errors import OperationFailure, AutoReconnect, DuplicateKeyError 17 | from optparse import OptionParser, OptionGroup 18 | 19 | # add project dir to PYTHONPATH 20 | sys.path.insert(0, os.path.join(os.path.split(sys.path[0])[0])) 21 | 22 | # so now we can import opensyllabus package 23 | # from opensyllabus.core.extractor import DataExtractor 24 | # from opensyllabus.core.mongo import OpenSyllabusDb 25 | from opensyllabus.config import DATA_DIR, INGESTION_LOG_FILE, FILE_LOG_VERBOSITY, \ 26 | CONSOLE_LOG_VERBOSITY, THREADS_COUNT, LOG_TO_FILE 27 | 28 | from opensyllabus.core.ingestion import Ingester, StatCounter 29 | from opensyllabus.core.utils import get_data_files, get_file_ext, configure_loggers, log_levels 30 | 31 | 32 | log = logging.getLogger(__name__) 33 | 34 | 35 | if __name__ == '__main__': 36 | 37 | parser = OptionParser(usage='Usage: %prog [options] dir') 38 | parser.add_option( 39 | '-v', '--verbosity', 40 | dest='verbosity', 41 | type='choice', 42 | choices=log_levels.keys(), 43 | default=CONSOLE_LOG_VERBOSITY, 44 | help='setup console log verbosity' 45 | ) 46 | 47 | parser.add_option( 48 | '-f', '--log-verbosity', 49 | dest='log_verbosity', 50 | type='choice', 51 | choices=log_levels.keys(), 52 | default=FILE_LOG_VERBOSITY, 53 | help='setup file log verbosity' 54 | ) 55 | 56 | parser.add_option( 57 | '-t', '--threads_count', 58 | dest='threads_count', 59 | default=THREADS_COUNT, 60 | help='setup threads count' 61 | ) 62 | 63 | parser.add_option( 64 | '-l', '--log', 65 | dest='log_file', 66 | help='setup log file' 67 | ) 68 | 69 | options, args = parser.parse_args() 70 | 71 | if LOG_TO_FILE and not options.log_file: 72 | options.log_file = INGESTION_LOG_FILE 73 | 74 | if len(args) < 1: 75 | data_dir = DATA_DIR 76 | else: 77 | data_dir = args[1] 78 | 79 | log = configure_loggers(log, 80 | log_levels[options.verbosity], 81 | options.log_file, 82 | log_levels[options.log_verbosity]) 83 | 84 | log.info('Ingestion started: %s' % data_dir) 85 | 86 | queue = Queue.Queue() 87 | counter = StatCounter() 88 | 89 | for i in range(options.threads_count): 90 | ingester = Ingester(queue, log, counter) 91 | ingester.setDaemon(True) 92 | ingester.start() 93 | 94 | for data_file in get_data_files(): 95 | queue.put(data_file) 96 | 97 | queue.join() 98 | 99 | counter.show_report() 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /sanitize.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class Sanitize(object): 4 | 5 | """ 6 | Methods for removing personal information from a syllabus. 7 | """ 8 | 9 | @classmethod 10 | def get_professor_names(cls, text): 11 | ''' Find phrases that begin with Dr., Prof., Professor, or Instructor 12 | and then 1-3 words in title case ''' 13 | matches = re.findall(r"(.)*((Prof\.)|(Dr\.)|(Professor)|(Instructor(:)*))(((\ ){0,2}([A-Z][a-z]{2,})*){1,3})", text) 14 | #print matches 15 | fullPhrases = [] 16 | for m in matches: 17 | full = m[7].strip() 18 | if full and ( full not in fullPhrases ) : 19 | fullPhrases.append(full) 20 | return fullPhrases 21 | 22 | @classmethod 23 | def get_email_addresses(cls, text): 24 | ''' Find phrases that contain @ symbol ''' 25 | matches = re.findall(r"(.)*(\b(\w|\.)+@(\w|\.)+\b)(.)*", text) 26 | fullPhrases = [] 27 | for m in matches: 28 | full = m[1] 29 | if full and ( full not in fullPhrases ) : 30 | fullPhrases.append(full) 31 | return fullPhrases 32 | 33 | @classmethod 34 | def get_phone_numbers(cls, text): 35 | ''' Find phrases in the form 555-555-5555 or (555)555-5555 ''' 36 | matches = re.findall(r"(.)*(([0-9]{3}[\-]|\([0-9]{3}\))[0-9]{3}[\-][0-9]{4})(.)*", text) 37 | fullPhrases = [] 38 | for m in matches: 39 | full = m[1] 40 | if full and ( full not in fullPhrases ) : 41 | fullPhrases.append(full) 42 | #print fullPhrases 43 | return fullPhrases 44 | 45 | @classmethod 46 | def remove_names_and_emails_and_phone(cls, text): 47 | """ Deletes instances of professor names, emails, and phone numbers from the text """ 48 | ## Our approach is to make a regular expression that will replace any 49 | ## character name that is in either 'FULL CAPS' or in 'Title Case' 50 | ## "(JOHN)|(John)|(ALICE)|(Alice)", etc. 51 | emails = Sanitize.get_email_addresses(text) 52 | names = Sanitize.get_professor_names(text) 53 | phone = Sanitize.get_phone_numbers(text) 54 | allToRemove = emails + names + phone 55 | withParens = map(lambda x: "(" + x + ")", allToRemove) 56 | for regex in withParens: 57 | text = re.sub(regex, "", text) 58 | return text 59 | #print text 60 | 61 | def sanitize_test(): 62 | PRACTICE_TEXT = " HIST101 \n Dr. Eddard Stark \n Winterfell College \n Phone: 212-555-5555 \n Email: ned@kingslanding.edu \n Hand of the King.\n\n\n" 63 | print PRACTICE_TEXT 64 | print Sanitize.remove_names_and_emails_and_phone(PRACTICE_TEXT) 65 | 66 | if __name__ == '__main__': 67 | sanitize_test() 68 | 69 | 70 | 71 | ###################################################################################################### 72 | ## 73 | ## This code was written by: 74 | ## 75 | ## Graham Sack 76 | ## Columbia University 77 | ## http://www.columbia.edu/~gas2117/grahamsack.html 78 | ## 79 | ##################################################################################################### 80 | 81 | -------------------------------------------------------------------------------- /twitter/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | twitter-log.csv 3 | -------------------------------------------------------------------------------- /twitter/twitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 0 * * * * /mnt/osp-archive-mount/code/twitter/twitter.py 4 | 5 | import csv 6 | from datetime import datetime 7 | import os 8 | import sys 9 | import tweepy 10 | import urllib2 11 | import httplib 12 | import urlparse 13 | 14 | #log contains Twitter login creds and timestamp for most recent Tweet collected 15 | logPath = '/mnt/osp-archive-mount/code/twitter/' 16 | dumpPath = '/mnt/osp-archive-mount/document-dump/twitter/' 17 | 18 | def login(): 19 | try: 20 | with open(logPath+'twitter-log.csv', 'r') as f: 21 | log = csv.DictReader(f, delimiter=',') 22 | row = log.next() 23 | keys = [row['cons_key'], row['cons_secret'], row['a_key'], row['a_secret']] 24 | #'last' is the timestampt of the most recent Tweet pulled from server 25 | last = row['last'] 26 | except Exception: 27 | print "Log file not accessible or corrupted" 28 | sys.exit(0) 29 | auth = tweepy.auth.OAuthHandler(keys[0], keys[1]) 30 | auth.set_access_token(keys[2], keys[3]) 31 | api = tweepy.API(auth) 32 | return api, last 33 | 34 | def save(results): 35 | #save the ID of the most recent Tweet in 'results' 36 | try: 37 | thisTime = results[0].id 38 | except Exception: 39 | print "Sorry, no new Tweets." 40 | sys.exit(0) 41 | with open(logPath+'twitter-log.csv', 'rb') as r: 42 | with open(logPath+'newlog.csv', 'wb') as w: 43 | read = csv.reader(r, delimiter=',') 44 | write = csv.writer(w,delimiter=',') 45 | row = read.next() 46 | write.writerow(row) 47 | row = read.next() 48 | row[4] = str(thisTime) 49 | write.writerow(row) 50 | r.close() 51 | w.close() 52 | os.rename(logPath+'newlog.csv', logPath+'twitter-log.csv') 53 | 54 | def search(): 55 | api, last = login() 56 | results = api.search(q='#ospsubmit', since_id=last) 57 | save(results) 58 | finalurls = [] 59 | users = [] 60 | ids = [] 61 | for result in results: 62 | for entry in result.entities.get('urls'): 63 | fullurl=entry.get('expanded_url') 64 | finalurls.append(fullurl) 65 | users.append(result.user.screen_name) 66 | ids.append(result.id) 67 | 68 | return finalurls, users, api, ids 69 | 70 | def download(finalurls): #finalurls is a list of URLs to download 71 | for index in range(len(finalurls[0])): 72 | url = finalurls[0][index] 73 | user = finalurls[1][index] 74 | api = finalurls[2] 75 | tweetid = finalurls[3][index] 76 | 77 | #construct string of date to use as folder for these downloads 78 | date = (str(datetime.now().year) + '-' + str(datetime.now().month) + 79 | '-' + str(datetime.now().day)) 80 | if not os.path.exists(dumpPath+date): 81 | os.mkdir(dumpPath+date) 82 | 83 | #chop off the tracking junk from URL string 84 | if '?' in url: 85 | url=url.split('?')[0] 86 | 87 | #take last string of URL and make it into filename 88 | filename = url.split('/')[len(url.split('/'))-1] 89 | if filename=='': 90 | filename = url.split('/')[len(url.split('/'))-2] 91 | 92 | write = 0 #innocent until proven guilty 93 | 94 | #check filetype 95 | if len(filename.split('.')) == 2: 96 | extension = filename.split('.')[1].lower() 97 | if extension not in ["doc", "docx", "pdf", "txt", "rtf", "md", "html"]: 98 | write = 1 99 | else: 100 | write = 1 101 | 102 | #check size 103 | try: 104 | if int(getsize(url)) > int(5242880): 105 | write = 2 106 | except Exception: 107 | print "Unable to get file size." 108 | 109 | #Avoid overwriting files named same thing. 110 | counter = 1 111 | rawfilename = filename 112 | while os.path.exists(dumpPath+date+'/'+filename): 113 | filename = str(counter) + '-' + rawfilename 114 | counter += 1 115 | 116 | #download or not, depending on status 117 | if write == 0: 118 | try: 119 | site = urllib2.urlopen(url) 120 | with open(dumpPath+date+'/'+filename, 'w') as file: 121 | file.write(site.read()) 122 | except Exception: 123 | print "A file failed to download." 124 | write = 3 125 | tweet(write, user, filename, tweetid, api) 126 | 127 | def getsize(site): 128 | scheme, host, path, params, query, fragment = urlparse.urlparse(site) 129 | if scheme != "http": 130 | raise ValueError("only supports HTTP requests") 131 | if not path: 132 | path = "/" 133 | if params: 134 | path = path + ";" + params 135 | if query: 136 | path = path + "?" + query 137 | # make a http HEAD request 138 | h = httplib.HTTP(host) 139 | h.putrequest("HEAD", path) 140 | h.putheader("Host", host) 141 | h.endheaders() 142 | status, reason, headers = h.getreply() 143 | h.close() 144 | size = headers.get("content-length") 145 | #if server doesn't send header, resort to downloading + checking (possible security issue) 146 | if size is None: 147 | page = urllib.urlopen(site) 148 | return len(page.read()) 149 | return size 150 | 151 | def tweet(status, user, filename, tweetid, api): 152 | if len(filename) > 40: 153 | filename = "..." + filename[-30:] 154 | if status == 0: 155 | message = "@" + user + " Thanks for submitting to the Open Syllabus Project. \'" + filename + "\' is now in the database!" 156 | if status == 1: 157 | message = "@" + user + " Sorry, \'" + filename + "\' is not in a supported format (doc/docx, pdf, htm/html, txt, rtf, or md)!" 158 | if status == 2: 159 | message = "@" + user + " Sorry, \'" + filename + "\' is too large. Please tweet a smaller copy!" 160 | if status == 3: 161 | message = "@" + user + " Woops! \'" + filename + "\' was not accessible. Please check the link and tweet again." 162 | print message 163 | try: 164 | api.update_status(message, tweetid) 165 | except Exception: 166 | print "Tweet failed." 167 | 168 | download(search()) 169 | --------------------------------------------------------------------------------