├── TopicModelling ├── TopicSimilarity │ └── readme.txt ├── arguing_lexicon │ ├── lexicon │ │ ├── en │ │ │ ├── patterns │ │ │ │ ├── authority.tff │ │ │ │ ├── wants.tff │ │ │ │ ├── causation.tff │ │ │ │ ├── inyourshoes.tff │ │ │ │ ├── structure.tff │ │ │ │ ├── rhetoricalquestion.tff │ │ │ │ ├── generalization.tff │ │ │ │ ├── possibility.tff │ │ │ │ ├── doubt.tff │ │ │ │ ├── priority.tff │ │ │ │ ├── contrast.tff │ │ │ │ ├── necessity.tff │ │ │ │ ├── inconsistency.tff │ │ │ │ ├── difficulty.tff │ │ │ │ ├── conditionals.tff │ │ │ │ ├── assessments.tff │ │ │ │ └── emphasis.tff │ │ │ ├── macros │ │ │ │ ├── spoken.tff │ │ │ │ ├── wordclasses.tff │ │ │ │ ├── pronoun.tff │ │ │ │ ├── intensifiers.tff │ │ │ │ └── modals.tff │ │ │ └── patterntest.txt │ │ ├── nl │ │ │ ├── macros │ │ │ │ ├── spoken.tff │ │ │ │ ├── wordclasses.tff │ │ │ │ ├── pronoun.tff │ │ │ │ ├── intensifiers.tff │ │ │ │ └── modals.tff │ │ │ └── patterns │ │ │ │ ├── authority.tff │ │ │ │ ├── wants.tff │ │ │ │ ├── structure.tff │ │ │ │ ├── inyourshoes.tff │ │ │ │ ├── causation.tff │ │ │ │ ├── rhetoricalquestion.tff │ │ │ │ ├── generalization.tff │ │ │ │ ├── contrast.tff │ │ │ │ ├── possibility.tff │ │ │ │ ├── necessity.tff │ │ │ │ ├── priority.tff │ │ │ │ ├── difficulty.tff │ │ │ │ ├── doubt.tff │ │ │ │ ├── inconsistency.tff │ │ │ │ ├── conditionals.tff │ │ │ │ ├── emphasis.tff │ │ │ │ └── assessments.tff │ │ └── README for Arguing Lexicon.pdf │ ├── environment.yml │ ├── README.md │ ├── lda.py │ ├── arguing_lexicon.py │ ├── arguing-lexicon-lda.ipynb │ └── arguing-lexicon-filter.ipynb ├── Captions preprocessing │ ├── README.md │ └── Preprocess leftwing.ipynb ├── TopicModelWrapper │ ├── stopwords │ │ ├── dutch │ │ └── english │ ├── visualisations │ │ └── pyLDAvisualisation.py │ ├── StreamingCorpus.py │ ├── StreamingPreprocesser.py │ ├── StreamingParser.py │ └── main.py ├── Top TfIdf │ ├── README.md │ └── Right - tfidf top words.ipynb ├── filterTranscripts.py ├── getWord2VecModel.py ├── TextLemma │ ├── filterTranscripts.py │ └── getTokens.py ├── getTokens.py └── language_detection │ └── spacy-language-detection.ipynb ├── DataCollection ├── tests │ └── youtubecollector │ │ ├── test_utils.py │ │ ├── resources │ │ ├── api_test.conf │ │ ├── comment_minimal.json │ │ ├── comment_full.json │ │ ├── video.json │ │ ├── recommendation.json │ │ ├── video_metadata.json │ │ ├── comment_with_reply.json │ │ ├── nullable_fields_channel_response.json │ │ └── full_channel_response.json │ │ ├── utils_for_test.py │ │ ├── test_recommendations.py │ │ ├── test_videos.py │ │ ├── test_channels.py │ │ └── test_comments.py ├── src │ └── youtubecollector │ │ ├── printer.py │ │ ├── __init__.py │ │ ├── util.py │ │ ├── youtube_client.py │ │ ├── transcripts.py │ │ ├── recommendations.py │ │ ├── channels.py │ │ ├── video.py │ │ └── comments.py ├── api.conf ├── requirements.txt ├── setup.cfg ├── setup.py ├── CONTRIBUTE.md ├── Makefile └── README.md ├── .gitignore ├── RabbitHole ├── config.py └── youtube-onderzoek-jan.py ├── README.md └── Notebooks ├── getting_started.ipynb └── scenariofunctions.py /TopicModelling/TopicSimilarity/readme.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/test_utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.egg-info/ 3 | __pycache__/ 4 | venv/ 5 | .coverage 6 | htmlcov/ -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/printer.py: -------------------------------------------------------------------------------- 1 | def print_text(): 2 | print("hello Worlds") 3 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/authority.tff: -------------------------------------------------------------------------------- 1 | #class="authority" 2 | according to 3 | -------------------------------------------------------------------------------- /DataCollection/api.conf: -------------------------------------------------------------------------------- 1 | youtube_api_service_name="youtube" 2 | youtube_api_version="v3" 3 | developer_key=None -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/macros/spoken.tff: -------------------------------------------------------------------------------- 1 | #class="spoken" 2 | @DYS={uh,um, mm-hmm, uh-huh, huh} -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/macros/spoken.tff: -------------------------------------------------------------------------------- 1 | #class="spoken" 2 | @UH={uh,um, mm-hmm, uh-huh, huh, ehm, euhm, eh} 3 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/authority.tff: -------------------------------------------------------------------------------- 1 | #class="authority" 2 | volgens 3 | naargelang 4 | overeenkomstig 5 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/wants.tff: -------------------------------------------------------------------------------- 1 | #class="wants" 2 | (jij|wij|ik) (wil|willen) (niet|misschien|misschien niet) 3 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/api_test.conf: -------------------------------------------------------------------------------- 1 | youtube_api_service_name="youtube" 2 | youtube_api_version="v3" 3 | developer_key="123" -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/wants.tff: -------------------------------------------------------------------------------- 1 | #class="wants" 2 | (you|we|i) (don\'t )?(want|wanna) 3 | (you|we|i) might (not )?(want|wanna) 4 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/structure.tff: -------------------------------------------------------------------------------- 1 | #class="structure" 2 | (ten )?eerste? 3 | ten tweede 4 | verder 5 | (in de )?eerste plaats 6 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/causation.tff: -------------------------------------------------------------------------------- 1 | #class="causation" 2 | so 3 | therefore 4 | because 5 | hence 6 | as a result 7 | consequently -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/inyourshoes.tff: -------------------------------------------------------------------------------- 1 | #class="inyourshoes" 2 | what i would do 3 | if i were you 4 | i would not 5 | i wouldn\'t 6 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/structure.tff: -------------------------------------------------------------------------------- 1 | #class="structure" 2 | first 3 | secondly 4 | first place 5 | in the first place 6 | first of all 7 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterntest.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decorrespondent/youtube_extremism/HEAD/TopicModelling/arguing_lexicon/lexicon/en/patterntest.txt -------------------------------------------------------------------------------- /RabbitHole/config.py: -------------------------------------------------------------------------------- 1 | PATH_RESULTS = '' 2 | PATH_DEMOGRAPHICS = '~' 3 | PATH_OUTPUT = '' 4 | DEVELOPER_KEY = '' 5 | YOUTUBE_API_SERVICE_NAME = "youtube" 6 | YOUTUBE_API_VERSION = "v3" 7 | -------------------------------------------------------------------------------- /DataCollection/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | youtube_dl 4 | google-api-python-client 5 | webvtt-py 6 | jupyter 7 | requests 8 | matplotlib 9 | networkx 10 | seaborn 11 | tqdm 12 | docutils -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/README for Arguing Lexicon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decorrespondent/youtube_extremism/HEAD/TopicModelling/arguing_lexicon/lexicon/README for Arguing Lexicon.pdf -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/inyourshoes.tff: -------------------------------------------------------------------------------- 1 | #class="inyourshoes" 2 | wat ik zou doen 3 | als ik jou was 4 | als ik in jouw schoenen stond 5 | ik zou in jouw (plaats|positie) 6 | ik zou niet 7 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/causation.tff: -------------------------------------------------------------------------------- 1 | #class="causation" 2 | dus 3 | daarom 4 | omdat 5 | derhalve 6 | resultaat 7 | voortkomend 8 | voortvloeiend 9 | volgend 10 | bijgevolg 11 | zodoende 12 | -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/__init__.py: -------------------------------------------------------------------------------- 1 | from . import channels 2 | from . import comments 3 | from . import recommendations 4 | from . import transcripts 5 | from . import video 6 | from . import youtube_client 7 | from . import printer 8 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/rhetoricalquestion.tff: -------------------------------------------------------------------------------- 1 | #class="rhetoricalquestion" 2 | do (we|you) (actually|really|still) (need|want) 3 | why not 4 | why don\'t (we|you) 5 | what if 6 | (and )?who (wouldn\'t|doesn\'t) (@EMO1V) 7 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/generalization.tff: -------------------------------------------------------------------------------- 1 | #class="generalization" 2 | (everybody|everything|anybody|anything|nobody|nothing) (else|at all) 3 | in the (world|universe) 4 | of all times 5 | in recent memory 6 | in living history 7 | -------------------------------------------------------------------------------- /DataCollection/setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | addopts = --verbose 6 | python_files = tests/*.py 7 | 8 | [coverage:run] 9 | branch=True 10 | source = . 11 | 12 | [coverage:report] 13 | omit = 14 | tests/* 15 | include = 16 | src/youtubecollector/* -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/possibility.tff: -------------------------------------------------------------------------------- 1 | #class="possibility" 2 | you can 3 | we can 4 | you can\'t 5 | you cannot 6 | we can\'t 7 | we cannot 8 | you could 9 | we could 10 | (@BE) able to 11 | there\'s no way (that|for|of|to)? 12 | any way (that|for|of|to)? 13 | no way 14 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/rhetoricalquestion.tff: -------------------------------------------------------------------------------- 1 | #class="rhetoricalquestion" 2 | (wil|willen) (jij|je|we) (eigenlijk|echt|nog steeds) 3 | (heb|hebben) (jij|je|we) (eigenlijk|echt|nog steeds) nodig 4 | waarom niet 5 | waarom (doe|doen) (jij|je|jullie) niet 6 | wat als 7 | wie (@EMO1V) (nu )? niet 8 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/doubt.tff: -------------------------------------------------------------------------------- 1 | #class="doubt" 2 | (i am|i\'m) not (sure|convinced) 3 | i (don\'t|can\'t|do not|cannot) see how 4 | it (is not|isn\'t) (clear|evident|obvious) (that)? 5 | it\'s not (clear|evident|obvious) (that)? 6 | (we|i) doubt (that)? 7 | (we|i) (am|are) doubtful 8 | (we\'re|i\'m) doubtful -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/macros/wordclasses.tff: -------------------------------------------------------------------------------- 1 | #class="wordclasses" 2 | #emo1=stative, relational, positive 3 | #emo2=stative, relational, negative 4 | @EMO1V={like, adore, want, prefer, love, enjoy} 5 | @EMO1N={like, adoration, want, preference, love, enjoyment} 6 | @EMO2V={hate, dislike, disprefer} 7 | @EMO2N={hate, dislike, dispreference} 8 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/priority.tff: -------------------------------------------------------------------------------- 1 | #class="priority" 2 | important 3 | crucial 4 | key 5 | essential 6 | critical 7 | fundamental 8 | key 9 | major 10 | vital 11 | first and foremost 12 | (now )?remember (that)? 13 | keep in mind (that)? 14 | don\'t forget (that)? 15 | let\'s not forget 16 | let\'s keep in mind 17 | let\'s remember 18 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/generalization.tff: -------------------------------------------------------------------------------- 1 | #class="generalization" 2 | (iedereen|alles|iemand|iets|niemand|niets) anders 3 | helemaal (niemand|niets) 4 | (in|op) de wereld 5 | in het universum 6 | op aarde 7 | altijd 8 | te allen tijde 9 | van alle tijden 10 | in de recente geschiedenis 11 | recentelijk 12 | in de jongste geschiedenis 13 | sinds mensenheugenis 14 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/macros/pronoun.tff: -------------------------------------------------------------------------------- 1 | #class="pronoun" 2 | @I={i, i\'m, i\'ve, i\'ll, i\'d,me} 3 | @YOU={you, you\'re, you\'ve, you\'ll, you\'d} 4 | @HE={he, he\'s, he\'ll, he\'d,him} 5 | @SHE={she, she\'s, she\'ll, she\'d,her} 6 | @WE={we, we\'re, we\'ll, we\'d, we\'ve,us} 7 | @THEY={they, they\'re, they\'ll, they\'d, they\'ve,them} 8 | @PRONSUBJ={i, you, he, she, it, we, they} -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/macros/wordclasses.tff: -------------------------------------------------------------------------------- 1 | #class="wordclasses" 2 | #emo1=subjectief,relationeel, positief 3 | #emo2=subjectief,relationeel, negatief 4 | @EMO1V={adoreert, wilt, prefereert, houdt, geniet} 5 | @EMO1N={leuk, adoreren, willen, voorkeuren, houden van, genieten} 6 | @EMO2V={haten, niet leuk, niet de voorkeur, afkeuren} 7 | @EMO2N={haten, niet leuk, niet de voorkeur, afkeuren} 8 | -------------------------------------------------------------------------------- /TopicModelling/Captions preprocessing/README.md: -------------------------------------------------------------------------------- 1 | # Preprocessing of captions 2 | 3 | The attached ipython notebook cleans the csv files that were created straight from the .vtt files. They contain duplicates, sometimes combined in one line. Furthermore, since .csv does not support the python lists, these are saved as strings. 4 | 5 | **TODO:** Put the cleaning script straight in the script that reads the .vtt files. -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/contrast.tff: -------------------------------------------------------------------------------- 1 | #class="contrast" 2 | really 3 | actually 4 | as opposed to 5 | instead of 6 | rather than 7 | there (are|is) ([\w]+[ \,]*){1,4} and (then )?there (are|is) 8 | (is|that\'s|it\'s) a whole nother issue 9 | (is|are|that\'s|it\'s) (very|quite|completely|totally )?different 10 | whole new ballgame 11 | (is|that\'s|it\'s) a (separate|different) (issue|question) 12 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/contrast.tff: -------------------------------------------------------------------------------- 1 | #class="contrast" 2 | er (is|zijn) ([\w]+[ \,]*){1,4} en er (is|zijn|dan is er) 3 | (dat|het) is een (heel)? andere? (probleem|zaak|vraag|issue) 4 | (dat |het )?is (heel |best wel |compleet |totaal |helemaal )?anders 5 | zijn (heel |best wel |compleet |totaal |helemaal )?anders 6 | (compleet|helemaal|heel) anders 7 | is een (aparte|andere kwestie|situatie) 8 | (dat |het )?is een (aparte|andere) (kwestie|situatie) 9 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/necessity.tff: -------------------------------------------------------------------------------- 1 | #class="necessity" 2 | a must 3 | must 4 | essential 5 | indispensable 6 | necessary 7 | (@BE) a necessity 8 | needed 9 | required 10 | requirement 11 | can\'t do without 12 | got to 13 | gotta 14 | had better 15 | hafta 16 | have to 17 | has to 18 | need to 19 | needs to 20 | ought to 21 | oughta 22 | should 23 | (@PRONSUBJ) better 24 | (necesssitates|necessitated|necessitating|necessitate) -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/environment.yml: -------------------------------------------------------------------------------- 1 | name: ml 2 | channels: 3 | - defaults 4 | dependencies: 5 | - ipython 6 | - jupyter_client 7 | - jupyter_core 8 | - nb_conda 9 | - nb_conda_kernels 10 | - notebook 11 | - python=3.6.6 12 | - pip: 13 | - autopep8==1.4 14 | - ipdb==0.10.3 15 | - jupyter==1.0.0 16 | - jupyter-console==5.1.0 17 | - numpy==1.14.2 18 | - pandas==0.19.2 19 | - spacy==2.0.11 20 | - spacy-arguing-lexicon==0.0.2 21 | 22 | -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/util.py: -------------------------------------------------------------------------------- 1 | import os as _os 2 | 3 | 4 | def is_empty_file(filename: str): 5 | if not isinstance(filename, str): 6 | raise Exception(f"filename should be a string of an existing file") 7 | 8 | if not _os.path.exists(filename): 9 | raise Exception(f"{filename} doesn't exists") 10 | 11 | return _os.stat(filename).st_size == 0 12 | 13 | 14 | def convert_to_dictionary(obj): 15 | return {field: getattr(obj, field) for field in obj._fields} -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/possibility.tff: -------------------------------------------------------------------------------- 1 | #class="possibility" 2 | (je|jij) (kan|kunt)( niet)? 3 | (we|wij) kunnen( niet)? 4 | (je|jij) zou kunnen 5 | (we|wij) zouden kunnen 6 | (@ZIJN) in staat tot 7 | in staat zijn tot 8 | (kunnen|kun|kan) 9 | er is geen mogelijkheid (dat|voor|om|tot) 10 | er is geen wijze (om|voor) 11 | er geen manier (dat|voor|om) 12 | een mogelijkheid (dat|voor|om|tot) 13 | een manier (dat|voor|om) 14 | een wijze (om|voor) 15 | geen (manier|mogelijkheid|wijze) 16 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/inconsistency.tff: -------------------------------------------------------------------------------- 1 | #class="inconsistency" 2 | except that 3 | except for 4 | with the exception of 5 | however 6 | nevertheless 7 | that said 8 | that having been said 9 | that being said 10 | despite 11 | in spite of 12 | even so 13 | at the same time 14 | still 15 | wait a minute 16 | hold on a second 17 | hold on a sec 18 | it\'s just that 19 | all well and good 20 | as far as it goes 21 | you might think (that)? 22 | you may think (that)? 23 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/necessity.tff: -------------------------------------------------------------------------------- 1 | #class="necessity" 2 | een (must|voorwaarde|vereiste) 3 | essentieel 4 | (onvervangbaar|onontbeerlijk) 5 | noodzakelijk 6 | (@ZIJN) (een )?(noodzaak|vereiste|een vereiste|noodzakelijk) 7 | nodig 8 | verplicht 9 | (eis|vereiste|voorwaarde) 10 | kan niet (zijn )?zonder 11 | (moet|moeten|eis) 12 | (hadden|had|konden|kon) beter 13 | nodig hebben 14 | (behoort te|zou moeten) 15 | (zou|zouden) 16 | (@OND) beter 17 | (noodzaken|noodzakelijk|dwingen|noodzakelijk maken) 18 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/macros/pronoun.tff: -------------------------------------------------------------------------------- 1 | #class="pronoun" 2 | @IK={ik, ik ben, ik heb, ik zal, mijn, me, mij} 3 | @JIJ={jij, je, jij bent, je bent, jij hebt, je hebt, jij zal, je zal, jij zult, je zult, jouw} 4 | @HIJ={hij, hij is, hij heeft, hij zal, hem} 5 | @ZE={zij, ze, zij is, ze is, zij heeft, ze heeft, zij zal, ze zal, haar} 6 | @WE={we, wij, we zijn, wij zijn, we hebben, wij hebben, we zullen, wij zullen, ons} 7 | @ZIJ={zij, zij zijn, zij hebben, zij zullen, hun} 8 | @OND={ik, jij, je, hij, zij, het, we, wij, hen, hun} 9 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/priority.tff: -------------------------------------------------------------------------------- 1 | #class="priority" 2 | (belangrijk|van belang|gewichtig) 3 | cruciaal 4 | sleutelrol 5 | essentieel 6 | kritiek 7 | fundamenteel 8 | (enorm|groot|zeer groot|belangrijk|belangrijkste|voornaamste) 9 | (vitaal|wezenlijk|doorslaggevend) 10 | (in de eerste plaats|allereerst) 11 | onthoud( dat)? 12 | houd in gedachte( dat)? 13 | vergeet (dat )?niet( dat)? 14 | laten we niet vergeten 15 | laten we in gedachte houden 16 | laten we onthouden 17 | laten we herinneren 18 | herinner je 19 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/macros/intensifiers.tff: -------------------------------------------------------------------------------- 1 | #class="intensifier_adv" 2 | @INTENSADV1={absolutely, absurdly, resoundingly, amazingly, awfully, extremely, completely, highly, incredibly, perfectly, quite, really, strikingly, surprisingly, terribly, totally, unbelievably, hugely, unnaturally, unusually, utterly, very, tremendously, spectacularly} 3 | @INTENSADJ1={absolute, extreme, incredible, perfect, phenomenal, spectacular, huge, major, tremendous, complete, considerable, real, terrible, total, unbelievable, utter, great, resounding} 4 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/difficulty.tff: -------------------------------------------------------------------------------- 1 | #class="difficulty" 2 | (@ZIJN) (@INTENSBIJW)? ?makkelijk 3 | (@ZIJN) (@INTENSBIJW)? ?gemakkelijk 4 | in een zucht 5 | (@ZIJN) (een )?appeltje eitje 6 | (@ZIJN) snel gedaan 7 | (@ZIJN) (@INTENSBIJW)? ?kinderspel 8 | (@ZIJN) (@INTENSBIJW)? ?(lastig|verradelijk|moeilijk|vervelend) 9 | (@ZIJN) niet gemakkelijk 10 | (@ZIJN) moeilijk 11 | (@ZIJN) (@INTENSBIJW)? zwaar 12 | (@ZIJN) een (@INTENSBIJV)? ?uitdaging 13 | (@ZIJN) (@INTENSBIJW)? ?uitdagend 14 | (@HEB) een (@INTENSBIJW)? ?(moeilijke|zware) tijd 15 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/doubt.tff: -------------------------------------------------------------------------------- 1 | #class="doubt" 2 | ik ben niet overtuigd 3 | ik ben er niet van overtuigd 4 | ik ben niet zeker 5 | ik ben er niet zeker van 6 | ik (weet|zie) niet hoe 7 | ik (kan|zou) niet (zien|weten) hoe 8 | het is niet (helder|duidelijk|overduidelijk|evident)( dat)? 9 | is het niet (helder|duidelijk|overduidelijk|evident)( dat)? 10 | ik betwijfel (dat|ten zeerste|ten zeerste dat)? 11 | (we|wij) betwijfelen (dat|ten zeerste|ten zeerste dat)? 12 | ik (ben in )?twijfel 13 | we twijfelen 14 | ik ben twijfelachtig 15 | (we|wij) zijn twijfelachtig 16 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/inconsistency.tff: -------------------------------------------------------------------------------- 1 | #class="inconsistency" 2 | behalve (dan )?dat 3 | behalve 4 | met (uitsluiting|uitzondering) van 5 | echter|maar 6 | (niettemin|desondanks|noch|niettegenstaande) 7 | dat gezegd (hebbende|zijnde) 8 | dat zeggende 9 | nu dat gezegd is 10 | ondanks 11 | ongeacht 12 | zelfs dan 13 | tegelijkertijd 14 | nog steeds 15 | wacht (eens )?even 16 | even wachten 17 | het is alleen( dat)? 18 | (allemaal|alles) goed en wel 19 | (zo ver als het gaat|voor zo ver het gaat) 20 | je denkt (dat )?misschien 21 | misschien denk je( dat)? 22 | zou kunnen denken( dat)? 23 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/difficulty.tff: -------------------------------------------------------------------------------- 1 | #class="difficulty" 2 | (@BE) (@INTENSADV1)?easy 3 | (@BE) a (@INTENSADJ1)?breeze 4 | (@BE) a (@INTENSADJ1)?walk in the park 5 | (@BE) a (@INTENSADJ1)?piece of cake 6 | (@BE) a (@INTENSADJ1)?snap 7 | (@BE) a (@INTENSADJ1)?cinch 8 | (@BE) (@INTENSADJ1)?child's play 9 | (@BE) (@INTENSADV1)?difficult 10 | (@BE) a (@INTENSADJ1)?pain 11 | (@BE) a (@INTENSADJ1)?pain in the (butt|neck|ass) 12 | (@BE) a (@INTENSADJ1)?(bitch|bastard) to 13 | (@BE) no picnic 14 | (@BE) (@INTENSADV1)?tricky 15 | (@BE) (@INTENSADV1)?arduous 16 | (@BE) a (@INTENSADJ1)?challenge 17 | (@BE) (@INTENSADV1)?challenging 18 | (@HAVE) a (@INTENSADV1)?(hard|difficult) time 19 | -------------------------------------------------------------------------------- /DataCollection/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("requirements.txt") as handle: 4 | project_requirements = [line.strip() for line in handle.readlines()] 5 | 6 | test_requirements = ["pytest-runner", "pytest", "coverage"] 7 | 8 | setup(name="youtubecollector", 9 | version="0.1.0", 10 | description="Module for getting data from youtube", 11 | url="https://github.com/CorrespondentData/YouTubeExtremism", 12 | author="De Correspondent", 13 | packages=['youtubecollector'], 14 | package_dir={'': 'src'}, 15 | install_requires=project_requirements, 16 | tests_require=test_requirements, 17 | extras_require={ 18 | "dev": test_requirements 19 | }, 20 | test_suite="tests", 21 | python_requires='>=3' 22 | ) 23 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/conditionals.tff: -------------------------------------------------------------------------------- 1 | #class="conditionals" 2 | if (we|you) want to ([\w]+[ \,]+){1,7}(we|you) (need to|must|have to) 3 | (we|you) ([\w ,]+) (must|have to|need to) ([\w]+[ \,]+){1,7}if (you|we) want to 4 | it would be ([\w]+[ \,]+){0,2}nice if 5 | wouldn\'t it be ([\w]+[ \,]+){0,2}nice if 6 | if ([\w]+[ \,]+){3,8} that would be ([\w]+[ \,]+){0,2}nice 7 | (cannot|will not|won\'t|can\'t) ([\w]+[ \,]+){1,7}(if|unless) 8 | (if|unless) ([\w]+[ \,]+){3,10}(cannot|will not|won\'t|can\'t) 9 | (need|needs|must|has to|have to) ([\w]+[ \,]+){3,10}(in order )to 10 | (in order )?to ([\w]+[ \,]+){3,10}(need|needs|must|has to|have to) 11 | as long as (we|you) ([\w]+[ \,]+){3,10}(will|can|able|should|[a-zA-Z]+\'ll) 12 | ([a-zA-Z]\'ll|will|can|able|should) ([\w]+[ \,]+){3,10}as long as (we|you) 13 | (you|he|we) better ([\w]+[ \,]+){3,10}or 14 | otherwise -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/macros/intensifiers.tff: -------------------------------------------------------------------------------- 1 | #class="intensifier_adv" 2 | @INTENSBIJW={absoluut, zeker weten, extreem, absurd, laiwaaiig, luidruchtig, fenomenaal, spectaculair, enorm, groot, overweldigend, compleet, aanzienlijk, echt, verbazingwekkend, verschrikkelijk, vreselijk, totaal, ongelooflijk, volledig, groots, daverend, vervelend, compleet, hoog, ongelooflijk, perfect, treffend, sprekend, erg, ongelooflijk, onnatuurlijk, ongewoon, finaal, gigantisch, geweldig, gek genoeg, een beetje, enigszins, helemaal} 3 | @INTENSBIJV={absolute, zekere, extreme, absurde, laiwaaiige, luidruchtige, fenomenale, spectaculaire, enorme, grote, overweldigende, complete, aanzienlijke, echte, verbazingwekkende, verschrikkelijke, vreselijke, totale, ongelooflijke, volledige, grootse, daverende, vervelende, complete, hoge, ongelooflijke, perfecte, treffende, sprekende, erge, ongelooflijke, onnatuurlijke, finale, gigantische, geweldige} 4 | 5 | 6 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/README.md: -------------------------------------------------------------------------------- 1 | Arguing Lexicon Filter 2 | ====================== 3 | 4 | Filters transcripts to only include text that contains argument lexicon. 5 | For details about this lexicon please [read here](https://github.com/fako/spacy_arguing_lexicon#how-it-works). 6 | 7 | 8 | Prerequisites 9 | ------------- 10 | 11 | * Conda 12 | 13 | 14 | Installation 15 | ------------ 16 | 17 | Make sure you are with a terminal inside ```arguing_lexicon```. Then setup your environment with: 18 | 19 | ```conda env create -f environment.yml``` 20 | 21 | Original data was the [captions_metadata.csv](https://drive.google.com/drive/folders/13f2fYPIsiednDBTMhd7rvCyikD_R6405) from the right_wing folder. 22 | Place the data you want to work with inside the ```data``` folder and make sure it uses the same columns as the original data. 23 | Note that it simply copies most columns and only really needs the ```content``` column. 24 | 25 | After that you should be able to start a Jupyter session with: 26 | 27 | ```jupyter notebook``` 28 | 29 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/conditionals.tff: -------------------------------------------------------------------------------- 1 | #class="conditionals" 2 | als (we|jij|je) (willen|wil) ([\w]+[ \,]+){1,7} (moeten|moet|zullen|zal) (we|jij|je) (moeten)? 3 | (we|jij|je)([\w ,]+) (moeten|moet|zullen|zal) ([\w]+[ \,]+){1,7} als (we|jij|je) (willen|wil|wilt) 4 | het zou ([\w]+[ \,]+){0,2} (fijn|prettig) zijn 5 | zou het niet ([\w]+[ \,]+){0,2} (fijn|prettig) zijn 6 | (als|wanneer) ([\w]+[ \,]+){2,8} dat zou ([\w]+[ \,]+){0,2} (fijn|prettig) zijn 7 | (kan niet|zal niet)([\w]+[ \,]+){1,7} (als|tenzij|mits|behalve als) 8 | (als|tenzij|mits|behalve)([\w]+[ \,]+){2,10} (kan|zal) (het|dat|dit) niet 9 | (moet|moeten|zal moeten|moet hebben|zal moeten hebben) ([\w]+[ \,]+){3,10} om (te)? 10 | om te ([\w]+[ \,]+){3,10} (moet|moeten|zal moeten) 11 | zo lang als (we|jij|je) ([\w]+[ \,]+){3,10} (zullen|zal|kunnen|kan|in staat zijn tot|zouden|zou) 12 | (zal|zullen|zou|zouden|kan|kunnen|zou|zouden) in staat zijn tot ([\w]+[ \,]+){3,10} zo lang als (we|jij) 13 | (jij|jij|hij|we) (zou|zouden) beter ([\w]+[ \,]+){3,10}of 14 | anders dan 15 | -------------------------------------------------------------------------------- /DataCollection/CONTRIBUTE.md: -------------------------------------------------------------------------------- 1 | ## Coding style 2 | 1. try to stick to **PEP-8** code guidelines 3 | 1. To keep the api surface as small as possible make functions and import private (using `_` prefix) 4 | 5 | ## Todo 6 | 7 | [ ] After an error halfway through a given list, the code should be able to 8 | restart after the last successful 9 | [ ] When an api key hits it limits, the next possible key should be used 10 | [ ] Add tests to the package 11 | [ ] Add google api config to readme 12 | [ ] Add documentation on domain and getting started using documentation generation 13 | * **Implementation note** we could use [sphinx](http://www.sphinx-doc.org/en/stable/) 14 | 15 | [ ] Provide a better user experience with progress bar 16 | * **Implementation note**: we could use the [tqdm package](https://github.com/tqdm/tqdm) 17 | 18 | [ ] Be able to rerun the scraping and only add the new finds 19 | * **Implementation note** requires a way to perform a delta, possible filter solution space using a last run date 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /TopicModelling/TopicModelWrapper/stopwords/dutch: -------------------------------------------------------------------------------- 1 | de 2 | en 3 | van 4 | ik 5 | te 6 | dat 7 | die 8 | in 9 | een 10 | hij 11 | het 12 | niet 13 | zijn 14 | is 15 | was 16 | op 17 | aan 18 | met 19 | als 20 | voor 21 | had 22 | er 23 | maar 24 | om 25 | hem 26 | dan 27 | zou 28 | of 29 | wat 30 | mijn 31 | men 32 | dit 33 | zo 34 | door 35 | over 36 | ze 37 | zich 38 | bij 39 | ook 40 | tot 41 | je 42 | mij 43 | uit 44 | der 45 | daar 46 | haar 47 | naar 48 | heb 49 | hoe 50 | heeft 51 | hebben 52 | deze 53 | u 54 | want 55 | nog 56 | zal 57 | me 58 | zij 59 | nu 60 | ge 61 | geen 62 | omdat 63 | iets 64 | worden 65 | toch 66 | al 67 | waren 68 | veel 69 | meer 70 | doen 71 | toen 72 | moet 73 | ben 74 | zonder 75 | kan 76 | hun 77 | dus 78 | alles 79 | onder 80 | ja 81 | eens 82 | hier 83 | wie 84 | werd 85 | altijd 86 | doch 87 | wordt 88 | wezen 89 | kunnen 90 | ons 91 | zelf 92 | tegen 93 | na 94 | reeds 95 | wil 96 | kon 97 | niets 98 | uw 99 | iemand 100 | geweest 101 | andere 102 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/utils_for_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os.path import join, split 3 | 4 | from googleapiclient.discovery import build 5 | from googleapiclient.http import HttpMockSequence 6 | 7 | 8 | def get_file_from_test_resource(filename): 9 | return join(split(__file__)[0], "resources", filename) 10 | 11 | 12 | def get_content_from_file(filename): 13 | with open(get_file_from_test_resource(filename)) as handle: 14 | return handle.read() 15 | 16 | 17 | def read_json_from_file(filename): 18 | with open(get_file_from_test_resource(filename)) as json_file: 19 | return json.loads(json_file.read()) 20 | 21 | 22 | def create_test_client_with_response(response_json_file, status_code): 23 | test_response = get_content_from_file(response_json_file) 24 | service_json = get_content_from_file("youtube_service.json") 25 | url = HttpMockSequence([ 26 | ({'status': '200'}, service_json), 27 | ({'status': str(status_code)}, test_response) 28 | ]) 29 | 30 | return build("youtube", "v3", http=url, developerKey="key") 31 | -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/youtube_client.py: -------------------------------------------------------------------------------- 1 | import getpass as _getpass 2 | import json as _json 3 | 4 | from googleapiclient.discovery import build as _build 5 | from googleapiclient.errors import HttpError 6 | 7 | 8 | def create_youtube_client(api_config_filename): 9 | youtube_api_service_name, youtube_api_version,developer_key = _get_api_config(api_config_filename) 10 | if developer_key is None: 11 | developer_key = _getpass.getpass("Google Developer Api key: ") 12 | try: 13 | return _build(youtube_api_service_name, youtube_api_version, developerKey=developer_key) 14 | except HttpError as e: 15 | print(f"Failed to connect due to {_json.loads(e.content)['error']['errors'][0]['reason']}") 16 | 17 | 18 | def _get_api_config(api_config_filename): 19 | with open(api_config_filename) as handle: 20 | config_local_vars = {} 21 | exec(handle.read(), {},config_local_vars) 22 | return (config_local_vars['youtube_api_service_name'], 23 | config_local_vars['youtube_api_version'], 24 | config_local_vars['developer_key']) 25 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/assessments.tff: -------------------------------------------------------------------------------- 1 | #class="assessments" 2 | (our|my) (opinion|understanding) (is|was) that 3 | it (is|was) (our|my) (opinion|understanding) (that)? 4 | in (our|my) opinion 5 | (our|my) take on 6 | it (seems|seemed) to (us|me) (that)? 7 | it (seems|seemed) (that)? 8 | it would seem to (us|me)? 9 | it would appear to (us|me)? 10 | it appears to (us|me)? 11 | (the|my|our) ([\w]+[ ])?point is (that)? 12 | (the|my|our) ([\w]+[ \,]*){1,2} point is (that)? 13 | it (looks|looked) to (us|me) (as if|like) 14 | it (looks|looked) (as if|like|that way) 15 | (we|i) (have|get|got) the impression (that)? 16 | (our|my) impression (was|is) (that)? 17 | in (our|my) book 18 | to (our|my) mind 19 | to (our|my) way of thinking 20 | as far as (I am|I was|we are|we were) concerned 21 | if you ask (me|us) 22 | (our|my) feeling (is|was|would be) 23 | from where (I\'m|I am) (standing|sitting) 24 | (we|I) (don\'t)? think (that)? 25 | all (we\'re|I\'m) saying is 26 | what (I\'m|we\'re) saying is 27 | (we\'re|I\'m) (not)? saying that 28 | what (we\'re|i\'m) trying to say is 29 | what (we|i) mean is (that)? -------------------------------------------------------------------------------- /DataCollection/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help guard-virtual-env basedependencies env clean 2 | 3 | guard-virtual-env: ## Exits if python virtual environment is not set 4 | @ if [ "${VIRTUAL_ENV}" = "" ]; then \ 5 | echo "Virtual environment is not active. Activate with 'source venv/bin/activate'"; \ 6 | exit 1; \ 7 | fi 8 | 9 | clean: ## Removes all build artifacts 10 | for build_dir_glob in \ 11 | build \ 12 | dist \ 13 | .eggs \ 14 | *.egg-info \ 15 | *.pyc \ 16 | *.pyo \ 17 | *~ \ 18 | __pycache__ \ 19 | .pytest_cache \ 20 | .coverage \ 21 | htmlcov; \ 22 | do find . -name "$$build_dir_glob" -exec rm -fr {} +; done 23 | 24 | basedependencies: guard-virtual-env clean 25 | pip3 install --upgrade pip setuptools wheel 26 | 27 | env: 28 | pip3 install --upgrade virtualenv 29 | virtualenv venv 30 | 31 | dependencies: basedependencies 32 | pip3 install -r requirements.txt 33 | 34 | development: dependencies 35 | pip3 install -e .[dev] 36 | 37 | test: clean development 38 | python3 setup.py test 39 | 40 | coverage: clean development 41 | coverage run -m pytest 42 | coverage report 43 | coverage html 44 | open htmlcov/index.html 45 | 46 | install: dependencies 47 | python3 setup.py install 48 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/emphasis.tff: -------------------------------------------------------------------------------- 1 | #class="emphasis" 2 | (duidelijk|helder) 3 | (klaarblijkelijk|overduidelijk|vanzelfsprekend|kennelijk|onomstreden) 4 | blijkbaar 5 | als je er (echt )?over nadenkt 6 | het is ((echt|aardig|best wel) )?(duidelijk|evident|helder|vanzelfsprekend)( dat)? 7 | (definitief|absoluut|beslist) 8 | ik moet (zeggen|bekennen) 9 | ik zou moeten (zeggen|bekennen) 10 | (zeker|stellig) 11 | (zeker weten|ongetwijfeld|stellig) 12 | (@ZIJN) (zeker|stellig overtuigd|zelfverzekerd)( dat)? 13 | natuurlijk 14 | geen twijfel mogelijk 15 | zonder twijfel over 16 | ongetwijfeld 17 | zonder (enige )?twijfel 18 | ik weet zeker( dat)? 19 | ik (twijfel|betwijfel) niet( dat)? 20 | wedden( dat)? 21 | ik wed( dat)? 22 | (het|de) enige (ding|probleem|punt|vraag) (@MISS)? ?(@ZIJN)( dat)? 23 | mijn (gevoel|intuïtie|intuitie) zegt( dat)? 24 | daarom 25 | dat is waarom 26 | het idee (hier )?is( dat)? 27 | (mijn|het) (hele punt|vraag) is 28 | wat je moet doen is 29 | de reden( hiervoor)? Is( dat)? 30 | dit is wat 31 | hier is wat 32 | exact 33 | precies 34 | (@GA) 35 | (@GANEG) 36 | (@GANEGPER) 37 | (@GAPER) 38 | wat (er )?gaat gebeuren is 39 | wat gebeurt is 40 | wat zal gaan gebeuren is 41 | ik wil (highlighten|benadrukken|onderstrepen) 42 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/patterns/emphasis.tff: -------------------------------------------------------------------------------- 1 | #class="emphasis" 2 | clearly 3 | obviously 4 | patently 5 | when you (really )?think about it 6 | (it is|it\'s) ((really|pretty) )?(obvious|evident|clear) (that)? 7 | definitely 8 | i have to say 9 | i\'ve got to say 10 | i\'ve gotta say 11 | i should say 12 | surely 13 | for sure 14 | (@BE) ((sure)|(certain)|(confident)) (that)? 15 | of course 16 | no doubt about it 17 | doubtless 18 | without a doubt 19 | I have no doubt (that)? 20 | I bet (that)? 21 | (@BE) bound to 22 | no two ways about it 23 | there ((is)|(are)) no two ways about it 24 | there\'s no two ways about it 25 | ((the)|(one)) ((thing)|(issue)|(question)|(problem)) (@MODAL )?(@BE) (that)? 26 | my feeling is (that)? 27 | that\'s why 28 | that is why 29 | the idea (here )?is (that)? 30 | ((my)|(the)) whole ((point)|(question)) is 31 | what you have to do is 32 | the reason is (that)? 33 | here\'s what 34 | here is what 35 | exactly 36 | precisely 37 | (@GONNA) 38 | (@GONNANEG) 39 | (@GONNANEGCL) 40 | (@GONNACL) 41 | what will happen is 42 | what\'ll happen is 43 | what\'s ((gonna)|(going to)) happen is 44 | what is ((gonna)|(going to)) happen is 45 | i want to (highlight|emphasize|underscore) 46 | 47 | -------------------------------------------------------------------------------- /TopicModelling/Top TfIdf/README.md: -------------------------------------------------------------------------------- 1 | # Top TfIdf terms per channel/per year 2 | 3 | > Inspiration/example: https://pudding.cool/2017/09/hip-hop-words/ 4 | 5 | IPython notebook in this folder takes a cleaned csv of transcripts and merges the texts per channel per year into one document. Then it simply takes the top TfIdf words for each new document, so the channels can be compared over time. 6 | 7 | ## TfIdf options 8 | 9 | - cutoff point was chosen on occurrence in one in 50, because channels span a lot of topics. 10 | - Instead of linear term frequency (10 occurrences -> tf = 10), I followed the pudding in using sublinear term frequency (10 occurrences -> tf = 1 + log(9)). The basic idea is that a linear increase in use of a term does not linearly increase their importance. In terms of results, the linear term frequency yields a list of stop words per document. The sublinear tf returns a much more meaningful list. 11 | - **TODO:** Lemmatize the words. I didn't have it available at the time, but knew someone else was working on it. As a result, I waited for the lemmatized set. 12 | 13 | ## Notebook contents 14 | 15 | 1. Top 10 per document 16 | 2. Top 100 in order to build networks of similarity. The networks this yields, however, are rather heavy, so that still needs finetuning. -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/test_recommendations.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from utils_for_test import read_json_from_file 4 | from youtubecollector.recommendations import convert_to_recommendations, recommendation 5 | 6 | 7 | class RecommendationsTest(TestCase): 8 | 9 | def test_get_full_recommendation(self): 10 | response = read_json_from_file("recommendation.json") 11 | actual = convert_to_recommendations(response,"id_of_video") 12 | 13 | expected = [ 14 | recommendation(video_id="id_of_video", 15 | target_video_id="id of first target video", 16 | published_at='2018-01-01T01:01:01.000Z', 17 | channel_id='channel Id of first video', 18 | video_title='title of first video', 19 | video_description='Description of first recommendation'), 20 | recommendation(video_id='id_of_video', 21 | target_video_id='id of second target video', 22 | published_at='2018-10-10T10:10:10.000Z', 23 | channel_id='channel id of second video', 24 | video_title='title of second video', 25 | video_description='description of second video') 26 | ] 27 | self.assertEqual(actual, expected) -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/patterns/assessments.tff: -------------------------------------------------------------------------------- 1 | #class="assessments" 2 | (onze|ons|mijn) (mening|idee|inzicht) (is|was) dat 3 | het (is|was) (mijn|onze|ons) (mening|idee|inzicht) (dat)? 4 | naar (onze|mijn|ons) (mening|idee) 5 | (vanuit)? (ons|mijn) standpunt 6 | het (lijkt|leek) (ons|mij) (dat)? 7 | het (lijkt|leek) (dat)? 8 | het oogt alsof 9 | het komt op (ons|mij) over 10 | het zou (ons|mij) lijken 11 | het lijkt (ons|mij) 12 | het lijkt (erop)? 13 | (het|mijn|ons)([\w]+[ ])?punt is (dat)? 14 | (het|mijn|ons)([\w]+[ \,]*){1,2} punt is (dat)? 15 | het (lijkt|leek) (mij|ons) (dat|alsof) 16 | het (lijkt|leek) (alsof|dat|net alsof) 17 | (we|wij|ik)(hebben|heb|krijgen|krijg|hadden|had|kregen|kreeg) (het idee|het gevoel|de indruk|de impressie) (dat)? 18 | (onze|mijn|ons) (indruk|idee|impressie|gevoel) (was|is) (dat)? 19 | volgens (onze|mijn) normen 20 | in (onze|mijn) ogen 21 | mijns? inziens 22 | naar (ons|onze|mijn) (idee|mening) 23 | voor zo ver (het)? (mij|ons) (aangaat|betreft) 24 | wat (mij|ons) betreft 25 | als je het (mij|ons) vraagt 26 | (wij|we|ik) (hebben|heb) het gevoel (dat)? 27 | vanuit (mijn|ons|onze) (standpunt|oogpunt|visie) 28 | (we|wij|ik) (denken|denk) niet (dat)? 29 | het enige (wat|dat) (we|wij|ik) (zeggen|zeg|bedoelen|bedoel) is 30 | wat (we|wij|ik) (zeggen|zeg|bedoelen|bedoel) is 31 | (we|wij|ik) (zeggen|zeg) dat 32 | (we|wij|ik) (zeggen|zeg) niet dat 33 | wat (we|wij|ik) (proberen|probeer) te zeggen is 34 | wat (we|wij|ik) (bedoelen|bedoel) is (dat)? 35 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/comment_minimal.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#commentThreadListResponse", 3 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/M13kILBSDXHZmf82KpKIUu78oro\"", 4 | "pageInfo": { 5 | "totalResults": 5, 6 | "resultsPerPage": 100 7 | }, 8 | "items": [ 9 | { 10 | "kind": "youtube#commentThread", 11 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/6WkG1Db9T-En4DWmNTTBdki5WTk\"", 12 | "id": "The comment id that is used", 13 | "snippet": { 14 | "videoId": "the video id", 15 | "topLevelComment": { 16 | "kind": "youtube#comment", 17 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/RUV3jLsQcA6PsDjkr3q5GkpAOmI\"", 18 | "id": "Also the comment id but not used", 19 | "snippet": { 20 | "authorDisplayName": "Author name", 21 | "authorProfileImageUrl": "example.com/photo.jpg", 22 | "authorChannelUrl": "http://www.youtube.com/channel/someone", 23 | "videoId": "a video id", 24 | "textDisplay": "The text that is displayed", 25 | "textOriginal": "text that is not shown,", 26 | "canRate": true, 27 | "viewerRating": "none", 28 | "likeCount": 4, 29 | "publishedAt": "2017-11-02T19:25:12.000Z", 30 | "updatedAt": "2017-11-02T19:25:12.000Z" 31 | } 32 | }, 33 | "canReply": true, 34 | "totalReplyCount": 0, 35 | "isPublic": true 36 | } 37 | } 38 | ] 39 | } -------------------------------------------------------------------------------- /TopicModelling/TopicModelWrapper/visualisations/pyLDAvisualisation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | # Gensim 5 | from gensim import corpora, models, similarities 6 | from gensim.corpora import Dictionary 7 | 8 | # Plotting tools 9 | import pyLDAvis 10 | import pyLDAvis.gensim 11 | 12 | # Enable logging for gensim - optional 13 | import logging 14 | import warnings 15 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) 16 | warnings.filterwarnings("ignore", category=DeprecationWarning) 17 | 18 | 19 | def main(): 20 | root = os.getcwd() 21 | model_name = 'captions_right' 22 | topic_num = 50 23 | 24 | model_path = '{}/models/{}_{}'.format(root, model_name, topic_num) 25 | 26 | corpus = corpora.MmCorpus('{}/{}.mm'.format(model_path, model_name)) 27 | lda = models.LdaMulticore.load('{}/{}.lda'.format(model_path, model_name)) 28 | dictionary = Dictionary.load('{}/{}.dict'.format(model_path, model_name)) 29 | 30 | t1 = time.time() 31 | print('Starting preparation of LDAvis visualisation') 32 | 33 | # # Load gensim data to prepare for visualization 34 | prepared_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False) 35 | 36 | # Save visualisation to HTML file 37 | pyLDAvis.save_html(prepared_data, os.path.join(model_path, '{}_LDAvis.html'.format(model_name))) 38 | 39 | t2 = time.time() 40 | print('LDAvis visualisation successful! Time elapsed: {}\n'.format(t2 - t1)) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/comment_full.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#commentThreadListResponse", 3 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/M13kILBSDXHZmf82KpKIUu78oro\"", 4 | "pageInfo": { 5 | "totalResults": 5, 6 | "resultsPerPage": 100 7 | }, 8 | "items": [ 9 | { 10 | "kind": "youtube#commentThread", 11 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/6WkG1Db9T-En4DWmNTTBdki5WTk\"", 12 | "id": "The comment id that is used", 13 | "snippet": { 14 | "videoId": "the video id", 15 | "topLevelComment": { 16 | "kind": "youtube#comment", 17 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/RUV3jLsQcA6PsDjkr3q5GkpAOmI\"", 18 | "id": "comment id that is not used", 19 | "snippet": { 20 | "authorDisplayName": "Author name", 21 | "authorProfileImageUrl": "example.com/photo.jpg", 22 | "authorChannelUrl": "http://www.youtube.com/channel/someone", 23 | "authorChannelId": { 24 | "value": "someone" 25 | }, 26 | "videoId": "some video id", 27 | "textDisplay": "The text that is displayed", 28 | "textOriginal": "text that is not shown", 29 | "canRate": true, 30 | "viewerRating": "none", 31 | "likeCount": 4, 32 | "disLikeCount": 2, 33 | "publishedAt": "2017-11-02T19:25:12.000Z", 34 | "updatedAt": "2017-11-02T19:25:12.000Z" 35 | } 36 | }, 37 | "canReply": true, 38 | "totalReplyCount": 0, 39 | "isPublic": true 40 | } 41 | } 42 | ] 43 | } -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lda.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("../") 3 | 4 | import pickle 5 | 6 | import pandas as pd 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.decomposition import LatentDirichletAllocation 9 | 10 | # Lazy data reader into DataFrame 11 | def read_argument_captions(): 12 | transcripts_reader = pd.read_csv("data/captions_arguments.csv", chunksize=10) 13 | for batch in transcripts_reader: 14 | for ix, caption in batch.iterrows(): 15 | text = "" 16 | for fragment, argument_label in zip(str(caption["content"]).split("\n"), str(caption["argument_labels"]).split("\n")): 17 | if argument_label: 18 | text += fragment + " " 19 | yield text 20 | 21 | 22 | with open("models/vectorizer.pkl", "rb") as count_file: 23 | vectorizer = pickle.load(count_file) 24 | with open("models/vectorizer_matrix.pkl", "rb") as matrix_file: 25 | matrix = pickle.load(matrix_file) 26 | 27 | lda_model = LatentDirichletAllocation(n_topics=50, max_iter=500, verbose=3, n_jobs=-1, learning_method="online") 28 | lda_model.fit(matrix) 29 | 30 | # Saving progress 31 | with open("models/lda.50.pkl", "wb") as lda_file: 32 | pickle.dump(lda_model, lda_file) 33 | 34 | def print_top_words(model, feature_names, n_top_words): 35 | for topic_idx, topic in enumerate(model.components_): 36 | print("Topic #%d:" % topic_idx) 37 | print(" | ".join([feature_names[i] 38 | for i in topic.argsort()[:-n_top_words - 1:-1]])) 39 | print() 40 | print() 41 | print() 42 | 43 | print_top_words(lda_model, feature_names, 50) 44 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/test_videos.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from utils_for_test import read_json_from_file, create_test_client_with_response 4 | from youtubecollector.video import convert_to_videos, video 5 | 6 | 7 | class VideoTest(TestCase): 8 | 9 | def test_get_video(self): 10 | client = create_test_client_with_response("video_metadata.json", "200") 11 | 12 | response = read_json_from_file("video.json") 13 | actual = convert_to_videos(response, client) 14 | 15 | expected = [ 16 | video(video_id='The id of the video', 17 | video_published='2015-01-01T01:01:01.000Z', 18 | channel_id='Id of the channel', 19 | video_title='The title of the video', 20 | video_description='Description of the video', 21 | video_channel_title='Title of the channel', 22 | video_tags=['tag one', 'tag two'], 23 | video_category_id='1', 24 | video_default_language='not set', 25 | video_duration='PT3M3S', 26 | video_view_count='120', 27 | video_comment_count='564', 28 | video_likes_count='231', 29 | video_dislikes_count='342', 30 | video_topic_ids=[ 31 | 'Relevant topics ids 1', 32 | 'Relevant topics ids 2' 33 | ], 34 | video_topic_categories=['https://en.wikipedia.org/wiki/Television_program', 35 | 'https://en.wikipedia.org/wiki/Society']) 36 | ] 37 | 38 | self.assertEqual(actual, expected) 39 | -------------------------------------------------------------------------------- /TopicModelling/filterTranscripts.py: -------------------------------------------------------------------------------- 1 | import getTokens 2 | import pandas as pd 3 | import ast 4 | import re 5 | import getWord2VecModel 6 | import numpy as np 7 | from gensim.models import Word2Vec 8 | 9 | def trainW2vTranscripts(): 10 | """ train a column of strings to a word2vec model""" 11 | df = pd.read_csv('data/captions-filtered.csv', encoding='utf-8') 12 | model = getWord2VecModel.getWord2Vec(train=df['transcript_clean']) 13 | model.most_similar(positive=['muslim']) 14 | 15 | def cleanTranscripts(): 16 | """ filter the transcripts by removing stopwords and stemming """ 17 | df = pd.read_csv('data/captions-clean.csv', encoding='utf-8') 18 | df['transcript_clean'] = np.nan 19 | datalength = len(df) 20 | print(df.head()) 21 | li_transcripts = ['n'] * len(df) 22 | for index, transcript in enumerate(df['transcript']): 23 | transcript_clean = ast.literal_eval(transcript) 24 | transcript_clean = getTokens.getTokens(li_strings=(ast.literal_eval(transcript)), lemmatizing=True) 25 | li_transcripts[index] = transcript_clean 26 | if index % 200 == 0: 27 | df['transcript_clean'] = li_transcripts 28 | df.to_csv('data/captions-filtered.csv', encoding='utf-8') 29 | print('Completed video ' + str(index) + '/' + str(datalength)) 30 | 31 | def removeDuplicateEntries(): 32 | """ remove the duplicate transcript entries """ 33 | df = pd.read_csv('data/captions.csv', encoding='utf-8') 34 | df.columns = ['id', 'transcript'] 35 | 36 | li_transcripts = [] 37 | for transcript in df['transcript']: 38 | li_transcript = ast.literal_eval(transcript) 39 | li_transcript = li_transcript[0::3] 40 | li_transcripts.append(li_transcript) 41 | df['transcript'] = li_transcripts 42 | 43 | df.to_csv('data/captions-clean.csv', encoding='utf-8') -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/video.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#playlistItemListResponse", 3 | "etag": "etag", 4 | "prevPageToken": "token", 5 | "pageInfo": { 6 | "totalResults": 258, 7 | "resultsPerPage": 50 8 | }, 9 | "items": [ 10 | { 11 | "kind": "youtube#playlistItem", 12 | "etag": "Etag", 13 | "id": "An id that isn't used", 14 | "snippet": { 15 | "publishedAt": "2015-01-01T01:01:01.000Z", 16 | "channelId": "Id of the channel", 17 | "title": "The title of the video", 18 | "description": "Description of the video", 19 | "thumbnails": { 20 | "default": { 21 | "url": "https://image.jpg", 22 | "width": 120, 23 | "height": 90 24 | }, 25 | "medium": { 26 | "url": "https:image.jpg", 27 | "width": 320, 28 | "height": 180 29 | }, 30 | "high": { 31 | "url": "https://image.jpg", 32 | "width": 480, 33 | "height": 360 34 | }, 35 | "standard": { 36 | "url": "https://image.jpg", 37 | "width": 640, 38 | "height": 480 39 | }, 40 | "maxres": { 41 | "url": "https://image.jpg", 42 | "width": 1280, 43 | "height": 720 44 | } 45 | }, 46 | "channelTitle": "Title of the channel", 47 | "playlistId": "id of playlist", 48 | "position": 250, 49 | "resourceId": { 50 | "kind": "youtube#video", 51 | "videoId": "video Id that isn't used" 52 | } 53 | }, 54 | "contentDetails": { 55 | "videoId": "The id of the video", 56 | "videoPublishedAt": "2015-01-01T10:01:10.000Z" 57 | } 58 | } 59 | ] 60 | } -------------------------------------------------------------------------------- /TopicModelling/TopicModelWrapper/StreamingCorpus.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import tarfile 4 | from gensim import corpora 5 | from gensim.corpora import TextCorpus 6 | from StreamingPreprocesser import StreamingPreprocesser 7 | 8 | DOCUMENT_MIN_TOKENS = 5 9 | TOKEN_MIN_LEN = 2 # less than; not inclusive 10 | TOKEN_MAX_LEN = 15 # equal to or larger tan 11 | 12 | 13 | class StreamingCorpus(TextCorpus): 14 | """ 15 | TextCorpus class 16 | """ 17 | def __init__(self, path, parse_strategy=None, clean_strategy=None, dictionary=None, metadata=False): 18 | self.path = path # path to index file or main folder of docs 19 | self.metadata = metadata 20 | 21 | self.streaming_parser = parse_strategy if parse_strategy is not None else StreamingParser(self.path, 1, metadata=True) 22 | self.streaming_cleaner = clean_strategy if clean_strategy is not None else StreamingPreprocesser() 23 | 24 | self.dictionary = dictionary or corpora.Dictionary() 25 | 26 | def get_dictionary(self): 27 | return self.dictionary 28 | 29 | def get_texts(self): 30 | 31 | for tokens, metadata in self.process_entries(): 32 | if self.metadata: 33 | yield tokens, metadata 34 | else: 35 | yield tokens 36 | 37 | def process_entries(self): 38 | 39 | for sources_texts, metadata in self.streaming_parser: 40 | 41 | # Clean the texts from all sources 42 | cleaned_text = [] 43 | for token in self.streaming_cleaner.process(sources_texts): # includes tokenizer 44 | cleaned_text.append(token) 45 | 46 | if len(cleaned_text) > 1: 47 | self.dictionary.add_documents([cleaned_text]) 48 | yield cleaned_text, metadata 49 | else: 50 | continue 51 | -------------------------------------------------------------------------------- /TopicModelling/TopicModelWrapper/stopwords/english: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | youre 11 | youve 12 | youll 13 | youd 14 | your 15 | yours 16 | yourself 17 | yourselves 18 | he 19 | him 20 | his 21 | himself 22 | she 23 | shes 24 | her 25 | hers 26 | herself 27 | it 28 | its 29 | its 30 | itself 31 | they 32 | them 33 | their 34 | theirs 35 | themselves 36 | what 37 | which 38 | who 39 | whom 40 | this 41 | that 42 | thatll 43 | these 44 | those 45 | am 46 | is 47 | are 48 | was 49 | were 50 | be 51 | been 52 | being 53 | have 54 | has 55 | had 56 | having 57 | do 58 | does 59 | did 60 | doing 61 | a 62 | an 63 | the 64 | and 65 | but 66 | if 67 | or 68 | because 69 | as 70 | until 71 | while 72 | of 73 | at 74 | by 75 | for 76 | with 77 | about 78 | against 79 | between 80 | into 81 | through 82 | during 83 | before 84 | after 85 | above 86 | below 87 | to 88 | from 89 | up 90 | down 91 | in 92 | out 93 | on 94 | off 95 | over 96 | under 97 | again 98 | further 99 | then 100 | once 101 | here 102 | there 103 | when 104 | where 105 | why 106 | how 107 | all 108 | any 109 | both 110 | each 111 | few 112 | more 113 | most 114 | other 115 | some 116 | such 117 | no 118 | nor 119 | not 120 | only 121 | own 122 | same 123 | so 124 | than 125 | too 126 | very 127 | s 128 | t 129 | can 130 | will 131 | just 132 | don 133 | dont 134 | should 135 | shouldve 136 | now 137 | d 138 | ll 139 | m 140 | o 141 | re 142 | ve 143 | y 144 | ain 145 | aren 146 | arent 147 | couldn 148 | couldnt 149 | didn 150 | didnt 151 | doesn 152 | doesnt 153 | hadn 154 | hadnt 155 | hasn 156 | hasnt 157 | haven 158 | havent 159 | isn 160 | isnt 161 | ma 162 | mightn 163 | mightnt 164 | mustn 165 | mustnt 166 | needn 167 | neednt 168 | shan 169 | shant 170 | shouldn 171 | shouldnt 172 | wasn 173 | wasnt 174 | weren 175 | werent 176 | won 177 | wont 178 | wouldn 179 | wouldnt 180 | -------------------------------------------------------------------------------- /TopicModelling/getWord2VecModel.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import ast 3 | import pickle as p 4 | from gensim.models import Word2Vec 5 | 6 | def getW2vModel(train='', load='', modelname='', min_word=200): 7 | """ 8 | Trains or loads a word2vec model. Input must be a list of strings. 9 | Keyword arguments: 10 | train -- when provided, trains, saved (in binary) and returns a model 11 | load -- when provided, loads and returns a model (usually stored in .model.bin) 12 | modelname -- name of the saved model 13 | min_word -- the minimum amount of occurances of words to be included in the model. Useful for filtering out bloat. 14 | """ 15 | 16 | if train != '': 17 | print('Training ' + modelname) 18 | # train model 19 | # neighbourhood? 20 | model = Word2Vec(train, min_count=min_word) 21 | # pickle the entire model to disk, so we can load&resume training later 22 | model.save(modelname + '.model') 23 | #store the learned weights, in a format the original C tool understands 24 | model.wv.save_word2vec_format(modelname + '.model.bin', binary=True) 25 | return model 26 | elif load != '': 27 | model = Word2Vec.load(load) 28 | return model 29 | 30 | def getStrings(): 31 | df = pd.read_csv('data/captions-filtered-final.csv', encoding='utf-8') 32 | li_transcripts = df['transcript_clean'] 33 | li_str_transcripts = [] 34 | 35 | for full_transcript in li_transcripts: 36 | li_full_transcript = ast.literal_eval(full_transcript) 37 | for sent_transcript in li_full_transcript: 38 | #li_transcript = ast.literal_eval(sent_transcript) 39 | str_transcript = ' '.join(sent_transcript) 40 | li_str_transcripts.append(str_transcript) 41 | print(li_str_transcripts[:10]) 42 | p.dump(li_str_transcripts, open('li_str_transcripts.p', 'wb')) 43 | return li_str_transcripts 44 | 45 | li_str_transcripts = getStrings() 46 | model = getW2vModel(train = li_str_transcripts, modelname='youtube-transcripts') 47 | print(model.most_similar) -------------------------------------------------------------------------------- /DataCollection/README.md: -------------------------------------------------------------------------------- 1 | ## Module for collection data from youtube 2 | 3 | ### Setup: 4 | 5 | #### Step 1: Setup your python environment 6 | It is recommended to setup an virtual environment and activate it. 7 | This will keep this module limited to this project and not part of your global installation 8 | 9 | ```commandline 10 | $ pip3 install virtualenv 11 | $ virtualenv venv 12 | $ source venv/bin/activate 13 | ``` 14 | 15 | You should now have a commandline that start like this: 16 | 17 | ```commandline 18 | (venv) $ 19 | ``` 20 | 21 | #### Step 2: Install the youtubecollector package 22 | 23 | Now you can install the package 24 | ```commandline 25 | (venv) $ make install 26 | ``` 27 | 28 | If you want to use this in a jupyter notebook (like notebooks/getting_started.ipynb) 29 | you have to start the jupyter server from within the virtual env 30 | ```bash 31 | (venv) $ jupyter notebook 32 | ``` 33 | 34 | You can now import the module like any other package 35 | ```python 36 | import youtubecollector 37 | ``` 38 | 39 | if this fails you could check if you have the right python kernel via 40 | ```python 41 | import sys 42 | sys.executable 43 | ``` 44 | 45 | this should result in a path that ends in `venv/bin/python3.6` 46 | 47 | #### Step 3: Get a developer key for the api 48 | 49 | You will need a google account. 50 | The next step are described here: [google api setup documentation](https://support.google.com/googleapi/answer/6158862) 51 | 52 | #### Getting started 53 | To see an example of the complete pipeline check the `getting_started.ipynb`. 54 | This notebook makes use of `tqdm` which generates some nice progress bar so you can track the progress. 55 | 56 | To enable these visualisations run: 57 | ```commandline 58 | (venv) $ jupyter nbextension enable --py widgetsnbextension 59 | ``` 60 | 61 | #### Development note 62 | If you wish to work on the package install the package with 63 | ```bash 64 | (venv) $ make development 65 | ``` 66 | In combination with the `autoreload` extension you can quickly test changes to the package in a notebook 67 | ```ipnbpython 68 | %load_ext autoreload 69 | %autoreload 2 70 | ``` 71 | 72 | -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/transcripts.py: -------------------------------------------------------------------------------- 1 | import os as _os 2 | import glob as _glob 3 | import csv as _csv 4 | import webvtt as _webvtt 5 | from .util import is_empty_file as _is_empty_file 6 | import youtube_dl as _youtube_dl 7 | 8 | 9 | def _get_captions_header(): 10 | return 'videoId', 'transcript' 11 | 12 | 13 | # TODO(OMeuwese) suppress messages and choose output dir 14 | def get_captions(videos): 15 | ydl_opts = { 16 | 'writeautomaticsub': True, 17 | 'skip_download': True, 18 | 'nocheckcertificate': True, 19 | 'verbose': False # doesn't seem to work 20 | 21 | } 22 | with _youtube_dl.YoutubeDL(ydl_opts) as ydl: 23 | for video in videos: 24 | try: 25 | video_url = 'https://www.youtube.com/watch?v={}'.format(video.video_id) 26 | ydl.download([video_url]) 27 | except: 28 | continue 29 | 30 | 31 | # TODO(OMeuwese) provide folder as argument and extract all vtt_files from given folder 32 | def extract_transcripts(vtt_folder): 33 | """:param vtt_folder should be location string ending in *.vtt to get all .vtt files like "files/output/*.vtt" """ 34 | 35 | video_ids = [] 36 | transcripts = [] 37 | 38 | for filename in _glob.glob(vtt_folder): 39 | ids = _get_ids_from_filename(filename) 40 | video_ids.append(ids) 41 | 42 | try: 43 | words = [] 44 | for caption in _webvtt.read(filename): 45 | words.append(caption.text) 46 | transcripts.append(words) 47 | except: 48 | pass 49 | return list(zip(video_ids, transcripts)) 50 | 51 | 52 | def write_transcripts(captions_filename, video_id_transcript_list): 53 | with open(captions_filename, 'a') as csv_file: 54 | writer = _csv.writer(csv_file, delimiter=',') 55 | 56 | if _is_empty_file(captions_filename): 57 | writer.writerow(_get_captions_header()) 58 | 59 | writer.writerows(video_id_transcript_list) 60 | 61 | 62 | def _get_ids_from_filename(filename): 63 | ids = _os.path.basename(filename) 64 | ids = ids[-18:-7] 65 | return ids 66 | -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/recommendations.py: -------------------------------------------------------------------------------- 1 | import csv as _csv 2 | from collections import namedtuple as _namedtuple 3 | 4 | from .util import is_empty_file as _is_empty_file 5 | from .util import convert_to_dictionary as _convert_to_dictionary 6 | 7 | recommendation = _namedtuple("recommendation", ('video_id', 8 | 'target_video_id', 9 | 'published_at', 10 | 'channel_id', 11 | 'video_title', 12 | 'video_description')) 13 | 14 | 15 | def _get_recommendations_header(): 16 | return recommendation._fields 17 | 18 | 19 | def get_recommendations(video_id, youtube_client, max_results=50): 20 | return youtube_client.search().list( 21 | part='snippet', 22 | type='video', 23 | relatedToVideoId=video_id, 24 | maxResults=max_results 25 | ).execute() 26 | 27 | 28 | def convert_to_recommendations(response, video_id): 29 | recommendations = list() 30 | for data in response['items']: 31 | next_recommendation = recommendation(video_id=video_id, 32 | target_video_id=data['id']['videoId'], 33 | published_at=data['snippet']['publishedAt'], 34 | channel_id=data['snippet']['channelId'], 35 | video_title=data['snippet']['title'], 36 | video_description=data['snippet']['description']) 37 | 38 | recommendations.append(next_recommendation) 39 | 40 | return recommendations 41 | 42 | 43 | def write_recommendations(recommendations_file, recommendations): 44 | header = _get_recommendations_header() 45 | 46 | with open(recommendations_file, 'a') as csv_file: 47 | writer = _csv.DictWriter(csv_file, fieldnames=header) 48 | 49 | if _is_empty_file(recommendations_file): 50 | writer.writeheader() 51 | 52 | for recommendation_row in recommendations: 53 | writer.writerow(_convert_to_dictionary(recommendation_row)) 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # youtube_extremism 2 | 3 | This is a repository for the research into radical and extremist infospheres on YouTube. We have used this code for a series of stories at de Volkskrant ([link to stories](https://volkskrant.nl/youtube)) and de Correspondent ([link to stories](https://decorrespondent.nl/collectie/extreme-politieke-bewegingen)) 4 | 5 | The code consists of several modules, packages and collections of code. 6 | 7 | ## DataCollection 8 | 9 | DataCollection contains a library for, well, large scale data collection. The code takes a list of channels and collects, through the YouTube API, the following data types: 10 | 1. Channel information (basic statistics, relevant playlist ids and more) 11 | 2. Videos (statistics and descriptions) 12 | 3. Comments (all comments of the videos) 13 | 4. Recommendations (all recommendations for the gathered videos) 14 | 5. Transcripts (transcripts, if available, in English of the videos, gathered with the [youtube-dl library](https://rg3.github.io/youtube-dl/) 15 | 16 | You'll find additional documentation in the [DataCollection folder.](https://github.com/dtokmetzis/youtube_extremism/tree/master/DataCollection) 17 | 18 | ## RabbitHole 19 | 20 | Contains scripts and notebooks to gather and analyse data we used for an experiment into the recommendation system of YouTube. This codes still needs a lot of work. 21 | 22 | ## Notebooks 23 | 24 | Contains some notebooks used for the analysis of the data on right and left wing 'infospheres.' They just scratch the surface of possible analyses, but they can help you along. 25 | 26 | ## TopicModelling 27 | 28 | Contains a lot of scripts, data and ideas for natural language processing. The transcripts are a real treasure. During two hackathons we've written code to get a grip on this data. There is still a lot that need to be done, so please consider these scripts as suggestions. 29 | 30 | ## Finally 31 | 32 | If you are interested in the data (we have gathered aroung 100GB, or 500.000 videos of far right and far left content), please drop me a line. We won't share our comment data without a clear agreement on how to process those safely, because they are really sensitive data. 33 | 34 | All code is written in python3. 35 | 36 | Please let me know what we can do better. And please share your findings with us. 37 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/test_channels.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import pandas as pd 4 | 5 | from utils_for_test import create_test_client_with_response 6 | from youtubecollector.channels import get_channels, channel 7 | 8 | 9 | class ChannelTest(TestCase): 10 | 11 | def test_get_full_channel(self): 12 | expected = [ 13 | channel(channel_id='Some_ID', channel_title='The test channel', 14 | channel_description='The Official YouTube Channel for testing', 15 | channel_default_language='en', channel_country='US', 16 | channel_uploads='UU_8WUrPbi8clO6sWt_FDvuA', channel_viewcount='2640735', 17 | channel_commentcount='0', channel_subscribercount='9779', channel_videocount='258', 18 | channel_topic_ids=['topic1', 'topic2', 'topic3'], 19 | channel_topic_categories=['https://en.wikipedia.org/wiki/Society', 20 | 'https://en.wikipedia.org/wiki/Politics'], 21 | channel_branding_keywords='"Testing is fun", "More Testing"') 22 | ] 23 | channel_seed = pd.DataFrame([{"channel_id": "Some_ID"}]) 24 | 25 | client = create_test_client_with_response("full_channel_response.json", "200") 26 | actual = get_channels(channel_seed, client) 27 | 28 | self.assertEqual(expected, actual) 29 | 30 | def test_get_minimal_channel(self): 31 | expected = [ 32 | channel(channel_id='Some_ID', channel_title='The test channel', 33 | channel_description='The Official YouTube Channel for testing', 34 | channel_default_language='not set', channel_country='not set', 35 | channel_uploads='', channel_viewcount='2640735', 36 | channel_commentcount='0', channel_subscribercount='9779', channel_videocount='258', 37 | channel_topic_ids="not set", 38 | channel_topic_categories="not set", 39 | channel_branding_keywords="not set") 40 | ] 41 | 42 | channel_seed = pd.DataFrame([{"channel_id": "Some_ID"}]) 43 | 44 | client = create_test_client_with_response("nullable_fields_channel_response.json", "200") 45 | actual = get_channels(channel_seed, client) 46 | 47 | self.assertEqual(expected, actual) 48 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/recommendation.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#searchListResponse", 3 | "etag": "some identifier", 4 | "nextPageToken": "nextPageToken", 5 | "regionCode": "NL", 6 | "pageInfo": { 7 | "totalResults": 330, 8 | "resultsPerPage": 50 9 | }, 10 | "items": [ 11 | { 12 | "kind": "youtube#searchResult", 13 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/XMTNvc3BBTTBo2SWSuj2nscMDmw\"", 14 | "id": { 15 | "kind": "youtube#video", 16 | "videoId": "id of first target video" 17 | }, 18 | "snippet": { 19 | "publishedAt": "2018-01-01T01:01:01.000Z", 20 | "channelId": "channel Id of first video", 21 | "title": "title of first video", 22 | "description": "Description of first recommendation", 23 | "thumbnails": { 24 | "default": { 25 | "url": "https://someimage.jpg", 26 | "width": 120, 27 | "height": 90 28 | }, 29 | "medium": { 30 | "url": "https://someimage.jpg", 31 | "width": 320, 32 | "height": 180 33 | }, 34 | "high": { 35 | "url": "https://someimage.jpg", 36 | "width": 480, 37 | "height": 360 38 | } 39 | }, 40 | "channelTitle": "Title of channel", 41 | "liveBroadcastContent": "none" 42 | } 43 | }, 44 | { 45 | "kind": "youtube#searchResult", 46 | "etag": "some identifier", 47 | "id": { 48 | "kind": "youtube#video", 49 | "videoId": "id of second target video" 50 | }, 51 | "snippet": { 52 | "publishedAt": "2018-10-10T10:10:10.000Z", 53 | "channelId": "channel id of second video", 54 | "title": "title of second video", 55 | "description": "description of second video", 56 | "thumbnails": { 57 | "default": { 58 | "url": "https://image.jpg", 59 | "width": 120, 60 | "height": 90 61 | }, 62 | "medium": { 63 | "url": "https://image.jpg", 64 | "width": 320, 65 | "height": 180 66 | }, 67 | "high": { 68 | "url": "https://image.jpg", 69 | "width": 480, 70 | "height": 360 71 | } 72 | }, 73 | "channelTitle": "Title of second channel", 74 | "liveBroadcastContent": "none" 75 | } 76 | } 77 | ] 78 | } -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/video_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#videoListResponse", 3 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/b7mZ8rGMG1BFDpd9jh6TQbAlDik\"", 4 | "pageInfo": { 5 | "totalResults": 1, 6 | "resultsPerPage": 1 7 | }, 8 | "items": [ 9 | { 10 | "kind": "youtube#video", 11 | "etag": "etage", 12 | "id": "HIjfgBATDfs", 13 | "snippet": { 14 | "publishedAt": "2008-10-18T18:42:29.000Z", 15 | "channelId": "UC_0dwPeY0vQSJGVfRpFvGUg", 16 | "title": "title in metadata", 17 | "description": "Description in metadata", 18 | "thumbnails": { 19 | "default": { 20 | "url": "https://image.jpg", 21 | "width": 120, 22 | "height": 90 23 | }, 24 | "medium": { 25 | "url": "https://image.jpg", 26 | "width": 320, 27 | "height": 180 28 | }, 29 | "high": { 30 | "url": "https://image.jpg", 31 | "width": 480, 32 | "height": 360 33 | } 34 | }, 35 | "channelTitle": "Channel title of metadata", 36 | "tags": [ 37 | "tag one", 38 | "tag two" 39 | ], 40 | "categoryId": "1", 41 | "liveBroadcastContent": "none", 42 | "localized": { 43 | "title": "Localized title of video in metadata", 44 | "description": "localized Description in metadata" 45 | } 46 | }, 47 | "contentDetails": { 48 | "duration": "PT3M3S", 49 | "dimension": "2d", 50 | "definition": "sd", 51 | "caption": "false", 52 | "licensedContent": false, 53 | "regionRestriction": { 54 | "blocked": [ 55 | "GP", 56 | "DE", 57 | "GB" 58 | ] 59 | }, 60 | "projection": "rectangular" 61 | }, 62 | "statistics": { 63 | "viewCount": "120", 64 | "likeCount": "231", 65 | "dislikeCount": "342", 66 | "favoriteCount": "453", 67 | "commentCount": "564" 68 | }, 69 | "topicDetails": { 70 | "topicIds": [ 71 | "topic Id" 72 | ], 73 | "relevantTopicIds": [ 74 | "Relevant topics ids 1", 75 | "Relevant topics ids 2" 76 | ], 77 | "topicCategories": [ 78 | "https://en.wikipedia.org/wiki/Television_program", 79 | "https://en.wikipedia.org/wiki/Society" 80 | ] 81 | } 82 | } 83 | ] 84 | } -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/nl/macros/modals.tff: -------------------------------------------------------------------------------- 1 | #class="modals" 2 | @ZIJN={zijn, is, ben, waren, was} 3 | @ZIJNPER={hij is, ik ben, het is, zij is, ze is, zij zijn, we zijn, wij zijn, jij bent, je bent, dat is} 4 | @ZIJNNEG={zijn niet, is niet, was niet, waren niet, ben niet, is niet} 5 | @MISS={misschien, mogelijk, zou kunnen, kan, zouden, zou, zal, zullen} 6 | @MOG={mogelijk, misschien, mogelijkheid tot} 7 | @MOGNEG={mogelijk niet, misschien niet, geen mogelijkheid tot} 8 | @KAN={kan, kunnen, kon, konden, zou kunnen, zouden kunnen} 9 | @KANNEG={kan niet, kunnen niet, kon niet, konden niet} 10 | @KANHEB={kan hebben, kunnen hebben, kon hebben, konden hebben, zou kunnen hebben, zouden kunnen hebben} 11 | @KANHEBNEG={kan niet hebben, kunnen niet hebben, zal, niet hebben, heeft misschien niet, hebben misschien niet, zal misschien niet} 12 | 13 | @INSTAAT={in staat tot, niet in staat tot} 14 | @ISNIET={is niet} 15 | @ZOUPER={jij zou, hij zou, ik zou, het zou, zij zou, ze zou, zij zouden, wij zouden, we zouden, dat zou, dit zou} 16 | @HEBPER={ik heb, jij hebt, je hebt, zij heeft, hij heeft, we hebben, wij hebben, zij hebben, het heeft, dat heeft, dit heeft} 17 | @ZAL={zal, zouden} 18 | @ZALNEG={zal niet, zouden niet} 19 | @ZALPER={ik zal, jij zal, zij zal, ze zal, hij zal, dat zal, dit zal} 20 | @DURF={durven, durf} 21 | @DURFNEG={durven niet, durf niet} 22 | @HEB={hebben, heb, hebt, heeft, had} 23 | @HEBNEG={heeft niet, had niet, heb niet, hebt niet} 24 | @MOET={moet, moeten, zal moeten, zullen moeten} 25 | @HEEFTNEG={heeft geen, heeft niet, hebben geen, hebben niet, hoeft niet, hoeven niet} 26 | @MOETPER={het moet, het zal moeten, hij moet, hij zal moeten, zij moet, zij zal moeten, ik moet, ik zal moeten, we moeten, we zullen moeten, zij moeten, zij zullen moeten, jij moet, je moet, jij zal moeten, je zal moeten, dat moet, dat zal moeten, dit moet, dit zal moeten} 27 | 28 | @MOETNEG={moet niet, moeten niet} 29 | @ZALMOET={zal moeten} 30 | @MOETNEGHEB={moet niet hebben, moeten niet hebben, zal niet moeten hebben} 31 | @WAS={was, vroeger, in het verleden} 32 | 33 | @MOETHEB={moet hebben, moeten hebben} 34 | 35 | @ZOUMOET={zou moeten} 36 | @ZOUNEGMOET={zou niet moeten} 37 | @ZOUMOETHEB={zou moeten hebben} 38 | @DOE={doe, doen, doet} 39 | @DOENEG={doet niet, doen niet, doet het niet, doen het niet} 40 | @GA={ga, gaat, gaan, zal gaan, zullen gaan} 41 | @GANEG={ga niet, gaat niet, gaan niet, zal niet gaan, zullen niet gaan} 42 | @GAPER={ik ga, zij gaan, zij gaat, het gaat, we gaan, jij|je gaat, het gaat, dat gaat} 43 | @GANEGPER={ik ga niet, zij gaan niet, zij gaat niet, het gaat niet, we gaan niet, jij|je gaat niet, het gaat niet, dat gaat niet} 44 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/comment_with_reply.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#commentThreadListResponse", 3 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/M13kILBSDXHZmf82KpKIUu78oro\"", 4 | "pageInfo": { 5 | "totalResults": 5, 6 | "resultsPerPage": 100 7 | }, 8 | "items": [ 9 | { 10 | "kind": "youtube#commentThread", 11 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/6WkG1Db9T-En4DWmNTTBdki5WTk\"", 12 | "id": "The comment id that is used", 13 | "snippet": { 14 | "videoId": "the video id", 15 | "topLevelComment": { 16 | "kind": "youtube#comment", 17 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/RUV3jLsQcA6PsDjkr3q5GkpAOmI\"", 18 | "id": "this comment id is not used", 19 | "snippet": { 20 | "authorDisplayName": "Author name", 21 | "authorProfileImageUrl": "somewhere.com/photo.jpg", 22 | "authorChannelUrl": "http://www.youtube.com/channel/someone", 23 | "authorChannelId": { 24 | "value": "someone" 25 | }, 26 | "videoId": "a video id", 27 | "textDisplay": "The text that is displayed", 28 | "textOriginal": "Would be amazing to see an Austin Petersen endorsement as well. He would stand right up there with Massie, Amash, and the rest of the liberty coalition in congress.", 29 | "canRate": true, 30 | "viewerRating": "none", 31 | "likeCount": 4, 32 | "disLikeCount": 2, 33 | "publishedAt": "2017-11-02T19:25:12.000Z", 34 | "updatedAt": "2017-11-02T19:25:12.000Z" 35 | } 36 | }, 37 | "canReply": true, 38 | "totalReplyCount": 0, 39 | "isPublic": true 40 | }, 41 | "replies": { 42 | "comments": [ 43 | { 44 | "kind": "youtube#comment", 45 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/TAYF2haOdXkzVaoBDUpJkoLXQSQ\"", 46 | "id": "The parent id.the reply id", 47 | "snippet": { 48 | "authorDisplayName": "Responder", 49 | "authorProfileImageUrl": "example2.com/photo.jpg", 50 | "authorChannelUrl": "http://www.youtube.com/channel/responder", 51 | "authorChannelId": { 52 | "value": "responder channel id" 53 | }, 54 | "videoId": "some video id", 55 | "textDisplay": "The response text", 56 | "textOriginal": "Different if something was changed", 57 | "parentId": "the parent id", 58 | "canRate": true, 59 | "viewerRating": "none", 60 | "likeCount": 1, 61 | "publishedAt": "2017-11-02T19:55:27.000Z", 62 | "updatedAt": "2017-11-02T19:55:27.000Z" 63 | } 64 | } 65 | ] 66 | } 67 | } 68 | ] 69 | } -------------------------------------------------------------------------------- /TopicModelling/Captions preprocessing/Preprocess leftwing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import ast\n", 11 | "\n", 12 | "raw_captions = 'C:/hackathon/captions.csv'\n", 13 | "raw_videolist = 'C:/hackathon/videolists_lefty.csv'\n", 14 | "preprocessed_captions = 'C:/hackathon/left_captions.csv'\n", 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Text cleaning\n", 22 | "\n", 23 | "#### Input format:\n", 24 | "List to csv -> string representation of list saved and then read. Contains duplicates, sometimes within the same list. Then separated by newline (\\n)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "def clean_caption(caption_string):\n", 34 | " caption = ast.literal_eval(caption_string) #list representation to actual list\n", 35 | " caption = [newline for oldline in caption for newline in oldline.split('\\n') if newline != ' '] #split lines by newline,\n", 36 | " #unnest the result and keep if line is not empty (in this case just a space)\n", 37 | " result = [] #Initialise empty list to store non-duplicates in. Does not use set because lines can be identical,\n", 38 | " #so the criterion is to only drop duplicates if they follow eachother\n", 39 | " prevline = ''\n", 40 | " for line in caption:\n", 41 | " if line == prevline:\n", 42 | " continue\n", 43 | " result.append(line)\n", 44 | " prevline = line\n", 45 | " return '\\n'.join(result)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 6, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df = pd.read_csv(raw_captions, names = ['yt_id','captions'])\n", 55 | "df['captions'] = df['captions'].apply(clean_caption)\n", 56 | "\n", 57 | "df_videolist = pd.read_csv(raw_videolist, names = ['yt_id','timestamp','title','channel_id','channel_name'])\n", 58 | "df = df.merge(df_videolist, how = 'left', on = 'yt_id')\n", 59 | "\n", 60 | "df.to_csv(preprocessed_captions,index=False)" 61 | ] 62 | } 63 | ], 64 | "metadata": { 65 | "kernelspec": { 66 | "display_name": "Python 3", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 3 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython3", 80 | "version": "3.6.3" 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 2 85 | } 86 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/lexicon/en/macros/modals.tff: -------------------------------------------------------------------------------- 1 | #class="modals" 2 | @BE={be,is,am,are,were,was,been,being} 3 | @BECL={he\'s,I\'m,it\'s,she\'s,they\'re,we\'re,you\'re, that\'s} 4 | @BENEG={aren\'t,isn\'t,wasn\'t,weren\'t, am not, is not, are not, was not, were not} 5 | @MODAL={might,may,could,can,should,shall,will,would} 6 | @MAY={may,might} 7 | @MAYNEG={may not,might not,mightn\'t, mayn\'t} 8 | @MAYHAVE={may have, might have, might\'ve, may\'ve}\ 9 | @MAYNEGHAVE={may not have, might not have, mightn\'t have, mayn\'t have} 10 | @CAN={can,could} 11 | @CANNEG={cannot,can\'t,couldn\'t, could not} 12 | @CANHAVE={can have, could have, could\'ve} 13 | @SHALL={shall,should, shalt} 14 | @SHALLHAVE={shall have, should have} 15 | @SHALLNEG={shan't, shouldn\'t, shall not, should not} 16 | @SHALLNEGHAVE={shall not have, should not have, shouldn\'t have} 17 | @ABLE={able,unable} 18 | @AINT={ain\'t} 19 | @WOULDCL={you\'d,he\'d,I\'d,it\'d,she\'d,they\'d,we\'d, that\'d, this\'d} 20 | @HAVECL={i\'ve, you\'ve, she\'s, he\'s, we\'ve, they\'ve, it\'s, that\'s, this\'s} 21 | @WILL={will, would} 22 | @WILLNEG={wouldn\'t,won\'t, will not, would not} 23 | @WILLCL={i\'ll, you\'ll, she\'ll, he\'ll,that\'ll, this\'ll} 24 | @DARE={dare} 25 | @DARENEG={daren\'t, dare not} 26 | @HAVE={have,has,had,having} 27 | @HAVENEG={haven\'t,hasn\'t,hadn\'t, had not, has not, have not} 28 | @HAVEGOTTA={has got to, has gotta, have got to, have gotta, gotta} 29 | @HAVEGOTTANEG={hasn\'t got to, hasn\'t gotta, haven\'t got to, haven\'t gotta, ain\'t gotta, ain\'t got to} 30 | @HAVEGOTTACL={it\'s gotta, it\'s got to, he\'s got to, he\'s gotta, she\'s got to, she\'s gotta, i\'ve gotta, i\'ve got to, we\'ve gotta, we\'ve got to, they've got to, they've gotta, you\'ve got to, you\'ve gotta, that\'s got to, that\'s gotta, this\'s gotta, this\'s got to} 31 | @MUST={must} 32 | @MUSTNEG={mustn\'t} 33 | @MUSTHAVE={must have, must\'ve} 34 | @MUSTNEGHAVE={must not have, mustn\'t have, mustn\'t \'ve} 35 | @USEDTO={used to} 36 | @NEED={need} 37 | @NEEDNEG={needn\'t} 38 | @NEEDHAVE={need\'ve, need have} 39 | @NEEDHAVENEG={needn\'t have, needn\'t\'ve, need not have} 40 | @OUGHT={ought to, oughta} 41 | @OUGHTNEG={oughtn\'t to} 42 | @OUGHTHAVE={ought to have, oughta have} 43 | @DO={do, does} 44 | @DONEG={doesn\'t,don\'t} 45 | @GONNA={am going to, are going to, is going to, am gonna, are gonna, is gonna} 46 | @GONNANEG={am not going to, are not going to, is not going to, am not gonna, are not gonna, is not gonna, ain\'t gonna, isn\'t gonna, aren\'t gonna} 47 | @GONNACL={i\'m going to, they\'re going to, she\'s going to, it\'s going to, we\'re going to, i\'m gonna, you\'re gonna, i\'m gonna, you\'re gonna, he\'s gonna, she\'s gonna, it\'s gonna, we\'re gonna, they\'re gonna, that\'s gonna} 48 | @GONNANEGCL={i\'m not going to, they\'re not going to, she\'s not going to, it\'s not going to, we\'re not going to, i\'m not gonna, you\'re not gonna, he\'s not gonna, she\'s not gonna, it\'s not gonna, we\'re not gonna, they\'re not gonna, that\'s not gonna} 49 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/test_comments.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from utils_for_test import read_json_from_file 4 | from youtubecollector.comments import convert_to_comments, comment 5 | 6 | 7 | class CommentTest(TestCase): 8 | 9 | def test_get_full_comment(self): 10 | response = read_json_from_file("comment_full.json") 11 | actual = convert_to_comments(response) 12 | 13 | expected = [ 14 | comment(video_id='the video id', comment_id='The comment id that is used', 15 | author_display_name='Author name', 16 | author_channel_url='http://www.youtube.com/channel/someone', 17 | author_channel_id='someone', 18 | comment_text='The text that is displayed', 19 | comment_like_count=4, 20 | comment_dislike_count=2, 21 | comment_time='2017-11-02T19:25:12.000Z', 22 | reply_count=0) 23 | ] 24 | 25 | self.assertEqual(actual, expected) 26 | 27 | def test_get_minimal_comment(self): 28 | response = read_json_from_file("comment_minimal.json") 29 | actual = convert_to_comments(response) 30 | 31 | expected = [ 32 | comment(video_id='the video id', comment_id='The comment id that is used', 33 | author_display_name='Author name', 34 | author_channel_url='http://www.youtube.com/channel/someone', 35 | author_channel_id='not set', 36 | comment_text='The text that is displayed', 37 | comment_like_count=4, 38 | comment_dislike_count=0, 39 | comment_time='2017-11-02T19:25:12.000Z', 40 | reply_count=0) 41 | ] 42 | 43 | self.assertEqual(actual, expected) 44 | 45 | def test_get_comments_with_replies(self): 46 | response = read_json_from_file("comment_with_reply.json") 47 | actual = convert_to_comments(response) 48 | 49 | expected = [ 50 | comment(video_id='the video id', comment_id='The comment id that is used', 51 | author_display_name='Author name', 52 | author_channel_url='http://www.youtube.com/channel/someone', 53 | author_channel_id='someone', 54 | comment_text='The text that is displayed', 55 | comment_like_count=4, 56 | comment_dislike_count=2, 57 | comment_time='2017-11-02T19:25:12.000Z', 58 | reply_count=0), 59 | comment(video_id='some video id', comment_id='The parent id.the reply id', 60 | author_display_name='Responder', 61 | author_channel_url='http://www.youtube.com/channel/responder', 62 | author_channel_id='responder channel id', 63 | comment_text='The response text', 64 | comment_like_count=1, 65 | comment_dislike_count='', 66 | comment_time='2017-11-02T19:55:27.000Z', 67 | reply_count='') 68 | ] 69 | 70 | self.assertEqual(actual, expected) 71 | -------------------------------------------------------------------------------- /TopicModelling/TopicModelWrapper/StreamingPreprocesser.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | 4 | class StreamingPreprocesser(object): 5 | 6 | def __init__(self, stopwords=None, processes=None): 7 | self.source_generator = None 8 | 9 | self.stopwords = stopwords if stopwords is not None else open('stopwords-nl.txt', 'r').read().split('\n') 10 | 11 | punctuation = string.punctuation 12 | punctuation += '`’‘”“' 13 | self.punctuation_table = str.maketrans('', '', punctuation) 14 | 15 | spacers = '\n\r\t' 16 | self.spacers_table = str.maketrans(spacers, ' ' * len(spacers)) 17 | 18 | self.processes = processes 19 | if self.processes is None: 20 | self.processes = [ 21 | # self.process_string, 22 | self.encode_doc_to_ascii, 23 | self.remove_punctuation, 24 | self.remove_spacers, 25 | self.to_lower_case, 26 | self.tokenise, 27 | self.remove_stopwords, 28 | self.remove_digit_terms, 29 | self.remove_min_len 30 | ] 31 | 32 | def __iter__(self): 33 | for tokens in self.process(self.source_generator): 34 | yield tokens 35 | 36 | def add_processor(self, process): 37 | if callable(process): 38 | self.processes.append(process) 39 | 40 | def process(self, text): 41 | pipeline = text 42 | pipeline = [pipeline] if type(pipeline) == str else pipeline 43 | for processor in self.processes: 44 | pipeline = processor(pipeline) 45 | return pipeline 46 | 47 | def encode_doc_to_ascii(self, texts): 48 | for text in texts: 49 | yield text.encode('ascii', errors='ignore').decode('utf8') 50 | # List all non ascii characters and translate/filter them out 51 | 52 | def remove_punctuation(self, texts): 53 | for text in texts: 54 | yield text.translate(self.punctuation_table) 55 | 56 | def remove_spacers(self, texts): 57 | for text in texts: 58 | yield text.translate(self.spacers_table) 59 | 60 | def to_lower_case(self, texts): 61 | for text in texts: 62 | yield text.lower() 63 | 64 | def tokenise(self, texts): 65 | for text in texts: 66 | for token in text.split(): 67 | yield token 68 | 69 | def remove_stopwords(self, tokens): 70 | for token in tokens: 71 | if token not in self.stopwords: 72 | yield token 73 | else: 74 | continue 75 | 76 | def remove_digit_terms(self, tokens): 77 | for token in tokens: 78 | if not token.isdigit(): 79 | yield token 80 | else: 81 | continue 82 | 83 | def remove_min_len(self, tokens): 84 | for token in tokens: 85 | if len(token) > 2: # self.token_min 86 | yield token 87 | else: 88 | continue 89 | -------------------------------------------------------------------------------- /TopicModelling/TextLemma/filterTranscripts.py: -------------------------------------------------------------------------------- 1 | import getTokens 2 | import pandas as pd 3 | import ast 4 | import re 5 | import numpy as np 6 | from gensim.models import Word2Vec 7 | from joblib import Parallel, delayed 8 | import multiprocessing 9 | from math import sqrt 10 | from collections import defaultdict 11 | 12 | 13 | class CallBack(object): 14 | completed = defaultdict(int) 15 | 16 | def __init__(self, index, parallel): 17 | self.index = index 18 | self.parallel = parallel 19 | 20 | def __call__(self, index): 21 | CallBack.completed[self.parallel] += 1 22 | print("done with {}".format(CallBack.completed[self.parallel])) 23 | if self.parallel._original_iterable: 24 | self.parallel.dispatch_next() 25 | 26 | import joblib.parallel 27 | joblib.parallel.CallBack = CallBack 28 | 29 | def trainW2vTranscripts(): 30 | """ train a column of strings to a word2vec model""" 31 | df = pd.read_csv('data/captions-filtered.csv', encoding='utf-8') 32 | model = getWord2VecModel.getWord2Vec(train=df['transcript_clean']) 33 | model.most_similar(positive=['muslim']) 34 | 35 | def callable(df): 36 | df['transcript_clean'] = np.nan 37 | datalength = len(df) 38 | print(df.head()) 39 | li_transcripts = ['n'] * len(df) 40 | for index, transcript in enumerate(df['transcript']): 41 | transcript_clean = ast.literal_eval(transcript) 42 | transcript_clean = getTokens.getTokens(li_strings=(ast.literal_eval(transcript)), lemmatizing=True) 43 | li_transcripts[index] = transcript_clean 44 | df['transcript_clean'] = li_transcripts 45 | return df 46 | 47 | def cleanTranscripts(): 48 | """ filter the transcripts by removing stopwords and stemming """ 49 | dfs = pd.read_csv('data/captions-clean.csv', encoding='utf-8', chunksize=500, nrows=1, skiprows=range(1,40000)) 50 | parallel = Parallel(n_jobs=multiprocessing.cpu_count()) 51 | retlist = parallel(delayed(callable)(i) for i in dfs) 52 | df = pd.concat(retlist) 53 | # df['transcript_clean'] = np.nan 54 | # datalength = len(df) 55 | # print(df.head()) 56 | # li_transcripts = ['n'] * len(df) 57 | # for index, transcript in enumerate(df['transcript']): 58 | # transcript_clean = ast.literal_eval(transcript) 59 | # transcript_clean = getTokens.getTokens(li_strings=(ast.literal_eval(transcript)), lemmatizing=True) 60 | # li_transcripts[index] = transcript_clean 61 | # if index % 200 == 0: 62 | # df['transcript_clean'] = li_transcripts 63 | # df.to_csv('data/captions-filtered.csv', encoding='utf-8') 64 | # print('Completed video ' + str(index) + '/' + str(datalength)) 65 | # df['transcript_clean'] = li_transcripts 66 | df.to_csv('data/captions-filtered.csv', encoding='utf-8') 67 | # print('Completed video ' + str(index) + '/' + str(datalength)) 68 | 69 | def removeDuplicateEntries(): 70 | """ remove the duplicate transcript entries """ 71 | df = pd.read_csv('data/captions.csv', encoding='utf-8') 72 | df.columns = ['id', 'transcript'] 73 | 74 | li_transcripts = [] 75 | for transcript in df['transcript']: 76 | li_transcript = ast.literal_eval(transcript) 77 | li_transcript = li_transcript[0::3] 78 | li_transcripts.append(li_transcript) 79 | df['transcript'] = li_transcripts 80 | 81 | df.to_csv('data/captions-clean.csv', encoding='utf-8') 82 | 83 | cleanTranscripts() -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/channels.py: -------------------------------------------------------------------------------- 1 | import csv as _csv 2 | from collections import namedtuple as _namedtuple 3 | from .util import is_empty_file as _is_empty_file 4 | from .util import convert_to_dictionary as _convert_to_dictionary 5 | 6 | channel = _namedtuple("channel", ('channel_id', 7 | 'channel_title', 8 | 'channel_description', 9 | 'channel_default_language', 10 | 'channel_country', 11 | 'channel_uploads', 12 | 'channel_viewcount', 13 | 'channel_commentcount', 14 | 'channel_subscribercount', 15 | 'channel_videocount', 16 | 'channel_topic_ids', 17 | 'channel_topic_categories', 18 | 'channel_branding_keywords')) 19 | 20 | 21 | def _get_channel_header(): 22 | return channel._fields 23 | 24 | 25 | def _get_channel(channel_id, youtube_client): 26 | """Queries the youtube API and gets a json in return""" 27 | 28 | return youtube_client.channels().list( 29 | part='snippet,contentDetails,topicDetails,statistics,brandingSettings', 30 | id=channel_id 31 | ).execute() 32 | 33 | 34 | def _convert_to_channel(response) -> channel: 35 | """Extracts the needed variables from the returned json""" 36 | response_channel = response['items'][0] 37 | return channel(channel_id=response_channel['id'], 38 | channel_title=response_channel['snippet']['title'], 39 | channel_description=response_channel['snippet']['description'], 40 | channel_default_language=response_channel['snippet'].get('defaultLanguage', 'not set'), 41 | channel_country=response_channel['snippet'].get('country', 'not set'), 42 | channel_uploads=response_channel['contentDetails']['relatedPlaylists'].get('uploads', ''), 43 | channel_viewcount=response_channel['statistics']['viewCount'], 44 | channel_commentcount=response_channel['statistics']['commentCount'], 45 | channel_subscribercount=response_channel['statistics']['subscriberCount'], 46 | channel_videocount=response_channel['statistics']['videoCount'], 47 | channel_topic_ids=response_channel['topicDetails'].get('topicIds', 'not set'), 48 | channel_topic_categories=response_channel['topicDetails'].get('topicCategories', 'not set'), 49 | channel_branding_keywords=response_channel['brandingSettings']['channel'].get('keywords', 'not set') 50 | ) 51 | 52 | 53 | def _is_empty(response): 54 | return len(response['items']) == 0 55 | 56 | 57 | def get_channels(channel_seeds, youtube_client): 58 | channels = list() 59 | for channel_id in channel_seeds['channel_id']: 60 | response = _get_channel(channel_id, youtube_client) 61 | if _is_empty(response): 62 | print(f"Channel with channel_id {channel_id} returns empty") 63 | continue 64 | else: 65 | next_channel = _convert_to_channel(response) 66 | channels.append(next_channel) 67 | print(channel_id) 68 | 69 | return channels 70 | 71 | 72 | def write_channels(channels, channel_filename): 73 | with open(channel_filename, "a") as csv_file: 74 | writer = _csv.DictWriter(csv_file, fieldnames=_get_channel_header()) 75 | if _is_empty_file(channel_filename): 76 | writer.writeheader() 77 | 78 | for channel_row in channels: 79 | writer.writerow(_convert_to_dictionary(channel_row)) 80 | -------------------------------------------------------------------------------- /TopicModelling/TextLemma/getTokens.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | import numpy as np 4 | import re 5 | import pickle 6 | import operator 7 | #import glove_python 8 | from matplotlib.font_manager import FontProperties 9 | from nltk.stem.snowball import SnowballStemmer 10 | from nltk.stem.wordnet import WordNetLemmatizer 11 | from nltk.corpus import stopwords 12 | 13 | def getTokens(li_strings='', stemming=False, lemmatizing=False): 14 | if stemming: 15 | global di_stems 16 | di_stems = pickle.load(open('di_stems.p', 'rb')) 17 | 18 | # print('imported') 19 | #do some cleanup: only alphabetic characters, no stopwords 20 | # create separate stemmed tokens, to which the full strings will be compared to: 21 | li_comments_stemmed = [] 22 | len_comments = len(li_strings) 23 | # print(len(li_strings)) 24 | # print('Creating list of tokens per monthly document') 25 | for index, comment in enumerate(li_strings): 26 | #create list of list for comments and tokens 27 | if isinstance(comment, str): 28 | li_comment_stemmed = [] 29 | li_comment_stemmed = getFilteredText(comment, stemming=stemming, lemmatizing=lemmatizing) 30 | li_comments_stemmed.append(li_comment_stemmed) 31 | #if index % 1000 == 0: 32 | #print('Stemming/tokenising finished for string ' + str(index) + '/' + str(len_comments)) 33 | # print(len(li_comments_stemmed)) 34 | 35 | if stemming: 36 | pickle.dump(di_stems, open('di_stems.p', 'wb')) 37 | df_stems = pd.DataFrame.from_dict(di_stems, orient='index') 38 | df_stems.to_csv('di_stems_dataframe.csv', encoding='utf-8') 39 | 40 | return li_comments_stemmed 41 | 42 | def getFilteredText(string, stemming=False, lemmatizing=False): 43 | #first, remove urls 44 | if 'http' in string: 45 | string = re.sub(r'https?:\/\/.*[\r\n]*', ' ', string) 46 | if 'www.' in string: 47 | string = re.sub(r'www.*[\r\n]*', ' ', string) 48 | 49 | #use nltk's tokeniser to get a list of words 50 | # from nltk.tokeimport TreebankWordTokenizer 51 | # tokenizer = TreebankWordTokenizer() 52 | # tokenizer.PARENS_BRACKETS = [] 53 | # tokens = [word.lower() for sent in nltk.sent_tokenize(string) for word in tokenizer.tokenize(sent)] 54 | tokens = re.findall("[a-zA-Z\-\)\(]{3,50}", string) 55 | stemmer = SnowballStemmer("english") 56 | #list with tokens further processed 57 | li_filtered_tokens = [] 58 | # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) 59 | for token in tokens: 60 | token = token.lower() 61 | #print(len(tokens)) 62 | #only alphabetic characters, keep '(' and ')' symbols for echo brackets, only tokens with three or more characters 63 | #if re.search('[a-zA-Z\-\)\(]{3,50}', token): 64 | if re.match('[a-zA-Z\-\)\(]{3,50}', token) is not None: 65 | #no stopwords 66 | if token not in stopwords.words('english'): 67 | #token = token.lower() 68 | #shorten word if it's longer than 20 characters (e.g. 'reeeeeeeeeeeeeeeeeeeeeeeee') 69 | if len(token) >= 20: 70 | token = token[:20] 71 | #stem if indicated it should be stemmed 72 | if stemming: 73 | token_stemmed = stemmer.stem(token) 74 | li_filtered_tokens.append(token_stemmed) 75 | 76 | #update lookup dict with token and stemmed token 77 | #lookup dict is dict of stemmed words as keys and lists as full tokens 78 | if token_stemmed in di_stems: 79 | if token not in di_stems[token_stemmed]: 80 | di_stems[token_stemmed].append(token) 81 | else: 82 | di_stems[token_stemmed] = [] 83 | di_stems[token_stemmed].append(token) 84 | #if lemmatizing is used instead 85 | elif lemmatizing: 86 | lemmatizer = WordNetLemmatizer() 87 | token = lemmatizer.lemmatize(token) 88 | li_filtered_tokens.append(token) 89 | else: 90 | li_filtered_tokens.append(token) 91 | return li_filtered_tokens -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/video.py: -------------------------------------------------------------------------------- 1 | import csv as _csv 2 | from collections import namedtuple as _namedtuple 3 | 4 | from .util import is_empty_file as _is_empty_file 5 | from .util import convert_to_dictionary as _convert_to_dictionary 6 | 7 | video = _namedtuple('video', ('video_id', 8 | 'video_published', 9 | 'channel_id', 10 | 'video_title', 11 | 'video_description', 12 | 'video_channel_title', 13 | 'video_tags', 14 | 'video_category_id', 15 | 'video_default_language', 16 | 'video_duration', 17 | 'video_view_count', 18 | 'video_comment_count', 19 | 'video_likes_count', 20 | 'video_dislikes_count', 21 | 'video_topic_ids', 22 | 'video_topic_categories' 23 | )) 24 | 25 | 26 | def _get_video_header(): 27 | return video._fields 28 | 29 | 30 | def get_more_videos(channel_uploads, youtube_client, next_page_token, max_results=None): 31 | """takes the id of the uploads_playlist 32 | in channel data""" 33 | 34 | return youtube_client.playlistItems().list( 35 | part='snippet,contentDetails', 36 | playlistId=channel_uploads, 37 | maxResults=50, 38 | pageToken=next_page_token 39 | ).execute() 40 | 41 | 42 | def get_videos(channel_uploads, youtube_client, max_results=None): 43 | return youtube_client.playlistItems().list( 44 | part='snippet,contentDetails', 45 | playlistId=channel_uploads, 46 | maxResults=50 47 | ).execute() 48 | 49 | 50 | def _get_video_metadata(video_id, youtube_client): 51 | return youtube_client.videos().list( 52 | part='snippet,contentDetails,statistics,topicDetails', 53 | id=video_id 54 | ).execute() 55 | 56 | 57 | def _get_topic_ids(metadata): 58 | if "topicDetails" in metadata: 59 | return metadata['topicDetails'].get('relevantTopicIds', "not set") 60 | else: 61 | return "not set" 62 | 63 | 64 | def _get_topic_categories(metadata): 65 | if "topicDetails" in metadata: 66 | return metadata['topicDetails'].get('topicCategories', "not set") 67 | else: 68 | return "not set" 69 | 70 | 71 | def convert_to_videos(response, youtube_client): 72 | videos = list() 73 | for data in response['items']: 74 | video_id = data['contentDetails']['videoId'] 75 | video_metadata = _get_video_metadata(video_id, youtube_client) 76 | metadata = video_metadata['items'][0] 77 | 78 | next_video = video(video_id=video_id, 79 | video_published=data['snippet']['publishedAt'], 80 | channel_id=data['snippet']['channelId'], 81 | video_title=data['snippet']['title'], 82 | video_description=data['snippet'].get('description', 'not set'), 83 | video_channel_title=data['snippet']['channelTitle'], 84 | video_tags=metadata['snippet'].get('tags', 'not set'), 85 | video_category_id=metadata['snippet'].get('categoryId', 'not set'), 86 | video_default_language=metadata['snippet'].get('defaultLanguage', 'not set'), 87 | video_duration=metadata['contentDetails']['duration'], 88 | video_view_count=metadata['statistics'].get('viewCount', 0), 89 | video_comment_count=metadata['statistics'].get('commentCount', 0), 90 | video_likes_count=metadata['statistics'].get('likeCount', 0), 91 | video_dislikes_count=metadata['statistics'].get('dislikeCount', 0), 92 | video_topic_ids=_get_topic_ids(metadata), 93 | video_topic_categories=_get_topic_categories(metadata) 94 | ) 95 | videos.append(next_video) 96 | 97 | return videos 98 | 99 | 100 | def write_videos(videos, video_file): 101 | with open(video_file, "a") as csv_file: 102 | writer = _csv.DictWriter(csv_file, fieldnames=_get_video_header()) 103 | if _is_empty_file(video_file): 104 | writer.writeheader() 105 | 106 | for video_row in videos: 107 | writer.writerow(_convert_to_dictionary(video_row)) 108 | -------------------------------------------------------------------------------- /TopicModelling/getTokens.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sqlite3 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt, mpld3 6 | import time 7 | import re 8 | import os 9 | import nltk 10 | import pickle 11 | import operator 12 | #import glove_python 13 | from matplotlib.font_manager import FontProperties 14 | from nltk.stem.snowball import SnowballStemmer 15 | from nltk.stem.wordnet import WordNetLemmatizer 16 | from nltk.corpus import stopwords 17 | from scipy.interpolate import spline 18 | from datetime import datetime, timedelta 19 | from collections import OrderedDict 20 | from sklearn.feature_extraction.text import TfidfVectorizer 21 | from sklearn.metrics.pairwise import cosine_similarity 22 | from sklearn.cluster import KMeans 23 | from sklearn.externals import joblib 24 | from sklearn.manifold import MDS 25 | from sklearn.manifold import TSNE 26 | from sklearn.decomposition import PCA 27 | from gensim.models import Word2Vec 28 | from gensim.models import fasttext 29 | from gensim.scripts.word2vec2tensor import word2vec2tensor 30 | from matplotlib import pyplot 31 | from adjustText import adjust_text 32 | 33 | def getTokens(li_strings='', stemming=False, lemmatizing=False): 34 | if stemming: 35 | global di_stems 36 | di_stems = pickle.load(open('di_stems.p', 'rb')) 37 | 38 | # print('imported') 39 | #do some cleanup: only alphabetic characters, no stopwords 40 | # create separate stemmed tokens, to which the full strings will be compared to: 41 | li_comments_stemmed = [] 42 | len_comments = len(li_strings) 43 | # print(len(li_strings)) 44 | # print('Creating list of tokens per monthly document') 45 | for index, comment in enumerate(li_strings): 46 | #create list of list for comments and tokens 47 | if isinstance(comment, str): 48 | li_comment_stemmed = [] 49 | li_comment_stemmed = getFilteredText(comment, stemming=stemming, lemmatizing=lemmatizing) 50 | li_comments_stemmed.append(li_comment_stemmed) 51 | #if index % 1000 == 0: 52 | #print('Stemming/tokenising finished for string ' + str(index) + '/' + str(len_comments)) 53 | # print(len(li_comments_stemmed)) 54 | 55 | if stemming: 56 | pickle.dump(di_stems, open('di_stems.p', 'wb')) 57 | df_stems = pd.DataFrame.from_dict(di_stems, orient='index') 58 | df_stems.to_csv('di_stems_dataframe.csv', encoding='utf-8') 59 | 60 | return li_comments_stemmed 61 | 62 | def getFilteredText(string, stemming=False, lemmatizing=False): 63 | #first, remove urls 64 | if 'http' in string: 65 | string = re.sub(r'https?:\/\/.*[\r\n]*', ' ', string) 66 | if 'www.' in string: 67 | string = re.sub(r'www.*[\r\n]*', ' ', string) 68 | 69 | #use nltk's tokeniser to get a list of words 70 | # from nltk.tokeimport TreebankWordTokenizer 71 | # tokenizer = TreebankWordTokenizer() 72 | # tokenizer.PARENS_BRACKETS = [] 73 | # tokens = [word.lower() for sent in nltk.sent_tokenize(string) for word in tokenizer.tokenize(sent)] 74 | tokens = re.findall("[a-zA-Z\-\)\(]{3,50}", string) 75 | stemmer = SnowballStemmer("english") 76 | #list with tokens further processed 77 | li_filtered_tokens = [] 78 | # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) 79 | for token in tokens: 80 | token = token.lower() 81 | #print(len(tokens)) 82 | #only alphabetic characters, keep '(' and ')' symbols for echo brackets, only tokens with three or more characters 83 | #if re.search('[a-zA-Z\-\)\(]{3,50}', token): 84 | if re.match('[a-zA-Z\-\)\(]{3,50}', token) is not None: 85 | #no stopwords 86 | if token not in stopwords.words('english'): 87 | #token = token.lower() 88 | #shorten word if it's longer than 20 characters (e.g. 'reeeeeeeeeeeeeeeeeeeeeeeee') 89 | if len(token) >= 20: 90 | token = token[:20] 91 | #stem if indicated it should be stemmed 92 | if stemming: 93 | token_stemmed = stemmer.stem(token) 94 | li_filtered_tokens.append(token_stemmed) 95 | 96 | #update lookup dict with token and stemmed token 97 | #lookup dict is dict of stemmed words as keys and lists as full tokens 98 | if token_stemmed in di_stems: 99 | if token not in di_stems[token_stemmed]: 100 | di_stems[token_stemmed].append(token) 101 | else: 102 | di_stems[token_stemmed] = [] 103 | di_stems[token_stemmed].append(token) 104 | #if lemmatizing is used instead 105 | elif lemmatizing: 106 | lemmatizer = WordNetLemmatizer() 107 | token = lemmatizer.lemmatize(token) 108 | li_filtered_tokens.append(token) 109 | else: 110 | li_filtered_tokens.append(token) 111 | return li_filtered_tokens -------------------------------------------------------------------------------- /DataCollection/src/youtubecollector/comments.py: -------------------------------------------------------------------------------- 1 | import csv as _csv 2 | from collections import namedtuple as _namedtuple 3 | 4 | from googleapiclient.errors import HttpError 5 | 6 | from .util import is_empty_file as _is_empty_file 7 | from .util import convert_to_dictionary as _convert_to_dictionary 8 | 9 | comment = _namedtuple("comment", ('video_id', 10 | 'comment_id', 11 | 'author_display_name', 12 | 'author_channel_url', 13 | 'author_channel_id', 14 | 'comment_text', 15 | 'comment_like_count', 16 | 'comment_dislike_count', 17 | 'comment_time', 18 | 'reply_count')) 19 | 20 | 21 | def _get_comment_header(): 22 | return comment._fields 23 | 24 | 25 | def get_comments(video_id, youtube_client): 26 | try: 27 | return youtube_client.commentThreads().list( 28 | videoId=video_id, 29 | part='snippet,replies', 30 | maxResults=100 31 | ).execute() 32 | except HttpError: 33 | return 34 | 35 | 36 | def get_more_comments(video_id, youtube_client, next_page_token): 37 | try: 38 | return youtube_client.commentThreads().list( 39 | videoId=video_id, 40 | part='snippet,replies', 41 | pageToken=next_page_token, 42 | maxResults=100 43 | ).execute() 44 | except HttpError: 45 | return 46 | 47 | 48 | def _get_author_channel_id(data): 49 | if "authorChannelId" in data['snippet']['topLevelComment']['snippet']: 50 | return data['snippet']['topLevelComment']['snippet']['authorChannelId'].get("value", 'not set') 51 | else: 52 | return "not set" 53 | 54 | 55 | def convert_to_comments(response): 56 | if response is None: 57 | return list() 58 | 59 | comments = list() 60 | for data in response['items']: 61 | comments.append(comment(comment_id=data['id'], 62 | video_id=data['snippet']['videoId'], 63 | author_display_name=data['snippet']['topLevelComment']['snippet']['authorDisplayName'], 64 | author_channel_url=data['snippet']['topLevelComment']['snippet']['authorChannelUrl'], 65 | author_channel_id=_get_author_channel_id(data), 66 | comment_text=data['snippet']['topLevelComment']['snippet']['textDisplay'], 67 | comment_like_count=data['snippet']['topLevelComment']['snippet']['likeCount'], 68 | comment_dislike_count=data['snippet']['topLevelComment']['snippet'].get('disLikeCount', 0), 69 | comment_time=data['snippet']['topLevelComment']['snippet']['publishedAt'], 70 | reply_count=data['snippet']['totalReplyCount']) 71 | ) 72 | if 'replies' in data: 73 | for reply in data['replies']['comments']: 74 | # Replies can be recognized by the format of their id: 75 | # The id is made out of two elements: {parent_comment_id}.{reply_id} 76 | # TODO[Olaf]: Do we want to add a boolean field if something is an reply 77 | 78 | comments.append(comment(comment_id=reply['id'], 79 | video_id=reply['snippet']['videoId'], 80 | author_display_name=reply['snippet']['authorDisplayName'], 81 | author_channel_url=reply['snippet']['authorChannelUrl'], 82 | author_channel_id=reply['snippet']['authorChannelId']['value'], 83 | comment_text=reply['snippet']['textDisplay'], 84 | comment_like_count=reply['snippet']['likeCount'], 85 | comment_dislike_count='', 86 | comment_time=reply['snippet']['publishedAt'], 87 | reply_count='')) 88 | 89 | return comments 90 | 91 | 92 | def write_comments(comments_file, comments): 93 | with open(comments_file, 'a') as csv_file: 94 | writer = _csv.DictWriter(csv_file, fieldnames=_get_comment_header()) 95 | if _is_empty_file(comments_file): 96 | writer.writeheader() 97 | 98 | for comment_row in comments: 99 | writer.writerow(_convert_to_dictionary(comment_row)) 100 | -------------------------------------------------------------------------------- /TopicModelling/language_detection/spacy-language-detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pickle\n", 10 | "from collections import namedtuple\n", 11 | "from tqdm import tqdm_notebook as tqdm\n", 12 | "\n", 13 | "import pandas as pd\n", 14 | "import spacy\n", 15 | "from spacy_cld import LanguageDetector\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "nlp = spacy.load('en_core_web_sm')\n", 32 | "language_detector = LanguageDetector()\n", 33 | "nlp.add_pipe(language_detector)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Util function to write lazy chunks back to disk\n", 43 | "output_columns = [\"id\", \"hash1\", \"hash2\", \"user\", \"user_pic\", \"channel_url\", \"channel_id\", \"comment\", \n", 44 | " \"depth\", \"timestamp\", \"language\"]\n", 45 | "comments_path = \"data/comments_language.csv\"\n", 46 | "def write_to_disk(chunk):\n", 47 | " with open(comments_path, \"a\", encoding=\"utf-8\") as file:\n", 48 | " chunk.to_csv(file, index=False, header=False)\n", 49 | "\n", 50 | "# Lazy data reader into DataFrame\n", 51 | "transcripts_reader = pd.read_csv(\"data/comments.csv\", chunksize=500, names=output_columns[:-1])\n", 52 | "# Reader to pick up where we ended\n", 53 | "completed_reader = pd.read_csv(\"data/comments_language.csv\", chunksize=500, names=output_columns)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 9, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# Run this to move the transcripts_reader iterator to where we stopped last time\n", 63 | "for _ in completed_reader:\n", 64 | " next(transcripts_reader)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "for ix, transcripts in enumerate(tqdm(transcripts_reader)):\n", 74 | " languages = []\n", 75 | " for ix, transcript in transcripts.iterrows():\n", 76 | " content = str(transcript[7])\n", 77 | " doc = nlp(content)\n", 78 | " try:\n", 79 | " language = doc._.languages[0]\n", 80 | " except IndexError:\n", 81 | " language = \"?\"\n", 82 | " languages.append(language)\n", 83 | " transcripts[\"language\"] = languages\n", 84 | " write_to_disk(transcripts)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 10, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "application/vnd.jupyter.widget-view+json": { 95 | "model_id": "66fbb3d0704343cb8d9eb501f7dab277", 96 | "version_major": 2, 97 | "version_minor": 0 98 | }, 99 | "text/plain": [ 100 | "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" 101 | ] 102 | }, 103 | "metadata": {}, 104 | "output_type": "display_data" 105 | } 106 | ], 107 | "source": [ 108 | "for ix, transcripts in enumerate(tqdm(transcripts_reader)):\n", 109 | " pass" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.6.7" 137 | }, 138 | "varInspector": { 139 | "cols": { 140 | "lenName": 16, 141 | "lenType": 16, 142 | "lenVar": 40 143 | }, 144 | "kernels_config": { 145 | "python": { 146 | "delete_cmd_postfix": "", 147 | "delete_cmd_prefix": "del ", 148 | "library": "var_list.py", 149 | "varRefreshCmd": "print(var_dic_list())" 150 | }, 151 | "r": { 152 | "delete_cmd_postfix": ") ", 153 | "delete_cmd_prefix": "rm(", 154 | "library": "var_list.r", 155 | "varRefreshCmd": "cat(var_dic_list()) " 156 | } 157 | }, 158 | "types_to_exclude": [ 159 | "module", 160 | "function", 161 | "builtin_function_or_method", 162 | "instance", 163 | "_Feature" 164 | ], 165 | "window_display": false 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 1 170 | } 171 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/arguing_lexicon.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from spacy.tokens import Doc 5 | from spacy_arguing_lexicon.arguments import ArgumentTexts 6 | from spacy_arguing_lexicon.exceptions import LexiconMissingError 7 | 8 | 9 | class ArguingLexiconParser(object): 10 | 11 | MACROS_PATH = os.path.join(os.path.dirname(__file__), "lexicon", "{}", "macros") 12 | PATTERNS_PATH = os.path.join(os.path.dirname(__file__), "lexicon", "{}", "patterns") 13 | 14 | MACRO_PATTERN = re.compile("(@[A-Z0-9]+)") 15 | 16 | MACROS = {} 17 | PATTERNS = {} 18 | 19 | def package_check(self, lang): 20 | if not os.path.exists(self.MACROS_PATH.format(lang)): 21 | raise LexiconMissingError( 22 | "Trying to load Arguing Lexicon without macros file for language {}".format(lang) 23 | ) 24 | if not os.path.exists(self.PATTERNS_PATH.format(lang)): 25 | raise LexiconMissingError( 26 | "Trying to load Arguing Lexicon without patterns file for language {}".format(lang) 27 | ) 28 | 29 | def load_macros(self, lang): 30 | for entry in os.listdir(self.MACROS_PATH.format(lang)): 31 | if not entry.endswith(".tff"): 32 | continue 33 | with open(os.path.join(self.MACROS_PATH.format(lang), entry)) as macro_file: 34 | for macro_line in macro_file.readlines(): 35 | # Skip empty lines, class definitions and comments 36 | if not macro_line.strip(): 37 | continue 38 | if macro_line.startswith("#"): 39 | continue 40 | # Add macros 41 | macro_label, macro_definition = self.preprocess_pattern(macro_line).split("=") 42 | macro = [mcr.strip() for mcr in macro_definition.strip().strip("{}").split(",")] 43 | self.MACROS[macro_label] = macro 44 | 45 | def preprocess_pattern(self, pattern): 46 | stripped_pattern = pattern.replace("\\'", "'").strip() 47 | return "{}\\b".format(stripped_pattern) # the \b makes sure that a match ends with a non-word token 48 | 49 | def compile_pattern(self, pattern): 50 | macro_match = self.MACRO_PATTERN.search(pattern) 51 | if macro_match is None: 52 | yield re.compile(self.preprocess_pattern(pattern), flags=re.IGNORECASE) 53 | else: 54 | macro = macro_match.group(0) 55 | macro_replacement = "|".join(self.MACROS[macro]) 56 | replaced_pattern = pattern.replace(macro, macro_replacement) 57 | for preprocessed_pattern in self.compile_pattern(replaced_pattern): 58 | yield preprocessed_pattern 59 | 60 | def load_patterns(self, lang): 61 | for entry in os.listdir(self.PATTERNS_PATH.format(lang)): 62 | if not entry.endswith(".tff"): 63 | continue 64 | with open(os.path.join(self.PATTERNS_PATH.format(lang), entry)) as patterns_file: 65 | pattern_class = None 66 | for pattern_line in patterns_file.readlines(): 67 | # Skip empty lines and comments 68 | if not pattern_line.strip(): 69 | continue 70 | if pattern_line.startswith("#") and pattern_class: 71 | continue 72 | # Read pattern class 73 | elif pattern_line.startswith("#"): 74 | trash, pattern_class = pattern_line.replace('"', "").split("=") 75 | pattern_class = pattern_class.strip() 76 | self.PATTERNS[pattern_class] = [] 77 | continue 78 | # Add patterns 79 | for preprocessed_patterns in self.compile_pattern(pattern_line): 80 | self.PATTERNS[pattern_class].append(preprocessed_patterns) 81 | 82 | def get_arguing_matches(self, doc): 83 | for arguing_label, arguing_patterns in self.PATTERNS.items(): 84 | for arguing_pattern in arguing_patterns: 85 | match = arguing_pattern.search(doc.text) 86 | if match is not None: 87 | yield arguing_label, match 88 | 89 | def get_lexicon_vocabulary(self): 90 | vocabulary = set() 91 | for label, patterns in self.PATTERNS.items(): 92 | for compiled in patterns: 93 | words = "".join([char if char.isalnum() or char == "'" else " " for char in compiled.pattern]) 94 | for word in words.split(" "): 95 | if len(word) <= 1 and not word == "I": 96 | continue 97 | vocabulary.add(word) 98 | return vocabulary 99 | 100 | def __init__(self, lang="en"): 101 | super().__init__() 102 | self.package_check(lang) 103 | self.load_macros(lang) 104 | self.load_patterns(lang) 105 | if not Doc.has_extension('arguments'): 106 | Doc.set_extension('arguments', getter=ArgumentTexts(self), force=True) 107 | else: 108 | default, method, getter, setter = Doc.get_extension('arguments') 109 | assert isinstance(getter, ArgumentTexts), \ 110 | "Expected 'arguments' extension to be of type ArgumentTexts " \ 111 | "but found {}. Namespace clash?".format(type(Doc.get_extension('arguments'))) 112 | 113 | def __call__(self, doc): 114 | # All parsing is lazy 115 | return doc 116 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/nullable_fields_channel_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#channelListResponse", 3 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/OeMJN9Cnx6jGn9D5tetafViyG0U\"", 4 | "pageInfo": { 5 | "totalResults": 1, 6 | "resultsPerPage": 1 7 | }, 8 | "items": [ 9 | { 10 | "kind": "youtube#channel", 11 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/3t1aM54tFd_H-BQTFGM9M4ZNq1g\"", 12 | "id": "Some_ID", 13 | "snippet": { 14 | "title": "The test channel", 15 | "description": "The Official YouTube Channel for testing", 16 | "publishedAt": "2015-04-03T17:02:57.000Z", 17 | "thumbnails": { 18 | "default": { 19 | "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s88-mo-c-c0xffffffff-rj-k-no", 20 | "width": 88, 21 | "height": 88 22 | }, 23 | "medium": { 24 | "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s240-mo-c-c0xffffffff-rj-k-no", 25 | "width": 240, 26 | "height": 240 27 | }, 28 | "high": { 29 | "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s800-mo-c-c0xffffffff-rj-k-no", 30 | "width": 800, 31 | "height": 800 32 | } 33 | }, 34 | "localized": { 35 | "title": "Rand Paul", 36 | "description": "The Official YouTube Channel of Rand Paul" 37 | } 38 | }, 39 | "contentDetails": { 40 | "relatedPlaylists": { 41 | "watchHistory": "HL", 42 | "watchLater": "WL" 43 | } 44 | }, 45 | "statistics": { 46 | "viewCount": "2640735", 47 | "commentCount": "0", 48 | "subscriberCount": "9779", 49 | "hiddenSubscriberCount": false, 50 | "videoCount": "258" 51 | }, 52 | "topicDetails": { 53 | }, 54 | "brandingSettings": { 55 | "channel": { 56 | "title": "Rand Paul", 57 | "description": "The Official YouTube Channel of Rand Paul", 58 | "defaultTab": "Featured", 59 | "trackingAnalyticsAccountId": "UA-57201184-2", 60 | "showRelatedChannels": true, 61 | "showBrowseView": true, 62 | "unsubscribedTrailer": "KJnn2oi8e7A", 63 | "profileColor": "#000000", 64 | "country": "US" 65 | }, 66 | "image": { 67 | "bannerImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1060-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 68 | "bannerMobileImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 69 | "bannerTabletLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1138-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 70 | "bannerTabletImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1707-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 71 | "bannerTabletHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2276-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 72 | "bannerTabletExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2560-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 73 | "bannerMobileLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w320-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 74 | "bannerMobileMediumHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w960-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 75 | "bannerMobileHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 76 | "bannerMobileExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1440-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 77 | "bannerTvImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2120-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no", 78 | "bannerTvLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w854-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no", 79 | "bannerTvMediumImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no", 80 | "bannerTvHighImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1920-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no" 81 | }, 82 | "hints": [ 83 | { 84 | "property": "channel.banner.mobile.medium.image.url", 85 | "value": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no" 86 | }, 87 | { 88 | "property": "channel.featured_tab.template.string", 89 | "value": "Everything" 90 | }, 91 | { 92 | "property": "channel.modules.show_comments.bool", 93 | "value": "True" 94 | } 95 | ] 96 | } 97 | } 98 | ] 99 | } -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/arguing-lexicon-lda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "\n", 12 | "import sys\n", 13 | "sys.path.append(\"../\")\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pickle\n", 23 | "\n", 24 | "import pandas as pd\n", 25 | "from sklearn.feature_extraction.text import CountVectorizer\n", 26 | "from sklearn.decomposition import LatentDirichletAllocation" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# Lazy data reader into DataFrame\n", 36 | "def read_argument_captions():\n", 37 | " transcripts_reader = pd.read_csv(\"data/captions_arguments.csv\", chunksize=10)\n", 38 | " for batch in transcripts_reader:\n", 39 | " for ix, caption in batch.iterrows():\n", 40 | " text = \"\"\n", 41 | " for fragment, argument_label in zip(str(caption[\"content\"]).split(\"\\n\"), str(caption[\"argument_labels\"]).split(\"\\n\")):\n", 42 | " if argument_label:\n", 43 | " text += fragment + \" \"\n", 44 | " yield text\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Training a tfidf vectorizer\n", 54 | "vectorizer = CountVectorizer(stop_words=\"english\")\n", 55 | "matrix = vectorizer.fit_transform(read_argument_captions())\n", 56 | "feature_names = vectorizer.get_feature_names()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "with open(\"models/vectorizer.pkl\", \"rb\") as count_file:\n", 66 | " vectorizer = pickle.load(count_file)\n", 67 | "with open(\"models/vectorizer_matrix.pkl\", \"rb\") as matrix_file:\n", 68 | " matrix = pickle.load(matrix_file)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# Saving progress\n", 78 | "with open(\"models/vectorizer.pkl\", \"wb\") as count_file:\n", 79 | " pickle.dump(vectorizer, count_file)\n", 80 | "with open(\"models/vectorizer_matrix.pkl\", \"wb\") as matrix_file:\n", 81 | " pickle.dump(matrix, matrix_file)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Training the LDA model\n", 91 | "lda_model = LatentDirichletAllocation(n_topics=50, max_iter=500, verbose=3, n_jobs=-1, learning_method=\"online\")\n", 92 | "lda_model.fit(matrix)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Saving progress\n", 102 | "with open(\"models/lda.50.pkl\", \"wb\") as lda_file:\n", 103 | " pickle.dump(lda_model, lda_file)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "def print_top_words(model, feature_names, n_top_words):\n", 113 | " for topic_idx, topic in enumerate(model.components_):\n", 114 | " print(\"Topic #%d:\" % topic_idx)\n", 115 | " print(\" | \".join([feature_names[i]\n", 116 | " for i in topic.argsort()[:-n_top_words - 1:-1]]))\n", 117 | " print()\n", 118 | " print()\n", 119 | " print()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "print_top_words(lda_model, feature_names, 50)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python [conda env:ml]", 142 | "language": "python", 143 | "name": "conda-env-ml-py" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.6.6" 156 | }, 157 | "varInspector": { 158 | "cols": { 159 | "lenName": 16, 160 | "lenType": 16, 161 | "lenVar": 40 162 | }, 163 | "kernels_config": { 164 | "python": { 165 | "delete_cmd_postfix": "", 166 | "delete_cmd_prefix": "del ", 167 | "library": "var_list.py", 168 | "varRefreshCmd": "print(var_dic_list())" 169 | }, 170 | "r": { 171 | "delete_cmd_postfix": ") ", 172 | "delete_cmd_prefix": "rm(", 173 | "library": "var_list.r", 174 | "varRefreshCmd": "cat(var_dic_list()) " 175 | } 176 | }, 177 | "types_to_exclude": [ 178 | "module", 179 | "function", 180 | "builtin_function_or_method", 181 | "instance", 182 | "_Feature" 183 | ], 184 | "window_display": false 185 | } 186 | }, 187 | "nbformat": 4, 188 | "nbformat_minor": 2 189 | } 190 | -------------------------------------------------------------------------------- /TopicModelling/arguing_lexicon/arguing-lexicon-filter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pickle\n", 10 | "from collections import namedtuple\n", 11 | "\n", 12 | "import pandas as pd\n", 13 | "from arguing_lexicon import ArguingLexiconParser\n", 14 | "from tqdm import tqdm_notebook as tqdm\n", 15 | "\n", 16 | "input_data_csv = \"data/captions_metadata.csv\"" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Lazy data reader into DataFrame\n", 26 | "transcripts_reader = pd.read_csv(\"data/captions_metadata.csv\", chunksize=10)\n", 27 | "\n", 28 | "# Dummy class for convenience and speed\n", 29 | "Doc = namedtuple(\"Doc\", [\"text\"])\n", 30 | "\n", 31 | "# Util function to write lazy chunks back to disk\n", 32 | "output_columns = [\"id\", \"content\", \"date\", \"title\", \"unknown\", \"channel\", \"fragments\", \"argument_fragments\",\n", 33 | " \"argument_labels\", \"argument_content\"]\n", 34 | "arguments_path = \"data/captions_arguments.csv\"\n", 35 | "def write_to_disk(chunk):\n", 36 | " with open(arguments_path, \"a\", encoding=\"utf-8\") as arguments_file:\n", 37 | " chunk.to_csv(arguments_file, index=False)\n", 38 | "\n", 39 | "# Arguing lexixon parser\n", 40 | "parser = ArguingLexiconParser()\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "application/vnd.jupyter.widget-view+json": { 51 | "model_id": "bbd877813eba4fe1b5db68f79ebdb966", 52 | "version_major": 2, 53 | "version_minor": 0 54 | }, 55 | "text/plain": [ 56 | "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" 57 | ] 58 | }, 59 | "metadata": {}, 60 | "output_type": "display_data" 61 | }, 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "for ix, transcripts in enumerate(tqdm(transcripts_reader)):\n", 72 | " if ix <= 1183:\n", 73 | " continue\n", 74 | " arguments_frame = pd.DataFrame(columns=output_columns)\n", 75 | " for ix, transcript in transcripts.iterrows():\n", 76 | " content = str(transcript[\"content\"]).split(\"\\n\")\n", 77 | " labels = []\n", 78 | " arguments = []\n", 79 | " argument_fragments = 0\n", 80 | " for con in content:\n", 81 | " doc = Doc(con)\n", 82 | " matches = list(parser.get_arguing_matches(doc))\n", 83 | " if len(matches):\n", 84 | " argument_fragments += 1\n", 85 | " lbls = []\n", 86 | " args = []\n", 87 | " for label, match in matches:\n", 88 | " lbls.append(label)\n", 89 | " args.append(match.group(0))\n", 90 | " labels.append(\"\\t\".join(lbls))\n", 91 | " arguments.append(\"\\t\".join(args))\n", 92 | " argument_serie = pd.Series(data={\n", 93 | " \"id\": transcript[\"id\"],\n", 94 | " \"content\": transcript[\"content\"],\n", 95 | " \"date\": transcript[\"date\"],\n", 96 | " \"title\": transcript[\"title\"],\n", 97 | " \"unknown\": transcript[\"unknown\"],\n", 98 | " \"channel\": transcript[\"channel\"],\n", 99 | " \"fragments\": len(content),\n", 100 | " \"argument_fragments\": argument_fragments,\n", 101 | " \"argument_labels\": \"\\n\".join(labels),\n", 102 | " \"argument_content\": \"\\n\".join(arguments)\n", 103 | " })\n", 104 | "\n", 105 | " arguments_frame = arguments_frame.append(argument_serie, ignore_index=True)\n", 106 | " write_to_disk(arguments_frame)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python [conda env:ml]", 120 | "language": "python", 121 | "name": "conda-env-ml-py" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.6.6" 134 | }, 135 | "varInspector": { 136 | "cols": { 137 | "lenName": 16, 138 | "lenType": 16, 139 | "lenVar": 40 140 | }, 141 | "kernels_config": { 142 | "python": { 143 | "delete_cmd_postfix": "", 144 | "delete_cmd_prefix": "del ", 145 | "library": "var_list.py", 146 | "varRefreshCmd": "print(var_dic_list())" 147 | }, 148 | "r": { 149 | "delete_cmd_postfix": ") ", 150 | "delete_cmd_prefix": "rm(", 151 | "library": "var_list.r", 152 | "varRefreshCmd": "cat(var_dic_list()) " 153 | } 154 | }, 155 | "types_to_exclude": [ 156 | "module", 157 | "function", 158 | "builtin_function_or_method", 159 | "instance", 160 | "_Feature" 161 | ], 162 | "window_display": false 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 1 167 | } 168 | -------------------------------------------------------------------------------- /DataCollection/tests/youtubecollector/resources/full_channel_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "youtube#channelListResponse", 3 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/OeMJN9Cnx6jGn9D5tetafViyG0U\"", 4 | "pageInfo": { 5 | "totalResults": 1, 6 | "resultsPerPage": 1 7 | }, 8 | "items": [ 9 | { 10 | "kind": "youtube#channel", 11 | "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/3t1aM54tFd_H-BQTFGM9M4ZNq1g\"", 12 | "id": "Some_ID", 13 | "snippet": { 14 | "title": "The test channel", 15 | "description": "The Official YouTube Channel for testing", 16 | "publishedAt": "2015-04-03T17:02:57.000Z", 17 | "defaultLanguage": "en", 18 | "thumbnails": { 19 | "default": { 20 | "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s88-mo-c-c0xffffffff-rj-k-no", 21 | "width": 88, 22 | "height": 88 23 | }, 24 | "medium": { 25 | "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s240-mo-c-c0xffffffff-rj-k-no", 26 | "width": 240, 27 | "height": 240 28 | }, 29 | "high": { 30 | "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s800-mo-c-c0xffffffff-rj-k-no", 31 | "width": 800, 32 | "height": 800 33 | } 34 | }, 35 | "localized": { 36 | "title": "Rand Paul", 37 | "description": "The Official YouTube Channel of Rand Paul" 38 | }, 39 | "country": "US" 40 | }, 41 | "contentDetails": { 42 | "relatedPlaylists": { 43 | "uploads": "UU_8WUrPbi8clO6sWt_FDvuA", 44 | "watchHistory": "HL", 45 | "watchLater": "WL" 46 | } 47 | }, 48 | "statistics": { 49 | "viewCount": "2640735", 50 | "commentCount": "0", 51 | "subscriberCount": "9779", 52 | "hiddenSubscriberCount": false, 53 | "videoCount": "258" 54 | }, 55 | "topicDetails": { 56 | "topicIds": [ 57 | "topic1", 58 | "topic2", 59 | "topic3" 60 | ], 61 | "topicCategories": [ 62 | "https://en.wikipedia.org/wiki/Society", 63 | "https://en.wikipedia.org/wiki/Politics" 64 | ] 65 | }, 66 | "brandingSettings": { 67 | "channel": { 68 | "title": "Rand Paul", 69 | "description": "The Official YouTube Channel of Rand Paul", 70 | "keywords": "\"Testing is fun\", \"More Testing\"", 71 | "defaultTab": "Featured", 72 | "trackingAnalyticsAccountId": "UA-57201184-2", 73 | "showRelatedChannels": true, 74 | "showBrowseView": true, 75 | "unsubscribedTrailer": "KJnn2oi8e7A", 76 | "profileColor": "#000000", 77 | "country": "US" 78 | }, 79 | "image": { 80 | "bannerImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1060-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 81 | "bannerMobileImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 82 | "bannerTabletLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1138-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 83 | "bannerTabletImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1707-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 84 | "bannerTabletHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2276-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 85 | "bannerTabletExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2560-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no", 86 | "bannerMobileLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w320-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 87 | "bannerMobileMediumHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w960-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 88 | "bannerMobileHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 89 | "bannerMobileExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1440-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no", 90 | "bannerTvImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2120-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no", 91 | "bannerTvLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w854-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no", 92 | "bannerTvMediumImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no", 93 | "bannerTvHighImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1920-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no" 94 | }, 95 | "hints": [ 96 | { 97 | "property": "channel.banner.mobile.medium.image.url", 98 | "value": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no" 99 | }, 100 | { 101 | "property": "channel.featured_tab.template.string", 102 | "value": "Everything" 103 | }, 104 | { 105 | "property": "channel.modules.show_comments.bool", 106 | "value": "True" 107 | } 108 | ] 109 | } 110 | } 111 | ] 112 | } -------------------------------------------------------------------------------- /TopicModelling/TopicModelWrapper/StreamingParser.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import csv 3 | import json 4 | import sys 5 | 6 | 7 | class StreamingParser(object): 8 | """ 9 | Wrapper class for different approaches to loading texts. 10 | 11 | Included approaches: 12 | - Directory iteration 13 | - JSON iteration (evaluate per line, search for certain keys) 14 | """ 15 | def __init__(self, file_path, iter_methods_index, metadata=False): 16 | self.path = file_path 17 | iter_methods = [self.directory_iterator, self.json_iterator, self.frog_iterator, self.csv_iterator] 18 | self.iter_method = iter_methods[iter_methods_index] 19 | self.metadata = metadata 20 | self.empty_counter = 0 21 | 22 | def __iter__(self): 23 | if self.metadata: 24 | for text, metadata in self.iter_method(): 25 | yield text, metadata 26 | else: 27 | for text in self.iter_method(): 28 | yield text 29 | 30 | def directory_iterator(self): 31 | """ 32 | Iterable object (generator) for aggregating plain text files in a given directory. 33 | """ 34 | for filename in os.listdir(self.path): 35 | with open(os.path.join(self.path, filename), 'r') as file: 36 | # date = file.readline() 37 | yield file.read() 38 | 39 | def frog_iterator(self): 40 | """ 41 | Parser method for parsing frog tar.gz archives. 42 | """ 43 | print("Loading input from Frog file") 44 | 45 | with tarfile.open(self.path, 'r:gz') as tf: 46 | for i, entry in enumerate(tf): 47 | print(i) 48 | if not entry.isdir(): 49 | _id = os.path.basename(entry.name) 50 | 51 | file_path = '{}{}{}'.format(self.path, '/extracted_data/docs/', _id) 52 | with open(file_path, 'r') as f: 53 | _id = f.readline() 54 | _name = f.readline() 55 | _collection = f.readline() 56 | _type = f.readline() 57 | _classification = f.readline() 58 | _date = f.readline() 59 | 60 | entry_string = [] 61 | for line in tf.extractfile(entry): 62 | line = line.decode('utf-8').split('\t') 63 | if line[0] is not '\n': 64 | if line[4][0] == 'N': 65 | entry_string.append(line[2]) 66 | yield ' '.join(entry_string), (_id, _name, _collection, _type, _classification, _date) 67 | 68 | def json_iterator(self): 69 | """ 70 | Iterable object (generator) for aggregations of ORI (Elasticsearch) data. 71 | The aggregations are in JSON format, with each line containing one entry. 72 | The StreamingJSON object iterates over all lines contained in the file that was 73 | passed as a parameter. 74 | Iter yields only the raw text from the object, in this case 75 | the description field per source. If more than one source is found, 76 | Iter concatenates the results to one string. This string is then returned, 77 | and control is yielded to the caller. If no description is found, the 78 | KeyError exception is caught and a message is printed to the console. 79 | """ 80 | print("Loading input as JSON formatted file") 81 | 82 | with open(self.path) as json_file: 83 | for index, line in enumerate(json_file): 84 | print("extracting line {}".format(index)) 85 | json_data = json.loads(line) 86 | 87 | # Extract all descriptions of the sources and append them to the main data list 88 | doc_data = '' 89 | 90 | try: 91 | _id = json_data['_id'] 92 | _name = json_data["_source"].get('name', "").replace('\n', '').replace('\r', '').replace(',', '') 93 | _collection = json_data["_source"].get('meta', {}).get('collection', "No Collection available") 94 | _type = json_data['_type'] 95 | _classification = json_data["_source"].get('classification', "No classification in data") 96 | _date = json_data["_source"].get('end_date', "No end_date in data") 97 | 98 | for source in json_data['_source']['sources']: 99 | # Add description of data as input 100 | doc_data = ' '.join([doc_data, source['description']]) 101 | 102 | if self.metadata: 103 | yield doc_data, (_id, _name, _collection, _type, _classification, _date) 104 | else: 105 | yield doc_data 106 | except KeyError: 107 | print("No sources key detected!") 108 | self.empty_counter += 1 109 | 110 | def csv_iterator(self): 111 | """ 112 | Parser of CorrespondentEx cleaned csv files. 113 | """ 114 | print("Loading input as JSON formatted file") 115 | with open(self.path) as csv_file: 116 | csv_data = csv.reader(csv_file) 117 | 118 | # Skip the column names 119 | next(csv_data) 120 | # Increase the csv max field size 121 | csv.field_size_limit(sys.maxsize) 122 | 123 | for index, row in enumerate(csv_data): 124 | # if not index % 1000: 125 | print("extracting line {}".format(index)) 126 | 127 | # For some reason the index got duplicated, hence counting from 1 (blasphemy!) 128 | _id = row[1] 129 | _text = ast.literal_eval(row[2]) 130 | 131 | terms = [] 132 | for caption in _text: 133 | for term in caption.split(): 134 | terms.append(term) 135 | 136 | if self.metadata: 137 | yield terms, (_id,) 138 | else: 139 | yield ''.join(terms) 140 | -------------------------------------------------------------------------------- /TopicModelling/Top TfIdf/Right - tfidf top words.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Top TfIdf words for channels\n", 8 | "\n", 9 | "Methodology: similar to https://pudding.cool/2017/09/hip-hop-words/\n", 10 | "\n", 11 | "Merge all videos for each channel for every year and see what makes that channel distinctive and if it changes over time.\n", 12 | "\n", 13 | "Method:\n", 14 | "1. Import cleaned captions\n", 15 | "2. Group them by channel and year" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "import datetime as dt\n", 26 | "from spacy.lang.en import English\n", 27 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 28 | "import numpy as np\n", 29 | "import networkx as nx" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "\n", 39 | "\n", 40 | "captions = '/home/dim/Downloads/captions_right.csv'\n", 41 | "videos = '/home/dim/Documents/projecten/extremisme/youtube/data/temp/bubble/right/videos_right.csv'\n", 42 | "\n", 43 | "columns = ['video_id', 'text']\n", 44 | "\n", 45 | "df1 = pd.read_csv(captions, names=columns, low_memory=False)\n", 46 | "df2 = pd.read_csv(videos, low_memory=False)\n", 47 | "df = pd.merge(df1, df2, on='video_id', how='left')\n", 48 | "\n", 49 | "del df1, df2" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "\n", 59 | "df['video_published'] = pd.to_datetime(df['video_published'])\n", 60 | "df['year'] = df['video_published'].dt.year\n", 61 | "\n", 62 | "df = df.groupby(['video_channel_title', 'year'])['text'].apply(lambda x: x.sum())" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 9, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "df = df.reset_index()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "###Optional: Lemmatize\n", 81 | "\n", 82 | "tokenizer = English().Defaults.create_tokenizer()\n", 83 | "\n", 84 | "df.text = df.text.apply(lambda x: ' '.join([tok.lemma_ for tok in tokenizer(x)]))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Tfidf values\n", 92 | "\n", 93 | "### Parameter choices\n", 94 | "Followed the pudding hiphop blog. Terms have to appear in at least one in 50 channels (lower than with the pudding, who use one in 10, because we have a very diverse and large set of channels with topics probably changing a lot over time). Used sublinear term frequency (not 10, but 1 + log(9)), because otherwise stop words appear." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "vec = TfidfVectorizer(min_df=.02,sublinear_tf = True)\n", 104 | "res = vec.fit_transform(merged_df.text)\n", 105 | "vocab = {value:key for key,value in vec.vocabulary_.items()}" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "scrolled": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "results = []\n", 117 | "for index in merged_df.index:\n", 118 | " top10words = [vocab[j] for i,j in sorted(zip(res[index].data,res[index].indices),reverse=True)[:10]]\n", 119 | " if len(top10words) < 10:\n", 120 | " continue\n", 121 | " meta = {'year':merged_df.year[index],'channel':merged_df.channel[index],'channel_id':merged_df.channel_id[index]}\n", 122 | " words = ({'word{no}'.format(no=i+1):top10words[i] for i in range(10)})\n", 123 | " results.append({k: v for d in [meta, words] for k, v in d.items()})\n", 124 | "top10words_df = pd.DataFrame(results)\n", 125 | "top10words_df = top10words_df[['year','channel','channel_id']+['word'+str(no) for no in range(1,11)]]\n", 126 | "top10words_df.to_csv('C:/hackathon/top10tfidf_per_channel.csv',index=False)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Tfidf top 100 words (for similarity)\n", 134 | "\n", 135 | "Parameter choices same as above, but with json output to preserve list structure" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "results = []\n", 145 | "for index in merged_df.index:\n", 146 | " top100words = [vocab[j] for i,j in sorted(zip(res[index].data,res[index].indices),reverse=True)[:100]]\n", 147 | " if len(top100words) < 100:\n", 148 | " continue\n", 149 | " results.append({'year':merged_df.year[index],\n", 150 | " 'channel':merged_df.channel[index],\n", 151 | " 'channel_id':merged_df.channel_id[index], \n", 152 | " 'words':top100words})\n", 153 | "top100words_df = pd.DataFrame(results)\n", 154 | "top100words_df = top100words_df[['year','channel','channel_id','words']]\n", 155 | "top100words_df.to_json('C:/hackathon/top100tfidf.json')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## 'Overlap' matrix tfidf" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "\n", 172 | "channel_id = {i:{'year':top100words_df.year[i],\n", 173 | " 'channel':top100words_df.channel[i],\n", 174 | " 'channel_id':top100words_df.channel_id[i]} for i in top100words_df.index}\n", 175 | "top100words_df.words = top100words_df.words.apply(set)\n", 176 | "distance_matrix = np.ones((len(channel_id),len(channel_id)))\n", 177 | "\n", 178 | "for i in range(len(channel_id)):\n", 179 | " for j in range(len(channel_id)):\n", 180 | " if i == j:\n", 181 | " continue\n", 182 | " elif i > j:\n", 183 | " distance_matrix[i,j] = distance_matrix[j,i]\n", 184 | " else:\n", 185 | " distance_matrix[i,j] = len(top100words_df.words[i] & top100words_df.words[j])/100\n", 186 | "\n", 187 | "distance_matrix[distance_matrix < .05] = 0" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "\n", 197 | "G = nx.from_numpy_matrix(distance_matrix)\n", 198 | "\n", 199 | "for i in range(len(channel_id)):\n", 200 | " G.node[i].update(channel_id[i])\n", 201 | "#nx.write_gexf(G,'C:/hackathon/tfidf_graph.gexf')" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "nx.write_gexf(G,'C:/hackathon/tfidf_graph.gexf')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "merged_df.to_csv('C:/hackathon/merged_right.csv',index = False)" 220 | ] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.6.6" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 2 244 | } 245 | -------------------------------------------------------------------------------- /Notebooks/getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import youtubecollector as ytc\n", 20 | "import pandas as pd\n", 21 | "from tqdm import tqdm_notebook as tqdm" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Youtube client setup" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "youtube_client = ytc.youtube_client.create_youtube_client(\"./api.conf\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Channel Seed\n", 45 | "The pipeline starts with a list of channels for which all videos are checked, for which all comments, recommendations and captations are collected" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "channel_seed_filename = \"input/seeds_right.csv\"\n", 55 | "channel_outputfile = \"output/channels_right.csv\"" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "channel_seed_df = pd.read_csv(channel_seed_filename)\n", 65 | "\n", 66 | "channels = ytc.channels.get_channels(channel_seed_df.loc[0:], youtube_client)\n", 67 | "\n", 68 | "ytc.channels.write_channels(channels, channel_outputfile)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Videos" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "video_output_file = \"output/videos_right1.csv\"" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "channels = channels[0:1]" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "all_videos = list()\n", 103 | "\n", 104 | "for channel in tqdm(channels):\n", 105 | " response = ytc.video.get_videos(channel.channel_uploads, youtube_client)\n", 106 | " next_page_token = response.get('nextPageToken')\n", 107 | " videos = ytc.video.convert_to_videos(response, youtube_client)\n", 108 | " all_videos.extend(videos)\n", 109 | " ytc.video.write_videos(videos, video_output_file)\n", 110 | " \n", 111 | " while next_page_token:\n", 112 | " response = ytc.video.get_more_videos(channel.channel_uploads, youtube_client, next_page_token) \n", 113 | " next_page_token = response.get('nextPageToken')\n", 114 | " videos = ytc.video.convert_to_videos(response, youtube_client)\n", 115 | " all_videos.extend(videos)\n", 116 | " ytc.video.write_videos(videos, video_output_file)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## Comments" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "comments_output_file = \"output/comments_right1.csv\"" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "all_videos = all_videos[1015:]\n", 142 | "\n", 143 | "all_comments = list()\n", 144 | "for video in tqdm(all_videos):\n", 145 | " response = ytc.comments.get_comments(video.video_id, youtube_client)\n", 146 | " comments = ytc.comments.convert_to_comments(response)\n", 147 | " all_comments.extend(comments)\n", 148 | " ytc.comments.write_comments(comments_output_file, comments)\n", 149 | " try:\n", 150 | " next_page_token = response.get('nextPageToken')\n", 151 | " except AttributeError:\n", 152 | " continue\n", 153 | " \n", 154 | " while next_page_token:\n", 155 | " response = ytc.comments.get_more_comments(video.video_id, youtube_client, next_page_token)\n", 156 | " try:\n", 157 | " next_page_token = response.get('nextPageToken') \n", 158 | " except AttributeError:\n", 159 | " continue\n", 160 | " comments = ytc.comments.convert_to_comments(response)\n", 161 | " all_comments.extend(comments)\n", 162 | " ytc.comments.write_comments(comments_output_file, comments)\n", 163 | " \n", 164 | " " 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Recommendations" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "recommendations_output_file = \"output/recommendations_right1.csv\"" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "video_to_recommendations = dict()\n", 190 | "for video in tqdm(all_videos, ):\n", 191 | " try:\n", 192 | " response = ytc.recommendations.get_recommendations(video.video_id, youtube_client)\n", 193 | " except rateLimitExceeded:\n", 194 | " youtube_client = ytc.youtube_client.create_youtube_client(\"./api.conf\")\n", 195 | " pass\n", 196 | " \n", 197 | " recommendations = ytc.recommendations.convert_to_recommendations(response, video.video_id)\n", 198 | " video_to_recommendations[video.video_id]=recommendations\n", 199 | " \n", 200 | " ytc.recommendations.write_recommendations(recommendations_output_file, recommendations)\n", 201 | " " 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "## Transcripts" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "transcripts_output_file = \"/home/dim/Documents/projecten/extremisme/youtube/yt/YouTubeExtremism/DataCollection/transcripts_right1.csv\"" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "ytc.transcripts.get_captions(all_videos)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "video_id_transcripts = ytc.transcripts.extract_transcripts(\"./*.vtt\")\n", 236 | "\n", 237 | "ytc.transcripts.write_transcripts(transcripts_output_file, video_id_transcripts)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "import shutil\n", 247 | "import glob\n", 248 | "import os\n", 249 | "\n", 250 | "\n", 251 | "for filename in glob.glob('/home/dim/Documents/projecten/extremisme/youtube/yt/YouTubeExtremism/DataCollection/*vtt'):\n", 252 | " os.remove(filename)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "correspondent", 266 | "language": "python", 267 | "name": "correspondent" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.6.7" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 1 284 | } 285 | -------------------------------------------------------------------------------- /Notebooks/scenariofunctions.py: -------------------------------------------------------------------------------- 1 | # Functions needed 2 | 3 | # Clean socialblade data 4 | # Plot bars 5 | # Select topics in tags 6 | # Select topics in comments 7 | # Include commenters 8 | # Exclude commenters 9 | # Include channels 10 | # Excude channel 11 | 12 | import pandas as pd 13 | from tqdm import tqdm_notebook as tqdm 14 | import matplotlib.pyplot as plt 15 | import datetime as dt 16 | import scenariofunctions as sf 17 | import glob 18 | import csv 19 | import re 20 | import sys 21 | import os 22 | import config 23 | 24 | def socialblade_ranking(channels): 25 | '''takes a messy social blade dataframe 26 | and cleans it up''' 27 | 28 | channels['Source Url'] = channels['Source Url'].str.replace('https://socialblade.com/youtube/channel/', '') 29 | channels['Subscriber_Rank'] = channels['Subscriber_Rank'].replace('\D', '', regex=True).apply(pd.to_numeric) 30 | channels['Video_View_Rank'] = channels['Video_View_Rank'].replace('\D', '', regex=True).apply(pd.to_numeric) 31 | channels['Sb_Rank'] = channels['Sb_Rank'].replace('\D', '', regex=True).apply(pd.to_numeric) 32 | channels['earnings_low'], channels['earnings_high'] = channels['Estimated_Yearly_Earning'].str.split('-', 1).str 33 | channels['earnings_low'] = channels['earnings_low'].replace('st|th|rd|nd', '', regex=True) 34 | channels['earnings_high'] = channels['earnings_high'].replace('st|th|rd|nd', '', regex=True) 35 | channels = channels.rename(columns={'Source Url': 'channel_id', 36 | 'Subscriber_Rank': 'subscriber_rank', 37 | 'Video_View_Rank': 'video_view_rank', 38 | 'Sb_Rank': 'sb_rank', 39 | 'Grade': 'grade' 40 | }) 41 | channels = channels[['channel_id', 'subscriber_rank', 'video_view_rank', 'sb_rank', 'grade']] 42 | 43 | return channels 44 | 45 | 46 | def socialblade_growth(channel_history): 47 | 48 | pattern = re.compile('(\d{4}-\d{2}-\d+,\d+)') 49 | channel_history['daily_views'] = channel_history['Date_Daily_Views'].str.findall(pattern) 50 | channel_history['daily_subs'] = channel_history['Date_Total_Subs'].str.findall(pattern) 51 | channel_history = channel_history.rename(columns={'User':'channel_id'}) 52 | daily_views = channel_history.set_index('channel_id') \ 53 | .daily_views.apply(pd.Series) \ 54 | .stack() \ 55 | .reset_index(level=-1, drop=True) \ 56 | .reset_index() 57 | 58 | daily_views['date'], daily_views['views'] = daily_views[0].str.split(',', 1).str 59 | daily_views = daily_views[['channel_id', 'date', 'views']] 60 | 61 | daily_subs = channel_history.set_index('channel_id') \ 62 | .daily_subs.apply(pd.Series) \ 63 | .stack() \ 64 | .reset_index(level=-1, drop=True) \ 65 | .reset_index() 66 | 67 | daily_subs['date'], daily_subs['subs'] = daily_subs[0].str.split(',', 1).str 68 | daily_subs = daily_subs[['channel_id', 'date', 'subs']] 69 | 70 | daily_stats = pd.merge(daily_subs, daily_views, how='left', left_on=['channel_id', 'date'], right_on = ['channel_id', 'date']) 71 | daily_stats['yearmonth'] = pd.to_datetime(daily_stats['date']).dt.to_period('M') 72 | 73 | return daily_stats 74 | 75 | 76 | def channel_filter(dataframe, selection): 77 | filtered_data = dataframe[dataframe['video_channel_title'].isin(selection)] 78 | print('deze selectie levert ' + str(len(filtered_data)) + ' videos op.') 79 | return filtered_data 80 | 81 | def channel_filter_exclude(dataframe, selection): 82 | filtered_data = dataframe[~dataframe['video_channel_title'].isin(selection)] 83 | print('deze selectie levert ' + str(len(filtered_data)) + ' videos op.') 84 | return filtered_data 85 | 86 | def add_years_months_to_videos(dataframe): 87 | dataframe.loc[:,('year')] = pd.to_datetime(dataframe.loc[:,('video_published')]).dt.to_period('Y') 88 | dataframe.loc[:,('yearmonth')] = pd.to_datetime(dataframe.loc[:,('video_published')]).dt.to_period('M') 89 | 90 | return dataframe 91 | 92 | 93 | def plot_views_per_year(dataframe): 94 | views_per_year = dataframe.groupby(['year'])['video_view_count'].agg('sum') 95 | fig = plt.figure(figsize=(10,5)) 96 | width = 0.4 97 | ax = fig.add_subplot(111) 98 | views_per_year.plot(kind='bar', color='red', width=width, grid=True) 99 | ax.set_ylabel('number of views') 100 | ax.set_xlabel('year') 101 | 102 | return plt.show() 103 | 104 | def plot_top_channels(dataframe): 105 | top_channels = dataframe['video_channel_title'].value_counts()[0:20] 106 | fig = plt.figure(figsize=(20,10)) # Create matplotlib figure 107 | width = 0.4 108 | ax = fig.add_subplot(111) 109 | top_channels.plot(kind='bar', color='red', width=width, grid=True) 110 | ax.set_ylabel('number of videos published') 111 | ax.set_xlabel('channels') 112 | 113 | return plt.show() 114 | 115 | def plot_users(dataframe): 116 | top_users = dataframe['author_display_name'].value_counts()[0:20] 117 | fig = plt.figure(figsize=(20,10)) # Create matplotlib figure 118 | width = 0.4 119 | ax = fig.add_subplot(111) 120 | top_users.plot(kind='bar', color='red', width=width, grid=True) 121 | ax.set_ylabel('number of comments') 122 | ax.set_xlabel('channels') 123 | 124 | return plt.show() 125 | 126 | 127 | def topic_filter(dataframe, query, query_topic): 128 | pattern = '|'.join([s for s in query]) 129 | mask = dataframe['video_tags'].str.contains(pattern, regex=True, case=False, na=False) 130 | topic = dataframe[mask] 131 | print('found ' + str(len(topic)) + ' videos with ' + query_topic) 132 | 133 | return topic 134 | 135 | def zoom_in_on_commenter(dataframe, name): 136 | result = dataframe[dataframe['author_display_name'] == name] 137 | return result 138 | 139 | 140 | def extract_tags(dataframe): 141 | vidtags = dataframe[['video_id', 'video_title', 'video_tags', 'year']] 142 | 143 | video_tags = vidtags['video_tags'].str.replace(r"\[|\]|\'|-", '') \ 144 | .str.lower() \ 145 | .str.split(', ', expand=True) \ 146 | .merge(vidtags, right_index = True, left_index = True) \ 147 | .drop(["video_tags"], axis = 1) \ 148 | .melt(id_vars = ['video_id', 'video_title', 'year'], value_name = "tag") \ 149 | .drop(['variable'], axis=1) \ 150 | .dropna() 151 | 152 | video_tags = video_tags[~video_tags['tag'].str.contains('not set')] 153 | video_tags.sort_values('tag', inplace=True) 154 | video_tags['tag'] = video_tags['tag'].str.replace('"', '') 155 | print('found ' + str(video_tags.tag.nunique()) + ' unique tags') 156 | return video_tags 157 | 158 | def tag_filter(dataframe, tag): 159 | result = dataframe[dataframe['tag'].str.contains(tag)] 160 | print('found ' + str(len(result)) + ' tags') 161 | return result 162 | 163 | def get_comments_by_video_id(query, sphere): 164 | if sphere == 'nl_right': 165 | path = config.PATH_NL 166 | if sphere == 'left': 167 | path = config.PATH_LEFT 168 | elif sphere == 'right': 169 | path = config.PATH_RIGHT 170 | else: 171 | print('sphere not found \n please try again') 172 | 173 | iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 174 | chunksize=1000000, 175 | sep='¶', 176 | quotechar='þ', 177 | engine='python') 178 | result = pd.concat([chunk[chunk['video_id'].isin(query)] for chunk in iter_csv]) 179 | result['sphere'] = sphere 180 | result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y') 181 | result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']] 182 | print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters') 183 | return result 184 | 185 | def get_comments_by_author(query, sphere): 186 | if sphere == 'nl_right': 187 | path = config.PATH_NL 188 | if sphere == 'left': 189 | path = config.PATH_LEFT 190 | elif sphere == 'right': 191 | path = config.PATH_RIGHT 192 | else: 193 | print('sphere not found \n please try again') 194 | 195 | iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 196 | chunksize=1000000, 197 | sep='¶', 198 | quotechar='þ', 199 | engine='python') 200 | result = pd.concat([chunk[chunk['author_channel_id'].isin(query)] for chunk in iter_csv]) 201 | result['sphere'] = sphere 202 | result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y') 203 | result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']] 204 | print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters') 205 | return result 206 | 207 | def get_comments_by_author_name(query, sphere): 208 | if sphere == 'nl_right': 209 | path = config.PATH_NL 210 | if sphere == 'left': 211 | path = config.PATH_LEFT 212 | elif sphere == 'right': 213 | path = config.PATH_RIGHT 214 | else: 215 | print('sphere not found \n please try again') 216 | 217 | iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 218 | chunksize=1000000, 219 | sep='¶', 220 | quotechar='þ', 221 | engine='python') 222 | result = pd.concat([chunk[chunk['author_display_name'].isin(query)] for chunk in iter_csv]) 223 | result['sphere'] = sphere 224 | result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y') 225 | result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']] 226 | print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters') 227 | return result 228 | 229 | def get_comments_by_topic(query, sphere): 230 | if sphere == 'nl_right': 231 | path = config.PATH_NL 232 | if sphere == 'left': 233 | path = config.PATH_LEFT 234 | elif sphere == 'right': 235 | path = config.PATH_RIGHT 236 | else: 237 | print('sphere not found \nplease try again') 238 | 239 | iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 240 | chunksize=1000000, 241 | sep='¶', 242 | quotechar='þ', 243 | engine='python') 244 | result = pd.concat([chunk[chunk['comment_text'].astype(str).str.contains('{}'.format('|'.join([s for s in query])), na=False, case=False, regex=True)] for chunk in iter_csv]) 245 | result['sphere'] = sphere 246 | result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y') 247 | result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']] 248 | print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters') 249 | return result -------------------------------------------------------------------------------- /TopicModelling/TopicModelWrapper/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | import datetime 5 | import string 6 | 7 | # Gensim 8 | import gensim 9 | 10 | # Plotting tools 11 | import pyLDAvis 12 | 13 | # Wrappers 14 | from StreamingCorpus import StreamingCorpus 15 | from StreamingPreprocesser import StreamingPreprocesser 16 | from StreamingParser import StreamingParser 17 | 18 | # Enable logging for gensim 19 | import logging 20 | import warnings 21 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) 22 | warnings.filterwarnings("ignore", category=DeprecationWarning) 23 | 24 | def main(args): 25 | # -------------------------------------------------- 26 | # 27 | # Initialize parameters 28 | # 29 | # -------------------------------------------------- 30 | root = os.path.dirname(os.path.realpath(__file__)) 31 | 32 | input_file = '{}/{}'.format(root, args.input) 33 | # input_file = os.path.dirname(os.path.realpath(__file__)) + "/temp.json" # temp file for testing 34 | 35 | # Prepare stopwords and extend if applicable 36 | stopwords_path = '{}/{}'.format(root, args.stopwords_file) 37 | stopwords = open(stopwords_path, 'r').read().split('\n') 38 | 39 | # Add 'stopwords' manually; TODO: substitute with spacy lemmatiser 40 | stopwords.extend(['know', 'think', 'like', 'thats', 'well', 'dont', 41 | 'get', 'actually', 'would', 'say', 'yeah', 'want', 'going', 42 | 'said', 'speech', 'theres', 'way', 'could', 'see', 'something', 43 | 'people', 'really', 'okay', 'gonna', 'ive', 'mean', 'right', 44 | 'got', 'thing', 'one', 'theyre', 'stuff', 'kind', 'lot', 45 | 'good', 'lot', 'things', 'saying', 'hes', 'even', 'much', 46 | 'guy', 'whatever', 'back', 'everything', 'life', 'love', 47 | 'guys', 'great', 'time', 'video', 'sort', 'cant', 'maybe', 48 | 'point', 'lets', 'take', 'talk', 'probably', 'might', 'put', 49 | 'years', 'new', 'two', 'need', 'yes', 'left', 'look', 'talking', 50 | 'anything', 'guess', 'make', 'interesting', 'someone', 'obviously', 51 | 'ill', 'still', 'also', 'whats', 'find', 'certain', 'course', 52 | 'weve', 'part', 'first', 'done', 'many', 'around', 'never', 53 | 'show', 'went', 'little', 'ever', 'big', 'look', 'give', 54 | 'last']) 55 | 56 | # 57 | # dict_min = 4 58 | # dict_max = 0.6 59 | 60 | topic_num = args.topic_num 61 | model_name = args.model_name 62 | model_path = "{}/models/{}_{}".format(root, model_name, topic_num) 63 | if not os.path.isdir(model_path): 64 | print('Model directory not found, creating directory: {}'.format(model_path)) 65 | os.mkdir(model_path) 66 | 67 | # Simple preprocesser 68 | parser = StreamingParser(input_file, 3, metadata=True) 69 | preprocessor = StreamingPreprocesser(stopwords=stopwords) 70 | 71 | corpus = StreamingCorpus(path=input_file, 72 | parse_strategy=parser, 73 | clean_strategy=preprocessor, 74 | dictionary=None, 75 | metadata=True) 76 | dictionary = corpus.get_dictionary() 77 | 78 | gensim.corpora.MmCorpus.serialize(os.path.join( 79 | model_path, '{}.mm'.format(model_name)), corpus, metadata=True) 80 | corpus = gensim.corpora.MmCorpus(os.path.join(model_path, '{}.mm'.format(model_name))) 81 | 82 | # dictionary.filter_extremes(dict_min, dict_max_relative) 83 | dictionary.save(os.path.join(model_path, '{}.dict'.format(model_name))) 84 | 85 | # -------------------------------------------------- 86 | # 87 | # LDA model training and serialization 88 | # 89 | # -------------------------------------------------- 90 | 91 | t1 = time.time() 92 | print('Starting generation of LDA model') 93 | 94 | lda = gensim.models.LdaMulticore(corpus=corpus, 95 | id2word=dictionary, 96 | num_topics=topic_num, 97 | random_state=100, 98 | # update_every=1, 99 | chunksize=100, 100 | passes=10, 101 | # alpha='auto', 102 | per_word_topics=True) 103 | lda.save('{}/{}.lda'.format(model_path, model_name)) 104 | 105 | t2 = time.time() 106 | print('LDA model generation successful! Time elapsed: {}\n'.format(t2 - t1)) 107 | 108 | # -------------------------------------------------- 109 | # 110 | # Visualisation with pyLDAvis 111 | # 112 | # -------------------------------------------------- 113 | 114 | # t1 = time.time() 115 | # print('Starting preparation of LDAvis visualisation') 116 | # 117 | # # Load gensim data to prepare for visualization 118 | # prepared_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False) 119 | # # Save visualisation to HTML file 120 | # pyLDAvis.save_html(prepared_data, os.path.join(model_path, '{}_LDAvis.html'.format(model_name))) 121 | # 122 | # t2 = time.time() 123 | # print('LDAvis visualisation successful! Time elapsed: {}\n'.format(t2 - t1)) 124 | 125 | # -------------------------------------------------- 126 | # 127 | # Compute model perplexity and coherence score 128 | # 129 | # -------------------------------------------------- 130 | 131 | t1 = time.time() 132 | print('\nStarting computation of perplexity score') 133 | 134 | perplexity_score = lda.log_perplexity(corpus) 135 | # A measure of how good the model generalises. Lower is better. 136 | print('Perplexity: ', perplexity_score) 137 | 138 | t2 = time.time() 139 | print('Perplexitiy computed successfully! Time elapsed: {}\n'.format(t2 - t1)) 140 | 141 | t1 = time.time() 142 | print('\nStarting computation of coherence score') 143 | 144 | coherence_model_lda = gensim.models.CoherenceModel( 145 | model=lda, corpus=corpus, dictionary=dictionary, coherence='u_mass') 146 | coherence_lda = coherence_model_lda.get_coherence() 147 | print('Coherence Score: ', coherence_lda) 148 | 149 | t2 = time.time() 150 | print('Coherence score computed successfully! Time elapsed: {}\n'.format(t2 - t1)) 151 | 152 | # -------------------------------------------------- 153 | # 154 | # Saving parameters and scores to file 155 | # 156 | # -------------------------------------------------- 157 | 158 | print('Writing settings and results to file...') 159 | with open(os.path.join(model_path, '{}_parameters.txt'.format(model_name)), 'w') as file: 160 | file.write('Model name: {}\n date: {}\n'.format(model_name, datetime.datetime.now())) 161 | 162 | file.write('Corpus statistics:\n'.format()) 163 | file.write('\tNon-empty entries: {}\n'.format(len(corpus))) 164 | 165 | file.write('Model parameters: \n') 166 | file.write('\tNumber of topics: {}\n'.format(topic_num)) 167 | # file.write('\tDictionary min: {}\n'.format(dict_min)) 168 | # file.write('\tDictionary max (relative): {}\n'.format(dict_max_relative)) 169 | 170 | file.write('Model scores:\n') 171 | file.write('\tPerplexity score = {}\n'.format(perplexity_score)) 172 | file.write('\tCoherence score = {}\n'.format(coherence_lda)) 173 | file.write(''.format()) 174 | print('Done!') 175 | 176 | # Ngram models ------------------------------------------ 177 | 178 | # bigram_phrases = gensim.models.Phrases(data_tokens, min_count=5, threshold=100) 179 | # trigram_phrases = gensim.models.Phrases(bigram_phrases[data_tokens], threshold=100) 180 | # 181 | # bigram_model = gensim.models.phrases.Phraser(bigram_phrases) 182 | # trigram_model = gensim.models.phrases.Phraser(trigram_phrases) 183 | # 184 | # print(trigram_model[bigram_model[data_tokens[0]]]) 185 | 186 | # def make_bigrams(documents): 187 | # return [bigram_model[document] for document in documents] 188 | # 189 | # def make_trigrams(documents): 190 | # return [trigram_model[bigram_model[document]] for document in documents] 191 | # 192 | # t1 = time.time() 193 | # data_words_bigrams = make_bigrams(data_words_nostops) 194 | # t2 = time.time() 195 | # print('Bigrams created successfully! Time elapsed: {}'.format(t2 - t1)) 196 | 197 | # Build MALLET LDA model and test coherence scores ------------------------------------------ 198 | 199 | # mallet_path = 'path/to/mallet-2.0.8/bin/mallet' # update this path 200 | # ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word) 201 | # 202 | # # Show Topics 203 | # pprint(ldamallet.show_topics(formatted=False)) 204 | # 205 | # # Compute Coherence Score 206 | # coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_words_nostop, dictionary=id2word, 207 | # coherence='c_v') 208 | # coherence_ldamallet = coherence_model_ldamallet.get_coherence() 209 | # print('\nCoherence Score: ', coherence_ldamallet) 210 | 211 | # Try different number of topics (k) and compare scores ------------------------------------------ 212 | 213 | # Find dominant topic for each document ------------------------------------------ 214 | 215 | # Find most representative document for each topic ------------------------------------------ 216 | # Topic inference methods? 217 | 218 | 219 | def parse_arguments(): 220 | parser = argparse.ArgumentParser(description=""" 221 | Wrapper for streaming topic model implementation by Gensim. 222 | TODO: make config.ini 223 | """) 224 | ##### Positional arguments ##### 225 | parser.add_argument("input", type=str, default="temp.json", 226 | help="File or directory containing the data to be processed. ") 227 | # parser.add_argument("dictionary", type=str) 228 | # parser.add_argument("output_file", type=str, help="Optional. WIP") 229 | 230 | ##### Preprocessing parameters ##### 231 | preproccesing_parameters = parser.add_argument_group('preprocessing parameters') 232 | preproccesing_parameters.add_argument("stopwords_file", type=str, 233 | default="stopwords.txt", 234 | help="Path to file containing stopwords to be removed") 235 | preproccesing_parameters.add_argument("-m", "--term_min_freq", type=int, 236 | help="remove all terms with specified frequency (or lower)") 237 | preproccesing_parameters.add_argument("-M", "--term_max_freq", type=int, 238 | help="remove all terms with specified frequency (or larger)") 239 | 240 | ##### Topic modeling parameters ##### 241 | topicmodel_parameters = parser.add_argument_group('topic modeling parameters') 242 | topicmodel_parameters.add_argument("model_name", type=str, 243 | help="The name of the model. I.e. the dataset name.") 244 | topicmodel_parameters.add_argument("topic_num", type=int, 245 | help="The name of the model. I.e. the dataset name.") 246 | 247 | return parser.parse_args() 248 | 249 | if __name__ == '__main__': 250 | main(parse_arguments()) 251 | -------------------------------------------------------------------------------- /RabbitHole/youtube-onderzoek-jan.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Guillaume Chaslot' 2 | 3 | """ 4 | This scripts starts from a search query on youtube and: 5 | 1) gets the N first search results 6 | 2) follows the first M recommendations 7 | 3) repeats step (2) P times 8 | 4) stores the results in a json file 9 | """ 10 | 11 | import urllib2 12 | import requests 13 | import cookielib 14 | from cookielib import MozillaCookieJar 15 | import re 16 | import json 17 | import sys 18 | import time 19 | import ssl 20 | import os 21 | import easygui 22 | import shutil 23 | 24 | 25 | from bs4 import BeautifulSoup 26 | 27 | RECOMMENDATIONS_PER_VIDEO = 1 28 | RESULTS_PER_SEARCH = 1 29 | 30 | # NUMBER OF MIN LIKES ON A VIDEO TO COMPUTE A LIKE RATIO 31 | MIN_LIKES_FOR_LIKE_RATIO = 5 32 | 33 | 34 | # Google session class by stackoverflow user alexislg 35 | class SessionGoogle: 36 | def __init__(self): 37 | self.ses = requests.session() 38 | self.ses.cookies = MozillaCookieJar('cookie.txt') 39 | self.user = None 40 | if not os.path.exists('cookie.txt'): 41 | if easygui.ynbox('Het bestand *cookie.txt* is niet gevonden. Wil je inloggen bij Google?', 'youtube-folower', ('Ja', 'Nee')): 42 | self.user = self.login() 43 | else: 44 | with open('cookie.txt', 'r') as cf: 45 | self.rawcookie = cf.read() 46 | if (self.rawcookie == ''): 47 | if easygui.ynbox('Het bestand *cookie.txt* is leeg. Wil je inloggen bij Google?', 'youtube-folower', ('Ja', 'Nee')): 48 | self.user = self.login() 49 | else: 50 | self.ses.cookies.load(ignore_discard = True, ignore_expires = True) 51 | self.user = self.check_login() 52 | if self.user == None: 53 | if easygui.ynbox('Het bestand *cookie.txt* is gevonden, maar geeft geen ingelogde gebruiker. Wil je inloggen bij Google?', 'youtube-folower', ('Ja', 'Nee')): 54 | self.user = self.login() 55 | 56 | if self.user == None: 57 | print 'Nog logged in. Continuing anonymously.' 58 | else: 59 | print 'Logged in as: %s.' % self.user 60 | 61 | self.save_cookie() 62 | 63 | def get(self, URL): 64 | return self.ses.get(URL).content 65 | 66 | def save_cookie(self): 67 | cj = self.ses.cookies 68 | cj.save(ignore_discard = True, ignore_expires = True) 69 | 70 | def login(self): 71 | while True: 72 | msg = "Vul de gegevens van je Google-account in:" 73 | title = "youtube-follower" 74 | fieldNames = ["Gebruikersnaam","Wachtwoord"] 75 | fieldValues = [] # we start with blanks for the values 76 | fieldValues = easygui.multpasswordbox(msg, title, fieldNames) 77 | 78 | while True: 79 | if fieldValues == None: break 80 | errmsg = "" 81 | for i in range(len(fieldNames)): 82 | if fieldValues[i].strip() == "": 83 | errmsg = errmsg + ('"%s" niet ingevuld.\n\n' % fieldNames[i]) 84 | if errmsg == "": break # no problems found 85 | fieldValues = easygui.multpasswordbox(errmsg, title, fieldNames, fieldValues) 86 | 87 | login_html = self.ses.get('https://accounts.google.com/ServiceLogin') 88 | soup_login = BeautifulSoup(login_html.content).find('form').find_all('input') 89 | my_dict = {} 90 | for u in soup_login: 91 | if u.has_attr('value'): 92 | my_dict[u['name']] = u['value'] 93 | # override the inputs without login and pwd: 94 | print my_dict 95 | my_dict['Email'] = fieldValues[0] 96 | my_dict['Passwd'] = fieldValues[1] 97 | self.ses.post('https://accounts.google.com/ServiceLoginAuth', data=my_dict) 98 | 99 | m = self.check_login() 100 | if m == None: 101 | if easygui.ynbox('Inloggen mislukt. Wil je het opnieuw proberen?', 'youtube-folower', ('Ja', 'Nee')): 102 | continue 103 | else: 104 | return m 105 | else: 106 | return m 107 | 108 | def check_login(self): 109 | r = self.get('https://accounts.google.com/ServiceLogin') 110 | m = re.search(r'Google Account: (.*?) \&\#10', r) 111 | if m == None: 112 | return m 113 | else: 114 | return m.group(1) 115 | 116 | class YoutubeFollower(): 117 | def __init__(self, session, verbose=False, name='', alltime=True, gl=None, language=None, recent=False, loopok=True): 118 | # Name 119 | self._name = name 120 | self._alltime = alltime 121 | self._verbose = verbose 122 | 123 | # Dict video_id to {'likes': , 124 | # 'dislikes': , 125 | # 'views': , 126 | # 'recommendations': []} 127 | self._video_infos = {} # self.try_to_load_video_infos() 128 | 129 | # Dict search terms to [video_ids] 130 | self._search_infos = {} 131 | self._gl = gl 132 | self._language = language 133 | self._recent=recent 134 | self._loopok=loopok 135 | self._session=session 136 | 137 | print ('Location = ' + repr(self._gl) + ' Language = ' + repr(self._language)) 138 | 139 | def clean_count(self, text_count): 140 | # Ignore non ascii 141 | ascii_count = text_count.encode('ascii', 'ignore') 142 | # Ignore non numbers 143 | p = re.compile('[\d,]+') 144 | return int(p.findall(ascii_count)[0].replace(',', '')) 145 | 146 | def get_search_results(self, search_terms, max_results, top_rated=False): 147 | assert max_results < 20, 'max_results was not implemented to be > 20' 148 | 149 | if self._verbose: 150 | print ('Searching for {}'.format(search_terms)) 151 | 152 | # Trying to get results from cache 153 | if search_terms in self._search_infos and len(self._search_infos[search_terms]) >= max_results: 154 | return self._search_infos[search_terms][0:max_results] 155 | 156 | # Escaping search terms for youtube 157 | escaped_search_terms = urllib2.quote(search_terms.encode('utf-8')) 158 | 159 | # We only want search results that are videos, filtered by viewcoung. 160 | # This is achieved by using the youtube URI parameter: sp=CAMSAhAB 161 | if self._alltime: 162 | filter = "CAMSAhAB" 163 | else: 164 | if top_rated: 165 | filter = "CAE%253D" 166 | else: 167 | filter = "EgIQAQ%253D%253D" 168 | 169 | url = "https://www.youtube.com/results?sp=" + filter + "&q=" + escaped_search_terms 170 | if self._gl: 171 | url = url + '&gl=' + self._gl 172 | 173 | print ('Searching URL: ' + url) 174 | 175 | html = self._session.get(url) 176 | soup = BeautifulSoup(html, "html.parser") 177 | 178 | videos = [] 179 | for item_section in soup.findAll('div', {'class': 'yt-lockup-dismissable'}): 180 | video = item_section.contents[0].contents[0]['href'].split('=')[1] 181 | videos.append(video) 182 | 183 | self._search_infos[search_terms] = videos 184 | return videos[0:max_results] 185 | 186 | def get_recommendations(self, video_id, nb_recos_wanted, depth, key): 187 | if video_id in self._video_infos: 188 | # Updating the depth if this video was seen. 189 | #self._video_infos[video_id]['depth'] = min(self._video_infos[video_id]['depth'], depth) 190 | #print ('a video was seen at a lower depth') 191 | 192 | video = self._video_infos[video_id] 193 | recos_returned = [] 194 | for reco in video['recommendations']: 195 | # This line avoids to loop around the same videos: 196 | if reco not in self._video_infos or self._loopok: 197 | recos_returned.append(reco) 198 | if len(recos_returned) >= nb_recos_wanted: 199 | break 200 | if self._loopok: 201 | video['key'].append(key) 202 | print ('\n Following recommendations ' + repr(recos_returned) + '\n') 203 | return recos_returned 204 | 205 | url = "https://www.youtube.com/watch?v=" + video_id 206 | 207 | while True: 208 | try: 209 | html = urllib2.urlopen(url) 210 | break 211 | except urllib2.URLError: 212 | print 'error' 213 | time.sleep(1) 214 | soup = BeautifulSoup(html, "html.parser") 215 | 216 | # Views 217 | views = -1 218 | for watch_count in soup.findAll('div', {'class': 'watch-view-count'}): 219 | try: 220 | views = self.clean_count(watch_count.contents[0]) 221 | except IndexError: 222 | pass 223 | 224 | # Likes 225 | likes = -1 226 | for like_count in soup.findAll('button', {'class': 'like-button-renderer-like-button'}): 227 | try: 228 | likes = self.clean_count(like_count.contents[0].text) 229 | except IndexError: 230 | pass 231 | 232 | # Dislikes 233 | dislikes = -1 234 | for like_count in soup.findAll('button', {'class': 'like-button-renderer-dislike-button'}): 235 | try: 236 | dislikes = self.clean_count(like_count.contents[0].text) 237 | except IndexError: 238 | pass 239 | 240 | # Channel 241 | channel = '' 242 | for item_section in soup.findAll('a', {'class': 'yt-uix-sessionlink'}): 243 | if item_section['href'] and '/channel/' in item_section['href'] and item_section.contents[0] != '\n': 244 | channel = item_section.contents[0] 245 | channel_id = item_section['href'].split('/channel/')[1] 246 | break 247 | 248 | if channel == '': 249 | print ('WARNING: CHANNEL not found') 250 | 251 | # Recommendations 252 | recos = [] 253 | upnext = True 254 | for video_list in soup.findAll('ul', {'class': 'video-list'}): 255 | if upnext: 256 | # Up Next recommendation 257 | try: 258 | recos.append(video_list.contents[1].contents[1].contents[1]['href'].replace('/watch?v=', '')) 259 | except IndexError: 260 | print ('WARNING Could not get a up next recommendation because of malformed content') 261 | pass 262 | upnext = False 263 | else: 264 | # 19 Others 265 | for i in range(1, 19): 266 | try: 267 | recos.append(video_list.contents[i].contents[1].contents[1]['href'].replace('/watch?v=', '')) 268 | except IndexError: 269 | if self._verbose: 270 | print ('Could not get a recommendation because there are not enough') 271 | except AttributeError: 272 | if self._verbose: 273 | print ('WARNING Could not get a recommendation because of malformed content') 274 | 275 | title = '' 276 | for eow_title in soup.findAll('span', {'id': 'eow-title'}): 277 | title = eow_title.text.strip() 278 | 279 | if title == '': 280 | print ('WARNING: title not found') 281 | 282 | if video_id not in self._video_infos: 283 | self._video_infos[video_id] = {'views': views, 284 | 'likes': likes, 285 | 'dislikes': dislikes, 286 | 'recommendations': recos, 287 | 'title': title, 288 | 'depth': depth, 289 | 'id': video_id, 290 | 'channel': channel, 291 | 'key': []} 292 | if self._loopok: 293 | self._video_infos[video_id]['key'].append(key) 294 | 295 | video = self._video_infos[video_id] 296 | print (repr(video_id + ': ' + video['title'] + ' [' + channel + ']{' + repr(key) +'} ' + str(video['views']) + ' views , depth: ' + str(video['depth']))) 297 | # print (repr(video['recommendations'])) 298 | return recos[:nb_recos_wanted] 299 | 300 | def get_n_recommendations(self, seed, branching, depth, key): 301 | if depth is 0: 302 | return [seed] 303 | current_video = seed 304 | all_recos = [seed] 305 | index = 0 306 | for video in self.get_recommendations(current_video, branching, depth, key): 307 | code = chr(index + 97) 308 | all_recos.extend(self.get_n_recommendations(video, branching, depth - 1, key + code)) 309 | index = index + 1 310 | return all_recos 311 | 312 | def compute_all_recommendations_from_search(self, search_terms, search_results, branching, depth): 313 | search_results = self.get_search_results(search_terms, search_results) 314 | print ('Search results ' + repr(search_results)) 315 | 316 | all_recos = [] 317 | ind = 0 318 | for video in search_results: 319 | ind += 1 320 | all_recos.extend(self.get_n_recommendations(video, branching, depth, str(ind))) 321 | print ('\n\n\nNext search: ') 322 | all_recos.extend(search_results) 323 | return all_recos 324 | 325 | def count(self, iterator): 326 | counts = {} 327 | for video in iterator: 328 | counts[video] = counts.get(video, 0) + 1 329 | return counts 330 | 331 | def go_deeper_from(self, search_term, search_results, branching, depth): 332 | all_recos = self.compute_all_recommendations_from_search(search_term, search_results, branching, depth) 333 | counts = self.count(all_recos) 334 | print ('\n\n\nSearch term = ' + search_term + '\n') 335 | print ('counts: ' + repr(counts)) 336 | sorted_videos = sorted(counts, key=counts.get, reverse=True) 337 | return sorted_videos, counts 338 | 339 | def save_video_infos(self, keyword): 340 | print ('Wrote file:') 341 | date = time.strftime('%Y%m%d') 342 | with open('data/video-infos-' + keyword + '-' + date + '.json', 'w') as fp: 343 | json.dump(self._video_infos, fp) 344 | 345 | def try_to_load_video_infos(self): 346 | try: 347 | with open('data/video-infos-' + self._name + '.json', 'r') as fp: 348 | return json.load(fp) 349 | except Exception as e: 350 | print ('Failed to load from graph ' + repr(e)) 351 | return {} 352 | 353 | def count_recommendation_links(self): 354 | counts = {} 355 | for video_id in self._video_infos: 356 | for reco in self._video_infos[video_id]['recommendations']: 357 | counts[reco] = counts.get(reco, 0) + 1 358 | return counts 359 | 360 | def like_ratio_is_computed(self, video): 361 | return int(video['likes']) > MIN_LIKES_FOR_LIKE_RATIO 362 | 363 | def print_graph(self, links_per_video, only_mature_videos=True): 364 | """ 365 | Prints a file with a graph containing all videos. 366 | """ 367 | input_links_counts = self.count_recommendation_links() 368 | graph = {} 369 | nodes = [] 370 | links = [] 371 | for video_id in self._video_infos: 372 | video = self._video_infos[video_id] 373 | if self.like_ratio_is_computed(video): 374 | popularity = 0 375 | else: 376 | popularity = video['likes'] / float(video['likes'] + video['dislikes']) 377 | 378 | nodes.append({'id': video_id, 'size': input_links_counts.get(video_id, 0), 'popularity': popularity, 'type': 'circle', 'likes': video['likes'], 'dislikes': video['dislikes'], 'views': video['views'], 'depth': video['depth']}) 379 | link = 0 380 | for reco in self._video_infos[video_id]['recommendations']: 381 | if reco in self._video_infos: 382 | links.append({'source': video_id, 'target': reco, 'value': 1}) 383 | link += 1 384 | if link >= links_per_video: 385 | break 386 | graph['nodes'] = nodes 387 | graph['links'] = links 388 | with open('./graph-' + self._name + '.json', 'w') as fp: 389 | json.dump(graph, fp) 390 | date = time.strftime('%Y-%m-%d') 391 | with open('./graph-' + self._name + '-' + date + '.json', 'w') as fp: 392 | json.dump(graph, fp) 393 | print ('Wrote graph as: ' + './graph-' + self._name + '-' + date + '.json') 394 | 395 | 396 | def print_videos(self, videos, counts, max_length): 397 | idx = 1 398 | for video in videos[:max_length]: 399 | try: 400 | current_title = self._video_infos[video]['title'] 401 | print (str(idx) + ') Recommended ' + str(counts[video]) + ' times: ' 402 | ' https://www.youtube.com/watch?v=' + video + ' , Title: ' + repr(current_title)) 403 | if idx % 20 == 0: 404 | print ('') 405 | idx += 1 406 | except KeyError: 407 | pass 408 | 409 | def get_top_videos(self, videos, counts, max_length_count): 410 | video_infos = [] 411 | for video in videos: 412 | try: 413 | video_infos.append(self._video_infos[video]) 414 | video_infos[-1]['nb_recommendations'] = counts[video] 415 | except KeyError: 416 | pass 417 | 418 | # Computing the average recommendations of the video: 419 | # The average is computing only on the top videos, so it is an underestimation of the actual average. 420 | if video_infos is []: 421 | return [] 422 | sum_recos = 0 423 | for video in video_infos: 424 | sum_recos += video['nb_recommendations'] 425 | avg = sum_recos / float(len(video_infos)) 426 | for video in video_infos: 427 | video['mult'] = video['nb_recommendations'] / avg 428 | return video_infos[:max_length_count] 429 | 430 | def compare_keywords(session, query, search_results, branching, depth, name, gl, language, recent, loopok): 431 | date = time.strftime('%Y-%m-%d') 432 | file_name = 'results/' + name + '-' + date + '.json' 433 | print ('Running, will save the resulting json to:' + file_name) 434 | top_videos = {} 435 | for keyword in query.split(','): 436 | yf = YoutubeFollower(session, verbose=True, name=keyword, alltime=False, gl=gl, language=language, recent=recent, loopok=loopok) 437 | top_recommended, counts = yf.go_deeper_from(keyword, 438 | search_results=search_results, 439 | branching=branching, 440 | depth=depth) 441 | top_videos[keyword] = yf.get_top_videos(top_recommended, counts, 1000) 442 | yf.print_videos(top_recommended, counts, 50) 443 | yf.save_video_infos(name + '-' + keyword) 444 | 445 | with open(file_name, 'w') as fp: 446 | json.dump(top_videos, fp) 447 | 448 | def main(): 449 | query = 'jared taylor' 450 | name = "taylor" 451 | searches = 2 452 | branch = 2 453 | depth = 10 454 | 455 | #Create (authenticated) Google session and log in to YouTube 456 | session = SessionGoogle() 457 | session.get('https://accounts.google.com/ServiceLogin?continue=https://youtube.com&service=youtube') 458 | 459 | 460 | #file = codecs.open("test.txt","w+", encoding = 'utf-8') 461 | #file.write('Met login testin\'') 462 | #file.write(session.get('https://accounts.google.com/ServiceLogin?continue=https://youtube.com&service=youtube')) 463 | 464 | 465 | #Seasygui.msgbox("Dank je wel voor meewerken aan dit onderzoek! De app zal een ongeveer een half uur nodig hebben om alle resultaten te verzamelen. Als we klaar zijn krijg je nogmaals een venster als deze te zien en staan de resultaten op dezelfde plek als waar je dit script hebt bewaard en aangeklikt.", title="DeCorrespondent Youtube Onderzoek") 466 | 467 | #os.chdir('') 468 | if not os.path.exists('results'): 469 | os.mkdir('results') 470 | print 'made results' 471 | if not os.path.exists('data'): 472 | os.mkdir('data') 473 | 474 | compare_keywords(session, query, searches, branch, depth, name, 'NL', 'NL', False, False) 475 | 476 | if os.path.exists('data'): 477 | shutil.rmtree('data', ignore_errors=True) 478 | 479 | file_name = name + '-' + time.strftime('%Y-%m-%d') + '.json' 480 | 481 | if os.path.exists('results/'+file_name): 482 | shutil.copyfile('results/'+file_name, file_name) 483 | shutil.rmtree('results') 484 | 485 | #easygui.msgbox("We zijn klaar. Dank je wel :)", title="DeCorrespondent Youtube Onderzoek") 486 | 487 | return 0 488 | 489 | if __name__ == "__main__": 490 | sys.exit(main()) 491 | --------------------------------------------------------------------------------