├── TopicModelling
    ├── TopicSimilarity
    │   └── readme.txt
    ├── arguing_lexicon
    │   ├── lexicon
    │   │   ├── en
    │   │   │   ├── patterns
    │   │   │   │   ├── authority.tff
    │   │   │   │   ├── wants.tff
    │   │   │   │   ├── causation.tff
    │   │   │   │   ├── inyourshoes.tff
    │   │   │   │   ├── structure.tff
    │   │   │   │   ├── rhetoricalquestion.tff
    │   │   │   │   ├── generalization.tff
    │   │   │   │   ├── possibility.tff
    │   │   │   │   ├── doubt.tff
    │   │   │   │   ├── priority.tff
    │   │   │   │   ├── contrast.tff
    │   │   │   │   ├── necessity.tff
    │   │   │   │   ├── inconsistency.tff
    │   │   │   │   ├── difficulty.tff
    │   │   │   │   ├── conditionals.tff
    │   │   │   │   ├── assessments.tff
    │   │   │   │   └── emphasis.tff
    │   │   │   ├── macros
    │   │   │   │   ├── spoken.tff
    │   │   │   │   ├── wordclasses.tff
    │   │   │   │   ├── pronoun.tff
    │   │   │   │   ├── intensifiers.tff
    │   │   │   │   └── modals.tff
    │   │   │   └── patterntest.txt
    │   │   ├── nl
    │   │   │   ├── macros
    │   │   │   │   ├── spoken.tff
    │   │   │   │   ├── wordclasses.tff
    │   │   │   │   ├── pronoun.tff
    │   │   │   │   ├── intensifiers.tff
    │   │   │   │   └── modals.tff
    │   │   │   └── patterns
    │   │   │   │   ├── authority.tff
    │   │   │   │   ├── wants.tff
    │   │   │   │   ├── structure.tff
    │   │   │   │   ├── inyourshoes.tff
    │   │   │   │   ├── causation.tff
    │   │   │   │   ├── rhetoricalquestion.tff
    │   │   │   │   ├── generalization.tff
    │   │   │   │   ├── contrast.tff
    │   │   │   │   ├── possibility.tff
    │   │   │   │   ├── necessity.tff
    │   │   │   │   ├── priority.tff
    │   │   │   │   ├── difficulty.tff
    │   │   │   │   ├── doubt.tff
    │   │   │   │   ├── inconsistency.tff
    │   │   │   │   ├── conditionals.tff
    │   │   │   │   ├── emphasis.tff
    │   │   │   │   └── assessments.tff
    │   │   └── README for Arguing Lexicon.pdf
    │   ├── environment.yml
    │   ├── README.md
    │   ├── lda.py
    │   ├── arguing_lexicon.py
    │   ├── arguing-lexicon-lda.ipynb
    │   └── arguing-lexicon-filter.ipynb
    ├── Captions preprocessing
    │   ├── README.md
    │   └── Preprocess leftwing.ipynb
    ├── TopicModelWrapper
    │   ├── stopwords
    │   │   ├── dutch
    │   │   └── english
    │   ├── visualisations
    │   │   └── pyLDAvisualisation.py
    │   ├── StreamingCorpus.py
    │   ├── StreamingPreprocesser.py
    │   ├── StreamingParser.py
    │   └── main.py
    ├── Top TfIdf
    │   ├── README.md
    │   └── Right - tfidf top words.ipynb
    ├── filterTranscripts.py
    ├── getWord2VecModel.py
    ├── TextLemma
    │   ├── filterTranscripts.py
    │   └── getTokens.py
    ├── getTokens.py
    └── language_detection
    │   └── spacy-language-detection.ipynb
├── DataCollection
    ├── tests
    │   └── youtubecollector
    │   │   ├── test_utils.py
    │   │   ├── resources
    │   │       ├── api_test.conf
    │   │       ├── comment_minimal.json
    │   │       ├── comment_full.json
    │   │       ├── video.json
    │   │       ├── recommendation.json
    │   │       ├── video_metadata.json
    │   │       ├── comment_with_reply.json
    │   │       ├── nullable_fields_channel_response.json
    │   │       └── full_channel_response.json
    │   │   ├── utils_for_test.py
    │   │   ├── test_recommendations.py
    │   │   ├── test_videos.py
    │   │   ├── test_channels.py
    │   │   └── test_comments.py
    ├── src
    │   └── youtubecollector
    │   │   ├── printer.py
    │   │   ├── __init__.py
    │   │   ├── util.py
    │   │   ├── youtube_client.py
    │   │   ├── transcripts.py
    │   │   ├── recommendations.py
    │   │   ├── channels.py
    │   │   ├── video.py
    │   │   └── comments.py
    ├── api.conf
    ├── requirements.txt
    ├── setup.cfg
    ├── setup.py
    ├── CONTRIBUTE.md
    ├── Makefile
    └── README.md
├── .gitignore
├── RabbitHole
    ├── config.py
    └── youtube-onderzoek-jan.py
├── README.md
└── Notebooks
    ├── getting_started.ipynb
    └── scenariofunctions.py


/TopicModelling/TopicSimilarity/readme.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/test_utils.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.egg-info/
3 | __pycache__/
4 | venv/
5 | .coverage
6 | htmlcov/


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/printer.py:
--------------------------------------------------------------------------------
1 | def print_text():
2 |     print("hello Worlds")
3 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/authority.tff:
--------------------------------------------------------------------------------
1 | #class="authority"
2 | according to
3 | 


--------------------------------------------------------------------------------
/DataCollection/api.conf:
--------------------------------------------------------------------------------
1 | youtube_api_service_name="youtube"
2 | youtube_api_version="v3"
3 | developer_key=None


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/macros/spoken.tff:
--------------------------------------------------------------------------------
1 | #class="spoken"
2 | @DYS={uh,um, mm-hmm, uh-huh, huh}


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/macros/spoken.tff:
--------------------------------------------------------------------------------
1 | #class="spoken"
2 | @UH={uh,um, mm-hmm, uh-huh, huh, ehm, euhm, eh}
3 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/authority.tff:
--------------------------------------------------------------------------------
1 | #class="authority"
2 | volgens
3 | naargelang
4 | overeenkomstig
5 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/wants.tff:
--------------------------------------------------------------------------------
1 | #class="wants"
2 | (jij|wij|ik) (wil|willen) (niet|misschien|misschien niet)
3 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/api_test.conf:
--------------------------------------------------------------------------------
1 | youtube_api_service_name="youtube"
2 | youtube_api_version="v3"
3 | developer_key="123"


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/wants.tff:
--------------------------------------------------------------------------------
1 | #class="wants"
2 | (you|we|i) (don\'t )?(want|wanna)
3 | (you|we|i) might (not )?(want|wanna)
4 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/structure.tff:
--------------------------------------------------------------------------------
1 | #class="structure"
2 | (ten )?eerste?
3 | ten tweede
4 | verder
5 | (in de )?eerste plaats
6 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/causation.tff:
--------------------------------------------------------------------------------
1 | #class="causation"
2 | so
3 | therefore
4 | because
5 | hence
6 | as a result
7 | consequently


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/inyourshoes.tff:
--------------------------------------------------------------------------------
1 | #class="inyourshoes"
2 | what i would do
3 | if i were you
4 | i would not
5 | i wouldn\'t
6 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/structure.tff:
--------------------------------------------------------------------------------
1 | #class="structure"
2 | first
3 | secondly
4 | first place
5 | in the first place
6 | first of all
7 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterntest.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decorrespondent/youtube_extremism/HEAD/TopicModelling/arguing_lexicon/lexicon/en/patterntest.txt


--------------------------------------------------------------------------------
/RabbitHole/config.py:
--------------------------------------------------------------------------------
1 | PATH_RESULTS = ''
2 | PATH_DEMOGRAPHICS = '~'
3 | PATH_OUTPUT = ''
4 | DEVELOPER_KEY = ''
5 | YOUTUBE_API_SERVICE_NAME = "youtube"
6 | YOUTUBE_API_VERSION = "v3"
7 | 


--------------------------------------------------------------------------------
/DataCollection/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy
 3 | youtube_dl
 4 | google-api-python-client
 5 | webvtt-py
 6 | jupyter
 7 | requests
 8 | matplotlib
 9 | networkx
10 | seaborn
11 | tqdm
12 | docutils


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/README for Arguing Lexicon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decorrespondent/youtube_extremism/HEAD/TopicModelling/arguing_lexicon/lexicon/README for Arguing Lexicon.pdf


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/inyourshoes.tff:
--------------------------------------------------------------------------------
1 | #class="inyourshoes"
2 | wat ik zou doen
3 | als ik jou was
4 | als ik in jouw schoenen stond
5 | ik zou in jouw (plaats|positie)
6 | ik zou niet
7 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/causation.tff:
--------------------------------------------------------------------------------
 1 | #class="causation"
 2 | dus
 3 | daarom
 4 | omdat
 5 | derhalve
 6 | resultaat
 7 | voortkomend
 8 | voortvloeiend
 9 | volgend
10 | bijgevolg
11 | zodoende
12 | 


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/__init__.py:
--------------------------------------------------------------------------------
1 | from . import channels
2 | from . import comments
3 | from . import recommendations
4 | from . import transcripts
5 | from . import video
6 | from . import youtube_client
7 | from . import printer
8 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/rhetoricalquestion.tff:
--------------------------------------------------------------------------------
1 | #class="rhetoricalquestion"
2 | do (we|you) (actually|really|still) (need|want)
3 | why not
4 | why don\'t (we|you)
5 | what if
6 | (and )?who (wouldn\'t|doesn\'t) (@EMO1V)
7 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/generalization.tff:
--------------------------------------------------------------------------------
1 | #class="generalization"
2 | (everybody|everything|anybody|anything|nobody|nothing) (else|at all)
3 | in the (world|universe)
4 | of all times
5 | in recent memory
6 | in living history
7 | 


--------------------------------------------------------------------------------
/DataCollection/setup.cfg:
--------------------------------------------------------------------------------
 1 | [aliases]
 2 | test=pytest
 3 | 
 4 | [tool:pytest]
 5 | addopts = --verbose
 6 | python_files = tests/*.py
 7 | 
 8 | [coverage:run]
 9 | branch=True
10 | source = .
11 | 
12 | [coverage:report]
13 | omit =
14 |     tests/*
15 | include =
16 |     src/youtubecollector/*


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/possibility.tff:
--------------------------------------------------------------------------------
 1 | #class="possibility"
 2 | you can
 3 | we can
 4 | you can\'t
 5 | you cannot
 6 | we can\'t
 7 | we cannot
 8 | you could
 9 | we could
10 | (@BE) able to
11 | there\'s no way (that|for|of|to)?
12 | any way (that|for|of|to)?
13 | no way
14 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/rhetoricalquestion.tff:
--------------------------------------------------------------------------------
1 | #class="rhetoricalquestion"
2 | (wil|willen) (jij|je|we) (eigenlijk|echt|nog steeds)
3 | (heb|hebben) (jij|je|we) (eigenlijk|echt|nog steeds) nodig
4 | waarom niet
5 | waarom (doe|doen) (jij|je|jullie) niet
6 | wat als
7 | wie (@EMO1V) (nu )? niet
8 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/doubt.tff:
--------------------------------------------------------------------------------
1 | #class="doubt"
2 | (i am|i\'m) not (sure|convinced)
3 | i (don\'t|can\'t|do not|cannot) see how
4 | it (is not|isn\'t) (clear|evident|obvious) (that)?
5 | it\'s not (clear|evident|obvious) (that)?
6 | (we|i) doubt (that)?
7 | (we|i) (am|are) doubtful
8 | (we\'re|i\'m) doubtful


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/macros/wordclasses.tff:
--------------------------------------------------------------------------------
1 | #class="wordclasses"
2 | #emo1=stative, relational, positive
3 | #emo2=stative, relational, negative
4 | @EMO1V={like, adore, want, prefer, love, enjoy}
5 | @EMO1N={like, adoration, want, preference, love, enjoyment}
6 | @EMO2V={hate, dislike, disprefer}
7 | @EMO2N={hate, dislike, dispreference}
8 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/priority.tff:
--------------------------------------------------------------------------------
 1 | #class="priority"
 2 | important
 3 | crucial
 4 | key
 5 | essential
 6 | critical
 7 | fundamental
 8 | key
 9 | major
10 | vital
11 | first and foremost
12 | (now )?remember (that)?
13 | keep in mind (that)?
14 | don\'t forget (that)?
15 | let\'s not forget
16 | let\'s keep in mind
17 | let\'s remember
18 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/generalization.tff:
--------------------------------------------------------------------------------
 1 | #class="generalization"
 2 | (iedereen|alles|iemand|iets|niemand|niets) anders
 3 | helemaal (niemand|niets)
 4 | (in|op) de wereld
 5 | in het universum
 6 | op aarde
 7 | altijd
 8 | te allen tijde
 9 | van alle tijden
10 | in de recente geschiedenis
11 | recentelijk
12 | in de jongste geschiedenis
13 | sinds mensenheugenis
14 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/macros/pronoun.tff:
--------------------------------------------------------------------------------
1 | #class="pronoun"
2 | @I={i, i\'m, i\'ve, i\'ll, i\'d,me}
3 | @YOU={you, you\'re, you\'ve, you\'ll, you\'d}
4 | @HE={he, he\'s, he\'ll, he\'d,him}
5 | @SHE={she, she\'s, she\'ll, she\'d,her}
6 | @WE={we, we\'re, we\'ll, we\'d, we\'ve,us}
7 | @THEY={they, they\'re, they\'ll, they\'d, they\'ve,them}
8 | @PRONSUBJ={i, you, he, she, it, we, they}


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/macros/wordclasses.tff:
--------------------------------------------------------------------------------
1 | #class="wordclasses"
2 | #emo1=subjectief,relationeel, positief
3 | #emo2=subjectief,relationeel, negatief
4 | @EMO1V={adoreert, wilt, prefereert, houdt, geniet}
5 | @EMO1N={leuk, adoreren, willen, voorkeuren, houden van, genieten}
6 | @EMO2V={haten, niet leuk, niet de voorkeur, afkeuren}
7 | @EMO2N={haten, niet leuk, niet de voorkeur, afkeuren}
8 | 


--------------------------------------------------------------------------------
/TopicModelling/Captions preprocessing/README.md:
--------------------------------------------------------------------------------
1 | # Preprocessing of captions
2 | 
3 | The attached ipython notebook cleans the csv files that were created straight from the .vtt files. They contain duplicates, sometimes combined in one line. Furthermore, since .csv does not support the python lists, these are saved as strings. 
4 | 
5 | **TODO:** Put the cleaning script straight in the script that reads the .vtt files.


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/contrast.tff:
--------------------------------------------------------------------------------
 1 | #class="contrast"
 2 | really
 3 | actually
 4 | as opposed to
 5 | instead of
 6 | rather than
 7 | there (are|is) ([\w]+[ \,]*){1,4} and (then )?there (are|is)
 8 | (is|that\'s|it\'s) a whole nother issue
 9 | (is|are|that\'s|it\'s) (very|quite|completely|totally )?different
10 | whole new ballgame
11 | (is|that\'s|it\'s) a (separate|different) (issue|question)
12 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/contrast.tff:
--------------------------------------------------------------------------------
1 | #class="contrast"
2 | er (is|zijn) ([\w]+[ \,]*){1,4}  en er (is|zijn|dan is er)
3 | (dat|het) is een (heel)? andere? (probleem|zaak|vraag|issue)
4 | (dat |het )?is (heel |best wel |compleet |totaal |helemaal )?anders
5 | zijn (heel |best wel |compleet |totaal |helemaal )?anders
6 | (compleet|helemaal|heel) anders
7 | is een (aparte|andere kwestie|situatie)
8 | (dat |het )?is een (aparte|andere) (kwestie|situatie)
9 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/necessity.tff:
--------------------------------------------------------------------------------
 1 | #class="necessity"
 2 | a must
 3 | must
 4 | essential
 5 | indispensable
 6 | necessary
 7 | (@BE) a necessity
 8 | needed
 9 | required
10 | requirement
11 | can\'t do without
12 | got to
13 | gotta 
14 | had better
15 | hafta
16 | have to
17 | has to
18 | need to
19 | needs to
20 | ought to
21 | oughta
22 | should
23 | (@PRONSUBJ) better
24 | (necesssitates|necessitated|necessitating|necessitate)


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ml
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - ipython
 6 |   - jupyter_client
 7 |   - jupyter_core
 8 |   - nb_conda
 9 |   - nb_conda_kernels
10 |   - notebook
11 |   - python=3.6.6
12 |   - pip:
13 |     - autopep8==1.4
14 |     - ipdb==0.10.3
15 |     - jupyter==1.0.0
16 |     - jupyter-console==5.1.0
17 |     - numpy==1.14.2
18 |     - pandas==0.19.2
19 |     - spacy==2.0.11
20 |     - spacy-arguing-lexicon==0.0.2
21 | 
22 | 


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/util.py:
--------------------------------------------------------------------------------
 1 | import os as _os
 2 | 
 3 | 
 4 | def is_empty_file(filename: str):
 5 |     if not isinstance(filename, str):
 6 |         raise Exception(f"filename should be a string of an existing file")
 7 | 
 8 |     if not _os.path.exists(filename):
 9 |         raise Exception(f"{filename} doesn't exists")
10 | 
11 |     return _os.stat(filename).st_size == 0
12 | 
13 | 
14 | def convert_to_dictionary(obj):
15 |     return {field: getattr(obj, field) for field in obj._fields}


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/possibility.tff:
--------------------------------------------------------------------------------
 1 | #class="possibility"
 2 | (je|jij) (kan|kunt)( niet)?
 3 | (we|wij) kunnen( niet)?
 4 | (je|jij) zou kunnen
 5 | (we|wij) zouden kunnen
 6 | (@ZIJN) in staat tot
 7 | in staat zijn tot
 8 | (kunnen|kun|kan)
 9 | er is geen mogelijkheid (dat|voor|om|tot)
10 | er is geen wijze (om|voor)
11 | er geen manier (dat|voor|om)
12 | een mogelijkheid (dat|voor|om|tot)
13 | een manier (dat|voor|om)
14 | een wijze (om|voor)
15 | geen (manier|mogelijkheid|wijze)
16 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/inconsistency.tff:
--------------------------------------------------------------------------------
 1 | #class="inconsistency"
 2 | except that
 3 | except for
 4 | with the exception of
 5 | however
 6 | nevertheless
 7 | that said
 8 | that having been said
 9 | that being said
10 | despite
11 | in spite of
12 | even so
13 | at the same time
14 | still
15 | wait a minute
16 | hold on a second
17 | hold on a sec
18 | it\'s just that
19 | all well and good
20 | as far as it goes
21 | you might think (that)?
22 | you may think (that)?
23 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/necessity.tff:
--------------------------------------------------------------------------------
 1 | #class="necessity"
 2 | een (must|voorwaarde|vereiste)
 3 | essentieel
 4 | (onvervangbaar|onontbeerlijk)
 5 | noodzakelijk
 6 | (@ZIJN) (een )?(noodzaak|vereiste|een vereiste|noodzakelijk)
 7 | nodig
 8 | verplicht
 9 | (eis|vereiste|voorwaarde)
10 | kan niet (zijn )?zonder
11 | (moet|moeten|eis)
12 | (hadden|had|konden|kon) beter
13 | nodig hebben
14 | (behoort te|zou moeten)
15 | (zou|zouden)
16 | (@OND) beter
17 | (noodzaken|noodzakelijk|dwingen|noodzakelijk maken)
18 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/macros/pronoun.tff:
--------------------------------------------------------------------------------
1 | #class="pronoun"
2 | @IK={ik, ik ben, ik heb, ik zal, mijn, me, mij} 
3 | @JIJ={jij, je, jij bent, je bent, jij hebt, je hebt, jij zal, je zal, jij zult, je zult, jouw}
4 | @HIJ={hij, hij is, hij heeft, hij zal, hem}
5 | @ZE={zij, ze, zij is, ze is, zij heeft, ze heeft, zij zal, ze zal, haar}
6 | @WE={we, wij, we zijn, wij zijn, we hebben, wij hebben, we zullen, wij zullen, ons}
7 | @ZIJ={zij, zij zijn, zij hebben, zij zullen, hun}
8 | @OND={ik, jij, je, hij, zij, het, we, wij, hen, hun}
9 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/priority.tff:
--------------------------------------------------------------------------------
 1 | #class="priority"
 2 | (belangrijk|van belang|gewichtig)
 3 | cruciaal
 4 | sleutelrol
 5 | essentieel
 6 | kritiek
 7 | fundamenteel
 8 | (enorm|groot|zeer groot|belangrijk|belangrijkste|voornaamste)
 9 | (vitaal|wezenlijk|doorslaggevend)
10 | (in de eerste plaats|allereerst)
11 | onthoud( dat)?
12 | houd in gedachte( dat)?
13 | vergeet (dat )?niet( dat)?
14 | laten we niet vergeten
15 | laten we in gedachte houden
16 | laten we onthouden
17 | laten we herinneren
18 | herinner je
19 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/macros/intensifiers.tff:
--------------------------------------------------------------------------------
1 | #class="intensifier_adv"
2 | @INTENSADV1={absolutely, absurdly, resoundingly, amazingly, awfully, extremely, completely, highly, incredibly, perfectly, quite, really, strikingly, surprisingly, terribly, totally, unbelievably, hugely, unnaturally, unusually, utterly, very, tremendously, spectacularly}
3 | @INTENSADJ1={absolute, extreme, incredible, perfect, phenomenal, spectacular, huge, major, tremendous, complete, considerable, real, terrible, total, unbelievable, utter, great, resounding}
4 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/difficulty.tff:
--------------------------------------------------------------------------------
 1 | #class="difficulty"
 2 | (@ZIJN) (@INTENSBIJW)? ?makkelijk
 3 | (@ZIJN) (@INTENSBIJW)? ?gemakkelijk
 4 | in een zucht
 5 | (@ZIJN) (een )?appeltje eitje
 6 | (@ZIJN) snel gedaan
 7 | (@ZIJN) (@INTENSBIJW)? ?kinderspel
 8 | (@ZIJN) (@INTENSBIJW)? ?(lastig|verradelijk|moeilijk|vervelend)
 9 | (@ZIJN) niet gemakkelijk
10 | (@ZIJN) moeilijk
11 | (@ZIJN) (@INTENSBIJW)? zwaar
12 | (@ZIJN) een (@INTENSBIJV)? ?uitdaging
13 | (@ZIJN) (@INTENSBIJW)? ?uitdagend
14 | (@HEB) een (@INTENSBIJW)? ?(moeilijke|zware) tijd
15 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/doubt.tff:
--------------------------------------------------------------------------------
 1 | #class="doubt"
 2 | ik ben niet overtuigd
 3 | ik ben er niet van overtuigd
 4 | ik ben niet zeker
 5 | ik ben er niet zeker van
 6 | ik (weet|zie) niet hoe
 7 | ik (kan|zou) niet (zien|weten) hoe
 8 | het is niet (helder|duidelijk|overduidelijk|evident)( dat)?
 9 | is het niet (helder|duidelijk|overduidelijk|evident)( dat)?
10 | ik betwijfel (dat|ten zeerste|ten zeerste dat)?
11 | (we|wij) betwijfelen (dat|ten zeerste|ten zeerste dat)?
12 | ik (ben in )?twijfel
13 | we twijfelen
14 | ik ben twijfelachtig
15 | (we|wij) zijn twijfelachtig
16 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/inconsistency.tff:
--------------------------------------------------------------------------------
 1 | #class="inconsistency"
 2 | behalve (dan )?dat
 3 | behalve
 4 | met (uitsluiting|uitzondering) van
 5 | echter|maar
 6 | (niettemin|desondanks|noch|niettegenstaande)
 7 | dat gezegd (hebbende|zijnde)
 8 | dat zeggende
 9 | nu dat gezegd is
10 | ondanks
11 | ongeacht
12 | zelfs dan
13 | tegelijkertijd
14 | nog steeds
15 | wacht (eens )?even
16 | even wachten
17 | het is alleen( dat)?
18 | (allemaal|alles) goed en wel
19 | (zo ver als het gaat|voor zo ver het gaat)
20 | je denkt (dat )?misschien
21 | misschien denk je( dat)?
22 | zou kunnen denken( dat)?
23 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/difficulty.tff:
--------------------------------------------------------------------------------
 1 | #class="difficulty"
 2 | (@BE) (@INTENSADV1)?easy
 3 | (@BE) a (@INTENSADJ1)?breeze
 4 | (@BE) a (@INTENSADJ1)?walk in the park
 5 | (@BE) a (@INTENSADJ1)?piece of cake
 6 | (@BE) a (@INTENSADJ1)?snap
 7 | (@BE) a (@INTENSADJ1)?cinch
 8 | (@BE) (@INTENSADJ1)?child's play
 9 | (@BE) (@INTENSADV1)?difficult
10 | (@BE) a (@INTENSADJ1)?pain
11 | (@BE) a (@INTENSADJ1)?pain in the (butt|neck|ass)
12 | (@BE) a (@INTENSADJ1)?(bitch|bastard) to
13 | (@BE) no picnic
14 | (@BE) (@INTENSADV1)?tricky
15 | (@BE) (@INTENSADV1)?arduous
16 | (@BE) a (@INTENSADJ1)?challenge
17 | (@BE) (@INTENSADV1)?challenging
18 | (@HAVE) a (@INTENSADV1)?(hard|difficult) time
19 | 


--------------------------------------------------------------------------------
/DataCollection/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("requirements.txt") as handle:
 4 |     project_requirements = [line.strip() for line in handle.readlines()]
 5 | 
 6 | test_requirements = ["pytest-runner", "pytest", "coverage"]
 7 | 
 8 | setup(name="youtubecollector",
 9 |       version="0.1.0",
10 |       description="Module for getting data from youtube",
11 |       url="https://github.com/CorrespondentData/YouTubeExtremism",
12 |       author="De Correspondent",
13 |       packages=['youtubecollector'],
14 |       package_dir={'': 'src'},
15 |       install_requires=project_requirements,
16 |       tests_require=test_requirements,
17 |       extras_require={
18 |           "dev": test_requirements
19 |       },
20 |       test_suite="tests",
21 |       python_requires='>=3'
22 |       )
23 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/conditionals.tff:
--------------------------------------------------------------------------------
 1 | #class="conditionals"
 2 | if (we|you) want to ([\w]+[ \,]+){1,7}(we|you) (need to|must|have to)
 3 | (we|you) ([\w ,]+) (must|have to|need to) ([\w]+[ \,]+){1,7}if  (you|we) want to
 4 | it would be ([\w]+[ \,]+){0,2}nice if
 5 | wouldn\'t it be ([\w]+[ \,]+){0,2}nice if
 6 | if ([\w]+[ \,]+){3,8} that would be ([\w]+[ \,]+){0,2}nice
 7 | (cannot|will not|won\'t|can\'t) ([\w]+[ \,]+){1,7}(if|unless)
 8 | (if|unless) ([\w]+[ \,]+){3,10}(cannot|will not|won\'t|can\'t)
 9 | (need|needs|must|has to|have to) ([\w]+[ \,]+){3,10}(in order )to
10 | (in order )?to ([\w]+[ \,]+){3,10}(need|needs|must|has to|have to)
11 | as long as (we|you) ([\w]+[ \,]+){3,10}(will|can|able|should|[a-zA-Z]+\'ll)
12 | ([a-zA-Z]\'ll|will|can|able|should) ([\w]+[ \,]+){3,10}as long as (we|you) 
13 | (you|he|we) better ([\w]+[ \,]+){3,10}or
14 | otherwise


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/macros/intensifiers.tff:
--------------------------------------------------------------------------------
1 | #class="intensifier_adv"
2 | @INTENSBIJW={absoluut, zeker weten, extreem, absurd, laiwaaiig, luidruchtig, fenomenaal, spectaculair, enorm, groot, overweldigend, compleet, aanzienlijk, echt, verbazingwekkend, verschrikkelijk, vreselijk, totaal, ongelooflijk, volledig, groots, daverend, vervelend, compleet, hoog, ongelooflijk, perfect, treffend, sprekend, erg, ongelooflijk, onnatuurlijk, ongewoon, finaal, gigantisch, geweldig, gek genoeg, een beetje, enigszins, helemaal}
3 | @INTENSBIJV={absolute, zekere, extreme, absurde, laiwaaiige, luidruchtige, fenomenale, spectaculaire, enorme, grote, overweldigende, complete, aanzienlijke, echte, verbazingwekkende, verschrikkelijke, vreselijke, totale, ongelooflijke, volledige, grootse, daverende, vervelende, complete, hoge, ongelooflijke, perfecte, treffende, sprekende, erge, ongelooflijke, onnatuurlijke, finale, gigantische, geweldige}
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/README.md:
--------------------------------------------------------------------------------
 1 | Arguing Lexicon Filter
 2 | ======================
 3 | 
 4 | Filters transcripts to only include text that contains argument lexicon.
 5 | For details about this lexicon please [read here](https://github.com/fako/spacy_arguing_lexicon#how-it-works).
 6 | 
 7 | 
 8 | Prerequisites
 9 | -------------
10 | 
11 | * Conda
12 | 
13 | 
14 | Installation
15 | ------------
16 | 
17 | Make sure you are with a terminal inside ```arguing_lexicon```. Then setup your environment with:
18 | 
19 | ```conda env create -f environment.yml```
20 | 
21 | Original data was the [captions_metadata.csv](https://drive.google.com/drive/folders/13f2fYPIsiednDBTMhd7rvCyikD_R6405) from the right_wing folder.
22 | Place the data you want to work with inside the ```data``` folder and make sure it uses the same columns as the original data. 
23 | Note that it simply copies most columns and only really needs the ```content``` column.
24 | 
25 | After that you should be able to start a Jupyter session with:
26 | 
27 | ```jupyter notebook```
28 | 
29 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/conditionals.tff:
--------------------------------------------------------------------------------
 1 | #class="conditionals"
 2 | als (we|jij|je) (willen|wil)  ([\w]+[ \,]+){1,7} (moeten|moet|zullen|zal) (we|jij|je) (moeten)?
 3 | (we|jij|je)([\w ,]+) (moeten|moet|zullen|zal) ([\w]+[ \,]+){1,7} als (we|jij|je) (willen|wil|wilt)
 4 | het zou ([\w]+[ \,]+){0,2} (fijn|prettig) zijn
 5 | zou het niet ([\w]+[ \,]+){0,2} (fijn|prettig) zijn
 6 | (als|wanneer) ([\w]+[ \,]+){2,8} dat zou ([\w]+[ \,]+){0,2} (fijn|prettig) zijn
 7 | (kan niet|zal niet)([\w]+[ \,]+){1,7} (als|tenzij|mits|behalve als)
 8 | (als|tenzij|mits|behalve)([\w]+[ \,]+){2,10} (kan|zal) (het|dat|dit) niet
 9 | (moet|moeten|zal moeten|moet hebben|zal moeten hebben) ([\w]+[ \,]+){3,10} om (te)?
10 | om te ([\w]+[ \,]+){3,10} (moet|moeten|zal moeten)
11 | zo lang als (we|jij|je) ([\w]+[ \,]+){3,10} (zullen|zal|kunnen|kan|in staat zijn tot|zouden|zou)
12 | (zal|zullen|zou|zouden|kan|kunnen|zou|zouden) in staat zijn tot ([\w]+[ \,]+){3,10} zo lang als (we|jij)
13 | (jij|jij|hij|we) (zou|zouden) beter ([\w]+[ \,]+){3,10}of
14 | anders dan
15 | 


--------------------------------------------------------------------------------
/DataCollection/CONTRIBUTE.md:
--------------------------------------------------------------------------------
 1 | ## Coding style
 2 | 1. try to stick to **PEP-8** code guidelines
 3 | 1. To keep the api surface as small as possible make functions and import private (using `_` prefix)
 4 | 
 5 | ## Todo
 6 | 
 7 | [ ] After an error halfway through a given list, the code should be able to 
 8 | restart after the last successful  
 9 | [ ] When an api key hits it limits, the next possible key should be used  
10 | [ ] Add tests to the package  
11 | [ ] Add google api config to readme  
12 | [ ] Add documentation on domain and getting started using documentation generation    
13 |   *  **Implementation note** we could use [sphinx](http://www.sphinx-doc.org/en/stable/)    
14 |   
15 | [ ] Provide a better user experience with progress bar  
16 |    * **Implementation note**: we could use the [tqdm package](https://github.com/tqdm/tqdm)  
17 |      
18 | [ ] Be able to rerun the scraping and only add the new finds
19 |   * **Implementation note** requires a way to perform a delta, possible filter solution space using a last run date
20 |   
21 |  
22 |    
23 | 
24 |    


--------------------------------------------------------------------------------
/TopicModelling/TopicModelWrapper/stopwords/dutch:
--------------------------------------------------------------------------------
  1 | de
  2 | en
  3 | van
  4 | ik
  5 | te
  6 | dat
  7 | die
  8 | in
  9 | een
 10 | hij
 11 | het
 12 | niet
 13 | zijn
 14 | is
 15 | was
 16 | op
 17 | aan
 18 | met
 19 | als
 20 | voor
 21 | had
 22 | er
 23 | maar
 24 | om
 25 | hem
 26 | dan
 27 | zou
 28 | of
 29 | wat
 30 | mijn
 31 | men
 32 | dit
 33 | zo
 34 | door
 35 | over
 36 | ze
 37 | zich
 38 | bij
 39 | ook
 40 | tot
 41 | je
 42 | mij
 43 | uit
 44 | der
 45 | daar
 46 | haar
 47 | naar
 48 | heb
 49 | hoe
 50 | heeft
 51 | hebben
 52 | deze
 53 | u
 54 | want
 55 | nog
 56 | zal
 57 | me
 58 | zij
 59 | nu
 60 | ge
 61 | geen
 62 | omdat
 63 | iets
 64 | worden
 65 | toch
 66 | al
 67 | waren
 68 | veel
 69 | meer
 70 | doen
 71 | toen
 72 | moet
 73 | ben
 74 | zonder
 75 | kan
 76 | hun
 77 | dus
 78 | alles
 79 | onder
 80 | ja
 81 | eens
 82 | hier
 83 | wie
 84 | werd
 85 | altijd
 86 | doch
 87 | wordt
 88 | wezen
 89 | kunnen
 90 | ons
 91 | zelf
 92 | tegen
 93 | na
 94 | reeds
 95 | wil
 96 | kon
 97 | niets
 98 | uw
 99 | iemand
100 | geweest
101 | andere
102 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/utils_for_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from os.path import join, split
 3 | 
 4 | from googleapiclient.discovery import build
 5 | from googleapiclient.http import HttpMockSequence
 6 | 
 7 | 
 8 | def get_file_from_test_resource(filename):
 9 |     return join(split(__file__)[0], "resources", filename)
10 | 
11 | 
12 | def get_content_from_file(filename):
13 |     with open(get_file_from_test_resource(filename)) as handle:
14 |         return handle.read()
15 | 
16 | 
17 | def read_json_from_file(filename):
18 |     with open(get_file_from_test_resource(filename)) as json_file:
19 |         return json.loads(json_file.read())
20 | 
21 | 
22 | def create_test_client_with_response(response_json_file, status_code):
23 |     test_response = get_content_from_file(response_json_file)
24 |     service_json = get_content_from_file("youtube_service.json")
25 |     url = HttpMockSequence([
26 |         ({'status': '200'}, service_json),
27 |         ({'status': str(status_code)}, test_response)
28 |     ])
29 | 
30 |     return build("youtube", "v3", http=url, developerKey="key")
31 | 


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/youtube_client.py:
--------------------------------------------------------------------------------
 1 | import getpass as _getpass
 2 | import json as _json
 3 | 
 4 | from googleapiclient.discovery import build as _build
 5 | from googleapiclient.errors import HttpError
 6 | 
 7 | 
 8 | def create_youtube_client(api_config_filename):
 9 |     youtube_api_service_name, youtube_api_version,developer_key = _get_api_config(api_config_filename)
10 |     if developer_key is None:
11 |         developer_key = _getpass.getpass("Google Developer Api key: ")
12 |     try:
13 |         return _build(youtube_api_service_name, youtube_api_version, developerKey=developer_key)
14 |     except HttpError as e:
15 |         print(f"Failed to connect due to {_json.loads(e.content)['error']['errors'][0]['reason']}")
16 | 
17 | 
18 | def _get_api_config(api_config_filename):
19 |     with open(api_config_filename) as handle:
20 |         config_local_vars = {}
21 |         exec(handle.read(), {},config_local_vars)
22 |         return (config_local_vars['youtube_api_service_name'],
23 |                 config_local_vars['youtube_api_version'],
24 |                 config_local_vars['developer_key'])
25 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/assessments.tff:
--------------------------------------------------------------------------------
 1 | #class="assessments"
 2 | (our|my) (opinion|understanding) (is|was) that
 3 | it (is|was) (our|my) (opinion|understanding) (that)?
 4 | in (our|my) opinion
 5 | (our|my) take on
 6 | it (seems|seemed) to (us|me) (that)?
 7 | it (seems|seemed) (that)?
 8 | it would seem to (us|me)?
 9 | it would appear to (us|me)?
10 | it appears to (us|me)?
11 | (the|my|our) ([\w]+[ ])?point is (that)?
12 | (the|my|our) ([\w]+[ \,]*){1,2} point is (that)?
13 | it (looks|looked) to (us|me) (as if|like)
14 | it (looks|looked) (as if|like|that way)
15 | (we|i) (have|get|got) the impression (that)?
16 | (our|my) impression (was|is) (that)?
17 | in (our|my) book
18 | to (our|my) mind
19 | to (our|my) way of thinking
20 | as far as (I am|I was|we are|we were) concerned
21 | if you ask (me|us)
22 | (our|my) feeling (is|was|would be)
23 | from where (I\'m|I am) (standing|sitting)
24 | (we|I) (don\'t)? think (that)?
25 | all (we\'re|I\'m) saying is
26 | what (I\'m|we\'re) saying is
27 | (we\'re|I\'m) (not)? saying that
28 | what (we\'re|i\'m) trying to say is
29 | what (we|i) mean is (that)?


--------------------------------------------------------------------------------
/DataCollection/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: help guard-virtual-env basedependencies env clean
 2 | 
 3 | guard-virtual-env: ## Exits if python virtual environment is not set
 4 | 	@ if [ "${VIRTUAL_ENV}" = "" ]; then \
 5 | 		echo "Virtual environment is not active. Activate with 'source venv/bin/activate'"; \
 6 | 		exit 1; \
 7 | 	fi
 8 | 
 9 | clean: ## Removes all build artifacts
10 | 	for build_dir_glob in \
11 | 		build \
12 | 		dist \
13 | 		.eggs \
14 | 		*.egg-info \
15 | 		*.pyc \
16 | 		*.pyo \
17 | 		*~ \
18 | 		__pycache__ \
19 | 		.pytest_cache \
20 | 		.coverage \
21 | 		htmlcov; \
22 | 	do find . -name "$$build_dir_glob" -exec rm -fr {} +; done
23 | 
24 | basedependencies: guard-virtual-env clean
25 | 	pip3 install --upgrade pip setuptools wheel
26 | 
27 | env:
28 | 	pip3 install --upgrade virtualenv
29 | 	virtualenv venv
30 | 
31 | dependencies: basedependencies
32 | 	pip3 install -r requirements.txt
33 | 
34 | development: dependencies
35 | 	pip3 install -e .[dev]
36 | 
37 | test: clean development
38 | 	python3 setup.py test
39 | 
40 | coverage: clean development
41 | 	coverage run -m pytest
42 | 	coverage report
43 | 	coverage html
44 | 	open htmlcov/index.html
45 | 
46 | install: dependencies
47 | 	python3 setup.py install
48 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/emphasis.tff:
--------------------------------------------------------------------------------
 1 | #class="emphasis"
 2 | (duidelijk|helder)
 3 | (klaarblijkelijk|overduidelijk|vanzelfsprekend|kennelijk|onomstreden)
 4 | blijkbaar
 5 | als je er (echt )?over nadenkt
 6 | het is ((echt|aardig|best wel) )?(duidelijk|evident|helder|vanzelfsprekend)( dat)?
 7 | (definitief|absoluut|beslist)
 8 | ik moet (zeggen|bekennen)
 9 | ik zou moeten (zeggen|bekennen)
10 | (zeker|stellig)
11 | (zeker weten|ongetwijfeld|stellig)
12 | (@ZIJN) (zeker|stellig overtuigd|zelfverzekerd)( dat)?
13 | natuurlijk
14 | geen twijfel mogelijk
15 | zonder twijfel over
16 | ongetwijfeld
17 | zonder (enige )?twijfel
18 | ik weet zeker( dat)?
19 | ik (twijfel|betwijfel) niet( dat)?
20 | wedden( dat)?
21 | ik wed( dat)?
22 | (het|de) enige (ding|probleem|punt|vraag) (@MISS)? ?(@ZIJN)( dat)?
23 | mijn (gevoel|intuïtie|intuitie) zegt( dat)?
24 | daarom
25 | dat is waarom
26 | het idee (hier )?is( dat)?
27 | (mijn|het) (hele punt|vraag) is
28 | wat je moet doen is
29 | de reden( hiervoor)? Is( dat)?
30 | dit is wat
31 | hier is wat
32 | exact
33 | precies
34 | (@GA)
35 | (@GANEG)
36 | (@GANEGPER)
37 | (@GAPER)
38 | wat (er )?gaat gebeuren is
39 | wat gebeurt is
40 | wat zal gaan gebeuren is
41 | ik wil (highlighten|benadrukken|onderstrepen)
42 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/patterns/emphasis.tff:
--------------------------------------------------------------------------------
 1 | #class="emphasis"
 2 | clearly
 3 | obviously
 4 | patently
 5 | when you (really )?think about it
 6 | (it is|it\'s) ((really|pretty) )?(obvious|evident|clear) (that)?
 7 | definitely
 8 | i have to say
 9 | i\'ve got to say
10 | i\'ve gotta say
11 | i should say
12 | surely
13 | for sure
14 | (@BE) ((sure)|(certain)|(confident)) (that)?
15 | of course
16 | no doubt about it
17 | doubtless
18 | without a doubt
19 | I have no doubt (that)?
20 | I bet (that)?
21 | (@BE) bound to
22 | no two ways about it
23 | there ((is)|(are)) no two ways about it
24 | there\'s no two ways about it
25 | ((the)|(one)) ((thing)|(issue)|(question)|(problem)) (@MODAL )?(@BE) (that)?
26 | my feeling is (that)?
27 | that\'s why
28 | that is why
29 | the idea (here )?is (that)?
30 | ((my)|(the)) whole ((point)|(question)) is 
31 | what you have to do is 
32 | the reason is (that)?  
33 | here\'s what
34 | here is what
35 | exactly
36 | precisely
37 | (@GONNA)
38 | (@GONNANEG)
39 | (@GONNANEGCL)
40 | (@GONNACL)
41 | what will happen is
42 | what\'ll happen is
43 | what\'s ((gonna)|(going to)) happen is
44 | what is ((gonna)|(going to)) happen is
45 | i want to (highlight|emphasize|underscore)
46 | 
47 | 


--------------------------------------------------------------------------------
/TopicModelling/Top TfIdf/README.md:
--------------------------------------------------------------------------------
 1 | # Top TfIdf terms per channel/per year
 2 | 
 3 | > Inspiration/example: https://pudding.cool/2017/09/hip-hop-words/
 4 | 
 5 | IPython notebook in this folder takes a cleaned csv of transcripts and merges the texts per channel per year into one document. Then it simply takes the top TfIdf words for each new document, so the channels can be compared over time.
 6 | 
 7 | ## TfIdf options
 8 | 
 9 | - cutoff point was chosen on occurrence in one in 50, because channels span a lot of topics.
10 | - Instead of linear term frequency (10 occurrences -> tf = 10), I followed the pudding in using sublinear term frequency (10 occurrences -> tf = 1 + log(9)). The basic idea is that a linear increase in use of a term does not linearly increase their importance. In terms of results, the linear term frequency yields a list of stop words per document. The sublinear tf returns a much more meaningful list.
11 | - **TODO:** Lemmatize the words. I didn't have it available at the time, but knew someone else was working on it. As a result, I waited for the lemmatized set.
12 | 
13 | ## Notebook contents
14 | 
15 | 1. Top 10 per document
16 | 2. Top 100 in order to build networks of similarity. The networks this yields, however, are rather heavy, so that still needs finetuning.


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/test_recommendations.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from utils_for_test import read_json_from_file
 4 | from youtubecollector.recommendations import convert_to_recommendations, recommendation
 5 | 
 6 | 
 7 | class RecommendationsTest(TestCase):
 8 | 
 9 |     def test_get_full_recommendation(self):
10 |         response = read_json_from_file("recommendation.json")
11 |         actual = convert_to_recommendations(response,"id_of_video")
12 | 
13 |         expected = [
14 |             recommendation(video_id="id_of_video",
15 |                            target_video_id="id of first target video",
16 |                            published_at='2018-01-01T01:01:01.000Z',
17 |                            channel_id='channel Id of first video',
18 |                            video_title='title of first video',
19 |                            video_description='Description of first recommendation'),
20 |             recommendation(video_id='id_of_video',
21 |                            target_video_id='id of second target video',
22 |                            published_at='2018-10-10T10:10:10.000Z',
23 |                            channel_id='channel id of second video',
24 |                            video_title='title of second video',
25 |                            video_description='description of second video')
26 |         ]
27 |         self.assertEqual(actual, expected)


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/patterns/assessments.tff:
--------------------------------------------------------------------------------
 1 | #class="assessments"
 2 | (onze|ons|mijn) (mening|idee|inzicht) (is|was) dat
 3 | het (is|was) (mijn|onze|ons) (mening|idee|inzicht) (dat)?
 4 | naar (onze|mijn|ons) (mening|idee)
 5 | (vanuit)? (ons|mijn) standpunt
 6 | het (lijkt|leek) (ons|mij) (dat)?
 7 | het (lijkt|leek) (dat)?
 8 | het oogt alsof
 9 | het komt op (ons|mij) over
10 | het zou (ons|mij) lijken
11 | het lijkt (ons|mij)
12 | het lijkt (erop)?
13 | (het|mijn|ons)([\w]+[ ])?punt is (dat)?
14 | (het|mijn|ons)([\w]+[ \,]*){1,2}  punt is (dat)?
15 | het (lijkt|leek) (mij|ons) (dat|alsof)
16 | het (lijkt|leek) (alsof|dat|net alsof)
17 | (we|wij|ik)(hebben|heb|krijgen|krijg|hadden|had|kregen|kreeg) (het idee|het gevoel|de indruk|de impressie) (dat)?
18 | (onze|mijn|ons) (indruk|idee|impressie|gevoel) (was|is) (dat)?
19 | volgens (onze|mijn) normen
20 | in (onze|mijn) ogen
21 | mijns? inziens
22 | naar (ons|onze|mijn) (idee|mening)
23 | voor zo ver (het)? (mij|ons) (aangaat|betreft)
24 | wat (mij|ons) betreft
25 | als je het (mij|ons) vraagt
26 | (wij|we|ik) (hebben|heb) het gevoel (dat)?
27 | vanuit (mijn|ons|onze) (standpunt|oogpunt|visie)
28 | (we|wij|ik) (denken|denk) niet (dat)?
29 | het enige (wat|dat) (we|wij|ik) (zeggen|zeg|bedoelen|bedoel) is
30 | wat (we|wij|ik) (zeggen|zeg|bedoelen|bedoel) is
31 | (we|wij|ik) (zeggen|zeg) dat
32 | (we|wij|ik) (zeggen|zeg) niet dat
33 | wat (we|wij|ik) (proberen|probeer) te zeggen is
34 | wat (we|wij|ik) (bedoelen|bedoel) is (dat)?
35 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/comment_minimal.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "kind": "youtube#commentThreadListResponse",
 3 |   "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/M13kILBSDXHZmf82KpKIUu78oro\"",
 4 |   "pageInfo": {
 5 |     "totalResults": 5,
 6 |     "resultsPerPage": 100
 7 |   },
 8 |   "items": [
 9 |     {
10 |       "kind": "youtube#commentThread",
11 |       "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/6WkG1Db9T-En4DWmNTTBdki5WTk\"",
12 |       "id": "The comment id that is used",
13 |       "snippet": {
14 |         "videoId": "the video id",
15 |         "topLevelComment": {
16 |           "kind": "youtube#comment",
17 |           "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/RUV3jLsQcA6PsDjkr3q5GkpAOmI\"",
18 |           "id": "Also the comment id but not used",
19 |           "snippet": {
20 |             "authorDisplayName": "Author name",
21 |             "authorProfileImageUrl": "example.com/photo.jpg",
22 |             "authorChannelUrl": "http://www.youtube.com/channel/someone",
23 |             "videoId": "a video id",
24 |             "textDisplay": "The text that is displayed",
25 |             "textOriginal": "text that is not shown,",
26 |             "canRate": true,
27 |             "viewerRating": "none",
28 |             "likeCount": 4,
29 |             "publishedAt": "2017-11-02T19:25:12.000Z",
30 |             "updatedAt": "2017-11-02T19:25:12.000Z"
31 |           }
32 |         },
33 |         "canReply": true,
34 |         "totalReplyCount": 0,
35 |         "isPublic": true
36 |       }
37 |     }
38 |   ]
39 | }


--------------------------------------------------------------------------------
/TopicModelling/TopicModelWrapper/visualisations/pyLDAvisualisation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | # Gensim
 5 | from gensim import corpora, models, similarities
 6 | from gensim.corpora import Dictionary
 7 | 
 8 | # Plotting tools
 9 | import pyLDAvis
10 | import pyLDAvis.gensim
11 | 
12 | # Enable logging for gensim - optional
13 | import logging
14 | import warnings
15 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
16 | warnings.filterwarnings("ignore", category=DeprecationWarning)
17 | 
18 | 
19 | def main():
20 |     root = os.getcwd()
21 |     model_name = 'captions_right'
22 |     topic_num = 50
23 | 
24 |     model_path = '{}/models/{}_{}'.format(root, model_name, topic_num)
25 | 
26 |     corpus = corpora.MmCorpus('{}/{}.mm'.format(model_path, model_name))
27 |     lda = models.LdaMulticore.load('{}/{}.lda'.format(model_path, model_name))
28 |     dictionary = Dictionary.load('{}/{}.dict'.format(model_path, model_name))
29 | 
30 |     t1 = time.time()
31 |     print('Starting preparation of LDAvis visualisation')
32 | 
33 |     # # Load gensim data to prepare for visualization
34 |     prepared_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
35 | 
36 |     # Save visualisation to HTML file
37 |     pyLDAvis.save_html(prepared_data, os.path.join(model_path, '{}_LDAvis.html'.format(model_name)))
38 | 
39 |     t2 = time.time()
40 |     print('LDAvis visualisation successful! Time elapsed: {}\n'.format(t2 - t1))
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/comment_full.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "kind": "youtube#commentThreadListResponse",
 3 |   "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/M13kILBSDXHZmf82KpKIUu78oro\"",
 4 |   "pageInfo": {
 5 |     "totalResults": 5,
 6 |     "resultsPerPage": 100
 7 |   },
 8 |   "items": [
 9 |     {
10 |       "kind": "youtube#commentThread",
11 |       "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/6WkG1Db9T-En4DWmNTTBdki5WTk\"",
12 |       "id": "The comment id that is used",
13 |       "snippet": {
14 |         "videoId": "the video id",
15 |         "topLevelComment": {
16 |           "kind": "youtube#comment",
17 |           "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/RUV3jLsQcA6PsDjkr3q5GkpAOmI\"",
18 |           "id": "comment id that is not used",
19 |           "snippet": {
20 |             "authorDisplayName": "Author name",
21 |             "authorProfileImageUrl": "example.com/photo.jpg",
22 |             "authorChannelUrl": "http://www.youtube.com/channel/someone",
23 |             "authorChannelId": {
24 |               "value": "someone"
25 |             },
26 |             "videoId": "some video id",
27 |             "textDisplay": "The text that is displayed",
28 |             "textOriginal": "text that is not shown",
29 |             "canRate": true,
30 |             "viewerRating": "none",
31 |             "likeCount": 4,
32 |             "disLikeCount": 2,
33 |             "publishedAt": "2017-11-02T19:25:12.000Z",
34 |             "updatedAt": "2017-11-02T19:25:12.000Z"
35 |           }
36 |         },
37 |         "canReply": true,
38 |         "totalReplyCount": 0,
39 |         "isPublic": true
40 |       }
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lda.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("../")
 3 | 
 4 | import pickle
 5 | 
 6 | import pandas as pd
 7 | from sklearn.feature_extraction.text import CountVectorizer
 8 | from sklearn.decomposition import LatentDirichletAllocation
 9 | 
10 | # Lazy data reader into DataFrame
11 | def read_argument_captions():
12 |     transcripts_reader = pd.read_csv("data/captions_arguments.csv", chunksize=10)
13 |     for batch in transcripts_reader:
14 |         for ix, caption in batch.iterrows():
15 |             text = ""
16 |             for fragment, argument_label in zip(str(caption["content"]).split("\n"), str(caption["argument_labels"]).split("\n")):
17 |                 if argument_label:
18 |                     text += fragment + " "
19 |             yield text
20 | 
21 | 
22 | with open("models/vectorizer.pkl", "rb") as count_file:
23 |     vectorizer = pickle.load(count_file)
24 | with open("models/vectorizer_matrix.pkl", "rb") as matrix_file:
25 |     matrix = pickle.load(matrix_file)
26 | 
27 | lda_model = LatentDirichletAllocation(n_topics=50, max_iter=500, verbose=3, n_jobs=-1, learning_method="online")
28 | lda_model.fit(matrix)
29 | 
30 | # Saving progress
31 | with open("models/lda.50.pkl", "wb") as lda_file:
32 |     pickle.dump(lda_model, lda_file)
33 | 
34 | def print_top_words(model, feature_names, n_top_words):
35 |     for topic_idx, topic in enumerate(model.components_):
36 |         print("Topic #%d:" % topic_idx)
37 |         print(" | ".join([feature_names[i]
38 |                         for i in topic.argsort()[:-n_top_words - 1:-1]]))
39 |         print()
40 |         print()
41 |     print()
42 | 
43 | print_top_words(lda_model, feature_names, 50)
44 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/test_videos.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from utils_for_test import read_json_from_file, create_test_client_with_response
 4 | from youtubecollector.video import convert_to_videos, video
 5 | 
 6 | 
 7 | class VideoTest(TestCase):
 8 | 
 9 |     def test_get_video(self):
10 |         client = create_test_client_with_response("video_metadata.json", "200")
11 | 
12 |         response = read_json_from_file("video.json")
13 |         actual = convert_to_videos(response, client)
14 | 
15 |         expected = [
16 |             video(video_id='The id of the video',
17 |                   video_published='2015-01-01T01:01:01.000Z',
18 |                   channel_id='Id of the channel',
19 |                   video_title='The title of the video',
20 |                   video_description='Description of the video',
21 |                   video_channel_title='Title of the channel',
22 |                   video_tags=['tag one', 'tag two'],
23 |                   video_category_id='1',
24 |                   video_default_language='not set',
25 |                   video_duration='PT3M3S',
26 |                   video_view_count='120',
27 |                   video_comment_count='564',
28 |                   video_likes_count='231',
29 |                   video_dislikes_count='342',
30 |                   video_topic_ids=[
31 |                       'Relevant topics ids 1',
32 |                       'Relevant topics ids 2'
33 |                   ],
34 |                   video_topic_categories=['https://en.wikipedia.org/wiki/Television_program',
35 |                                           'https://en.wikipedia.org/wiki/Society'])
36 |         ]
37 | 
38 |         self.assertEqual(actual, expected)
39 | 


--------------------------------------------------------------------------------
/TopicModelling/filterTranscripts.py:
--------------------------------------------------------------------------------
 1 | import getTokens
 2 | import pandas as pd
 3 | import ast
 4 | import re
 5 | import getWord2VecModel
 6 | import numpy as np
 7 | from gensim.models import Word2Vec
 8 | 
 9 | def trainW2vTranscripts():
10 | 	""" train a column of strings to a word2vec model"""
11 | 	df = pd.read_csv('data/captions-filtered.csv', encoding='utf-8')
12 | 	model = getWord2VecModel.getWord2Vec(train=df['transcript_clean'])
13 | 	model.most_similar(positive=['muslim'])
14 | 
15 | def cleanTranscripts():
16 | 	""" filter the transcripts by removing stopwords and stemming """
17 | 	df = pd.read_csv('data/captions-clean.csv', encoding='utf-8')
18 | 	df['transcript_clean'] = np.nan
19 | 	datalength = len(df)
20 | 	print(df.head())
21 | 	li_transcripts = ['n'] * len(df)
22 | 	for index, transcript in enumerate(df['transcript']):
23 | 		transcript_clean = ast.literal_eval(transcript)
24 | 		transcript_clean = getTokens.getTokens(li_strings=(ast.literal_eval(transcript)), lemmatizing=True)
25 | 		li_transcripts[index] = transcript_clean
26 | 		if index % 200 == 0:
27 | 			df['transcript_clean'] = li_transcripts
28 | 			df.to_csv('data/captions-filtered.csv', encoding='utf-8')
29 | 			print('Completed video ' + str(index) + '/' + str(datalength))
30 | 
31 | def removeDuplicateEntries():
32 | 	""" remove the duplicate transcript entries """
33 | 	df = pd.read_csv('data/captions.csv', encoding='utf-8')
34 | 	df.columns = ['id', 'transcript']
35 | 
36 | 	li_transcripts = []
37 | 	for transcript in df['transcript']:
38 | 		li_transcript = ast.literal_eval(transcript)
39 | 		li_transcript = li_transcript[0::3]
40 | 		li_transcripts.append(li_transcript)
41 | 	df['transcript'] = li_transcripts
42 | 
43 | 	df.to_csv('data/captions-clean.csv', encoding='utf-8')


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/video.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "kind": "youtube#playlistItemListResponse",
 3 |   "etag": "etag",
 4 |   "prevPageToken": "token",
 5 |   "pageInfo": {
 6 |     "totalResults": 258,
 7 |     "resultsPerPage": 50
 8 |   },
 9 |   "items": [
10 |     {
11 |       "kind": "youtube#playlistItem",
12 |       "etag": "Etag",
13 |       "id": "An id that isn't used",
14 |       "snippet": {
15 |         "publishedAt": "2015-01-01T01:01:01.000Z",
16 |         "channelId": "Id of the channel",
17 |         "title": "The title of the video",
18 |         "description": "Description of the video",
19 |         "thumbnails": {
20 |           "default": {
21 |             "url": "https://image.jpg",
22 |             "width": 120,
23 |             "height": 90
24 |           },
25 |           "medium": {
26 |             "url": "https:image.jpg",
27 |             "width": 320,
28 |             "height": 180
29 |           },
30 |           "high": {
31 |             "url": "https://image.jpg",
32 |             "width": 480,
33 |             "height": 360
34 |           },
35 |           "standard": {
36 |             "url": "https://image.jpg",
37 |             "width": 640,
38 |             "height": 480
39 |           },
40 |           "maxres": {
41 |             "url": "https://image.jpg",
42 |             "width": 1280,
43 |             "height": 720
44 |           }
45 |         },
46 |         "channelTitle": "Title of the channel",
47 |         "playlistId": "id of playlist",
48 |         "position": 250,
49 |         "resourceId": {
50 |           "kind": "youtube#video",
51 |           "videoId": "video Id that isn't used"
52 |         }
53 |       },
54 |       "contentDetails": {
55 |         "videoId": "The id of the video",
56 |         "videoPublishedAt": "2015-01-01T10:01:10.000Z"
57 |       }
58 |     }
59 |   ]
60 | }


--------------------------------------------------------------------------------
/TopicModelling/TopicModelWrapper/StreamingCorpus.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import tarfile
 4 | from gensim import corpora
 5 | from gensim.corpora import TextCorpus
 6 | from StreamingPreprocesser import StreamingPreprocesser
 7 | 
 8 | DOCUMENT_MIN_TOKENS = 5
 9 | TOKEN_MIN_LEN = 2  # less than; not inclusive
10 | TOKEN_MAX_LEN = 15 # equal to or larger tan
11 | 
12 | 
13 | class StreamingCorpus(TextCorpus):
14 |     """
15 |     TextCorpus class 
16 |     """
17 |     def __init__(self, path, parse_strategy=None, clean_strategy=None, dictionary=None, metadata=False):
18 |         self.path = path  # path to index file or main folder of docs
19 |         self.metadata = metadata
20 | 
21 |         self.streaming_parser = parse_strategy if parse_strategy is not None else StreamingParser(self.path, 1, metadata=True)
22 |         self.streaming_cleaner = clean_strategy if clean_strategy is not None else StreamingPreprocesser()
23 | 
24 |         self.dictionary = dictionary or corpora.Dictionary()
25 | 
26 |     def get_dictionary(self):
27 |         return self.dictionary
28 | 
29 |     def get_texts(self):
30 | 
31 |         for tokens, metadata in self.process_entries():
32 |             if self.metadata:
33 |                 yield tokens, metadata
34 |             else:
35 |                 yield tokens
36 | 
37 |     def process_entries(self):
38 | 
39 |         for sources_texts, metadata in self.streaming_parser:
40 | 
41 |             # Clean the texts from all sources
42 |             cleaned_text = []
43 |             for token in self.streaming_cleaner.process(sources_texts):  # includes tokenizer
44 |                 cleaned_text.append(token)
45 | 
46 |             if len(cleaned_text) > 1:
47 |                 self.dictionary.add_documents([cleaned_text])
48 |                 yield cleaned_text, metadata
49 |             else:
50 |                 continue
51 | 


--------------------------------------------------------------------------------
/TopicModelling/TopicModelWrapper/stopwords/english:
--------------------------------------------------------------------------------
  1 | i
  2 | me
  3 | my
  4 | myself
  5 | we
  6 | our
  7 | ours
  8 | ourselves
  9 | you
 10 | youre
 11 | youve
 12 | youll
 13 | youd
 14 | your
 15 | yours
 16 | yourself
 17 | yourselves
 18 | he
 19 | him
 20 | his
 21 | himself
 22 | she
 23 | shes
 24 | her
 25 | hers
 26 | herself
 27 | it
 28 | its
 29 | its
 30 | itself
 31 | they
 32 | them
 33 | their
 34 | theirs
 35 | themselves
 36 | what
 37 | which
 38 | who
 39 | whom
 40 | this
 41 | that
 42 | thatll
 43 | these
 44 | those
 45 | am
 46 | is
 47 | are
 48 | was
 49 | were
 50 | be
 51 | been
 52 | being
 53 | have
 54 | has
 55 | had
 56 | having
 57 | do
 58 | does
 59 | did
 60 | doing
 61 | a
 62 | an
 63 | the
 64 | and
 65 | but
 66 | if
 67 | or
 68 | because
 69 | as
 70 | until
 71 | while
 72 | of
 73 | at
 74 | by
 75 | for
 76 | with
 77 | about
 78 | against
 79 | between
 80 | into
 81 | through
 82 | during
 83 | before
 84 | after
 85 | above
 86 | below
 87 | to
 88 | from
 89 | up
 90 | down
 91 | in
 92 | out
 93 | on
 94 | off
 95 | over
 96 | under
 97 | again
 98 | further
 99 | then
100 | once
101 | here
102 | there
103 | when
104 | where
105 | why
106 | how
107 | all
108 | any
109 | both
110 | each
111 | few
112 | more
113 | most
114 | other
115 | some
116 | such
117 | no
118 | nor
119 | not
120 | only
121 | own
122 | same
123 | so
124 | than
125 | too
126 | very
127 | s
128 | t
129 | can
130 | will
131 | just
132 | don
133 | dont
134 | should
135 | shouldve
136 | now
137 | d
138 | ll
139 | m
140 | o
141 | re
142 | ve
143 | y
144 | ain
145 | aren
146 | arent
147 | couldn
148 | couldnt
149 | didn
150 | didnt
151 | doesn
152 | doesnt
153 | hadn
154 | hadnt
155 | hasn
156 | hasnt
157 | haven
158 | havent
159 | isn
160 | isnt
161 | ma
162 | mightn
163 | mightnt
164 | mustn
165 | mustnt
166 | needn
167 | neednt
168 | shan
169 | shant
170 | shouldn
171 | shouldnt
172 | wasn
173 | wasnt
174 | weren
175 | werent
176 | won
177 | wont
178 | wouldn
179 | wouldnt
180 | 


--------------------------------------------------------------------------------
/TopicModelling/getWord2VecModel.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import ast
 3 | import pickle as p
 4 | from gensim.models import Word2Vec
 5 | 
 6 | def getW2vModel(train='', load='', modelname='', min_word=200):
 7 | 	"""
 8 | 	Trains or loads a word2vec model. Input must be a list of strings.
 9 | 	Keyword arguments:
10 | 	train -- when provided, trains, saved (in binary) and returns a model
11 | 	load -- when provided, loads and returns a model (usually stored in .model.bin)
12 | 	modelname -- name of the saved model
13 | 	min_word -- the minimum amount of occurances of words to be included in the model. Useful for filtering out bloat.
14 | 	"""
15 | 
16 | 	if train != '':
17 | 		print('Training ' + modelname)
18 | 		# train model
19 | 		# neighbourhood?
20 | 		model = Word2Vec(train, min_count=min_word)
21 | 		# pickle the entire model to disk, so we can load&resume training later
22 | 		model.save(modelname + '.model')
23 | 		#store the learned weights, in a format the original C tool understands
24 | 		model.wv.save_word2vec_format(modelname + '.model.bin', binary=True)
25 | 		return model
26 | 	elif load != '':
27 | 		model = Word2Vec.load(load)
28 | 		return model
29 | 
30 | def getStrings():
31 | 	df = pd.read_csv('data/captions-filtered-final.csv', encoding='utf-8')
32 | 	li_transcripts = df['transcript_clean']
33 | 	li_str_transcripts = []
34 | 
35 | 	for full_transcript in li_transcripts:
36 | 		li_full_transcript = ast.literal_eval(full_transcript)
37 | 		for sent_transcript in li_full_transcript:
38 | 			#li_transcript = ast.literal_eval(sent_transcript)
39 | 			str_transcript = ' '.join(sent_transcript)
40 | 			li_str_transcripts.append(str_transcript)
41 | 	print(li_str_transcripts[:10])
42 | 	p.dump(li_str_transcripts, open('li_str_transcripts.p', 'wb'))
43 | 	return li_str_transcripts
44 | 
45 | li_str_transcripts = getStrings()
46 | model = getW2vModel(train = li_str_transcripts, modelname='youtube-transcripts')
47 | print(model.most_similar)


--------------------------------------------------------------------------------
/DataCollection/README.md:
--------------------------------------------------------------------------------
 1 | ## Module for collection data from youtube
 2 | 
 3 | ### Setup:
 4 | 
 5 | #### Step 1: Setup your python environment
 6 | It is recommended to setup an virtual environment and activate it.  
 7 | This will keep this module limited to this project and not part of your global installation
 8 | 
 9 | ```commandline
10 |  $ pip3 install virtualenv
11 |  $ virtualenv venv
12 |  $ source venv/bin/activate
13 | ```
14 | 
15 | You should now have a commandline that start like this:
16 | 
17 | ```commandline
18 | (venv) $
19 | ```
20 | 
21 | #### Step 2: Install the youtubecollector package
22 | 
23 | Now you can install the package
24 | ```commandline
25 | (venv) $ make install
26 | ```
27 | 
28 | If you want to use this in a jupyter notebook (like notebooks/getting_started.ipynb)  
29 | you have to start the jupyter server from within the virtual env
30 | ```bash
31 | (venv) $ jupyter notebook
32 | ```
33 | 
34 | You can now import the module like any other package
35 | ```python
36 | import youtubecollector
37 | ```
38 | 
39 | if this fails you could check if you have the right python kernel via
40 | ```python
41 | import sys
42 | sys.executable
43 | ```
44 | 
45 | this should result in a path that ends in `venv/bin/python3.6`
46 | 
47 | #### Step 3: Get a developer key for the api
48 | 
49 | You will need a google account.
50 | The next step are described here: [google api setup documentation](https://support.google.com/googleapi/answer/6158862)
51 | 
52 | #### Getting started
53 | To see an example of the complete pipeline check the `getting_started.ipynb`.  
54 | This notebook makes use of `tqdm` which generates some nice progress bar so you can track the progress.  
55 | 
56 | To enable these visualisations run:
57 | ```commandline
58 | (venv) $ jupyter nbextension enable --py widgetsnbextension
59 | ``` 
60 | 
61 | #### Development note
62 | If you wish to work on the package install the package with 
63 | ```bash
64 | (venv) $ make development
65 | ```
66 | In combination with the `autoreload` extension you can quickly test changes to the package in a notebook
67 | ```ipnbpython
68 | %load_ext autoreload
69 | %autoreload 2
70 | ```
71 | 
72 | 


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/transcripts.py:
--------------------------------------------------------------------------------
 1 | import os as _os
 2 | import glob as _glob
 3 | import csv as _csv
 4 | import webvtt as _webvtt
 5 | from .util import is_empty_file as _is_empty_file
 6 | import youtube_dl as _youtube_dl
 7 | 
 8 | 
 9 | def _get_captions_header():
10 |     return 'videoId', 'transcript'
11 | 
12 | 
13 | # TODO(OMeuwese) suppress messages and choose output dir
14 | def get_captions(videos):
15 |     ydl_opts = {
16 |         'writeautomaticsub': True,
17 |         'skip_download': True,
18 |         'nocheckcertificate': True,
19 |         'verbose': False # doesn't seem to work
20 |         
21 |     }
22 |     with _youtube_dl.YoutubeDL(ydl_opts) as ydl:
23 |         for video in videos:
24 |             try:
25 |                 video_url = 'https://www.youtube.com/watch?v={}'.format(video.video_id)
26 |                 ydl.download([video_url])
27 |             except:
28 |                 continue
29 | 
30 | 
31 | # TODO(OMeuwese) provide folder as argument and extract all vtt_files from given folder
32 | def extract_transcripts(vtt_folder):
33 |     """:param vtt_folder should be location string ending in *.vtt to get all .vtt files like "files/output/*.vtt" """
34 | 
35 |     video_ids = []
36 |     transcripts = []
37 | 
38 |     for filename in _glob.glob(vtt_folder):
39 |         ids = _get_ids_from_filename(filename)
40 |         video_ids.append(ids)
41 | 
42 |         try:
43 |             words = []
44 |             for caption in _webvtt.read(filename):
45 |                 words.append(caption.text)
46 |             transcripts.append(words)
47 |         except:
48 |             pass
49 |     return list(zip(video_ids, transcripts))
50 | 
51 | 
52 | def write_transcripts(captions_filename, video_id_transcript_list):
53 |     with open(captions_filename, 'a') as csv_file:
54 |         writer = _csv.writer(csv_file, delimiter=',')
55 | 
56 |         if _is_empty_file(captions_filename):
57 |             writer.writerow(_get_captions_header())
58 | 
59 |         writer.writerows(video_id_transcript_list)
60 | 
61 | 
62 | def _get_ids_from_filename(filename):
63 |     ids = _os.path.basename(filename)
64 |     ids = ids[-18:-7]
65 |     return ids
66 | 


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/recommendations.py:
--------------------------------------------------------------------------------
 1 | import csv as _csv
 2 | from collections import namedtuple as _namedtuple
 3 | 
 4 | from .util import is_empty_file as _is_empty_file
 5 | from .util import convert_to_dictionary as _convert_to_dictionary
 6 | 
 7 | recommendation = _namedtuple("recommendation", ('video_id',
 8 |                                                 'target_video_id',
 9 |                                                 'published_at',
10 |                                                 'channel_id',
11 |                                                 'video_title',
12 |                                                 'video_description'))
13 | 
14 | 
15 | def _get_recommendations_header():
16 |     return recommendation._fields
17 | 
18 | 
19 | def get_recommendations(video_id, youtube_client, max_results=50):
20 |     return youtube_client.search().list(
21 |         part='snippet',
22 |         type='video',
23 |         relatedToVideoId=video_id,
24 |         maxResults=max_results
25 |     ).execute()
26 | 
27 | 
28 | def convert_to_recommendations(response, video_id):
29 |     recommendations = list()
30 |     for data in response['items']:
31 |         next_recommendation = recommendation(video_id=video_id,
32 |                                              target_video_id=data['id']['videoId'],
33 |                                              published_at=data['snippet']['publishedAt'],
34 |                                              channel_id=data['snippet']['channelId'],
35 |                                              video_title=data['snippet']['title'],
36 |                                              video_description=data['snippet']['description'])
37 | 
38 |         recommendations.append(next_recommendation)
39 | 
40 |     return recommendations
41 | 
42 | 
43 | def write_recommendations(recommendations_file, recommendations):
44 |     header = _get_recommendations_header()
45 | 
46 |     with open(recommendations_file, 'a') as csv_file:
47 |         writer = _csv.DictWriter(csv_file, fieldnames=header)
48 | 
49 |         if _is_empty_file(recommendations_file):
50 |             writer.writeheader()
51 | 
52 |         for recommendation_row in recommendations:
53 |             writer.writerow(_convert_to_dictionary(recommendation_row))
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # youtube_extremism
 2 | 
 3 | This is a repository for the research into radical and extremist infospheres on YouTube. We have used this code for a series of stories at de Volkskrant ([link to stories](https://volkskrant.nl/youtube)) and de Correspondent ([link to stories](https://decorrespondent.nl/collectie/extreme-politieke-bewegingen))
 4 | 
 5 | The code consists of several modules, packages and collections of code.
 6 | 
 7 | ## DataCollection 
 8 | 
 9 | DataCollection contains a library for, well, large scale data collection. The code takes a list of channels and collects, through the YouTube API, the following data types:
10 | 1. Channel information (basic statistics, relevant playlist ids and more)
11 | 2. Videos (statistics and descriptions)
12 | 3. Comments (all comments of the videos)
13 | 4. Recommendations (all recommendations for the gathered videos)
14 | 5. Transcripts (transcripts, if available, in English of the videos, gathered with the [youtube-dl library](https://rg3.github.io/youtube-dl/)
15 | 
16 | You'll find additional documentation in the [DataCollection folder.](https://github.com/dtokmetzis/youtube_extremism/tree/master/DataCollection)
17 | 
18 | ## RabbitHole
19 | 
20 | Contains scripts and notebooks to gather and analyse data we used for an experiment into the recommendation system of YouTube. This codes still needs a lot of work.
21 | 
22 | ## Notebooks
23 | 
24 | Contains some notebooks used for the analysis of the data on right and left wing 'infospheres.' They just scratch the surface of possible analyses, but they can help you along.
25 | 
26 | ## TopicModelling
27 | 
28 | Contains a lot of scripts, data and ideas for natural language processing. The transcripts are a real treasure. During two hackathons we've written code to get a grip on this data. There is still a lot that need to be done, so please consider these scripts as suggestions.
29 | 
30 | ## Finally
31 | 
32 | If you are interested in the data (we have gathered aroung 100GB, or 500.000 videos of far right and far left content), please drop me a line. We won't share our comment data without a clear agreement on how to process those safely, because they are really sensitive data. 
33 | 
34 | All code is written in python3. 
35 | 
36 | Please let me know what we can do better. And please share your findings with us.
37 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/test_channels.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from utils_for_test import create_test_client_with_response
 6 | from youtubecollector.channels import get_channels, channel
 7 | 
 8 | 
 9 | class ChannelTest(TestCase):
10 | 
11 |     def test_get_full_channel(self):
12 |         expected = [
13 |             channel(channel_id='Some_ID', channel_title='The test channel',
14 |                     channel_description='The Official YouTube Channel for testing',
15 |                     channel_default_language='en', channel_country='US',
16 |                     channel_uploads='UU_8WUrPbi8clO6sWt_FDvuA', channel_viewcount='2640735',
17 |                     channel_commentcount='0', channel_subscribercount='9779', channel_videocount='258',
18 |                     channel_topic_ids=['topic1', 'topic2', 'topic3'],
19 |                     channel_topic_categories=['https://en.wikipedia.org/wiki/Society',
20 |                                               'https://en.wikipedia.org/wiki/Politics'],
21 |                     channel_branding_keywords='"Testing is fun", "More Testing"')
22 |         ]
23 |         channel_seed = pd.DataFrame([{"channel_id": "Some_ID"}])
24 | 
25 |         client = create_test_client_with_response("full_channel_response.json", "200")
26 |         actual = get_channels(channel_seed, client)
27 | 
28 |         self.assertEqual(expected, actual)
29 | 
30 |     def test_get_minimal_channel(self):
31 |         expected = [
32 |             channel(channel_id='Some_ID', channel_title='The test channel',
33 |                     channel_description='The Official YouTube Channel for testing',
34 |                     channel_default_language='not set', channel_country='not set',
35 |                     channel_uploads='', channel_viewcount='2640735',
36 |                     channel_commentcount='0', channel_subscribercount='9779', channel_videocount='258',
37 |                     channel_topic_ids="not set",
38 |                     channel_topic_categories="not set",
39 |                     channel_branding_keywords="not set")
40 |         ]
41 | 
42 |         channel_seed = pd.DataFrame([{"channel_id": "Some_ID"}])
43 | 
44 |         client = create_test_client_with_response("nullable_fields_channel_response.json", "200")
45 |         actual = get_channels(channel_seed, client)
46 | 
47 |         self.assertEqual(expected, actual)
48 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/recommendation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "kind": "youtube#searchListResponse",
 3 |   "etag": "some identifier",
 4 |   "nextPageToken": "nextPageToken",
 5 |   "regionCode": "NL",
 6 |   "pageInfo": {
 7 |     "totalResults": 330,
 8 |     "resultsPerPage": 50
 9 |   },
10 |   "items": [
11 |     {
12 |       "kind": "youtube#searchResult",
13 |       "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/XMTNvc3BBTTBo2SWSuj2nscMDmw\"",
14 |       "id": {
15 |         "kind": "youtube#video",
16 |         "videoId": "id of first target video"
17 |       },
18 |       "snippet": {
19 |         "publishedAt": "2018-01-01T01:01:01.000Z",
20 |         "channelId": "channel Id of first video",
21 |         "title": "title of first video",
22 |         "description": "Description of first recommendation",
23 |         "thumbnails": {
24 |           "default": {
25 |             "url": "https://someimage.jpg",
26 |             "width": 120,
27 |             "height": 90
28 |           },
29 |           "medium": {
30 |             "url": "https://someimage.jpg",
31 |             "width": 320,
32 |             "height": 180
33 |           },
34 |           "high": {
35 |             "url": "https://someimage.jpg",
36 |             "width": 480,
37 |             "height": 360
38 |           }
39 |         },
40 |         "channelTitle": "Title of channel",
41 |         "liveBroadcastContent": "none"
42 |       }
43 |     },
44 |     {
45 |       "kind": "youtube#searchResult",
46 |       "etag": "some identifier",
47 |       "id": {
48 |         "kind": "youtube#video",
49 |         "videoId": "id of second target video"
50 |       },
51 |       "snippet": {
52 |         "publishedAt": "2018-10-10T10:10:10.000Z",
53 |         "channelId": "channel id of second video",
54 |         "title": "title of second video",
55 |         "description": "description of second video",
56 |         "thumbnails": {
57 |           "default": {
58 |             "url": "https://image.jpg",
59 |             "width": 120,
60 |             "height": 90
61 |           },
62 |           "medium": {
63 |             "url": "https://image.jpg",
64 |             "width": 320,
65 |             "height": 180
66 |           },
67 |           "high": {
68 |             "url": "https://image.jpg",
69 |             "width": 480,
70 |             "height": 360
71 |           }
72 |         },
73 |         "channelTitle": "Title of second channel",
74 |         "liveBroadcastContent": "none"
75 |       }
76 |     }
77 |   ]
78 | }


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/video_metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "kind": "youtube#videoListResponse",
 3 |   "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/b7mZ8rGMG1BFDpd9jh6TQbAlDik\"",
 4 |   "pageInfo": {
 5 |     "totalResults": 1,
 6 |     "resultsPerPage": 1
 7 |   },
 8 |   "items": [
 9 |     {
10 |       "kind": "youtube#video",
11 |       "etag": "etage",
12 |       "id": "HIjfgBATDfs",
13 |       "snippet": {
14 |         "publishedAt": "2008-10-18T18:42:29.000Z",
15 |         "channelId": "UC_0dwPeY0vQSJGVfRpFvGUg",
16 |         "title": "title in metadata",
17 |         "description": "Description in metadata",
18 |         "thumbnails": {
19 |           "default": {
20 |             "url": "https://image.jpg",
21 |             "width": 120,
22 |             "height": 90
23 |           },
24 |           "medium": {
25 |             "url": "https://image.jpg",
26 |             "width": 320,
27 |             "height": 180
28 |           },
29 |           "high": {
30 |             "url": "https://image.jpg",
31 |             "width": 480,
32 |             "height": 360
33 |           }
34 |         },
35 |         "channelTitle": "Channel title of metadata",
36 |         "tags": [
37 |           "tag one",
38 |           "tag two"
39 |         ],
40 |         "categoryId": "1",
41 |         "liveBroadcastContent": "none",
42 |         "localized": {
43 |           "title": "Localized title of video in metadata",
44 |           "description": "localized Description in metadata"
45 |         }
46 |       },
47 |       "contentDetails": {
48 |         "duration": "PT3M3S",
49 |         "dimension": "2d",
50 |         "definition": "sd",
51 |         "caption": "false",
52 |         "licensedContent": false,
53 |         "regionRestriction": {
54 |           "blocked": [
55 |             "GP",
56 |             "DE",
57 |             "GB"
58 |           ]
59 |         },
60 |         "projection": "rectangular"
61 |       },
62 |       "statistics": {
63 |         "viewCount": "120",
64 |         "likeCount": "231",
65 |         "dislikeCount": "342",
66 |         "favoriteCount": "453",
67 |         "commentCount": "564"
68 |       },
69 |       "topicDetails": {
70 |         "topicIds": [
71 |           "topic Id"
72 |         ],
73 |         "relevantTopicIds": [
74 |           "Relevant topics ids 1",
75 |           "Relevant topics ids 2"
76 |         ],
77 |         "topicCategories": [
78 |           "https://en.wikipedia.org/wiki/Television_program",
79 |           "https://en.wikipedia.org/wiki/Society"
80 |         ]
81 |       }
82 |     }
83 |   ]
84 | }


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/nl/macros/modals.tff:
--------------------------------------------------------------------------------
 1 | #class="modals"
 2 | @ZIJN={zijn, is, ben, waren, was}
 3 | @ZIJNPER={hij is, ik ben, het is, zij is, ze is, zij zijn, we zijn, wij zijn, jij bent, je bent, dat is}
 4 | @ZIJNNEG={zijn niet, is niet, was niet, waren niet, ben niet, is niet}
 5 | @MISS={misschien, mogelijk, zou kunnen, kan, zouden, zou, zal, zullen}
 6 | @MOG={mogelijk, misschien, mogelijkheid tot}
 7 | @MOGNEG={mogelijk niet, misschien niet, geen mogelijkheid tot}
 8 | @KAN={kan, kunnen, kon, konden, zou kunnen, zouden kunnen}
 9 | @KANNEG={kan niet, kunnen niet, kon niet, konden niet}
10 | @KANHEB={kan hebben, kunnen hebben, kon hebben, konden hebben, zou kunnen hebben, zouden kunnen hebben}
11 | @KANHEBNEG={kan niet hebben, kunnen niet hebben, zal, niet hebben, heeft misschien niet, hebben misschien niet, zal misschien niet}
12 | 
13 | @INSTAAT={in staat tot, niet in staat tot}
14 | @ISNIET={is niet}
15 | @ZOUPER={jij zou, hij zou, ik zou, het zou, zij zou, ze zou, zij zouden, wij zouden, we zouden, dat zou, dit zou}
16 | @HEBPER={ik heb, jij hebt, je hebt, zij heeft, hij heeft, we hebben, wij hebben, zij hebben, het heeft, dat heeft, dit heeft}
17 | @ZAL={zal, zouden}
18 | @ZALNEG={zal niet, zouden niet}
19 | @ZALPER={ik zal, jij zal, zij zal, ze zal, hij zal, dat zal, dit zal}
20 | @DURF={durven, durf}
21 | @DURFNEG={durven niet, durf niet}
22 | @HEB={hebben, heb, hebt, heeft, had}
23 | @HEBNEG={heeft niet, had niet, heb niet, hebt niet}
24 | @MOET={moet, moeten, zal moeten, zullen moeten}
25 | @HEEFTNEG={heeft geen, heeft niet, hebben geen, hebben niet, hoeft niet, hoeven niet}
26 | @MOETPER={het moet, het zal moeten, hij moet, hij zal moeten, zij moet, zij zal moeten, ik moet, ik zal moeten, we moeten, we zullen moeten, zij moeten, zij zullen moeten, jij moet, je moet, jij zal moeten, je zal moeten, dat moet, dat zal moeten, dit moet, dit zal moeten}
27 | 
28 | @MOETNEG={moet niet, moeten niet}
29 | @ZALMOET={zal moeten}
30 | @MOETNEGHEB={moet niet hebben, moeten niet hebben, zal niet moeten hebben}
31 | @WAS={was, vroeger, in het verleden}
32 | 
33 | @MOETHEB={moet hebben, moeten hebben}
34 | 
35 | @ZOUMOET={zou moeten}
36 | @ZOUNEGMOET={zou niet moeten}
37 | @ZOUMOETHEB={zou moeten hebben}
38 | @DOE={doe, doen, doet}
39 | @DOENEG={doet niet, doen niet, doet het niet, doen het niet}
40 | @GA={ga, gaat, gaan, zal gaan, zullen gaan}
41 | @GANEG={ga niet, gaat niet, gaan niet, zal niet gaan, zullen niet gaan}
42 | @GAPER={ik ga, zij gaan, zij gaat, het gaat, we gaan, jij|je gaat, het gaat, dat gaat}
43 | @GANEGPER={ik ga niet, zij gaan niet, zij gaat niet, het gaat niet, we gaan niet, jij|je gaat niet, het gaat niet, dat gaat niet}
44 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/comment_with_reply.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "kind": "youtube#commentThreadListResponse",
 3 |   "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/M13kILBSDXHZmf82KpKIUu78oro\"",
 4 |   "pageInfo": {
 5 |     "totalResults": 5,
 6 |     "resultsPerPage": 100
 7 |   },
 8 |   "items": [
 9 |     {
10 |       "kind": "youtube#commentThread",
11 |       "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/6WkG1Db9T-En4DWmNTTBdki5WTk\"",
12 |       "id": "The comment id that is used",
13 |       "snippet": {
14 |         "videoId": "the video id",
15 |         "topLevelComment": {
16 |           "kind": "youtube#comment",
17 |           "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/RUV3jLsQcA6PsDjkr3q5GkpAOmI\"",
18 |           "id": "this comment id is not used",
19 |           "snippet": {
20 |             "authorDisplayName": "Author name",
21 |             "authorProfileImageUrl": "somewhere.com/photo.jpg",
22 |             "authorChannelUrl": "http://www.youtube.com/channel/someone",
23 |             "authorChannelId": {
24 |               "value": "someone"
25 |             },
26 |             "videoId": "a video id",
27 |             "textDisplay": "The text that is displayed",
28 |             "textOriginal": "Would be amazing to see an Austin Petersen endorsement as well. He would stand right up there with Massie, Amash, and the rest of the liberty coalition in congress.",
29 |             "canRate": true,
30 |             "viewerRating": "none",
31 |             "likeCount": 4,
32 |             "disLikeCount": 2,
33 |             "publishedAt": "2017-11-02T19:25:12.000Z",
34 |             "updatedAt": "2017-11-02T19:25:12.000Z"
35 |           }
36 |         },
37 |         "canReply": true,
38 |         "totalReplyCount": 0,
39 |         "isPublic": true
40 |       },
41 |       "replies": {
42 |         "comments": [
43 |           {
44 |             "kind": "youtube#comment",
45 |             "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/TAYF2haOdXkzVaoBDUpJkoLXQSQ\"",
46 |             "id": "The parent id.the reply id",
47 |             "snippet": {
48 |               "authorDisplayName": "Responder",
49 |               "authorProfileImageUrl": "example2.com/photo.jpg",
50 |               "authorChannelUrl": "http://www.youtube.com/channel/responder",
51 |               "authorChannelId": {
52 |                 "value": "responder channel id"
53 |               },
54 |               "videoId": "some video id",
55 |               "textDisplay": "The response text",
56 |               "textOriginal": "Different if something was changed",
57 |               "parentId": "the parent id",
58 |               "canRate": true,
59 |               "viewerRating": "none",
60 |               "likeCount": 1,
61 |               "publishedAt": "2017-11-02T19:55:27.000Z",
62 |               "updatedAt": "2017-11-02T19:55:27.000Z"
63 |             }
64 |           }
65 |         ]
66 |       }
67 |     }
68 |   ]
69 | }


--------------------------------------------------------------------------------
/TopicModelling/Captions preprocessing/Preprocess leftwing.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import pandas as pd\n",
10 |     "import ast\n",
11 |     "\n",
12 |     "raw_captions = 'C:/hackathon/captions.csv'\n",
13 |     "raw_videolist = 'C:/hackathon/videolists_lefty.csv'\n",
14 |     "preprocessed_captions = 'C:/hackathon/left_captions.csv'\n",
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "### Text cleaning\n",
22 |     "\n",
23 |     "#### Input format:\n",
24 |     "List to csv -> string representation of list saved and then read. Contains duplicates, sometimes within the same list. Then separated by newline (\\n)"
25 |    ]
26 |   },
27 |   {
28 |    "cell_type": "code",
29 |    "execution_count": 4,
30 |    "metadata": {},
31 |    "outputs": [],
32 |    "source": [
33 |     "def clean_caption(caption_string):\n",
34 |     "    caption = ast.literal_eval(caption_string) #list representation to actual list\n",
35 |     "    caption = [newline for oldline in caption for newline in oldline.split('\\n') if newline != ' '] #split lines by newline,\n",
36 |     "                #unnest the result and keep if line is not empty (in this case just a space)\n",
37 |     "    result = []  #Initialise empty list to store non-duplicates in. Does not use set because lines can be identical,\n",
38 |     "                #so the criterion is to only drop duplicates if they follow eachother\n",
39 |     "    prevline = ''\n",
40 |     "    for line in caption:\n",
41 |     "        if line == prevline:\n",
42 |     "            continue\n",
43 |     "        result.append(line)\n",
44 |     "        prevline = line\n",
45 |     "    return '\\n'.join(result)"
46 |    ]
47 |   },
48 |   {
49 |    "cell_type": "code",
50 |    "execution_count": 6,
51 |    "metadata": {},
52 |    "outputs": [],
53 |    "source": [
54 |     "df = pd.read_csv(raw_captions, names = ['yt_id','captions'])\n",
55 |     "df['captions'] = df['captions'].apply(clean_caption)\n",
56 |     "\n",
57 |     "df_videolist = pd.read_csv(raw_videolist, names = ['yt_id','timestamp','title','channel_id','channel_name'])\n",
58 |     "df = df.merge(df_videolist, how = 'left', on = 'yt_id')\n",
59 |     "\n",
60 |     "df.to_csv(preprocessed_captions,index=False)"
61 |    ]
62 |   }
63 |  ],
64 |  "metadata": {
65 |   "kernelspec": {
66 |    "display_name": "Python 3",
67 |    "language": "python",
68 |    "name": "python3"
69 |   },
70 |   "language_info": {
71 |    "codemirror_mode": {
72 |     "name": "ipython",
73 |     "version": 3
74 |    },
75 |    "file_extension": ".py",
76 |    "mimetype": "text/x-python",
77 |    "name": "python",
78 |    "nbconvert_exporter": "python",
79 |    "pygments_lexer": "ipython3",
80 |    "version": "3.6.3"
81 |   }
82 |  },
83 |  "nbformat": 4,
84 |  "nbformat_minor": 2
85 | }
86 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/lexicon/en/macros/modals.tff:
--------------------------------------------------------------------------------
 1 | #class="modals"
 2 | @BE={be,is,am,are,were,was,been,being}
 3 | @BECL={he\'s,I\'m,it\'s,she\'s,they\'re,we\'re,you\'re, that\'s}
 4 | @BENEG={aren\'t,isn\'t,wasn\'t,weren\'t, am not, is not, are not, was not, were not}
 5 | @MODAL={might,may,could,can,should,shall,will,would}
 6 | @MAY={may,might}
 7 | @MAYNEG={may not,might not,mightn\'t, mayn\'t}
 8 | @MAYHAVE={may have, might have, might\'ve, may\'ve}\
 9 | @MAYNEGHAVE={may not have, might not have, mightn\'t have, mayn\'t have}
10 | @CAN={can,could}
11 | @CANNEG={cannot,can\'t,couldn\'t, could not}
12 | @CANHAVE={can have, could have, could\'ve}
13 | @SHALL={shall,should, shalt}
14 | @SHALLHAVE={shall have, should have}
15 | @SHALLNEG={shan't, shouldn\'t, shall not, should not}
16 | @SHALLNEGHAVE={shall not have, should not have, shouldn\'t have}
17 | @ABLE={able,unable}
18 | @AINT={ain\'t}
19 | @WOULDCL={you\'d,he\'d,I\'d,it\'d,she\'d,they\'d,we\'d, that\'d, this\'d}
20 | @HAVECL={i\'ve, you\'ve, she\'s, he\'s, we\'ve, they\'ve, it\'s, that\'s, this\'s}
21 | @WILL={will, would}
22 | @WILLNEG={wouldn\'t,won\'t, will not, would not}
23 | @WILLCL={i\'ll, you\'ll, she\'ll, he\'ll,that\'ll, this\'ll}
24 | @DARE={dare}
25 | @DARENEG={daren\'t, dare not}
26 | @HAVE={have,has,had,having}
27 | @HAVENEG={haven\'t,hasn\'t,hadn\'t, had not, has not, have not}
28 | @HAVEGOTTA={has got to, has gotta, have got to, have gotta, gotta}
29 | @HAVEGOTTANEG={hasn\'t got to, hasn\'t gotta, haven\'t got to, haven\'t gotta, ain\'t gotta, ain\'t got to}
30 | @HAVEGOTTACL={it\'s gotta, it\'s got to, he\'s got to, he\'s gotta, she\'s got to, she\'s gotta, i\'ve gotta, i\'ve got to, we\'ve gotta, we\'ve got to, they've got to, they've gotta, you\'ve got to, you\'ve gotta, that\'s got to, that\'s gotta, this\'s gotta, this\'s got to}
31 | @MUST={must}
32 | @MUSTNEG={mustn\'t}
33 | @MUSTHAVE={must have, must\'ve}
34 | @MUSTNEGHAVE={must not have, mustn\'t have, mustn\'t \'ve}
35 | @USEDTO={used to}
36 | @NEED={need}
37 | @NEEDNEG={needn\'t}
38 | @NEEDHAVE={need\'ve, need have}
39 | @NEEDHAVENEG={needn\'t have, needn\'t\'ve, need not have}
40 | @OUGHT={ought to, oughta}
41 | @OUGHTNEG={oughtn\'t to}
42 | @OUGHTHAVE={ought to have, oughta have}
43 | @DO={do, does}
44 | @DONEG={doesn\'t,don\'t}
45 | @GONNA={am going to, are going to, is going to, am gonna, are gonna, is gonna}
46 | @GONNANEG={am not going to, are not going to, is not going to, am not gonna, are not gonna, is not gonna, ain\'t gonna, isn\'t gonna, aren\'t gonna}
47 | @GONNACL={i\'m going to, they\'re going to, she\'s going to, it\'s going to, we\'re going to, i\'m gonna, you\'re gonna, i\'m gonna, you\'re gonna, he\'s gonna, she\'s gonna, it\'s gonna, we\'re gonna, they\'re gonna, that\'s gonna}
48 | @GONNANEGCL={i\'m not going to, they\'re not going to, she\'s not going to, it\'s not going to, we\'re not going to, i\'m not gonna, you\'re not gonna, he\'s not gonna, she\'s not gonna, it\'s not gonna, we\'re not gonna, they\'re not gonna, that\'s not gonna}
49 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/test_comments.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from utils_for_test import read_json_from_file
 4 | from youtubecollector.comments import convert_to_comments, comment
 5 | 
 6 | 
 7 | class CommentTest(TestCase):
 8 | 
 9 |     def test_get_full_comment(self):
10 |         response = read_json_from_file("comment_full.json")
11 |         actual = convert_to_comments(response)
12 | 
13 |         expected = [
14 |             comment(video_id='the video id', comment_id='The comment id that is used',
15 |                     author_display_name='Author name',
16 |                     author_channel_url='http://www.youtube.com/channel/someone',
17 |                     author_channel_id='someone',
18 |                     comment_text='The text that is displayed',
19 |                     comment_like_count=4,
20 |                     comment_dislike_count=2,
21 |                     comment_time='2017-11-02T19:25:12.000Z',
22 |                     reply_count=0)
23 |         ]
24 | 
25 |         self.assertEqual(actual, expected)
26 | 
27 |     def test_get_minimal_comment(self):
28 |         response = read_json_from_file("comment_minimal.json")
29 |         actual = convert_to_comments(response)
30 | 
31 |         expected = [
32 |             comment(video_id='the video id', comment_id='The comment id that is used',
33 |                     author_display_name='Author name',
34 |                     author_channel_url='http://www.youtube.com/channel/someone',
35 |                     author_channel_id='not set',
36 |                     comment_text='The text that is displayed',
37 |                     comment_like_count=4,
38 |                     comment_dislike_count=0,
39 |                     comment_time='2017-11-02T19:25:12.000Z',
40 |                     reply_count=0)
41 |         ]
42 | 
43 |         self.assertEqual(actual, expected)
44 | 
45 |     def test_get_comments_with_replies(self):
46 |         response = read_json_from_file("comment_with_reply.json")
47 |         actual = convert_to_comments(response)
48 | 
49 |         expected = [
50 |             comment(video_id='the video id', comment_id='The comment id that is used',
51 |                     author_display_name='Author name',
52 |                     author_channel_url='http://www.youtube.com/channel/someone',
53 |                     author_channel_id='someone',
54 |                     comment_text='The text that is displayed',
55 |                     comment_like_count=4,
56 |                     comment_dislike_count=2,
57 |                     comment_time='2017-11-02T19:25:12.000Z',
58 |                     reply_count=0),
59 |             comment(video_id='some video id', comment_id='The parent id.the reply id',
60 |                     author_display_name='Responder',
61 |                     author_channel_url='http://www.youtube.com/channel/responder',
62 |                     author_channel_id='responder channel id',
63 |                     comment_text='The response text',
64 |                     comment_like_count=1,
65 |                     comment_dislike_count='',
66 |                     comment_time='2017-11-02T19:55:27.000Z',
67 |                     reply_count='')
68 |         ]
69 | 
70 |         self.assertEqual(actual, expected)
71 | 


--------------------------------------------------------------------------------
/TopicModelling/TopicModelWrapper/StreamingPreprocesser.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | 
 4 | class StreamingPreprocesser(object):
 5 | 
 6 |     def __init__(self, stopwords=None, processes=None):
 7 |         self.source_generator = None
 8 | 
 9 |         self.stopwords = stopwords if stopwords is not None else open('stopwords-nl.txt', 'r').read().split('\n')
10 | 
11 |         punctuation = string.punctuation
12 |         punctuation += '`’‘”“'
13 |         self.punctuation_table = str.maketrans('', '', punctuation)
14 | 
15 |         spacers = '\n\r\t'
16 |         self.spacers_table = str.maketrans(spacers, ' ' * len(spacers))
17 | 
18 |         self.processes = processes
19 |         if self.processes is None:
20 |             self.processes = [
21 |                               # self.process_string,
22 |                               self.encode_doc_to_ascii,
23 |                               self.remove_punctuation,
24 |                               self.remove_spacers,
25 |                               self.to_lower_case,
26 |                               self.tokenise,
27 |                               self.remove_stopwords,
28 |                               self.remove_digit_terms,
29 |                               self.remove_min_len
30 |                               ]
31 | 
32 |     def __iter__(self):
33 |         for tokens in self.process(self.source_generator):
34 |             yield tokens
35 | 
36 |     def add_processor(self, process):
37 |         if callable(process):
38 |             self.processes.append(process)
39 | 
40 |     def process(self, text):
41 |         pipeline = text
42 |         pipeline = [pipeline] if type(pipeline) == str else pipeline
43 |         for processor in self.processes:
44 |             pipeline = processor(pipeline)
45 |         return pipeline
46 | 
47 |     def encode_doc_to_ascii(self, texts):
48 |         for text in texts:
49 |             yield text.encode('ascii', errors='ignore').decode('utf8')
50 |             # List all non ascii characters and translate/filter them out
51 | 
52 |     def remove_punctuation(self, texts):
53 |         for text in texts:
54 |             yield text.translate(self.punctuation_table)
55 | 
56 |     def remove_spacers(self, texts):
57 |         for text in texts:
58 |             yield text.translate(self.spacers_table)
59 | 
60 |     def to_lower_case(self, texts):
61 |         for text in texts:
62 |             yield text.lower()
63 | 
64 |     def tokenise(self, texts):
65 |         for text in texts:
66 |             for token in text.split():
67 |                 yield token
68 | 
69 |     def remove_stopwords(self, tokens):
70 |         for token in tokens:
71 |             if token not in self.stopwords:
72 |                 yield token
73 |             else:
74 |                 continue
75 | 
76 |     def remove_digit_terms(self, tokens):
77 |         for token in tokens:
78 |             if not token.isdigit():
79 |                 yield token
80 |             else:
81 |                 continue
82 | 
83 |     def remove_min_len(self, tokens):
84 |         for token in tokens:
85 |             if len(token) > 2:  # self.token_min
86 |                 yield token
87 |             else:
88 |                 continue
89 | 


--------------------------------------------------------------------------------
/TopicModelling/TextLemma/filterTranscripts.py:
--------------------------------------------------------------------------------
 1 | import getTokens
 2 | import pandas as pd
 3 | import ast
 4 | import re
 5 | import numpy as np
 6 | from gensim.models import Word2Vec
 7 | from joblib import Parallel, delayed
 8 | import multiprocessing
 9 | from math import sqrt
10 | from collections import defaultdict
11 | 
12 | 
13 | class CallBack(object):
14 |     completed = defaultdict(int)
15 | 
16 |     def __init__(self, index, parallel):
17 |         self.index = index
18 |         self.parallel = parallel
19 | 
20 |     def __call__(self, index):
21 |         CallBack.completed[self.parallel] += 1
22 |         print("done with {}".format(CallBack.completed[self.parallel]))
23 |         if self.parallel._original_iterable:
24 |             self.parallel.dispatch_next()
25 | 
26 | import joblib.parallel
27 | joblib.parallel.CallBack = CallBack
28 | 
29 | def trainW2vTranscripts():
30 | 	""" train a column of strings to a word2vec model"""
31 | 	df = pd.read_csv('data/captions-filtered.csv', encoding='utf-8')
32 | 	model = getWord2VecModel.getWord2Vec(train=df['transcript_clean'])
33 | 	model.most_similar(positive=['muslim'])
34 | 
35 | def callable(df):
36 | 	df['transcript_clean'] = np.nan
37 | 	datalength = len(df)
38 | 	print(df.head())
39 | 	li_transcripts = ['n'] * len(df)
40 | 	for index, transcript in enumerate(df['transcript']):
41 | 		transcript_clean = ast.literal_eval(transcript)
42 | 		transcript_clean = getTokens.getTokens(li_strings=(ast.literal_eval(transcript)), lemmatizing=True)
43 | 		li_transcripts[index] = transcript_clean
44 | 	df['transcript_clean'] = li_transcripts
45 | 	return df
46 | 
47 | def cleanTranscripts():
48 | 	""" filter the transcripts by removing stopwords and stemming """
49 | 	dfs = pd.read_csv('data/captions-clean.csv', encoding='utf-8', chunksize=500, nrows=1, skiprows=range(1,40000))
50 | 	parallel = Parallel(n_jobs=multiprocessing.cpu_count())
51 | 	retlist = parallel(delayed(callable)(i) for i in dfs)
52 | 	df = pd.concat(retlist)
53 | 	# df['transcript_clean'] = np.nan
54 | 	# datalength = len(df)
55 | 	# print(df.head())
56 | 	# li_transcripts = ['n'] * len(df)
57 | 	# for index, transcript in enumerate(df['transcript']):
58 | 	# 	transcript_clean = ast.literal_eval(transcript)
59 | 	# 	transcript_clean = getTokens.getTokens(li_strings=(ast.literal_eval(transcript)), lemmatizing=True)
60 | 	# 	li_transcripts[index] = transcript_clean
61 | 	# 	if index % 200 == 0:
62 | 	# 		df['transcript_clean'] = li_transcripts
63 | 	# 		df.to_csv('data/captions-filtered.csv', encoding='utf-8')
64 | 	# 		print('Completed video ' + str(index) + '/' + str(datalength))
65 | 	# df['transcript_clean'] = li_transcripts
66 | 	df.to_csv('data/captions-filtered.csv', encoding='utf-8')
67 | 	# print('Completed video ' + str(index) + '/' + str(datalength))
68 | 
69 | def removeDuplicateEntries():
70 | 	""" remove the duplicate transcript entries """
71 | 	df = pd.read_csv('data/captions.csv', encoding='utf-8')
72 | 	df.columns = ['id', 'transcript']
73 | 
74 | 	li_transcripts = []
75 | 	for transcript in df['transcript']:
76 | 		li_transcript = ast.literal_eval(transcript)
77 | 		li_transcript = li_transcript[0::3]
78 | 		li_transcripts.append(li_transcript)
79 | 	df['transcript'] = li_transcripts
80 | 
81 | 	df.to_csv('data/captions-clean.csv', encoding='utf-8')
82 | 
83 | cleanTranscripts()


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/channels.py:
--------------------------------------------------------------------------------
 1 | import csv as _csv
 2 | from collections import namedtuple as _namedtuple
 3 | from .util import is_empty_file as _is_empty_file
 4 | from .util import convert_to_dictionary as _convert_to_dictionary
 5 | 
 6 | channel = _namedtuple("channel", ('channel_id',
 7 |                                   'channel_title',
 8 |                                   'channel_description',
 9 |                                   'channel_default_language',
10 |                                   'channel_country',
11 |                                   'channel_uploads',
12 |                                   'channel_viewcount',
13 |                                   'channel_commentcount',
14 |                                   'channel_subscribercount',
15 |                                   'channel_videocount',
16 |                                   'channel_topic_ids',
17 |                                   'channel_topic_categories',
18 |                                   'channel_branding_keywords'))
19 | 
20 | 
21 | def _get_channel_header():
22 |     return channel._fields
23 | 
24 | 
25 | def _get_channel(channel_id, youtube_client):
26 |     """Queries the youtube API and gets a json in return"""
27 | 
28 |     return youtube_client.channels().list(
29 |         part='snippet,contentDetails,topicDetails,statistics,brandingSettings',
30 |         id=channel_id
31 |     ).execute()
32 | 
33 | 
34 | def _convert_to_channel(response) -> channel:
35 |     """Extracts the needed variables from the returned json"""
36 |     response_channel = response['items'][0]
37 |     return channel(channel_id=response_channel['id'],
38 |                    channel_title=response_channel['snippet']['title'],
39 |                    channel_description=response_channel['snippet']['description'],
40 |                    channel_default_language=response_channel['snippet'].get('defaultLanguage', 'not set'),
41 |                    channel_country=response_channel['snippet'].get('country', 'not set'),
42 |                    channel_uploads=response_channel['contentDetails']['relatedPlaylists'].get('uploads', ''),
43 |                    channel_viewcount=response_channel['statistics']['viewCount'],
44 |                    channel_commentcount=response_channel['statistics']['commentCount'],
45 |                    channel_subscribercount=response_channel['statistics']['subscriberCount'],
46 |                    channel_videocount=response_channel['statistics']['videoCount'],
47 |                    channel_topic_ids=response_channel['topicDetails'].get('topicIds', 'not set'),
48 |                    channel_topic_categories=response_channel['topicDetails'].get('topicCategories', 'not set'),
49 |                    channel_branding_keywords=response_channel['brandingSettings']['channel'].get('keywords', 'not set')
50 |                    )
51 | 
52 | 
53 | def _is_empty(response):
54 |     return len(response['items']) == 0
55 | 
56 | 
57 | def get_channels(channel_seeds, youtube_client):
58 |     channels = list()
59 |     for channel_id in channel_seeds['channel_id']:
60 |         response = _get_channel(channel_id, youtube_client)
61 |         if _is_empty(response):
62 |             print(f"Channel with channel_id {channel_id} returns empty")
63 |             continue
64 |         else:
65 |             next_channel = _convert_to_channel(response)
66 |             channels.append(next_channel)
67 |             print(channel_id)
68 | 
69 |     return channels
70 | 
71 | 
72 | def write_channels(channels, channel_filename):
73 |     with open(channel_filename, "a") as csv_file:
74 |         writer = _csv.DictWriter(csv_file, fieldnames=_get_channel_header())
75 |         if _is_empty_file(channel_filename):
76 |             writer.writeheader()
77 | 
78 |         for channel_row in channels:
79 |             writer.writerow(_convert_to_dictionary(channel_row))
80 | 


--------------------------------------------------------------------------------
/TopicModelling/TextLemma/getTokens.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import pandas as pd
 3 | import numpy as np
 4 | import re
 5 | import pickle
 6 | import operator
 7 | #import glove_python
 8 | from matplotlib.font_manager import FontProperties
 9 | from nltk.stem.snowball import SnowballStemmer
10 | from nltk.stem.wordnet import WordNetLemmatizer
11 | from nltk.corpus import stopwords
12 | 
13 | def getTokens(li_strings='', stemming=False, lemmatizing=False):
14 | 	if stemming:
15 | 		global di_stems
16 | 		di_stems = pickle.load(open('di_stems.p', 'rb'))
17 | 
18 | 	# print('imported')
19 | 	#do some cleanup: only alphabetic characters, no stopwords
20 | 	# create separate stemmed tokens, to which the full strings will be compared to:
21 | 	li_comments_stemmed = []
22 | 	len_comments = len(li_strings)
23 | 	# print(len(li_strings))
24 | 	# print('Creating list of tokens per monthly document')
25 | 	for index, comment in enumerate(li_strings):
26 | 		#create list of list for comments and tokens
27 | 		if isinstance(comment, str):
28 | 			li_comment_stemmed = []
29 | 			li_comment_stemmed = getFilteredText(comment, stemming=stemming, lemmatizing=lemmatizing)
30 | 			li_comments_stemmed.append(li_comment_stemmed)
31 | 		#if index % 1000 == 0:
32 | 			#print('Stemming/tokenising finished for string ' + str(index) + '/' + str(len_comments))
33 | 	# print(len(li_comments_stemmed))
34 | 
35 | 	if stemming:
36 | 		pickle.dump(di_stems, open('di_stems.p', 'wb'))
37 | 		df_stems = pd.DataFrame.from_dict(di_stems, orient='index')
38 | 		df_stems.to_csv('di_stems_dataframe.csv', encoding='utf-8')
39 | 
40 | 	return li_comments_stemmed
41 | 
42 | def getFilteredText(string, stemming=False, lemmatizing=False):
43 | 	#first, remove urls
44 | 	if 'http' in string:
45 | 		string = re.sub(r'https?:\/\/.*[\r\n]*', ' ', string)
46 | 	if 'www.' in string:
47 | 		string = re.sub(r'www.*[\r\n]*', ' ', string)
48 | 
49 | 	#use nltk's tokeniser to get a list of words
50 | 	# from nltk.tokeimport TreebankWordTokenizer
51 | 	# tokenizer = TreebankWordTokenizer()
52 | 	# tokenizer.PARENS_BRACKETS = []
53 | 	# tokens = [word.lower() for sent in nltk.sent_tokenize(string) for word in tokenizer.tokenize(sent)]
54 | 	tokens = re.findall("[a-zA-Z\-\)\(]{3,50}", string)
55 | 	stemmer = SnowballStemmer("english")
56 | 	#list with tokens further processed
57 | 	li_filtered_tokens = []
58 | 	# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
59 | 	for token in tokens:
60 | 		token = token.lower()
61 | 		#print(len(tokens))
62 | 		#only alphabetic characters, keep '(' and ')' symbols for echo brackets, only tokens with three or more characters
63 | 		#if re.search('[a-zA-Z\-\)\(]{3,50}', token):
64 | 		if re.match('[a-zA-Z\-\)\(]{3,50}', token) is not None:
65 | 			#no stopwords
66 | 			if token not in stopwords.words('english'):
67 | 				#token = token.lower()
68 | 				#shorten word if it's longer than 20 characters (e.g. 'reeeeeeeeeeeeeeeeeeeeeeeee')
69 | 				if len(token) >= 20:
70 | 					token = token[:20]
71 | 				#stem if indicated it should be stemmed
72 | 				if stemming:
73 | 					token_stemmed = stemmer.stem(token)
74 | 					li_filtered_tokens.append(token_stemmed)
75 | 
76 | 					#update lookup dict with token and stemmed token
77 | 					#lookup dict is dict of stemmed words as keys and lists as full tokens
78 | 					if token_stemmed in di_stems:
79 | 						if token not in di_stems[token_stemmed]:
80 | 							di_stems[token_stemmed].append(token)
81 | 					else:
82 | 						di_stems[token_stemmed] = []
83 | 						di_stems[token_stemmed].append(token)
84 | 				#if lemmatizing is used instead
85 | 				elif lemmatizing:
86 | 					lemmatizer = WordNetLemmatizer()
87 | 					token = lemmatizer.lemmatize(token)
88 | 					li_filtered_tokens.append(token)
89 | 				else:
90 | 					li_filtered_tokens.append(token)
91 | 	return li_filtered_tokens


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/video.py:
--------------------------------------------------------------------------------
  1 | import csv as _csv
  2 | from collections import namedtuple as _namedtuple
  3 | 
  4 | from .util import is_empty_file as _is_empty_file
  5 | from .util import convert_to_dictionary as _convert_to_dictionary
  6 | 
  7 | video = _namedtuple('video', ('video_id',
  8 |                               'video_published',
  9 |                               'channel_id',
 10 |                               'video_title',
 11 |                               'video_description',
 12 |                               'video_channel_title',
 13 |                               'video_tags',
 14 |                               'video_category_id',
 15 |                               'video_default_language',
 16 |                               'video_duration',
 17 |                               'video_view_count',
 18 |                               'video_comment_count',
 19 |                               'video_likes_count',
 20 |                               'video_dislikes_count',
 21 |                               'video_topic_ids',
 22 |                               'video_topic_categories'
 23 |                               ))
 24 | 
 25 | 
 26 | def _get_video_header():
 27 |     return video._fields
 28 | 
 29 | 
 30 | def get_more_videos(channel_uploads, youtube_client, next_page_token, max_results=None):
 31 |     """takes the id of the uploads_playlist
 32 |     in channel data"""
 33 | 
 34 |     return youtube_client.playlistItems().list(
 35 |         part='snippet,contentDetails',
 36 |         playlistId=channel_uploads,
 37 |         maxResults=50,
 38 |         pageToken=next_page_token
 39 |     ).execute()
 40 | 
 41 | 
 42 | def get_videos(channel_uploads, youtube_client, max_results=None):
 43 |     return youtube_client.playlistItems().list(
 44 |         part='snippet,contentDetails',
 45 |         playlistId=channel_uploads,
 46 |         maxResults=50
 47 |     ).execute()
 48 | 
 49 | 
 50 | def _get_video_metadata(video_id, youtube_client):
 51 |     return youtube_client.videos().list(
 52 |         part='snippet,contentDetails,statistics,topicDetails',
 53 |         id=video_id
 54 |     ).execute()
 55 | 
 56 | 
 57 | def _get_topic_ids(metadata):
 58 |     if "topicDetails" in metadata:
 59 |         return metadata['topicDetails'].get('relevantTopicIds', "not set")
 60 |     else:
 61 |         return "not set"
 62 | 
 63 | 
 64 | def _get_topic_categories(metadata):
 65 |     if "topicDetails" in metadata:
 66 |         return metadata['topicDetails'].get('topicCategories', "not set")
 67 |     else:
 68 |         return "not set"
 69 | 
 70 | 
 71 | def convert_to_videos(response, youtube_client):
 72 |     videos = list()
 73 |     for data in response['items']:
 74 |         video_id = data['contentDetails']['videoId']
 75 |         video_metadata = _get_video_metadata(video_id, youtube_client)
 76 |         metadata = video_metadata['items'][0]
 77 | 
 78 |         next_video = video(video_id=video_id,
 79 |                            video_published=data['snippet']['publishedAt'],
 80 |                            channel_id=data['snippet']['channelId'],
 81 |                            video_title=data['snippet']['title'],
 82 |                            video_description=data['snippet'].get('description', 'not set'),
 83 |                            video_channel_title=data['snippet']['channelTitle'],
 84 |                            video_tags=metadata['snippet'].get('tags', 'not set'),
 85 |                            video_category_id=metadata['snippet'].get('categoryId', 'not set'),
 86 |                            video_default_language=metadata['snippet'].get('defaultLanguage', 'not set'),
 87 |                            video_duration=metadata['contentDetails']['duration'],
 88 |                            video_view_count=metadata['statistics'].get('viewCount', 0),
 89 |                            video_comment_count=metadata['statistics'].get('commentCount', 0),
 90 |                            video_likes_count=metadata['statistics'].get('likeCount', 0),
 91 |                            video_dislikes_count=metadata['statistics'].get('dislikeCount', 0),
 92 |                            video_topic_ids=_get_topic_ids(metadata),
 93 |                            video_topic_categories=_get_topic_categories(metadata)
 94 |                            )
 95 |         videos.append(next_video)
 96 | 
 97 |     return videos
 98 | 
 99 | 
100 | def write_videos(videos, video_file):
101 |     with open(video_file, "a") as csv_file:
102 |         writer = _csv.DictWriter(csv_file, fieldnames=_get_video_header())
103 |         if _is_empty_file(video_file):
104 |             writer.writeheader()
105 | 
106 |         for video_row in videos:
107 |             writer.writerow(_convert_to_dictionary(video_row))
108 | 


--------------------------------------------------------------------------------
/TopicModelling/getTokens.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sqlite3
  3 | import pandas as pd
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt, mpld3
  6 | import time
  7 | import re
  8 | import os
  9 | import nltk
 10 | import pickle
 11 | import operator
 12 | #import glove_python
 13 | from matplotlib.font_manager import FontProperties
 14 | from nltk.stem.snowball import SnowballStemmer
 15 | from nltk.stem.wordnet import WordNetLemmatizer
 16 | from nltk.corpus import stopwords
 17 | from scipy.interpolate import spline
 18 | from datetime import datetime, timedelta
 19 | from collections import OrderedDict
 20 | from sklearn.feature_extraction.text import TfidfVectorizer
 21 | from sklearn.metrics.pairwise import cosine_similarity
 22 | from sklearn.cluster import KMeans
 23 | from sklearn.externals import joblib
 24 | from sklearn.manifold import MDS
 25 | from sklearn.manifold import TSNE
 26 | from sklearn.decomposition import PCA
 27 | from gensim.models import Word2Vec
 28 | from gensim.models import fasttext
 29 | from gensim.scripts.word2vec2tensor import word2vec2tensor
 30 | from matplotlib import pyplot
 31 | from adjustText import adjust_text
 32 | 
 33 | def getTokens(li_strings='', stemming=False, lemmatizing=False):
 34 | 	if stemming:
 35 | 		global di_stems
 36 | 		di_stems = pickle.load(open('di_stems.p', 'rb'))
 37 | 
 38 | 	# print('imported')
 39 | 	#do some cleanup: only alphabetic characters, no stopwords
 40 | 	# create separate stemmed tokens, to which the full strings will be compared to:
 41 | 	li_comments_stemmed = []
 42 | 	len_comments = len(li_strings)
 43 | 	# print(len(li_strings))
 44 | 	# print('Creating list of tokens per monthly document')
 45 | 	for index, comment in enumerate(li_strings):
 46 | 		#create list of list for comments and tokens
 47 | 		if isinstance(comment, str):
 48 | 			li_comment_stemmed = []
 49 | 			li_comment_stemmed = getFilteredText(comment, stemming=stemming, lemmatizing=lemmatizing)
 50 | 			li_comments_stemmed.append(li_comment_stemmed)
 51 | 		#if index % 1000 == 0:
 52 | 			#print('Stemming/tokenising finished for string ' + str(index) + '/' + str(len_comments))
 53 | 	# print(len(li_comments_stemmed))
 54 | 
 55 | 	if stemming:
 56 | 		pickle.dump(di_stems, open('di_stems.p', 'wb'))
 57 | 		df_stems = pd.DataFrame.from_dict(di_stems, orient='index')
 58 | 		df_stems.to_csv('di_stems_dataframe.csv', encoding='utf-8')
 59 | 
 60 | 	return li_comments_stemmed
 61 | 
 62 | def getFilteredText(string, stemming=False, lemmatizing=False):
 63 | 	#first, remove urls
 64 | 	if 'http' in string:
 65 | 		string = re.sub(r'https?:\/\/.*[\r\n]*', ' ', string)
 66 | 	if 'www.' in string:
 67 | 		string = re.sub(r'www.*[\r\n]*', ' ', string)
 68 | 
 69 | 	#use nltk's tokeniser to get a list of words
 70 | 	# from nltk.tokeimport TreebankWordTokenizer
 71 | 	# tokenizer = TreebankWordTokenizer()
 72 | 	# tokenizer.PARENS_BRACKETS = []
 73 | 	# tokens = [word.lower() for sent in nltk.sent_tokenize(string) for word in tokenizer.tokenize(sent)]
 74 | 	tokens = re.findall("[a-zA-Z\-\)\(]{3,50}", string)
 75 | 	stemmer = SnowballStemmer("english")
 76 | 	#list with tokens further processed
 77 | 	li_filtered_tokens = []
 78 | 	# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
 79 | 	for token in tokens:
 80 | 		token = token.lower()
 81 | 		#print(len(tokens))
 82 | 		#only alphabetic characters, keep '(' and ')' symbols for echo brackets, only tokens with three or more characters
 83 | 		#if re.search('[a-zA-Z\-\)\(]{3,50}', token):
 84 | 		if re.match('[a-zA-Z\-\)\(]{3,50}', token) is not None:
 85 | 			#no stopwords
 86 | 			if token not in stopwords.words('english'):
 87 | 				#token = token.lower()
 88 | 				#shorten word if it's longer than 20 characters (e.g. 'reeeeeeeeeeeeeeeeeeeeeeeee')
 89 | 				if len(token) >= 20:
 90 | 					token = token[:20]
 91 | 				#stem if indicated it should be stemmed
 92 | 				if stemming:
 93 | 					token_stemmed = stemmer.stem(token)
 94 | 					li_filtered_tokens.append(token_stemmed)
 95 | 
 96 | 					#update lookup dict with token and stemmed token
 97 | 					#lookup dict is dict of stemmed words as keys and lists as full tokens
 98 | 					if token_stemmed in di_stems:
 99 | 						if token not in di_stems[token_stemmed]:
100 | 							di_stems[token_stemmed].append(token)
101 | 					else:
102 | 						di_stems[token_stemmed] = []
103 | 						di_stems[token_stemmed].append(token)
104 | 				#if lemmatizing is used instead
105 | 				elif lemmatizing:
106 | 					lemmatizer = WordNetLemmatizer()
107 | 					token = lemmatizer.lemmatize(token)
108 | 					li_filtered_tokens.append(token)
109 | 				else:
110 | 					li_filtered_tokens.append(token)
111 | 	return li_filtered_tokens


--------------------------------------------------------------------------------
/DataCollection/src/youtubecollector/comments.py:
--------------------------------------------------------------------------------
  1 | import csv as _csv
  2 | from collections import namedtuple as _namedtuple
  3 | 
  4 | from googleapiclient.errors import HttpError
  5 | 
  6 | from .util import is_empty_file as _is_empty_file
  7 | from .util import convert_to_dictionary as _convert_to_dictionary
  8 | 
  9 | comment = _namedtuple("comment", ('video_id',
 10 |                                   'comment_id',
 11 |                                   'author_display_name',
 12 |                                   'author_channel_url',
 13 |                                   'author_channel_id',
 14 |                                   'comment_text',
 15 |                                   'comment_like_count',
 16 |                                   'comment_dislike_count',
 17 |                                   'comment_time',
 18 |                                   'reply_count'))
 19 | 
 20 | 
 21 | def _get_comment_header():
 22 |     return comment._fields
 23 | 
 24 | 
 25 | def get_comments(video_id, youtube_client):
 26 |     try:
 27 |         return youtube_client.commentThreads().list(
 28 |             videoId=video_id,
 29 |             part='snippet,replies',
 30 |             maxResults=100
 31 |         ).execute()
 32 |     except HttpError:
 33 |         return
 34 | 
 35 | 
 36 | def get_more_comments(video_id, youtube_client, next_page_token):
 37 |     try:
 38 |         return youtube_client.commentThreads().list(
 39 |             videoId=video_id,
 40 |             part='snippet,replies',
 41 |             pageToken=next_page_token,
 42 |             maxResults=100
 43 |         ).execute()
 44 |     except HttpError:
 45 |         return
 46 | 
 47 | 
 48 | def _get_author_channel_id(data):
 49 |     if "authorChannelId" in data['snippet']['topLevelComment']['snippet']:
 50 |         return data['snippet']['topLevelComment']['snippet']['authorChannelId'].get("value", 'not set')
 51 |     else:
 52 |         return "not set"
 53 | 
 54 | 
 55 | def convert_to_comments(response):
 56 |     if response is None:
 57 |         return list()
 58 | 
 59 |     comments = list()
 60 |     for data in response['items']:
 61 |         comments.append(comment(comment_id=data['id'],
 62 |                                 video_id=data['snippet']['videoId'],
 63 |                                 author_display_name=data['snippet']['topLevelComment']['snippet']['authorDisplayName'],
 64 |                                 author_channel_url=data['snippet']['topLevelComment']['snippet']['authorChannelUrl'],
 65 |                                 author_channel_id=_get_author_channel_id(data),
 66 |                                 comment_text=data['snippet']['topLevelComment']['snippet']['textDisplay'],
 67 |                                 comment_like_count=data['snippet']['topLevelComment']['snippet']['likeCount'],
 68 |                                 comment_dislike_count=data['snippet']['topLevelComment']['snippet'].get('disLikeCount', 0),
 69 |                                 comment_time=data['snippet']['topLevelComment']['snippet']['publishedAt'],
 70 |                                 reply_count=data['snippet']['totalReplyCount'])
 71 |                         )
 72 |         if 'replies' in data:
 73 |             for reply in data['replies']['comments']:
 74 |                 # Replies can be recognized by the format of their id:
 75 |                 # The id is made out of two elements: {parent_comment_id}.{reply_id}
 76 |                 # TODO[Olaf]: Do we want to add a boolean field if something is an reply
 77 | 
 78 |                 comments.append(comment(comment_id=reply['id'],
 79 |                                         video_id=reply['snippet']['videoId'],
 80 |                                         author_display_name=reply['snippet']['authorDisplayName'],
 81 |                                         author_channel_url=reply['snippet']['authorChannelUrl'],
 82 |                                         author_channel_id=reply['snippet']['authorChannelId']['value'],
 83 |                                         comment_text=reply['snippet']['textDisplay'],
 84 |                                         comment_like_count=reply['snippet']['likeCount'],
 85 |                                         comment_dislike_count='',
 86 |                                         comment_time=reply['snippet']['publishedAt'],
 87 |                                         reply_count=''))
 88 | 
 89 |     return comments
 90 | 
 91 | 
 92 | def write_comments(comments_file, comments):
 93 |     with open(comments_file, 'a') as csv_file:
 94 |         writer = _csv.DictWriter(csv_file, fieldnames=_get_comment_header())
 95 |         if _is_empty_file(comments_file):
 96 |             writer.writeheader()
 97 | 
 98 |         for comment_row in comments:
 99 |             writer.writerow(_convert_to_dictionary(comment_row))
100 | 


--------------------------------------------------------------------------------
/TopicModelling/language_detection/spacy-language-detection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pickle\n",
 10 |     "from collections import namedtuple\n",
 11 |     "from tqdm import tqdm_notebook as tqdm\n",
 12 |     "\n",
 13 |     "import pandas as pd\n",
 14 |     "import spacy\n",
 15 |     "from spacy_cld import LanguageDetector\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": []
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "nlp = spacy.load('en_core_web_sm')\n",
 32 |     "language_detector = LanguageDetector()\n",
 33 |     "nlp.add_pipe(language_detector)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# Util function to write lazy chunks back to disk\n",
 43 |     "output_columns = [\"id\", \"hash1\", \"hash2\", \"user\", \"user_pic\", \"channel_url\", \"channel_id\", \"comment\", \n",
 44 |     "                  \"depth\", \"timestamp\", \"language\"]\n",
 45 |     "comments_path = \"data/comments_language.csv\"\n",
 46 |     "def write_to_disk(chunk):\n",
 47 |     "    with open(comments_path, \"a\", encoding=\"utf-8\") as file:\n",
 48 |     "        chunk.to_csv(file, index=False, header=False)\n",
 49 |     "\n",
 50 |     "# Lazy data reader into DataFrame\n",
 51 |     "transcripts_reader = pd.read_csv(\"data/comments.csv\", chunksize=500, names=output_columns[:-1])\n",
 52 |     "# Reader to pick up where we ended\n",
 53 |     "completed_reader = pd.read_csv(\"data/comments_language.csv\", chunksize=500, names=output_columns)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 9,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Run this to move the transcripts_reader iterator to where we stopped last time\n",
 63 |     "for _ in completed_reader:\n",
 64 |     "    next(transcripts_reader)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "for ix, transcripts in enumerate(tqdm(transcripts_reader)):\n",
 74 |     "    languages = []\n",
 75 |     "    for ix, transcript in transcripts.iterrows():\n",
 76 |     "        content = str(transcript[7])\n",
 77 |     "        doc = nlp(content)\n",
 78 |     "        try:\n",
 79 |     "            language = doc._.languages[0]\n",
 80 |     "        except IndexError:\n",
 81 |     "            language = \"?\"\n",
 82 |     "        languages.append(language)\n",
 83 |     "    transcripts[\"language\"] = languages\n",
 84 |     "    write_to_disk(transcripts)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 10,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "application/vnd.jupyter.widget-view+json": {
 95 |        "model_id": "66fbb3d0704343cb8d9eb501f7dab277",
 96 |        "version_major": 2,
 97 |        "version_minor": 0
 98 |       },
 99 |       "text/plain": [
100 |        "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
101 |       ]
102 |      },
103 |      "metadata": {},
104 |      "output_type": "display_data"
105 |     }
106 |    ],
107 |    "source": [
108 |     "for ix, transcripts in enumerate(tqdm(transcripts_reader)):\n",
109 |     "    pass"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": []
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": "Python 3",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.6.7"
137 |   },
138 |   "varInspector": {
139 |    "cols": {
140 |     "lenName": 16,
141 |     "lenType": 16,
142 |     "lenVar": 40
143 |    },
144 |    "kernels_config": {
145 |     "python": {
146 |      "delete_cmd_postfix": "",
147 |      "delete_cmd_prefix": "del ",
148 |      "library": "var_list.py",
149 |      "varRefreshCmd": "print(var_dic_list())"
150 |     },
151 |     "r": {
152 |      "delete_cmd_postfix": ") ",
153 |      "delete_cmd_prefix": "rm(",
154 |      "library": "var_list.r",
155 |      "varRefreshCmd": "cat(var_dic_list()) "
156 |     }
157 |    },
158 |    "types_to_exclude": [
159 |     "module",
160 |     "function",
161 |     "builtin_function_or_method",
162 |     "instance",
163 |     "_Feature"
164 |    ],
165 |    "window_display": false
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 1
170 | }
171 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/arguing_lexicon.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | from spacy.tokens import Doc
  5 | from spacy_arguing_lexicon.arguments import ArgumentTexts
  6 | from spacy_arguing_lexicon.exceptions import LexiconMissingError
  7 | 
  8 | 
  9 | class ArguingLexiconParser(object):
 10 | 
 11 |     MACROS_PATH = os.path.join(os.path.dirname(__file__), "lexicon", "{}", "macros")
 12 |     PATTERNS_PATH = os.path.join(os.path.dirname(__file__), "lexicon", "{}", "patterns")
 13 | 
 14 |     MACRO_PATTERN = re.compile("(@[A-Z0-9]+)")
 15 | 
 16 |     MACROS = {}
 17 |     PATTERNS = {}
 18 | 
 19 |     def package_check(self, lang):
 20 |         if not os.path.exists(self.MACROS_PATH.format(lang)):
 21 |             raise LexiconMissingError(
 22 |                 "Trying to load Arguing Lexicon without macros file for language {}".format(lang)
 23 |             )
 24 |         if not os.path.exists(self.PATTERNS_PATH.format(lang)):
 25 |             raise LexiconMissingError(
 26 |                 "Trying to load Arguing Lexicon without patterns file for language {}".format(lang)
 27 |             )
 28 | 
 29 |     def load_macros(self, lang):
 30 |         for entry in os.listdir(self.MACROS_PATH.format(lang)):
 31 |             if not entry.endswith(".tff"):
 32 |                 continue
 33 |             with open(os.path.join(self.MACROS_PATH.format(lang), entry)) as macro_file:
 34 |                 for macro_line in macro_file.readlines():
 35 |                     # Skip empty lines, class definitions and comments
 36 |                     if not macro_line.strip():
 37 |                         continue
 38 |                     if macro_line.startswith("#"):
 39 |                         continue
 40 |                     # Add macros
 41 |                     macro_label, macro_definition = self.preprocess_pattern(macro_line).split("=")
 42 |                     macro = [mcr.strip() for mcr in macro_definition.strip().strip("{}").split(",")]
 43 |                     self.MACROS[macro_label] = macro
 44 | 
 45 |     def preprocess_pattern(self, pattern):
 46 |         stripped_pattern = pattern.replace("\\'", "'").strip()
 47 |         return "{}\\b".format(stripped_pattern)  # the \b makes sure that a match ends with a non-word token
 48 | 
 49 |     def compile_pattern(self, pattern):
 50 |         macro_match = self.MACRO_PATTERN.search(pattern)
 51 |         if macro_match is None:
 52 |             yield re.compile(self.preprocess_pattern(pattern), flags=re.IGNORECASE)
 53 |         else:
 54 |             macro = macro_match.group(0)
 55 |             macro_replacement = "|".join(self.MACROS[macro])
 56 |             replaced_pattern = pattern.replace(macro, macro_replacement)
 57 |             for preprocessed_pattern in self.compile_pattern(replaced_pattern):
 58 |                 yield preprocessed_pattern
 59 | 
 60 |     def load_patterns(self, lang):
 61 |         for entry in os.listdir(self.PATTERNS_PATH.format(lang)):
 62 |             if not entry.endswith(".tff"):
 63 |                 continue
 64 |             with open(os.path.join(self.PATTERNS_PATH.format(lang), entry)) as patterns_file:
 65 |                 pattern_class = None
 66 |                 for pattern_line in patterns_file.readlines():
 67 |                     # Skip empty lines and comments
 68 |                     if not pattern_line.strip():
 69 |                         continue
 70 |                     if pattern_line.startswith("#") and pattern_class:
 71 |                         continue
 72 |                     # Read pattern class
 73 |                     elif pattern_line.startswith("#"):
 74 |                         trash, pattern_class = pattern_line.replace('"', "").split("=")
 75 |                         pattern_class = pattern_class.strip()
 76 |                         self.PATTERNS[pattern_class] = []
 77 |                         continue
 78 |                     # Add patterns
 79 |                     for preprocessed_patterns in self.compile_pattern(pattern_line):
 80 |                         self.PATTERNS[pattern_class].append(preprocessed_patterns)
 81 | 
 82 |     def get_arguing_matches(self, doc):
 83 |         for arguing_label, arguing_patterns in self.PATTERNS.items():
 84 |             for arguing_pattern in arguing_patterns:
 85 |                 match = arguing_pattern.search(doc.text)
 86 |                 if match is not None:
 87 |                     yield arguing_label, match
 88 | 
 89 |     def get_lexicon_vocabulary(self):
 90 |         vocabulary = set()
 91 |         for label, patterns in self.PATTERNS.items():
 92 |             for compiled in patterns:
 93 |                 words = "".join([char if char.isalnum() or char == "'" else " " for char in compiled.pattern])
 94 |                 for word in words.split(" "):
 95 |                     if len(word) <= 1 and not word == "I":
 96 |                         continue
 97 |                     vocabulary.add(word)
 98 |         return vocabulary
 99 | 
100 |     def __init__(self, lang="en"):
101 |         super().__init__()
102 |         self.package_check(lang)
103 |         self.load_macros(lang)
104 |         self.load_patterns(lang)
105 |         if not Doc.has_extension('arguments'):
106 |             Doc.set_extension('arguments', getter=ArgumentTexts(self), force=True)
107 |         else:
108 |             default, method, getter, setter = Doc.get_extension('arguments')
109 |             assert isinstance(getter, ArgumentTexts), \
110 |                 "Expected 'arguments' extension to be of type ArgumentTexts " \
111 |                 "but found {}. Namespace clash?".format(type(Doc.get_extension('arguments')))
112 | 
113 |     def __call__(self, doc):
114 |         # All parsing is lazy
115 |         return doc
116 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/nullable_fields_channel_response.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "kind": "youtube#channelListResponse",
 3 |   "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/OeMJN9Cnx6jGn9D5tetafViyG0U\"",
 4 |   "pageInfo": {
 5 |     "totalResults": 1,
 6 |     "resultsPerPage": 1
 7 |   },
 8 |   "items": [
 9 |     {
10 |       "kind": "youtube#channel",
11 |       "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/3t1aM54tFd_H-BQTFGM9M4ZNq1g\"",
12 |       "id": "Some_ID",
13 |       "snippet": {
14 |         "title": "The test channel",
15 |         "description": "The Official YouTube Channel for testing",
16 |         "publishedAt": "2015-04-03T17:02:57.000Z",
17 |         "thumbnails": {
18 |           "default": {
19 |             "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s88-mo-c-c0xffffffff-rj-k-no",
20 |             "width": 88,
21 |             "height": 88
22 |           },
23 |           "medium": {
24 |             "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s240-mo-c-c0xffffffff-rj-k-no",
25 |             "width": 240,
26 |             "height": 240
27 |           },
28 |           "high": {
29 |             "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s800-mo-c-c0xffffffff-rj-k-no",
30 |             "width": 800,
31 |             "height": 800
32 |           }
33 |         },
34 |         "localized": {
35 |           "title": "Rand Paul",
36 |           "description": "The Official YouTube Channel of Rand Paul"
37 |         }
38 |       },
39 |       "contentDetails": {
40 |         "relatedPlaylists": {
41 |           "watchHistory": "HL",
42 |           "watchLater": "WL"
43 |         }
44 |       },
45 |       "statistics": {
46 |         "viewCount": "2640735",
47 |         "commentCount": "0",
48 |         "subscriberCount": "9779",
49 |         "hiddenSubscriberCount": false,
50 |         "videoCount": "258"
51 |       },
52 |       "topicDetails": {
53 |       },
54 |       "brandingSettings": {
55 |         "channel": {
56 |           "title": "Rand Paul",
57 |           "description": "The Official YouTube Channel of Rand Paul",
58 |           "defaultTab": "Featured",
59 |           "trackingAnalyticsAccountId": "UA-57201184-2",
60 |           "showRelatedChannels": true,
61 |           "showBrowseView": true,
62 |           "unsubscribedTrailer": "KJnn2oi8e7A",
63 |           "profileColor": "#000000",
64 |           "country": "US"
65 |         },
66 |         "image": {
67 |           "bannerImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1060-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
68 |           "bannerMobileImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
69 |           "bannerTabletLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1138-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
70 |           "bannerTabletImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1707-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
71 |           "bannerTabletHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2276-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
72 |           "bannerTabletExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2560-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
73 |           "bannerMobileLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w320-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
74 |           "bannerMobileMediumHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w960-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
75 |           "bannerMobileHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
76 |           "bannerMobileExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1440-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
77 |           "bannerTvImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2120-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no",
78 |           "bannerTvLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w854-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no",
79 |           "bannerTvMediumImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no",
80 |           "bannerTvHighImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1920-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no"
81 |         },
82 |         "hints": [
83 |           {
84 |             "property": "channel.banner.mobile.medium.image.url",
85 |             "value": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no"
86 |           },
87 |           {
88 |             "property": "channel.featured_tab.template.string",
89 |             "value": "Everything"
90 |           },
91 |           {
92 |             "property": "channel.modules.show_comments.bool",
93 |             "value": "True"
94 |           }
95 |         ]
96 |       }
97 |     }
98 |   ]
99 | }


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/arguing-lexicon-lda.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "\n",
 12 |     "import sys\n",
 13 |     "sys.path.append(\"../\")\n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import pickle\n",
 23 |     "\n",
 24 |     "import pandas as pd\n",
 25 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 26 |     "from sklearn.decomposition import LatentDirichletAllocation"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Lazy data reader into DataFrame\n",
 36 |     "def read_argument_captions():\n",
 37 |     "    transcripts_reader = pd.read_csv(\"data/captions_arguments.csv\", chunksize=10)\n",
 38 |     "    for batch in transcripts_reader:\n",
 39 |     "        for ix, caption in batch.iterrows():\n",
 40 |     "            text = \"\"\n",
 41 |     "            for fragment, argument_label in zip(str(caption[\"content\"]).split(\"\\n\"), str(caption[\"argument_labels\"]).split(\"\\n\")):\n",
 42 |     "                if argument_label:\n",
 43 |     "                    text += fragment + \" \"\n",
 44 |     "            yield text\n"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Training a tfidf vectorizer\n",
 54 |     "vectorizer = CountVectorizer(stop_words=\"english\")\n",
 55 |     "matrix = vectorizer.fit_transform(read_argument_captions())\n",
 56 |     "feature_names = vectorizer.get_feature_names()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "with open(\"models/vectorizer.pkl\", \"rb\") as count_file:\n",
 66 |     "    vectorizer = pickle.load(count_file)\n",
 67 |     "with open(\"models/vectorizer_matrix.pkl\", \"rb\") as matrix_file:\n",
 68 |     "    matrix = pickle.load(matrix_file)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# Saving progress\n",
 78 |     "with open(\"models/vectorizer.pkl\", \"wb\") as count_file:\n",
 79 |     "    pickle.dump(vectorizer, count_file)\n",
 80 |     "with open(\"models/vectorizer_matrix.pkl\", \"wb\") as matrix_file:\n",
 81 |     "    pickle.dump(matrix, matrix_file)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# Training the LDA model\n",
 91 |     "lda_model = LatentDirichletAllocation(n_topics=50, max_iter=500, verbose=3, n_jobs=-1, learning_method=\"online\")\n",
 92 |     "lda_model.fit(matrix)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Saving progress\n",
102 |     "with open(\"models/lda.50.pkl\", \"wb\") as lda_file:\n",
103 |     "    pickle.dump(lda_model, lda_file)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "def print_top_words(model, feature_names, n_top_words):\n",
113 |     "    for topic_idx, topic in enumerate(model.components_):\n",
114 |     "        print(\"Topic #%d:\" % topic_idx)\n",
115 |     "        print(\" | \".join([feature_names[i]\n",
116 |     "                        for i in topic.argsort()[:-n_top_words - 1:-1]]))\n",
117 |     "        print()\n",
118 |     "        print()\n",
119 |     "    print()"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "print_top_words(lda_model, feature_names, 50)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": []
137 |   }
138 |  ],
139 |  "metadata": {
140 |   "kernelspec": {
141 |    "display_name": "Python [conda env:ml]",
142 |    "language": "python",
143 |    "name": "conda-env-ml-py"
144 |   },
145 |   "language_info": {
146 |    "codemirror_mode": {
147 |     "name": "ipython",
148 |     "version": 3
149 |    },
150 |    "file_extension": ".py",
151 |    "mimetype": "text/x-python",
152 |    "name": "python",
153 |    "nbconvert_exporter": "python",
154 |    "pygments_lexer": "ipython3",
155 |    "version": "3.6.6"
156 |   },
157 |   "varInspector": {
158 |    "cols": {
159 |     "lenName": 16,
160 |     "lenType": 16,
161 |     "lenVar": 40
162 |    },
163 |    "kernels_config": {
164 |     "python": {
165 |      "delete_cmd_postfix": "",
166 |      "delete_cmd_prefix": "del ",
167 |      "library": "var_list.py",
168 |      "varRefreshCmd": "print(var_dic_list())"
169 |     },
170 |     "r": {
171 |      "delete_cmd_postfix": ") ",
172 |      "delete_cmd_prefix": "rm(",
173 |      "library": "var_list.r",
174 |      "varRefreshCmd": "cat(var_dic_list()) "
175 |     }
176 |    },
177 |    "types_to_exclude": [
178 |     "module",
179 |     "function",
180 |     "builtin_function_or_method",
181 |     "instance",
182 |     "_Feature"
183 |    ],
184 |    "window_display": false
185 |   }
186 |  },
187 |  "nbformat": 4,
188 |  "nbformat_minor": 2
189 | }
190 | 


--------------------------------------------------------------------------------
/TopicModelling/arguing_lexicon/arguing-lexicon-filter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pickle\n",
 10 |     "from collections import namedtuple\n",
 11 |     "\n",
 12 |     "import pandas as pd\n",
 13 |     "from arguing_lexicon import ArguingLexiconParser\n",
 14 |     "from tqdm import tqdm_notebook as tqdm\n",
 15 |     "\n",
 16 |     "input_data_csv = \"data/captions_metadata.csv\""
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# Lazy data reader into DataFrame\n",
 26 |     "transcripts_reader = pd.read_csv(\"data/captions_metadata.csv\", chunksize=10)\n",
 27 |     "\n",
 28 |     "# Dummy class for convenience and speed\n",
 29 |     "Doc = namedtuple(\"Doc\", [\"text\"])\n",
 30 |     "\n",
 31 |     "# Util function to write lazy chunks back to disk\n",
 32 |     "output_columns = [\"id\", \"content\", \"date\", \"title\", \"unknown\", \"channel\", \"fragments\", \"argument_fragments\",\n",
 33 |     "                  \"argument_labels\", \"argument_content\"]\n",
 34 |     "arguments_path = \"data/captions_arguments.csv\"\n",
 35 |     "def write_to_disk(chunk):\n",
 36 |     "    with open(arguments_path, \"a\", encoding=\"utf-8\") as arguments_file:\n",
 37 |     "        chunk.to_csv(arguments_file, index=False)\n",
 38 |     "\n",
 39 |     "# Arguing lexixon parser\n",
 40 |     "parser = ArguingLexiconParser()\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "application/vnd.jupyter.widget-view+json": {
 51 |        "model_id": "bbd877813eba4fe1b5db68f79ebdb966",
 52 |        "version_major": 2,
 53 |        "version_minor": 0
 54 |       },
 55 |       "text/plain": [
 56 |        "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
 57 |       ]
 58 |      },
 59 |      "metadata": {},
 60 |      "output_type": "display_data"
 61 |     },
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "for ix, transcripts in enumerate(tqdm(transcripts_reader)):\n",
 72 |     "    if ix <= 1183:\n",
 73 |     "        continue\n",
 74 |     "    arguments_frame = pd.DataFrame(columns=output_columns)\n",
 75 |     "    for ix, transcript in transcripts.iterrows():\n",
 76 |     "        content = str(transcript[\"content\"]).split(\"\\n\")\n",
 77 |     "        labels = []\n",
 78 |     "        arguments = []\n",
 79 |     "        argument_fragments = 0\n",
 80 |     "        for con in content:\n",
 81 |     "            doc = Doc(con)\n",
 82 |     "            matches = list(parser.get_arguing_matches(doc))\n",
 83 |     "            if len(matches):\n",
 84 |     "                argument_fragments += 1\n",
 85 |     "            lbls = []\n",
 86 |     "            args = []\n",
 87 |     "            for label, match in matches:\n",
 88 |     "                lbls.append(label)\n",
 89 |     "                args.append(match.group(0))\n",
 90 |     "            labels.append(\"\\t\".join(lbls))\n",
 91 |     "            arguments.append(\"\\t\".join(args))\n",
 92 |     "        argument_serie = pd.Series(data={\n",
 93 |     "            \"id\": transcript[\"id\"],\n",
 94 |     "            \"content\": transcript[\"content\"],\n",
 95 |     "            \"date\": transcript[\"date\"],\n",
 96 |     "            \"title\": transcript[\"title\"],\n",
 97 |     "            \"unknown\": transcript[\"unknown\"],\n",
 98 |     "            \"channel\": transcript[\"channel\"],\n",
 99 |     "            \"fragments\": len(content),\n",
100 |     "            \"argument_fragments\": argument_fragments,\n",
101 |     "            \"argument_labels\": \"\\n\".join(labels),\n",
102 |     "            \"argument_content\": \"\\n\".join(arguments)\n",
103 |     "        })\n",
104 |     "\n",
105 |     "        arguments_frame = arguments_frame.append(argument_serie, ignore_index=True)\n",
106 |     "    write_to_disk(arguments_frame)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": []
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python [conda env:ml]",
120 |    "language": "python",
121 |    "name": "conda-env-ml-py"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.6.6"
134 |   },
135 |   "varInspector": {
136 |    "cols": {
137 |     "lenName": 16,
138 |     "lenType": 16,
139 |     "lenVar": 40
140 |    },
141 |    "kernels_config": {
142 |     "python": {
143 |      "delete_cmd_postfix": "",
144 |      "delete_cmd_prefix": "del ",
145 |      "library": "var_list.py",
146 |      "varRefreshCmd": "print(var_dic_list())"
147 |     },
148 |     "r": {
149 |      "delete_cmd_postfix": ") ",
150 |      "delete_cmd_prefix": "rm(",
151 |      "library": "var_list.r",
152 |      "varRefreshCmd": "cat(var_dic_list()) "
153 |     }
154 |    },
155 |    "types_to_exclude": [
156 |     "module",
157 |     "function",
158 |     "builtin_function_or_method",
159 |     "instance",
160 |     "_Feature"
161 |    ],
162 |    "window_display": false
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 1
167 | }
168 | 


--------------------------------------------------------------------------------
/DataCollection/tests/youtubecollector/resources/full_channel_response.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "kind": "youtube#channelListResponse",
  3 |   "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/OeMJN9Cnx6jGn9D5tetafViyG0U\"",
  4 |   "pageInfo": {
  5 |     "totalResults": 1,
  6 |     "resultsPerPage": 1
  7 |   },
  8 |   "items": [
  9 |     {
 10 |       "kind": "youtube#channel",
 11 |       "etag": "\"XpPGQXPnxQJhLgs6enD_n8JR4Qk/3t1aM54tFd_H-BQTFGM9M4ZNq1g\"",
 12 |       "id": "Some_ID",
 13 |       "snippet": {
 14 |         "title": "The test channel",
 15 |         "description": "The Official YouTube Channel for testing",
 16 |         "publishedAt": "2015-04-03T17:02:57.000Z",
 17 |         "defaultLanguage": "en",
 18 |         "thumbnails": {
 19 |           "default": {
 20 |             "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s88-mo-c-c0xffffffff-rj-k-no",
 21 |             "width": 88,
 22 |             "height": 88
 23 |           },
 24 |           "medium": {
 25 |             "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s240-mo-c-c0xffffffff-rj-k-no",
 26 |             "width": 240,
 27 |             "height": 240
 28 |           },
 29 |           "high": {
 30 |             "url": "https://yt3.ggpht.com/a-/AAuE7mCD6Gw54_JFbHqq-6fIZynVB3B7grol7PQtYA=s800-mo-c-c0xffffffff-rj-k-no",
 31 |             "width": 800,
 32 |             "height": 800
 33 |           }
 34 |         },
 35 |         "localized": {
 36 |           "title": "Rand Paul",
 37 |           "description": "The Official YouTube Channel of Rand Paul"
 38 |         },
 39 |         "country": "US"
 40 |       },
 41 |       "contentDetails": {
 42 |         "relatedPlaylists": {
 43 |           "uploads": "UU_8WUrPbi8clO6sWt_FDvuA",
 44 |           "watchHistory": "HL",
 45 |           "watchLater": "WL"
 46 |         }
 47 |       },
 48 |       "statistics": {
 49 |         "viewCount": "2640735",
 50 |         "commentCount": "0",
 51 |         "subscriberCount": "9779",
 52 |         "hiddenSubscriberCount": false,
 53 |         "videoCount": "258"
 54 |       },
 55 |       "topicDetails": {
 56 |         "topicIds": [
 57 |           "topic1",
 58 |           "topic2",
 59 |           "topic3"
 60 |         ],
 61 |         "topicCategories": [
 62 |           "https://en.wikipedia.org/wiki/Society",
 63 |           "https://en.wikipedia.org/wiki/Politics"
 64 |         ]
 65 |       },
 66 |       "brandingSettings": {
 67 |         "channel": {
 68 |           "title": "Rand Paul",
 69 |           "description": "The Official YouTube Channel of Rand Paul",
 70 |           "keywords": "\"Testing is fun\", \"More Testing\"",
 71 |           "defaultTab": "Featured",
 72 |           "trackingAnalyticsAccountId": "UA-57201184-2",
 73 |           "showRelatedChannels": true,
 74 |           "showBrowseView": true,
 75 |           "unsubscribedTrailer": "KJnn2oi8e7A",
 76 |           "profileColor": "#000000",
 77 |           "country": "US"
 78 |         },
 79 |         "image": {
 80 |           "bannerImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1060-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
 81 |           "bannerMobileImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
 82 |           "bannerTabletLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1138-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
 83 |           "bannerTabletImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1707-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
 84 |           "bannerTabletHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2276-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
 85 |           "bannerTabletExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2560-fcrop64=1,00005a57ffffa5a8-nd-c0xffffffff-rj-k-no",
 86 |           "bannerMobileLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w320-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
 87 |           "bannerMobileMediumHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w960-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
 88 |           "bannerMobileHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
 89 |           "bannerMobileExtraHdImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1440-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no",
 90 |           "bannerTvImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w2120-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no",
 91 |           "bannerTvLowImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w854-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no",
 92 |           "bannerTvMediumImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1280-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no",
 93 |           "bannerTvHighImageUrl": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w1920-fcrop64=1,00000000ffffffff-nd-c0xffffffff-rj-k-no"
 94 |         },
 95 |         "hints": [
 96 |           {
 97 |             "property": "channel.banner.mobile.medium.image.url",
 98 |             "value": "https://yt3.ggpht.com/hywTtjeycRX2taWlatnyi1KqC-sVVmNS18aKETNxZQij9aIGeA05djEeihEH_J3hia65H7Vo=w640-fcrop64=1,32b75a57cd48a5a8-nd-c0xffffffff-rj-k-no"
 99 |           },
100 |           {
101 |             "property": "channel.featured_tab.template.string",
102 |             "value": "Everything"
103 |           },
104 |           {
105 |             "property": "channel.modules.show_comments.bool",
106 |             "value": "True"
107 |           }
108 |         ]
109 |       }
110 |     }
111 |   ]
112 | }


--------------------------------------------------------------------------------
/TopicModelling/TopicModelWrapper/StreamingParser.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import csv
  3 | import json
  4 | import sys
  5 | 
  6 | 
  7 | class StreamingParser(object):
  8 |     """
  9 |     Wrapper class for different approaches to loading texts.
 10 | 
 11 |     Included approaches:
 12 |     - Directory iteration
 13 |     - JSON iteration (evaluate per line, search for certain keys)
 14 |     """
 15 |     def __init__(self, file_path, iter_methods_index, metadata=False):
 16 |         self.path = file_path
 17 |         iter_methods = [self.directory_iterator, self.json_iterator, self.frog_iterator, self.csv_iterator]
 18 |         self.iter_method = iter_methods[iter_methods_index]
 19 |         self.metadata = metadata
 20 |         self.empty_counter = 0
 21 | 
 22 |     def __iter__(self):
 23 |         if self.metadata:
 24 |             for text, metadata in self.iter_method():
 25 |                 yield text, metadata
 26 |         else:
 27 |             for text in self.iter_method():
 28 |                 yield text
 29 | 
 30 |     def directory_iterator(self):
 31 |         """
 32 |         Iterable object (generator) for aggregating plain text files in a given directory.
 33 |         """
 34 |         for filename in os.listdir(self.path):
 35 |             with open(os.path.join(self.path, filename), 'r') as file:
 36 |                 # date = file.readline()
 37 |                 yield file.read()
 38 | 
 39 |     def frog_iterator(self):
 40 |         """
 41 |         Parser method for parsing frog tar.gz archives.
 42 |         """
 43 |         print("Loading input from Frog file")
 44 | 
 45 |         with tarfile.open(self.path, 'r:gz') as tf:
 46 |             for i, entry in enumerate(tf):
 47 |                 print(i)
 48 |                 if not entry.isdir():
 49 |                     _id = os.path.basename(entry.name)
 50 | 
 51 |                     file_path = '{}{}{}'.format(self.path, '/extracted_data/docs/', _id)
 52 |                     with open(file_path, 'r') as f:
 53 |                         _id = f.readline()
 54 |                         _name = f.readline()
 55 |                         _collection = f.readline()
 56 |                         _type = f.readline()
 57 |                         _classification = f.readline()
 58 |                         _date = f.readline()
 59 | 
 60 |                     entry_string = []
 61 |                     for line in tf.extractfile(entry):
 62 |                         line = line.decode('utf-8').split('\t')
 63 |                         if line[0] is not '\n':
 64 |                             if line[4][0] == 'N':
 65 |                                 entry_string.append(line[2])
 66 |                     yield ' '.join(entry_string), (_id, _name, _collection, _type, _classification, _date)
 67 | 
 68 |     def json_iterator(self):
 69 |         """
 70 |         Iterable object (generator) for aggregations of ORI (Elasticsearch) data.
 71 |         The aggregations are in JSON format, with each line containing one entry.
 72 |         The StreamingJSON object iterates over all lines contained in the file that was
 73 |         passed as a parameter.
 74 |         Iter yields only the raw text from the object, in this case
 75 |         the description field per source. If more than one source is found,
 76 |         Iter concatenates the results to one string. This string is then returned,
 77 |         and control is yielded to the caller. If no description is found, the
 78 |         KeyError exception is caught and a message is printed to the console.
 79 |         """
 80 |         print("Loading input as JSON formatted file")
 81 | 
 82 |         with open(self.path) as json_file:
 83 |             for index, line in enumerate(json_file):
 84 |                 print("extracting line {}".format(index))
 85 |                 json_data = json.loads(line)
 86 | 
 87 |                 # Extract all descriptions of the sources and append them to the main data list
 88 |                 doc_data = ''
 89 | 
 90 |                 try:
 91 |                     _id = json_data['_id']
 92 |                     _name = json_data["_source"].get('name', "").replace('\n', '').replace('\r', '').replace(',', '')
 93 |                     _collection = json_data["_source"].get('meta', {}).get('collection', "No Collection available")
 94 |                     _type = json_data['_type']
 95 |                     _classification = json_data["_source"].get('classification', "No classification in data")
 96 |                     _date = json_data["_source"].get('end_date', "No end_date in data")
 97 | 
 98 |                     for source in json_data['_source']['sources']:
 99 |                         # Add description of data as input
100 |                         doc_data = ' '.join([doc_data, source['description']])
101 | 
102 |                     if self.metadata:
103 |                         yield doc_data, (_id, _name, _collection, _type, _classification, _date)
104 |                     else:
105 |                         yield doc_data
106 |                 except KeyError:
107 |                     print("No sources key detected!")
108 |                     self.empty_counter += 1
109 | 
110 |     def csv_iterator(self):
111 |         """
112 |         Parser of CorrespondentEx cleaned csv files.
113 |         """
114 |         print("Loading input as JSON formatted file")
115 |         with open(self.path) as csv_file:
116 |             csv_data = csv.reader(csv_file)
117 | 
118 |             # Skip the column names
119 |             next(csv_data)
120 |             # Increase the csv max field size
121 |             csv.field_size_limit(sys.maxsize)
122 | 
123 |             for index, row in enumerate(csv_data):
124 |                 # if not index % 1000:
125 |                 print("extracting line {}".format(index))
126 | 
127 |                 # For some reason the index got duplicated, hence counting from 1 (blasphemy!)
128 |                 _id = row[1]
129 |                 _text = ast.literal_eval(row[2])
130 | 
131 |                 terms = []
132 |                 for caption in _text:
133 |                     for term in caption.split():
134 |                         terms.append(term)
135 | 
136 |                 if self.metadata:
137 |                     yield terms, (_id,)
138 |                 else:
139 |                     yield ''.join(terms)
140 | 


--------------------------------------------------------------------------------
/TopicModelling/Top TfIdf/Right - tfidf top words.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Top TfIdf words for channels\n",
  8 |     "\n",
  9 |     "Methodology: similar to https://pudding.cool/2017/09/hip-hop-words/\n",
 10 |     "\n",
 11 |     "Merge all videos for each channel for every year and see what makes that channel distinctive and if it changes over time.\n",
 12 |     "\n",
 13 |     "Method:\n",
 14 |     "1. Import cleaned captions\n",
 15 |     "2. Group them by channel and year"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import pandas as pd\n",
 25 |     "import datetime as dt\n",
 26 |     "from spacy.lang.en import English\n",
 27 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 28 |     "import numpy as np\n",
 29 |     "import networkx as nx"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "\n",
 39 |     "\n",
 40 |     "captions = '/home/dim/Downloads/captions_right.csv'\n",
 41 |     "videos = '/home/dim/Documents/projecten/extremisme/youtube/data/temp/bubble/right/videos_right.csv'\n",
 42 |     "\n",
 43 |     "columns = ['video_id', 'text']\n",
 44 |     "\n",
 45 |     "df1 = pd.read_csv(captions, names=columns, low_memory=False)\n",
 46 |     "df2 = pd.read_csv(videos, low_memory=False)\n",
 47 |     "df = pd.merge(df1, df2, on='video_id', how='left')\n",
 48 |     "\n",
 49 |     "del df1, df2"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "\n",
 59 |     "df['video_published'] = pd.to_datetime(df['video_published'])\n",
 60 |     "df['year'] = df['video_published'].dt.year\n",
 61 |     "\n",
 62 |     "df = df.groupby(['video_channel_title', 'year'])['text'].apply(lambda x: x.sum())"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 9,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "df = df.reset_index()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "###Optional: Lemmatize\n",
 81 |     "\n",
 82 |     "tokenizer = English().Defaults.create_tokenizer()\n",
 83 |     "\n",
 84 |     "df.text = df.text.apply(lambda x: ' '.join([tok.lemma_ for tok in tokenizer(x)]))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## Tfidf values\n",
 92 |     "\n",
 93 |     "### Parameter choices\n",
 94 |     "Followed the pudding hiphop blog. Terms have to appear in at least one in 50 channels (lower than with the pudding, who use one in 10, because we have a very diverse and large set of channels with topics probably changing a lot over time). Used sublinear term frequency (not 10, but 1 + log(9)), because otherwise stop words appear."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "vec = TfidfVectorizer(min_df=.02,sublinear_tf = True)\n",
104 |     "res = vec.fit_transform(merged_df.text)\n",
105 |     "vocab = {value:key for key,value in vec.vocabulary_.items()}"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "scrolled": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "results = []\n",
117 |     "for index in merged_df.index:\n",
118 |     "    top10words = [vocab[j] for i,j in sorted(zip(res[index].data,res[index].indices),reverse=True)[:10]]\n",
119 |     "    if len(top10words) < 10:\n",
120 |     "        continue\n",
121 |     "    meta = {'year':merged_df.year[index],'channel':merged_df.channel[index],'channel_id':merged_df.channel_id[index]}\n",
122 |     "    words = ({'word{no}'.format(no=i+1):top10words[i] for i in range(10)})\n",
123 |     "    results.append({k: v for d in [meta, words] for k, v in d.items()})\n",
124 |     "top10words_df = pd.DataFrame(results)\n",
125 |     "top10words_df = top10words_df[['year','channel','channel_id']+['word'+str(no) for no in range(1,11)]]\n",
126 |     "top10words_df.to_csv('C:/hackathon/top10tfidf_per_channel.csv',index=False)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "## Tfidf top 100 words (for similarity)\n",
134 |     "\n",
135 |     "Parameter choices same as above, but with json output to preserve list structure"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "results = []\n",
145 |     "for index in merged_df.index:\n",
146 |     "    top100words = [vocab[j] for i,j in sorted(zip(res[index].data,res[index].indices),reverse=True)[:100]]\n",
147 |     "    if len(top100words) < 100:\n",
148 |     "        continue\n",
149 |     "    results.append({'year':merged_df.year[index],\n",
150 |     "            'channel':merged_df.channel[index],\n",
151 |     "            'channel_id':merged_df.channel_id[index], \n",
152 |     "            'words':top100words})\n",
153 |     "top100words_df = pd.DataFrame(results)\n",
154 |     "top100words_df = top100words_df[['year','channel','channel_id','words']]\n",
155 |     "top100words_df.to_json('C:/hackathon/top100tfidf.json')"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## 'Overlap' matrix tfidf"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "\n",
172 |     "channel_id = {i:{'year':top100words_df.year[i],\n",
173 |     "                 'channel':top100words_df.channel[i],\n",
174 |     "                 'channel_id':top100words_df.channel_id[i]} for i in top100words_df.index}\n",
175 |     "top100words_df.words = top100words_df.words.apply(set)\n",
176 |     "distance_matrix = np.ones((len(channel_id),len(channel_id)))\n",
177 |     "\n",
178 |     "for i in range(len(channel_id)):\n",
179 |     "    for j in range(len(channel_id)):\n",
180 |     "        if i == j:\n",
181 |     "            continue\n",
182 |     "        elif i > j:\n",
183 |     "            distance_matrix[i,j] = distance_matrix[j,i]\n",
184 |     "        else:\n",
185 |     "            distance_matrix[i,j] = len(top100words_df.words[i] & top100words_df.words[j])/100\n",
186 |     "\n",
187 |     "distance_matrix[distance_matrix < .05] = 0"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "\n",
197 |     "G = nx.from_numpy_matrix(distance_matrix)\n",
198 |     "\n",
199 |     "for i in range(len(channel_id)):\n",
200 |     "    G.node[i].update(channel_id[i])\n",
201 |     "#nx.write_gexf(G,'C:/hackathon/tfidf_graph.gexf')"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "nx.write_gexf(G,'C:/hackathon/tfidf_graph.gexf')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "merged_df.to_csv('C:/hackathon/merged_right.csv',index = False)"
220 |    ]
221 |   }
222 |  ],
223 |  "metadata": {
224 |   "kernelspec": {
225 |    "display_name": "Python 3",
226 |    "language": "python",
227 |    "name": "python3"
228 |   },
229 |   "language_info": {
230 |    "codemirror_mode": {
231 |     "name": "ipython",
232 |     "version": 3
233 |    },
234 |    "file_extension": ".py",
235 |    "mimetype": "text/x-python",
236 |    "name": "python",
237 |    "nbconvert_exporter": "python",
238 |    "pygments_lexer": "ipython3",
239 |    "version": "3.6.6"
240 |   }
241 |  },
242 |  "nbformat": 4,
243 |  "nbformat_minor": 2
244 | }
245 | 


--------------------------------------------------------------------------------
/Notebooks/getting_started.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import youtubecollector as ytc\n",
 20 |     "import pandas as pd\n",
 21 |     "from tqdm import tqdm_notebook as tqdm"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Youtube client setup"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "youtube_client = ytc.youtube_client.create_youtube_client(\"./api.conf\")"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Channel Seed\n",
 45 |     "The pipeline starts with a list of channels for which all videos are checked, for which all comments, recommendations and captations are collected"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "channel_seed_filename = \"input/seeds_right.csv\"\n",
 55 |     "channel_outputfile = \"output/channels_right.csv\""
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "channel_seed_df = pd.read_csv(channel_seed_filename)\n",
 65 |     "\n",
 66 |     "channels = ytc.channels.get_channels(channel_seed_df.loc[0:], youtube_client)\n",
 67 |     "\n",
 68 |     "ytc.channels.write_channels(channels, channel_outputfile)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Videos"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "video_output_file = \"output/videos_right1.csv\""
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "channels = channels[0:1]"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "all_videos = list()\n",
103 |     "\n",
104 |     "for channel in tqdm(channels):\n",
105 |     "    response = ytc.video.get_videos(channel.channel_uploads, youtube_client)\n",
106 |     "    next_page_token = response.get('nextPageToken')\n",
107 |     "    videos = ytc.video.convert_to_videos(response, youtube_client)\n",
108 |     "    all_videos.extend(videos)\n",
109 |     "    ytc.video.write_videos(videos, video_output_file)\n",
110 |     "    \n",
111 |     "    while next_page_token:\n",
112 |     "        response = ytc.video.get_more_videos(channel.channel_uploads, youtube_client, next_page_token)            \n",
113 |     "        next_page_token = response.get('nextPageToken')\n",
114 |     "        videos = ytc.video.convert_to_videos(response, youtube_client)\n",
115 |     "        all_videos.extend(videos)\n",
116 |     "        ytc.video.write_videos(videos, video_output_file)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "## Comments"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "comments_output_file = \"output/comments_right1.csv\""
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "all_videos = all_videos[1015:]\n",
142 |     "\n",
143 |     "all_comments = list()\n",
144 |     "for video in tqdm(all_videos):\n",
145 |     "    response = ytc.comments.get_comments(video.video_id, youtube_client)\n",
146 |     "    comments = ytc.comments.convert_to_comments(response)\n",
147 |     "    all_comments.extend(comments)\n",
148 |     "    ytc.comments.write_comments(comments_output_file, comments)\n",
149 |     "    try:\n",
150 |     "        next_page_token = response.get('nextPageToken')\n",
151 |     "    except AttributeError:\n",
152 |     "        continue\n",
153 |     "    \n",
154 |     "    while next_page_token:\n",
155 |     "        response = ytc.comments.get_more_comments(video.video_id, youtube_client, next_page_token)\n",
156 |     "        try:\n",
157 |     "            next_page_token = response.get('nextPageToken')                       \n",
158 |     "        except AttributeError:\n",
159 |     "            continue\n",
160 |     "        comments = ytc.comments.convert_to_comments(response)\n",
161 |     "        all_comments.extend(comments)\n",
162 |     "        ytc.comments.write_comments(comments_output_file, comments)\n",
163 |     "    \n",
164 |     "    "
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "## Recommendations"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "recommendations_output_file = \"output/recommendations_right1.csv\""
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "video_to_recommendations = dict()\n",
190 |     "for video in tqdm(all_videos, ):\n",
191 |     "    try:\n",
192 |     "        response = ytc.recommendations.get_recommendations(video.video_id, youtube_client)\n",
193 |     "    except rateLimitExceeded:\n",
194 |     "        youtube_client = ytc.youtube_client.create_youtube_client(\"./api.conf\")\n",
195 |     "        pass\n",
196 |     "    \n",
197 |     "    recommendations = ytc.recommendations.convert_to_recommendations(response, video.video_id)\n",
198 |     "    video_to_recommendations[video.video_id]=recommendations\n",
199 |     "    \n",
200 |     "    ytc.recommendations.write_recommendations(recommendations_output_file, recommendations)\n",
201 |     "    "
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "## Transcripts"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "transcripts_output_file = \"/home/dim/Documents/projecten/extremisme/youtube/yt/YouTubeExtremism/DataCollection/transcripts_right1.csv\""
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "ytc.transcripts.get_captions(all_videos)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "video_id_transcripts = ytc.transcripts.extract_transcripts(\"./*.vtt\")\n",
236 |     "\n",
237 |     "ytc.transcripts.write_transcripts(transcripts_output_file, video_id_transcripts)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "import shutil\n",
247 |     "import glob\n",
248 |     "import os\n",
249 |     "\n",
250 |     "\n",
251 |     "for filename in glob.glob('/home/dim/Documents/projecten/extremisme/youtube/yt/YouTubeExtremism/DataCollection/*vtt'):\n",
252 |     "        os.remove(filename)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": []
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "correspondent",
266 |    "language": "python",
267 |    "name": "correspondent"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.6.7"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 1
284 | }
285 | 


--------------------------------------------------------------------------------
/Notebooks/scenariofunctions.py:
--------------------------------------------------------------------------------
  1 | # Functions needed
  2 | 
  3 | # Clean socialblade data
  4 | # Plot bars
  5 | # Select topics in tags
  6 | # Select topics in comments
  7 | # Include commenters
  8 | # Exclude commenters
  9 | # Include channels
 10 | # Excude channel
 11 | 
 12 | import pandas as pd
 13 | from tqdm import tqdm_notebook as tqdm
 14 | import matplotlib.pyplot as plt 
 15 | import datetime as dt
 16 | import scenariofunctions as sf
 17 | import glob 
 18 | import csv 
 19 | import re
 20 | import sys
 21 | import os
 22 | import config
 23 | 
 24 | def socialblade_ranking(channels):
 25 |      '''takes a messy social blade dataframe
 26 |      and cleans it up'''
 27 |      
 28 |      channels['Source Url'] = channels['Source Url'].str.replace('https://socialblade.com/youtube/channel/', '')
 29 |      channels['Subscriber_Rank'] = channels['Subscriber_Rank'].replace('\D', '', regex=True).apply(pd.to_numeric)
 30 |      channels['Video_View_Rank'] = channels['Video_View_Rank'].replace('\D', '', regex=True).apply(pd.to_numeric)
 31 |      channels['Sb_Rank'] = channels['Sb_Rank'].replace('\D', '', regex=True).apply(pd.to_numeric)
 32 |      channels['earnings_low'], channels['earnings_high'] = channels['Estimated_Yearly_Earning'].str.split('-', 1).str
 33 |      channels['earnings_low'] = channels['earnings_low'].replace('st|th|rd|nd', '', regex=True)
 34 |      channels['earnings_high'] = channels['earnings_high'].replace('st|th|rd|nd', '', regex=True)
 35 |      channels = channels.rename(columns={'Source Url': 'channel_id',
 36 |                                    'Subscriber_Rank': 'subscriber_rank',
 37 |                                    'Video_View_Rank': 'video_view_rank',
 38 |                                    'Sb_Rank': 'sb_rank',
 39 |                                    'Grade': 'grade'
 40 |                               })
 41 |      channels = channels[['channel_id', 'subscriber_rank', 'video_view_rank', 'sb_rank', 'grade']]
 42 | 
 43 |      return channels
 44 | 
 45 | 
 46 | def socialblade_growth(channel_history):
 47 | 
 48 |      pattern = re.compile('(\d{4}-\d{2}-\d+,\d+)')
 49 |      channel_history['daily_views'] = channel_history['Date_Daily_Views'].str.findall(pattern)
 50 |      channel_history['daily_subs'] = channel_history['Date_Total_Subs'].str.findall(pattern)
 51 |      channel_history = channel_history.rename(columns={'User':'channel_id'})
 52 |      daily_views = channel_history.set_index('channel_id') \
 53 |                .daily_views.apply(pd.Series) \
 54 |                .stack() \
 55 |                .reset_index(level=-1, drop=True) \
 56 |                .reset_index()
 57 |      
 58 |      daily_views['date'], daily_views['views'] = daily_views[0].str.split(',', 1).str
 59 |      daily_views = daily_views[['channel_id', 'date', 'views']]
 60 | 
 61 |      daily_subs = channel_history.set_index('channel_id') \
 62 |                .daily_subs.apply(pd.Series) \
 63 |                .stack() \
 64 |                .reset_index(level=-1, drop=True) \
 65 |                .reset_index()
 66 | 
 67 |      daily_subs['date'], daily_subs['subs'] = daily_subs[0].str.split(',', 1).str
 68 |      daily_subs = daily_subs[['channel_id', 'date', 'subs']]
 69 | 
 70 |      daily_stats = pd.merge(daily_subs, daily_views,  how='left', left_on=['channel_id', 'date'], right_on = ['channel_id', 'date'])
 71 |      daily_stats['yearmonth'] = pd.to_datetime(daily_stats['date']).dt.to_period('M')
 72 |      
 73 |      return daily_stats
 74 | 
 75 | 
 76 | def channel_filter(dataframe, selection):
 77 |      filtered_data = dataframe[dataframe['video_channel_title'].isin(selection)]
 78 |      print('deze selectie levert ' + str(len(filtered_data)) + ' videos op.')
 79 |      return filtered_data
 80 | 
 81 | def channel_filter_exclude(dataframe, selection):
 82 |      filtered_data = dataframe[~dataframe['video_channel_title'].isin(selection)]
 83 |      print('deze selectie levert ' + str(len(filtered_data)) + ' videos op.')
 84 |      return filtered_data
 85 | 
 86 | def add_years_months_to_videos(dataframe):
 87 |      dataframe.loc[:,('year')] = pd.to_datetime(dataframe.loc[:,('video_published')]).dt.to_period('Y')
 88 |      dataframe.loc[:,('yearmonth')] = pd.to_datetime(dataframe.loc[:,('video_published')]).dt.to_period('M')
 89 | 
 90 |      return dataframe
 91 | 
 92 | 
 93 | def plot_views_per_year(dataframe):
 94 |      views_per_year = dataframe.groupby(['year'])['video_view_count'].agg('sum')
 95 |      fig = plt.figure(figsize=(10,5))
 96 |      width = 0.4
 97 |      ax = fig.add_subplot(111) 
 98 |      views_per_year.plot(kind='bar', color='red', width=width, grid=True)
 99 |      ax.set_ylabel('number of views')
100 |      ax.set_xlabel('year')
101 | 
102 |      return plt.show()
103 | 
104 | def plot_top_channels(dataframe):
105 |      top_channels = dataframe['video_channel_title'].value_counts()[0:20]
106 |      fig = plt.figure(figsize=(20,10)) # Create matplotlib figure
107 |      width = 0.4
108 |      ax = fig.add_subplot(111) 
109 |      top_channels.plot(kind='bar', color='red', width=width, grid=True)
110 |      ax.set_ylabel('number of videos published')
111 |      ax.set_xlabel('channels')
112 |      
113 |      return  plt.show()
114 | 
115 | def plot_users(dataframe):
116 |      top_users = dataframe['author_display_name'].value_counts()[0:20]
117 |      fig = plt.figure(figsize=(20,10)) # Create matplotlib figure
118 |      width = 0.4
119 |      ax = fig.add_subplot(111) 
120 |      top_users.plot(kind='bar', color='red', width=width, grid=True)
121 |      ax.set_ylabel('number of comments')
122 |      ax.set_xlabel('channels')
123 |      
124 |      return  plt.show()
125 | 
126 | 
127 | def topic_filter(dataframe, query, query_topic):
128 |      pattern = '|'.join([s for s in query])
129 |      mask = dataframe['video_tags'].str.contains(pattern, regex=True, case=False, na=False)
130 |      topic = dataframe[mask]
131 |      print('found ' + str(len(topic)) + ' videos with ' + query_topic)
132 |      
133 |      return topic
134 | 
135 | def zoom_in_on_commenter(dataframe, name):
136 |      result = dataframe[dataframe['author_display_name'] == name]
137 |      return result
138 | 
139 | 
140 | def extract_tags(dataframe):
141 |      vidtags = dataframe[['video_id', 'video_title', 'video_tags', 'year']]
142 | 
143 |      video_tags = vidtags['video_tags'].str.replace(r"\[|\]|\'|-", '') \
144 |                     .str.lower() \
145 |                     .str.split(', ', expand=True) \
146 |                     .merge(vidtags, right_index = True, left_index = True) \
147 |                     .drop(["video_tags"], axis = 1) \
148 |                     .melt(id_vars = ['video_id', 'video_title', 'year'], value_name = "tag") \
149 |                     .drop(['variable'], axis=1) \
150 |                     .dropna()
151 | 
152 |      video_tags = video_tags[~video_tags['tag'].str.contains('not set')]
153 |      video_tags.sort_values('tag', inplace=True)
154 |      video_tags['tag'] = video_tags['tag'].str.replace('"', '')
155 |      print('found ' + str(video_tags.tag.nunique()) + ' unique tags')
156 |      return video_tags
157 | 
158 | def tag_filter(dataframe, tag):
159 |      result = dataframe[dataframe['tag'].str.contains(tag)]
160 |      print('found ' + str(len(result)) + ' tags')
161 |      return result
162 | 
163 | def get_comments_by_video_id(query, sphere):
164 |      if sphere == 'nl_right':
165 |           path = config.PATH_NL
166 |      if sphere == 'left':
167 |           path = config.PATH_LEFT
168 |      elif sphere == 'right':
169 |           path = config.PATH_RIGHT
170 |      else:
171 |           print('sphere not found \n please try again')
172 | 
173 |      iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 
174 |                         chunksize=1000000, 
175 |                         sep='¶',
176 |                         quotechar='þ',
177 |                         engine='python')
178 |      result = pd.concat([chunk[chunk['video_id'].isin(query)] for chunk in iter_csv])
179 |      result['sphere'] = sphere
180 |      result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y')
181 |      result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']]
182 |      print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters')
183 |      return result
184 | 
185 | def get_comments_by_author(query, sphere):
186 |      if sphere == 'nl_right':
187 |           path = config.PATH_NL
188 |      if sphere == 'left':
189 |           path = config.PATH_LEFT
190 |      elif sphere == 'right':
191 |           path = config.PATH_RIGHT
192 |      else:
193 |           print('sphere not found \n please try again')
194 | 
195 |      iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 
196 |                         chunksize=1000000, 
197 |                         sep='¶',
198 |                         quotechar='þ',
199 |                         engine='python')
200 |      result = pd.concat([chunk[chunk['author_channel_id'].isin(query)] for chunk in iter_csv])
201 |      result['sphere'] = sphere
202 |      result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y')
203 |      result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']]
204 |      print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters')
205 |      return result
206 | 
207 | def get_comments_by_author_name(query, sphere):
208 |      if sphere == 'nl_right':
209 |           path = config.PATH_NL
210 |      if sphere == 'left':
211 |           path = config.PATH_LEFT
212 |      elif sphere == 'right':
213 |           path = config.PATH_RIGHT
214 |      else:
215 |           print('sphere not found \n please try again')
216 | 
217 |      iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 
218 |                         chunksize=1000000, 
219 |                         sep='¶',
220 |                         quotechar='þ',
221 |                         engine='python')
222 |      result = pd.concat([chunk[chunk['author_display_name'].isin(query)] for chunk in iter_csv])
223 |      result['sphere'] = sphere
224 |      result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y')
225 |      result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']]
226 |      print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters')
227 |      return result
228 | 
229 | def get_comments_by_topic(query, sphere):
230 |      if sphere == 'nl_right':
231 |           path = config.PATH_NL
232 |      if sphere == 'left':
233 |           path = config.PATH_LEFT
234 |      elif sphere == 'right':
235 |           path = config.PATH_RIGHT
236 |      else:
237 |           print('sphere not found \nplease try again')
238 | 
239 |      iter_csv = pd.read_csv(path + 'comments_' + sphere + '.csv', 
240 |                         chunksize=1000000, 
241 |                         sep='¶',
242 |                         quotechar='þ',
243 |                         engine='python')
244 |      result = pd.concat([chunk[chunk['comment_text'].astype(str).str.contains('{}'.format('|'.join([s for s in query])), na=False, case=False, regex=True)] for chunk in iter_csv])
245 |      result['sphere'] = sphere
246 |      result.loc[:,('year')] = pd.to_datetime(result.loc[:,('comment_time')]).dt.to_period('Y')
247 |      result = result[['video_id', 'comment_id', 'author_display_name', 'author_channel_id', 'comment_text', 'comment_time', 'year', 'sphere']]
248 |      print('found ' + str(len(result)) + ' comments \n and ' + str(result.author_channel_id.nunique()) + ' unique commenters')
249 |      return result


--------------------------------------------------------------------------------
/TopicModelling/TopicModelWrapper/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import datetime
  5 | import string
  6 | 
  7 | # Gensim
  8 | import gensim
  9 | 
 10 | # Plotting tools
 11 | import pyLDAvis
 12 | 
 13 | # Wrappers
 14 | from StreamingCorpus import StreamingCorpus
 15 | from StreamingPreprocesser import StreamingPreprocesser
 16 | from StreamingParser import StreamingParser
 17 | 
 18 | # Enable logging for gensim
 19 | import logging
 20 | import warnings
 21 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
 22 | warnings.filterwarnings("ignore", category=DeprecationWarning)
 23 | 
 24 | def main(args):
 25 |     # --------------------------------------------------
 26 |     #
 27 |     # Initialize parameters
 28 |     #
 29 |     # --------------------------------------------------
 30 |     root = os.path.dirname(os.path.realpath(__file__))
 31 | 
 32 |     input_file = '{}/{}'.format(root, args.input)
 33 |     # input_file = os.path.dirname(os.path.realpath(__file__)) + "/temp.json"  # temp file for testing
 34 | 
 35 |     # Prepare stopwords and extend if applicable
 36 |     stopwords_path = '{}/{}'.format(root, args.stopwords_file)
 37 |     stopwords = open(stopwords_path, 'r').read().split('\n')
 38 | 
 39 |     # Add 'stopwords' manually; TODO: substitute with spacy lemmatiser
 40 |     stopwords.extend(['know', 'think', 'like', 'thats', 'well', 'dont',
 41 |                       'get', 'actually', 'would', 'say', 'yeah', 'want', 'going',
 42 |                       'said', 'speech', 'theres', 'way', 'could', 'see', 'something',
 43 |                       'people', 'really', 'okay', 'gonna', 'ive', 'mean', 'right',
 44 |                       'got', 'thing', 'one', 'theyre', 'stuff', 'kind', 'lot',
 45 |                       'good', 'lot', 'things', 'saying', 'hes', 'even', 'much',
 46 |                       'guy', 'whatever', 'back', 'everything', 'life', 'love',
 47 |                       'guys', 'great', 'time', 'video', 'sort', 'cant', 'maybe',
 48 |                       'point', 'lets', 'take', 'talk', 'probably', 'might', 'put',
 49 |                       'years', 'new', 'two', 'need', 'yes', 'left', 'look', 'talking',
 50 |                       'anything', 'guess', 'make', 'interesting', 'someone', 'obviously',
 51 |                       'ill', 'still', 'also', 'whats', 'find', 'certain', 'course',
 52 |                       'weve', 'part', 'first', 'done', 'many', 'around', 'never',
 53 |                       'show', 'went', 'little', 'ever', 'big', 'look', 'give',
 54 |                       'last'])
 55 | 
 56 |     #
 57 |     # dict_min = 4
 58 |     # dict_max = 0.6
 59 | 
 60 |     topic_num = args.topic_num
 61 |     model_name = args.model_name
 62 |     model_path = "{}/models/{}_{}".format(root, model_name, topic_num)
 63 |     if not os.path.isdir(model_path):
 64 |         print('Model directory not found, creating directory: {}'.format(model_path))
 65 |         os.mkdir(model_path)
 66 | 
 67 |     # Simple preprocesser
 68 |     parser = StreamingParser(input_file, 3, metadata=True)
 69 |     preprocessor = StreamingPreprocesser(stopwords=stopwords)
 70 | 
 71 |     corpus = StreamingCorpus(path=input_file,
 72 |                              parse_strategy=parser,
 73 |                              clean_strategy=preprocessor,
 74 |                              dictionary=None,
 75 |                              metadata=True)
 76 |     dictionary = corpus.get_dictionary()
 77 | 
 78 |     gensim.corpora.MmCorpus.serialize(os.path.join(
 79 |         model_path, '{}.mm'.format(model_name)), corpus, metadata=True)
 80 |     corpus = gensim.corpora.MmCorpus(os.path.join(model_path, '{}.mm'.format(model_name)))
 81 | 
 82 |     # dictionary.filter_extremes(dict_min, dict_max_relative)
 83 |     dictionary.save(os.path.join(model_path, '{}.dict'.format(model_name)))
 84 | 
 85 |     # --------------------------------------------------
 86 |     #
 87 |     # LDA model training and serialization
 88 |     #
 89 |     # --------------------------------------------------
 90 | 
 91 |     t1 = time.time()
 92 |     print('Starting generation of LDA model')
 93 | 
 94 |     lda = gensim.models.LdaMulticore(corpus=corpus,
 95 |                                      id2word=dictionary,
 96 |                                      num_topics=topic_num,
 97 |                                      random_state=100,
 98 |                                      # update_every=1,
 99 |                                      chunksize=100,
100 |                                      passes=10,
101 |                                      # alpha='auto',
102 |                                      per_word_topics=True)
103 |     lda.save('{}/{}.lda'.format(model_path, model_name))
104 | 
105 |     t2 = time.time()
106 |     print('LDA model generation successful! Time elapsed: {}\n'.format(t2 - t1))
107 | 
108 |     # --------------------------------------------------
109 |     #
110 |     # Visualisation with pyLDAvis
111 |     #
112 |     # --------------------------------------------------
113 | 
114 |     # t1 = time.time()
115 |     # print('Starting preparation of LDAvis visualisation')
116 |     #
117 |     # # Load gensim data to prepare for visualization
118 |     # prepared_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
119 |     # # Save visualisation to HTML file
120 |     # pyLDAvis.save_html(prepared_data, os.path.join(model_path, '{}_LDAvis.html'.format(model_name)))
121 |     #
122 |     # t2 = time.time()
123 |     # print('LDAvis visualisation successful! Time elapsed: {}\n'.format(t2 - t1))
124 | 
125 |     # --------------------------------------------------
126 |     #
127 |     # Compute model perplexity and coherence score
128 |     #
129 |     # --------------------------------------------------
130 | 
131 |     t1 = time.time()
132 |     print('\nStarting computation of perplexity score')
133 | 
134 |     perplexity_score = lda.log_perplexity(corpus)
135 |     # A measure of how good the model generalises. Lower is better.
136 |     print('Perplexity: ', perplexity_score)
137 | 
138 |     t2 = time.time()
139 |     print('Perplexitiy computed successfully! Time elapsed: {}\n'.format(t2 - t1))
140 | 
141 |     t1 = time.time()
142 |     print('\nStarting computation of coherence score')
143 | 
144 |     coherence_model_lda = gensim.models.CoherenceModel(
145 |         model=lda, corpus=corpus, dictionary=dictionary, coherence='u_mass')
146 |     coherence_lda = coherence_model_lda.get_coherence()
147 |     print('Coherence Score: ', coherence_lda)
148 | 
149 |     t2 = time.time()
150 |     print('Coherence score computed successfully! Time elapsed: {}\n'.format(t2 - t1))
151 | 
152 |     # --------------------------------------------------
153 |     #
154 |     # Saving parameters and scores to file
155 |     #
156 |     # --------------------------------------------------
157 | 
158 |     print('Writing settings and results to file...')
159 |     with open(os.path.join(model_path, '{}_parameters.txt'.format(model_name)), 'w') as file:
160 |         file.write('Model name: {}\n date: {}\n'.format(model_name, datetime.datetime.now()))
161 | 
162 |         file.write('Corpus statistics:\n'.format())
163 |         file.write('\tNon-empty entries: {}\n'.format(len(corpus)))
164 | 
165 |         file.write('Model parameters: \n')
166 |         file.write('\tNumber of topics:          {}\n'.format(topic_num))
167 |         # file.write('\tDictionary min:            {}\n'.format(dict_min))
168 |         # file.write('\tDictionary max (relative): {}\n'.format(dict_max_relative))
169 | 
170 |         file.write('Model scores:\n')
171 |         file.write('\tPerplexity score = {}\n'.format(perplexity_score))
172 |         file.write('\tCoherence score = {}\n'.format(coherence_lda))
173 |         file.write(''.format())
174 |     print('Done!')
175 | 
176 |     # Ngram models ------------------------------------------
177 | 
178 |     # bigram_phrases = gensim.models.Phrases(data_tokens, min_count=5, threshold=100)
179 |     # trigram_phrases = gensim.models.Phrases(bigram_phrases[data_tokens], threshold=100)
180 |     #
181 |     # bigram_model = gensim.models.phrases.Phraser(bigram_phrases)
182 |     # trigram_model = gensim.models.phrases.Phraser(trigram_phrases)
183 |     #
184 |     # print(trigram_model[bigram_model[data_tokens[0]]])
185 | 
186 |     # def make_bigrams(documents):
187 |     #     return [bigram_model[document] for document in documents]
188 |     #
189 |     # def make_trigrams(documents):
190 |     #     return [trigram_model[bigram_model[document]] for document in documents]
191 |     #
192 |     # t1 = time.time()
193 |     # data_words_bigrams = make_bigrams(data_words_nostops)
194 |     # t2 = time.time()
195 |     # print('Bigrams created successfully! Time elapsed: {}'.format(t2 - t1))
196 | 
197 |     # Build MALLET LDA model and test coherence scores ------------------------------------------
198 | 
199 |     # mallet_path = 'path/to/mallet-2.0.8/bin/mallet'  # update this path
200 |     # ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)
201 |     #
202 |     # # Show Topics
203 |     # pprint(ldamallet.show_topics(formatted=False))
204 |     #
205 |     # # Compute Coherence Score
206 |     # coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_words_nostop, dictionary=id2word,
207 |     #                                            coherence='c_v')
208 |     # coherence_ldamallet = coherence_model_ldamallet.get_coherence()
209 |     # print('\nCoherence Score: ', coherence_ldamallet)
210 | 
211 |     # Try different number of topics (k) and compare scores ------------------------------------------
212 | 
213 |     # Find dominant topic for each document ------------------------------------------
214 | 
215 |     # Find most representative document for each topic ------------------------------------------
216 |     # Topic inference methods?
217 | 
218 | 
219 | def parse_arguments():
220 |     parser = argparse.ArgumentParser(description="""
221 |     Wrapper for streaming topic model implementation by Gensim.
222 |     TODO: make config.ini
223 |     """)
224 |     #####  Positional arguments  #####
225 |     parser.add_argument("input", type=str, default="temp.json",
226 |                         help="File or directory containing the data to be processed. ")
227 |     # parser.add_argument("dictionary", type=str)
228 |     # parser.add_argument("output_file", type=str, help="Optional. WIP")
229 | 
230 |     #####  Preprocessing parameters  #####
231 |     preproccesing_parameters = parser.add_argument_group('preprocessing parameters')
232 |     preproccesing_parameters.add_argument("stopwords_file", type=str,
233 |                         default="stopwords.txt",
234 |                         help="Path to file containing stopwords to be removed")
235 |     preproccesing_parameters.add_argument("-m", "--term_min_freq", type=int,
236 |                         help="remove all terms with specified frequency (or lower)")
237 |     preproccesing_parameters.add_argument("-M", "--term_max_freq", type=int,
238 |                         help="remove all terms with specified frequency (or larger)")
239 | 
240 |     #####  Topic modeling parameters  #####
241 |     topicmodel_parameters = parser.add_argument_group('topic modeling parameters')
242 |     topicmodel_parameters.add_argument("model_name", type=str,
243 |                         help="The name of the model. I.e. the dataset name.")
244 |     topicmodel_parameters.add_argument("topic_num", type=int,
245 |                         help="The name of the model. I.e. the dataset name.")
246 | 
247 |     return parser.parse_args()
248 | 
249 | if __name__ == '__main__':
250 |     main(parse_arguments())
251 | 


--------------------------------------------------------------------------------
/RabbitHole/youtube-onderzoek-jan.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Guillaume Chaslot'
  2 | 
  3 | """
  4 |     This scripts starts from a search query on youtube and:
  5 |         1) gets the N first search results
  6 |         2) follows the first M recommendations
  7 |         3) repeats step (2) P times
  8 |         4) stores the results in a json file
  9 | """
 10 | 
 11 | import urllib2
 12 | import requests
 13 | import cookielib
 14 | from cookielib import MozillaCookieJar
 15 | import re
 16 | import json
 17 | import sys
 18 | import time
 19 | import ssl
 20 | import os
 21 | import easygui
 22 | import shutil
 23 | 
 24 | 
 25 | from bs4 import BeautifulSoup
 26 | 
 27 | RECOMMENDATIONS_PER_VIDEO = 1
 28 | RESULTS_PER_SEARCH = 1
 29 | 
 30 | # NUMBER OF MIN LIKES ON A VIDEO TO COMPUTE A LIKE RATIO
 31 | MIN_LIKES_FOR_LIKE_RATIO = 5
 32 | 
 33 | 
 34 | # Google session class by stackoverflow user alexislg
 35 | class SessionGoogle:
 36 |     def __init__(self):
 37 |         self.ses = requests.session()
 38 |         self.ses.cookies = MozillaCookieJar('cookie.txt')
 39 |         self.user = None
 40 |         if not os.path.exists('cookie.txt'):
 41 |             if easygui.ynbox('Het bestand *cookie.txt* is niet gevonden. Wil je inloggen bij Google?', 'youtube-folower', ('Ja', 'Nee')):
 42 |                 self.user = self.login()
 43 |         else:    
 44 |             with open('cookie.txt', 'r') as cf:
 45 |                 self.rawcookie = cf.read()
 46 |                 if (self.rawcookie == ''):
 47 |                     if easygui.ynbox('Het bestand *cookie.txt* is leeg. Wil je inloggen bij Google?', 'youtube-folower', ('Ja', 'Nee')):
 48 |                         self.user = self.login()
 49 |                 else: 
 50 |                     self.ses.cookies.load(ignore_discard = True, ignore_expires = True)
 51 |                     self.user = self.check_login()
 52 |                     if self.user == None:
 53 |                         if easygui.ynbox('Het bestand *cookie.txt* is gevonden, maar geeft geen ingelogde gebruiker. Wil je inloggen bij Google?', 'youtube-folower', ('Ja', 'Nee')):
 54 |                             self.user = self.login()
 55 | 
 56 |         if self.user == None:
 57 |             print 'Nog logged in. Continuing anonymously.'
 58 |         else:
 59 |             print 'Logged in as: %s.' % self.user         
 60 |                 
 61 |         self.save_cookie()
 62 | 
 63 |     def get(self, URL):
 64 |         return self.ses.get(URL).content
 65 | 
 66 |     def save_cookie(self):
 67 |         cj = self.ses.cookies
 68 |         cj.save(ignore_discard = True, ignore_expires = True)            
 69 | 
 70 |     def login(self):
 71 |         while True:
 72 |             msg = "Vul de gegevens van je Google-account in:"
 73 |             title = "youtube-follower"
 74 |             fieldNames = ["Gebruikersnaam","Wachtwoord"]
 75 |             fieldValues = []  # we start with blanks for the values
 76 |             fieldValues = easygui.multpasswordbox(msg, title, fieldNames)
 77 | 
 78 |             while True:
 79 |                 if fieldValues == None: break
 80 |                 errmsg = ""
 81 |                 for i in range(len(fieldNames)):
 82 |                   if fieldValues[i].strip() == "":
 83 |                     errmsg = errmsg + ('"%s" niet ingevuld.\n\n' % fieldNames[i])
 84 |                 if errmsg == "": break # no problems found
 85 |                 fieldValues = easygui.multpasswordbox(errmsg, title, fieldNames, fieldValues)
 86 | 
 87 |             login_html = self.ses.get('https://accounts.google.com/ServiceLogin')
 88 |             soup_login = BeautifulSoup(login_html.content).find('form').find_all('input')
 89 |             my_dict = {}
 90 |             for u in soup_login:
 91 |                 if u.has_attr('value'):
 92 |                     my_dict[u['name']] = u['value']
 93 |             # override the inputs without login and pwd:
 94 |             print my_dict
 95 |             my_dict['Email'] = fieldValues[0]
 96 |             my_dict['Passwd'] = fieldValues[1]
 97 |             self.ses.post('https://accounts.google.com/ServiceLoginAuth', data=my_dict)
 98 | 
 99 |             m = self.check_login()
100 |             if m == None:
101 |                 if easygui.ynbox('Inloggen mislukt. Wil je het opnieuw proberen?', 'youtube-folower', ('Ja', 'Nee')):
102 |                     continue
103 |                 else:
104 |                     return m 
105 |             else: 
106 |                 return m 
107 | 
108 |     def check_login(self):
109 |         r = self.get('https://accounts.google.com/ServiceLogin')
110 |         m = re.search(r'Google Account: (.*?)  \&\#10', r)
111 |         if m == None:
112 |             return m
113 |         else:
114 |             return m.group(1)
115 | 
116 | class YoutubeFollower():
117 |     def __init__(self, session, verbose=False, name='', alltime=True, gl=None, language=None, recent=False, loopok=True):
118 |         # Name
119 |         self._name = name
120 |         self._alltime = alltime
121 |         self._verbose = verbose
122 | 
123 |         # Dict video_id to {'likes': ,
124 |         #                   'dislikes': ,
125 |         #                   'views': ,
126 |         #                   'recommendations': []}
127 |         self._video_infos = {} # self.try_to_load_video_infos()
128 | 
129 |         # Dict search terms to [video_ids]
130 |         self._search_infos = {}
131 |         self._gl = gl
132 |         self._language = language
133 |         self._recent=recent
134 |         self._loopok=loopok
135 |         self._session=session
136 | 
137 |         print ('Location = ' + repr(self._gl) + ' Language = ' + repr(self._language))
138 | 
139 |     def clean_count(self, text_count):
140 |         # Ignore non ascii
141 |         ascii_count = text_count.encode('ascii', 'ignore')
142 |         # Ignore non numbers
143 |         p = re.compile('[\d,]+')
144 |         return int(p.findall(ascii_count)[0].replace(',', ''))
145 | 
146 |     def get_search_results(self, search_terms, max_results, top_rated=False):
147 |         assert max_results < 20, 'max_results was not implemented to be > 20'
148 | 
149 |         if self._verbose:
150 |             print ('Searching for {}'.format(search_terms))
151 | 
152 |         # Trying to get results from cache
153 |         if search_terms in self._search_infos and len(self._search_infos[search_terms]) >= max_results:
154 |             return self._search_infos[search_terms][0:max_results]
155 | 
156 |         # Escaping search terms for youtube
157 |         escaped_search_terms = urllib2.quote(search_terms.encode('utf-8'))
158 | 
159 |         # We only want search results that are videos, filtered by viewcoung.
160 |         #  This is achieved by using the youtube URI parameter: sp=CAMSAhAB
161 |         if self._alltime:
162 |             filter = "CAMSAhAB"
163 |         else:
164 |             if top_rated:
165 |                 filter = "CAE%253D"
166 |             else:
167 |                 filter = "EgIQAQ%253D%253D"
168 | 
169 |         url = "https://www.youtube.com/results?sp=" + filter + "&q=" + escaped_search_terms
170 |         if self._gl:
171 |             url = url + '&gl=' + self._gl
172 | 
173 |         print ('Searching URL: ' + url)
174 | 
175 |         html = self._session.get(url)
176 |         soup = BeautifulSoup(html, "html.parser")
177 | 
178 |         videos = []
179 |         for item_section in soup.findAll('div', {'class': 'yt-lockup-dismissable'}):
180 |             video = item_section.contents[0].contents[0]['href'].split('=')[1]
181 |             videos.append(video)
182 | 
183 |         self._search_infos[search_terms] = videos
184 |         return videos[0:max_results]
185 | 
186 |     def get_recommendations(self, video_id, nb_recos_wanted, depth, key):
187 |         if video_id in self._video_infos:
188 |             # Updating the depth if this video was seen.
189 |             #self._video_infos[video_id]['depth'] = min(self._video_infos[video_id]['depth'], depth)
190 |             #print ('a video was seen at a lower depth')
191 | 
192 |             video = self._video_infos[video_id]
193 |             recos_returned = []
194 |             for reco in video['recommendations']:
195 |                 # This line avoids to loop around the same videos:
196 |                 if reco not in self._video_infos or self._loopok:
197 |                     recos_returned.append(reco)
198 |                     if len(recos_returned) >= nb_recos_wanted:
199 |                         break
200 |             if self._loopok:
201 |                 video['key'].append(key)
202 |             print ('\n Following recommendations ' + repr(recos_returned) + '\n')
203 |             return recos_returned
204 | 
205 |         url = "https://www.youtube.com/watch?v=" + video_id
206 | 
207 |         while True:
208 |             try:
209 |                 html = urllib2.urlopen(url)
210 |                 break
211 |             except urllib2.URLError:
212 |                 print 'error'
213 |                 time.sleep(1)
214 |         soup = BeautifulSoup(html, "html.parser")
215 | 
216 |         # Views
217 |         views = -1
218 |         for watch_count in soup.findAll('div', {'class': 'watch-view-count'}):
219 |             try:
220 |                 views = self.clean_count(watch_count.contents[0])
221 |             except IndexError:
222 |                 pass
223 | 
224 |         # Likes
225 |         likes = -1
226 |         for like_count in soup.findAll('button', {'class': 'like-button-renderer-like-button'}):
227 |             try:
228 |                 likes = self.clean_count(like_count.contents[0].text)
229 |             except IndexError:
230 |                 pass
231 | 
232 |         # Dislikes
233 |         dislikes = -1
234 |         for like_count in soup.findAll('button', {'class': 'like-button-renderer-dislike-button'}):
235 |             try:
236 |                 dislikes = self.clean_count(like_count.contents[0].text)
237 |             except IndexError:
238 |                 pass
239 | 
240 |         # Channel
241 |         channel = ''
242 |         for item_section in soup.findAll('a', {'class': 'yt-uix-sessionlink'}):
243 |             if item_section['href'] and '/channel/' in item_section['href'] and item_section.contents[0] != '\n':
244 |                 channel = item_section.contents[0]
245 |                 channel_id = item_section['href'].split('/channel/')[1]
246 |                 break
247 | 
248 |         if channel == '':
249 |             print ('WARNING: CHANNEL not found')
250 | 
251 |         # Recommendations
252 |         recos = []
253 |         upnext = True
254 |         for video_list in soup.findAll('ul', {'class': 'video-list'}):
255 |             if upnext:
256 |                 # Up Next recommendation
257 |                 try:
258 |                     recos.append(video_list.contents[1].contents[1].contents[1]['href'].replace('/watch?v=', ''))
259 |                 except IndexError:
260 |                     print ('WARNING Could not get a up next recommendation because of malformed content')
261 |                     pass
262 |                 upnext = False
263 |             else:
264 |                 # 19 Others
265 |                 for i in range(1, 19):
266 |                     try:
267 |                         recos.append(video_list.contents[i].contents[1].contents[1]['href'].replace('/watch?v=', ''))
268 |                     except IndexError:
269 |                         if self._verbose:
270 |                             print ('Could not get a recommendation because there are not enough')
271 |                     except AttributeError:
272 |                         if self._verbose:
273 |                             print ('WARNING Could not get a recommendation because of malformed content')
274 | 
275 |         title = ''
276 |         for eow_title in soup.findAll('span', {'id': 'eow-title'}):
277 |             title = eow_title.text.strip()
278 | 
279 |         if title == '':
280 |             print ('WARNING: title not found')
281 | 
282 |         if video_id not in self._video_infos:
283 |             self._video_infos[video_id] = {'views': views,
284 |                                            'likes': likes,
285 |                                            'dislikes': dislikes,
286 |                                            'recommendations': recos,
287 |                                            'title': title,
288 |                                            'depth': depth,
289 |                                            'id': video_id,
290 |                                            'channel': channel,
291 |                                            'key': []}
292 |             if self._loopok:
293 |                 self._video_infos[video_id]['key'].append(key)
294 | 
295 |         video = self._video_infos[video_id]
296 |         print (repr(video_id + ': ' + video['title'] + ' [' + channel + ']{' + repr(key) +'} ' + str(video['views']) + ' views , depth: ' + str(video['depth'])))
297 |         # print (repr(video['recommendations']))
298 |         return recos[:nb_recos_wanted]
299 | 
300 |     def get_n_recommendations(self, seed, branching, depth, key):
301 |         if depth is 0:
302 |             return [seed]
303 |         current_video = seed
304 |         all_recos = [seed]
305 |         index = 0
306 |         for video in self.get_recommendations(current_video, branching, depth, key):
307 |             code = chr(index + 97)
308 |             all_recos.extend(self.get_n_recommendations(video, branching, depth - 1, key + code))
309 |             index = index + 1
310 |         return all_recos
311 | 
312 |     def compute_all_recommendations_from_search(self, search_terms, search_results, branching, depth):
313 |         search_results = self.get_search_results(search_terms, search_results)
314 |         print ('Search results ' + repr(search_results))
315 | 
316 |         all_recos = []
317 |         ind = 0
318 |         for video in search_results:
319 |             ind += 1
320 |             all_recos.extend(self.get_n_recommendations(video, branching, depth, str(ind)))
321 |             print ('\n\n\nNext search: ')
322 |         all_recos.extend(search_results)
323 |         return all_recos
324 | 
325 |     def count(self, iterator):
326 |         counts = {}
327 |         for video in iterator:
328 |             counts[video] = counts.get(video, 0) + 1
329 |         return counts
330 | 
331 |     def go_deeper_from(self, search_term, search_results, branching, depth):
332 |         all_recos = self.compute_all_recommendations_from_search(search_term, search_results, branching, depth)
333 |         counts = self.count(all_recos)
334 |         print ('\n\n\nSearch term = ' + search_term + '\n')
335 |         print ('counts: ' + repr(counts))
336 |         sorted_videos = sorted(counts, key=counts.get, reverse=True)
337 |         return sorted_videos, counts
338 | 
339 |     def save_video_infos(self, keyword):
340 |         print ('Wrote file:')
341 |         date = time.strftime('%Y%m%d')
342 |         with open('data/video-infos-' + keyword + '-' + date + '.json', 'w') as fp:
343 |             json.dump(self._video_infos, fp)
344 | 
345 |     def try_to_load_video_infos(self):
346 |         try:
347 |             with open('data/video-infos-' + self._name + '.json', 'r') as fp:
348 |                 return json.load(fp)
349 |         except Exception as e:
350 |             print ('Failed to load from graph ' + repr(e))
351 |             return {}
352 | 
353 |     def count_recommendation_links(self):
354 |         counts = {}
355 |         for video_id in self._video_infos:
356 |             for reco in self._video_infos[video_id]['recommendations']:
357 |                 counts[reco] = counts.get(reco, 0) + 1
358 |         return counts
359 | 
360 |     def like_ratio_is_computed(self, video):
361 |         return int(video['likes']) > MIN_LIKES_FOR_LIKE_RATIO
362 | 
363 |     def print_graph(self, links_per_video, only_mature_videos=True):
364 |         """
365 |             Prints a file with a graph containing all videos.
366 |         """
367 |         input_links_counts = self.count_recommendation_links()
368 |         graph = {}
369 |         nodes = []
370 |         links = []
371 |         for video_id in self._video_infos:
372 |             video = self._video_infos[video_id]
373 |             if self.like_ratio_is_computed(video):
374 |                 popularity = 0
375 |             else:
376 |                 popularity = video['likes'] / float(video['likes'] + video['dislikes'])
377 | 
378 |             nodes.append({'id': video_id, 'size': input_links_counts.get(video_id, 0), 'popularity': popularity, 'type': 'circle', 'likes': video['likes'], 'dislikes': video['dislikes'], 'views': video['views'], 'depth': video['depth']})
379 |             link = 0
380 |             for reco in self._video_infos[video_id]['recommendations']:
381 |                 if reco in self._video_infos:
382 |                     links.append({'source': video_id, 'target': reco, 'value': 1})
383 |                     link += 1
384 |                     if link >= links_per_video:
385 |                         break
386 |         graph['nodes'] = nodes
387 |         graph['links'] = links
388 |         with open('./graph-' + self._name + '.json', 'w') as fp:
389 |             json.dump(graph, fp)
390 |         date = time.strftime('%Y-%m-%d')
391 |         with open('./graph-' + self._name + '-' + date + '.json', 'w') as fp:
392 |             json.dump(graph, fp)
393 |         print ('Wrote graph as: ' + './graph-' + self._name + '-' + date + '.json')
394 | 
395 | 
396 |     def print_videos(self, videos, counts, max_length):
397 |         idx = 1
398 |         for video in videos[:max_length]:
399 |             try:
400 |                 current_title = self._video_infos[video]['title']
401 |                 print (str(idx) + ') Recommended ' + str(counts[video]) + ' times: '
402 |                     ' https://www.youtube.com/watch?v=' + video + ' , Title: ' + repr(current_title))
403 |                 if idx % 20 == 0:
404 |                     print ('')
405 |                 idx += 1
406 |             except KeyError:
407 |                 pass
408 | 
409 |     def get_top_videos(self, videos, counts, max_length_count):
410 |         video_infos = []
411 |         for video in videos:
412 |             try:
413 |                 video_infos.append(self._video_infos[video])
414 |                 video_infos[-1]['nb_recommendations'] = counts[video]
415 |             except KeyError:
416 |                 pass
417 | 
418 |         # Computing the average recommendations of the video:
419 |         # The average is computing only on the top videos, so it is an underestimation of the actual average.
420 |         if video_infos is []:
421 |             return []
422 |         sum_recos = 0
423 |         for video in video_infos:
424 |             sum_recos += video['nb_recommendations']
425 |         avg = sum_recos / float(len(video_infos))
426 |         for video in video_infos:
427 |             video['mult'] = video['nb_recommendations'] / avg
428 |         return video_infos[:max_length_count]
429 | 
430 | def compare_keywords(session, query, search_results, branching, depth, name, gl, language, recent, loopok):
431 |     date = time.strftime('%Y-%m-%d')
432 |     file_name = 'results/' + name + '-' + date + '.json'
433 |     print ('Running, will save the resulting json to:' + file_name)
434 |     top_videos = {}
435 |     for keyword in query.split(','):
436 |         yf = YoutubeFollower(session, verbose=True, name=keyword, alltime=False, gl=gl, language=language, recent=recent, loopok=loopok)
437 |         top_recommended, counts = yf.go_deeper_from(keyword,
438 |                           search_results=search_results,
439 |                           branching=branching,
440 |                           depth=depth)
441 |         top_videos[keyword] = yf.get_top_videos(top_recommended, counts, 1000)
442 |         yf.print_videos(top_recommended, counts, 50)
443 |         yf.save_video_infos(name + '-' + keyword)
444 | 
445 |     with open(file_name, 'w') as fp:
446 |         json.dump(top_videos, fp)
447 | 
448 | def main():
449 |     query    = 'jared taylor'
450 |     name     = "taylor"
451 |     searches = 2
452 |     branch   = 2
453 |     depth    = 10
454 |     
455 |     #Create (authenticated) Google session and log in to YouTube
456 |     session = SessionGoogle()
457 |     session.get('https://accounts.google.com/ServiceLogin?continue=https://youtube.com&service=youtube')
458 | 
459 | 
460 |     #file = codecs.open("test.txt","w+", encoding = 'utf-8')
461 |     #file.write('Met login testin\'')
462 |     #file.write(session.get('https://accounts.google.com/ServiceLogin?continue=https://youtube.com&service=youtube'))
463 | 
464 | 
465 |     #Seasygui.msgbox("Dank je wel voor meewerken aan dit onderzoek! De app zal een ongeveer een half uur nodig hebben om alle resultaten te verzamelen. Als we klaar zijn krijg je nogmaals een venster als deze te zien en staan de resultaten op dezelfde plek als waar je dit script hebt bewaard en aangeklikt.", title="DeCorrespondent Youtube Onderzoek")
466 | 
467 |     #os.chdir('')
468 |     if not os.path.exists('results'):
469 |         os.mkdir('results')
470 |         print 'made results'
471 |     if not os.path.exists('data'):
472 |         os.mkdir('data')
473 | 
474 |     compare_keywords(session, query, searches, branch, depth, name, 'NL', 'NL', False, False)
475 | 
476 |     if os.path.exists('data'):
477 |         shutil.rmtree('data', ignore_errors=True)
478 | 
479 |     file_name = name + '-' + time.strftime('%Y-%m-%d') + '.json'
480 | 
481 |     if os.path.exists('results/'+file_name):
482 |         shutil.copyfile('results/'+file_name, file_name)
483 |         shutil.rmtree('results')
484 | 
485 |     #easygui.msgbox("We zijn klaar. Dank je wel :)", title="DeCorrespondent Youtube Onderzoek")
486 | 
487 |     return 0
488 | 
489 | if __name__ == "__main__":
490 |     sys.exit(main())
491 | 


--------------------------------------------------------------------------------