├── .gitattributes
├── .gitignore
├── CONTRIBUTORS.md
├── LICENSE
├── README.md
├── api
├── README.md
├── english
│ ├── README.md
│ ├── __init__.py
│ ├── db
│ │ ├── config.py
│ │ └── mongoqueries.py
│ ├── endpoints
│ │ └── outlet_stats.py
│ ├── gunicorn_conf.py
│ ├── logging.conf
│ ├── main.py
│ ├── schemas
│ │ ├── stats_by_date.py
│ │ └── stats_weekly.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_mock_outlet_stats.py
│ │ └── test_outlet_stats.py
│ └── utils
│ │ ├── dateutils.py
│ │ └── logger.py
├── french
│ ├── README.md
│ ├── __init__.py
│ ├── db
│ │ ├── config.py
│ │ └── mongoqueries.py
│ ├── endpoints
│ │ └── outlet_stats.py
│ ├── gunicorn_conf.py
│ ├── logging.conf
│ ├── main.py
│ ├── schemas
│ │ ├── stats_by_date.py
│ │ └── stats_weekly.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_mock_outlet_stats.py
│ │ └── test_outlet_stats.py
│ └── utils
│ │ ├── dateutils.py
│ │ └── logger.py
└── requirements.txt
├── nlp
├── english
│ ├── README.md
│ ├── config.py
│ ├── entity_gender_annotator.py
│ ├── evaluation
│ │ ├── README.md
│ │ └── src
│ │ │ ├── README.md
│ │ │ ├── evaluate.py
│ │ │ ├── evaluate_quotes.py
│ │ │ └── run_predictions.py
│ ├── gender_predictor.py
│ ├── img
│ │ └── concurrent.png
│ ├── merge_collections.py
│ ├── quote_extractor.py
│ ├── requirements.txt
│ ├── rules
│ │ ├── README.md
│ │ ├── author_blocklist.txt
│ │ ├── name_patterns.jsonl
│ │ └── quote_verb_list.txt
│ ├── topic_model
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── config.py
│ │ ├── corpus_analysis
│ │ │ ├── README.md
│ │ │ ├── analyze.py
│ │ │ ├── config.py
│ │ │ ├── download_articles.py
│ │ │ ├── requirements.txt
│ │ │ ├── spacyLemmas
│ │ │ │ └── spacy_english_lemmas.txt
│ │ │ └── test_corpus_functions.py
│ │ ├── img
│ │ │ ├── example_divergent_heatmap.png
│ │ │ ├── example_heatmap.png
│ │ │ └── example_wordcloud.png
│ │ ├── preproc.py
│ │ ├── preproc_cc.py
│ │ ├── requirements.txt
│ │ ├── spacyLemmas
│ │ │ ├── README.md
│ │ │ ├── convert_spacy_lemmas.py
│ │ │ └── spacy_english_lemmas.txt
│ │ ├── stopwords
│ │ │ ├── README.md
│ │ │ ├── create_stopword_list.py
│ │ │ ├── nltk_curated.txt
│ │ │ └── stopwords.txt
│ │ ├── train.py
│ │ ├── train_cc.py
│ │ └── vis.py
│ └── utils.py
└── french
│ ├── README.md
│ ├── config.py
│ ├── data_statistics.py
│ ├── entity_gender_annotator.py
│ ├── entity_merger.py
│ ├── evaluation
│ ├── README.md
│ └── src
│ │ ├── README.md
│ │ ├── evaluate.py
│ │ ├── evaluate_quotes.py
│ │ └── run_predictions.py
│ ├── gender_predictor.py
│ ├── merge_collections.py
│ ├── quote_extractor.py
│ ├── quote_highlighter.py
│ ├── quote_merger.py
│ ├── requirements.txt
│ ├── rules
│ ├── author_blocklist.txt
│ ├── name_patterns.jsonl
│ └── quote_verb_list.txt
│ └── utils.py
├── research_dashboard
├── README.md
├── admin
│ ├── apps
│ │ ├── topiclabels.py
│ │ ├── topsources.py
│ │ ├── unknownsources.py
│ │ └── updatecache.py
│ ├── assets
│ │ └── style.css
│ ├── auth.py
│ ├── config.py
│ ├── run.py
│ ├── server.py
│ └── static
│ │ ├── SFULogo.png
│ │ └── discourse-lab-logo.jpeg
├── aliases.txt
├── apps
│ ├── __init__.py
│ ├── articlecounts.py
│ ├── dailywomenenglish.py
│ ├── textanalyzer.py
│ ├── topicmodel.py
│ ├── topsources.py
│ └── topsourcetrends.py
├── assets
│ ├── favicon.ico
│ └── style.css
├── config.py
├── run.py
├── server.py
└── static
│ ├── GGT_topic_model_technical_report.pdf
│ ├── SFULogo.png
│ ├── discourse-lab-logo.jpeg
│ ├── sfu_discourse_thumbnail.png
│ ├── topic-pipeline-flowchart-1.png
│ └── topic-pipeline-flowchart-2.png
├── scraper
├── README.md
├── config.py
├── mediaCollectors.py
├── requirements.txt
└── util.py
└── statistics
├── README.md
├── config.py
├── daily_pipeline
├── README.md
├── config.py
├── daily_article_counts.py
└── media_daily.py
├── monthly_pipeline
├── README.md
├── config.py
├── monthly_top_sources.py
└── monthly_top_sources_timeseries.py
├── queries.py
├── requirements.txt
└── run.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-vendored
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # Custom
107 | *.DS_Store
108 | logs
109 |
110 | # Generated Data
111 | NLP/experiments/MDS_Capstone/validation/annotatedData/
112 | NLP/experiments/MDS_Capstone/validation/results/
113 | NLP/experiments/MDS_Capstone/validation/V4.0/
114 |
--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | ## Contributors
2 |
3 | * [Professor Maite Taboada](https://www.sfu.ca/~mtaboada/): Principal Investigator and Director, Discourse Processing Lab
4 | * [Dr. Fatemeh Torabi Asr](https://ftasr.github.io/): Postdoctoral Fellow and Project Manager
5 | * [Alexandre Lopes](https://github.com/aleaugustoplus): Data Scientist and Database Manager
6 | * [Mohammad Mazraeh](https://github.com/MohMaz): Software Developer and Machine Learning Engineer
7 | * [Vagrant Gautam](https://dippedrusk.com/): Computational Linguist
8 | * [Junette Dianne Gonzales](http://www.sfu.ca/linguistics/events/2020/08/junette-gonzales-sda-minor.html): Language Data Annotator
9 | * [Lucas Chambers](https://www.sfu.ca/linguistics/events/2019/10/lucas-chambers.html): Linguist and Topic Label Annotator
10 | * [Jillian Anderson](https://github.com/jillianderson8): Big Data Developer
11 | * [Prashanth Rao](https://github.com/prrao87): Data Scientist and Software Developer
12 | * [Philipp Eibl](https://philippnoah.github.io): Data Scientist and Software Developer (French NLP)
13 | * [Valentin-Gabriel Soumah](https://github.com/Pantalaymon): Data Scientist and Software Developer (French NLP)
14 |
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018-2023 Maite Taboada
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | __Status: V7.0__ (Code provided as-is; only sporadic updates expected)
2 |
3 | # Measuring gender bias in media
4 |
5 | We present the code and framework for two bodies of work in this repo:
6 |
7 | 1. [The Gender Gap Tracker](https://gendergaptracker.informedopinions.org/) (GGT) for English news articles
8 | 2. [Radar de Parité](https://radardeparite.femmesexpertes.ca/) (RdP) for French news articles
9 |
10 | The GGT and RdP are automated systems that measure men and women’s voices on mainstream Canadian news outlets in real time. We analyze articles from six English outlets (for the GGT) and six French outlets (for the RdP) in Canada using Natural Language Processing (NLP), and quantify the discrepancy in proportions of men and women quoted. Our larger goals through this project are to enhance awareness of women’s portrayal in public discourse through hard evidence, and to encourage news organizations to provide a more diverse set of voices in their reporting.
11 |
12 | The Gender Gap Tracker is a collaboration between [Informed Opinions](https://informedopinions.org/), a non-profit dedicated to amplifying the voices of women and gender-diverse people in media and Simon Fraser University, through the [Discourse Processing Lab](https://www.sfu.ca/discourse-lab.html) and the [Big Data Initiative](https://www.sfu.ca/big-data/big-data-sfu).
13 |
14 | ## Publications
15 | 1. Asr FT, Mazraeh M, Lopes A, Gautam V, Gonzales J, Rao P, Taboada M. (2021) The Gender Gap Tracker: Using Natural Language Processing to measure gender bias in media. *PLoS ONE 16(1): e0245533*. https://doi.org/10.1371/journal.pone.0245533
16 | 2. Rao P, Taboada M. (2021), Gender bias in the news: A scalable topic modelling and visualization framework. *Frontiers in Artificial Intelligence, 4(82)*. https://doi.org/10.3389/frai.2021.664737
17 | 3. Soumah, V.-G., Rao, P., Eibl, P., & Taboada, M. (2023). Radar de Parité: An NLP system to measure gender representation in French news stories. *Proceedings of the Canadian Conference on Artificial Intelligence*. https://doi.org/10.21428/594757db.b6f3c89e
18 |
19 |
20 | ## Contributors
21 |
22 | See [CONTRIBUTORS.md](CONTRIBUTORS.md)
23 | ## Contents of this repo
24 |
25 | * `scraper`: Modules for scraping English and French news articles from various Canadian news organizations' websites and RSS feeds.
26 | * `nlp`: NLP modules for performing quote extraction and entity gender annotation on both English and French news articles.
27 | * `api`: FastAPI code base exposing endpoints that serve our daily statistics to public-facing dashboards: [Gender Gap Tracker](https://gendergaptracker.informedopinions.org) and [Radar de Parité](https://radardeparite.femmesexpertes.ca)
28 | * `research_dashboard`: [A multi-page, extensible dashboard](https://gendergaptracker.research.sfu.ca/) built in Plotly Dash that allows us to explore the GGT data in more detail.
29 | * `statistics`: Scripts for running batch queries on our MongoDB database to retrieve source/gender statistics.
30 |
31 | ## Data
32 |
33 | Both the English and French datasets were downloaded from public and subscription websites of newspapers, under the ‘fair dealing’ provision in Canada’s Copyright Act. This means that the data can be made available (upon signing a licence agreement) **only** for non-commercial and/or research purposes.
34 |
35 | ## Future directions
36 |
37 | In future versions of the software, we are planning to visualize more fine-grained information about who is being quoted, separating politicians, witnesses and/or victims, from experts (as informed sources of analysis, context and opinion). We are also looking into different ways of separating wire copy from the original publication of each news outlet in order to provide a clearer view of the gender gap in Canadian media, produced by the news outlets themselves.
38 |
39 | From a research perspective, questions of salience and space arise, i.e., whether quotes by men are presented more prominently in an article, and whether men are given more space in average (perhaps counted in number of words). More nuanced questions that involve language analysis include whether the quotes are presented differently in terms of endorsement or distance from the content of the quote (*stated* vs. *claimed*). Analyses of transitivity structure in clauses can yield further insights about the type of roles women are portrayed in, complementing some of our studies' findings via dependency analyses.
40 |
41 | We are mindful of and acknowledge the relative lack of work in NLP in languages other than English. We believe that we have played at least a small role here, through our analyses in the Radar de Parité on French news articles, though there still remains a lot more to be done in this domain. Our hope is that further such work will yield not only interesting methodological insights, but also reveal larger similarities in gender disparities in other regions of the world. While we are actively pursuing such additional areas of inquiry, we also invite other researchers to join in this effort!
42 |
43 |
44 | ## Contact
45 |
46 | For more information about the research methodology and for questions regarding collaboration, please contact Dr. Maite Taboada.
47 |
48 | > **Maite Taboada**
49 | mtaboada@sfu.ca
50 | Director, Discourse Processing Lab
51 | Simon Fraser University
52 | Burnaby, British Columbia, Canada
53 |
--------------------------------------------------------------------------------
/api/README.md:
--------------------------------------------------------------------------------
1 | # APIs for public-facing dashboards
2 |
3 | This section hosts code for the backend APIs that serve our public-facing dashboards for our partner organization, Informed Opinions.
4 |
5 | We have two APIs: one each serving the English and French dashboards (for the Gender Gap Tracker and the Radar de Parité, respectively).
6 |
7 | ## Dashboards
8 | * English: https://gendergaptracker.informedopinions.org
9 | * French: https://radardeparite.femmesexpertes.ca
10 |
11 | ### Front end code
12 |
13 | The front end code base, for clearer separation of roles and responsibilities, is hosted externally on GitLab.
14 |
15 | * English: [Kanopi_GGT/Gender Gap Tracker](https://gitlab.com/client-transfer-group/gender-gap-tracker)
16 | * French: [Kanopi_GGT/RDP](https://gitlab.com/client-transfer-group/rdp)
17 |
18 | Access to these repos is restricted, so please reach out to mtaboada@sfu.ca to get access to the code, if required.
19 |
20 | ## Setup
21 |
22 | Both APIs are written using [FastAPI](https://fastapi.tiangolo.com/), a high-performance web framework for building APIs in Python.
23 |
24 | This code base has been tested in Python 3.9, but there shouldn't be too many problems if using a higher Python version.
25 |
26 | Install the required dependencies via `requirements.txt` as follows.
27 |
28 | Install a new virtual environment if it does not already exist:
29 | ```sh
30 | $ python3.9 -m venv api_venv
31 | $ python3.9 -m pip install -r requirements.txt
32 | ```
33 |
34 | For further use, activate the virtual environment:
35 |
36 | ```sh
37 | $ source api_venv/bin/activate
38 | ```
39 |
40 |
41 |
--------------------------------------------------------------------------------
/api/english/README.md:
--------------------------------------------------------------------------------
1 | # Gender Gap Tracker: API
2 |
3 | This section contains the code for the API that serves the [Gender Gap Tracker public dashboard](https://gendergaptracker.informedopinions.org/). The dashboard itself is hosted externally, and its front end code is hosted on this [GitLab repo](https://gitlab.com/client-transfer-group/gender-gap-tracker).
4 |
5 | ## API docs
6 |
7 | The docs can be accessed in one of two ways:
8 |
9 | * Swagger: https://gendergaptracker.informedopinions.org/docs
10 | * Useful to test out the API interactively on the browser
11 | * Redoc: https://gendergaptracker.informedopinions.org/redoc
12 | * Clean, modern UI to see the API structure in a responsive format
13 |
14 |
15 | ## Run tests
16 |
17 | Tests are run via `pytest`. Set up an ssh tunnel on a Unix shell to forward the MongoDB host connection to the local machine on port 27017 as follows. In the example below, `vm12` is the alias for the primary node of the MongoDB cluster.
18 |
19 | ```
20 | $ ssh vm12 -f -N -L 27017:localhost:27017
21 | ```
22 | Run the tests:
23 |
24 | ```sh
25 | $ cd /path_to_repo/api/english
26 | $ python -m pytest -v
27 | ```
28 |
29 | ## Extensibility
30 |
31 | The code base has been written with the intention that future developers can add endpoints for other functionality that can potentially serve other dashboards.
32 |
33 | * `db`: Contains MongoDB-specific code (config and queries) that help interact with the RdP data on our MongoDB database
34 | * `endpoints`: Add new functionality to process and serve results via RESTful API endpoints
35 | * `schemas`: Perform response data validation so that the JSON results from the endpoint are formatted properly in the docs
36 | * `utils`: Add utility functions that support data manipulation within the routers
37 | * `tests`: Add tests to check that data from the endpoints are as expected for the front end
38 | * `gunicorn_conf.py`: Contains deployment-specific instructions for the web server, explained below.
39 |
40 | ## Deployment
41 |
42 | We perform a standard deployment of FastAPI in production, as per the best practices [shown in this blog post](https://www.vultr.com/docs/how-to-deploy-fastapi-applications-with-gunicorn-and-nginx-on-ubuntu-20-04/).
43 |
44 | * `uvicorn` is used as an async web server (compatible with the `gunicorn` web server for production apps)
45 | * We set `uvicorn` to use `uvloop` instead of `asyncio` to handle async coroutines under the hood (due to a bug with `asyncio` on CentOS)
46 | * `gunicorn` works as a process manager that starts multiple `uvicorn` processes via the `uvicorn.workers.UvicornWorker` class
47 | * `nginx` is used as a reverse proxy
48 |
49 | The deployment and maintenance of the web server is carried out by SFU's Research Computing Group (RCG).
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/api/english/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/english/__init__.py
--------------------------------------------------------------------------------
/api/english/db/config.py:
--------------------------------------------------------------------------------
1 | host = ["mongo0", "mongo1", "mongo2"]
2 | # host = "localhost"
3 | is_direct_connection = True if (host == "localhost") else False
4 |
5 | config = {
6 | "MONGO_HOST": host,
7 | "MONGO_PORT": 27017,
8 | "MONGO_ARGS": {
9 | "authSource": "admin",
10 | "readPreference": "primaryPreferred",
11 | "username": "username",
12 | "password": "password",
13 | "directConnection": is_direct_connection,
14 | },
15 | "DB_NAME": "mediaTracker",
16 | "LOGS_DIR": "logs/",
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/api/english/db/mongoqueries.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 |
4 | def agg_total_per_outlet(begin_date: datetime, end_date: datetime):
5 | query = [
6 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}},
7 | {
8 | "$group": {
9 | "_id": "$outlet",
10 | "totalArticles": {"$sum": "$totalArticles"},
11 | "totalFemales": {"$sum": "$totalFemales"},
12 | "totalMales": {"$sum": "$totalMales"},
13 | "totalUnknowns": {"$sum": "$totalUnknowns"},
14 | }
15 | },
16 | ]
17 | return query
18 |
19 |
20 | def agg_total_by_week(begin_date: datetime, end_date: datetime):
21 | query = [
22 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}},
23 | {
24 | "$group": {
25 | "_id": {
26 | "outlet": "$outlet",
27 | "week": {"$week": "$publishedAt"},
28 | "year": {"$year": "$publishedAt"},
29 | },
30 | "totalFemales": {"$sum": "$totalFemales"},
31 | "totalMales": {"$sum": "$totalMales"},
32 | "totalUnknowns": {"$sum": "$totalUnknowns"},
33 | }
34 | },
35 | ]
36 | return query
37 |
--------------------------------------------------------------------------------
/api/english/endpoints/outlet_stats.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from fastapi import APIRouter, HTTPException, Query, Request
3 |
4 | import utils.dateutils as dateutils
5 | from db.mongoqueries import agg_total_by_week, agg_total_per_outlet
6 | from schemas.stats_by_date import TotalStatsByDate
7 | from schemas.stats_weekly import TotalStatsByWeek
8 | from utils.logger import get_logger
9 |
10 | outlet_router = APIRouter()
11 | COLLECTION_NAME = "mediaDaily"
12 | LOWER_BOUND_START_DATE = "2018-09-29" # Specify start date slightly earlier 2018-10-01 for pytest suite
13 | ID_MAPPING = {"Huffington Post": "HuffPost Canada"}
14 |
15 | logger = get_logger("g-tracker-fastapi-en")
16 |
17 |
18 | @outlet_router.get(
19 | "/info_by_date",
20 | response_model=TotalStatsByDate,
21 | response_description="Get total and per outlet gender statistics for English outlets between two dates",
22 | )
23 | def expertwomen_info_by_date(
24 | request: Request,
25 | begin: str = Query(description="Start date in yyyy-mm-dd format"),
26 | end: str = Query(description="End date in yyyy-mm-dd format"),
27 | ) -> TotalStatsByDate:
28 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE):
29 | raise HTTPException(
30 | status_code=416,
31 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date",
32 | )
33 | result = _expertwomen_info_by_date(request, begin, end)
34 | logger.info("Obtained info by date for English outlets between %s and %s" % (begin, end))
35 | return result
36 |
37 |
38 | @outlet_router.get(
39 | "/weekly_info",
40 | response_model=TotalStatsByWeek,
41 | response_description="Get gender statistics per English outlet aggregated WEEKLY between two dates",
42 | )
43 | def expertwomen_weekly_info(
44 | request: Request,
45 | begin: str = Query(description="Start date in yyyy-mm-dd format"),
46 | end: str = Query(description="End date in yyyy-mm-dd format"),
47 | ) -> TotalStatsByWeek:
48 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE):
49 | raise HTTPException(
50 | status_code=416,
51 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date",
52 | )
53 | result = _expertwomen_weekly_info(request, begin, end)
54 | logger.info("Obtained weekly info for English outlets between %s and %s" % (begin, end))
55 | return result
56 |
57 |
58 | def _expertwomen_info_by_date(request: Request, begin: str, end: str) -> TotalStatsByDate:
59 | """
60 | Run aggregation query on MongoDB data to obtain total stats within a specified date range
61 | """
62 | begin_date = dateutils.convert_date(begin)
63 | end_date = dateutils.convert_date(end)
64 |
65 | query = agg_total_per_outlet(begin_date, end_date)
66 | response = request.app.connection[COLLECTION_NAME].aggregate(query)
67 | # Work with the data in pandas
68 | source_stats = list(response)
69 | if not source_stats:
70 | logger.error("No data found for date range %s to %s" % (begin, end))
71 | df = pd.DataFrame.from_dict(source_stats)
72 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"]
73 | # Replace outlet names if necessary
74 | df["_id"] = df["_id"].replace(ID_MAPPING)
75 | # Take sums of total males, females, unknowns and articles and convert to dict
76 | result = df.drop("_id", axis=1).sum().to_dict()
77 | # Compute per outlet stats
78 | df["perFemales"] = df["totalFemales"] / df["totalGenders"]
79 | df["perMales"] = df["totalMales"] / df["totalGenders"]
80 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"]
81 | df["perArticles"] = df["totalArticles"] / result["totalArticles"]
82 | # Convert dataframe to dict prior to JSON serialization
83 | result["sources"] = df.to_dict("records")
84 | result["perFemales"] = result["totalFemales"] / result["totalGenders"]
85 | result["perMales"] = result["totalMales"] / result["totalGenders"]
86 | result["perUnknowns"] = result["totalUnknowns"] / result["totalGenders"]
87 | return result
88 |
89 |
90 | def _expertwomen_weekly_info(request: Request, begin: str, end: str) -> TotalStatsByWeek:
91 | """
92 | Run aggregation query on MongoDB data to obtain weekly stats within a specified date range
93 | """
94 | begin_date = dateutils.convert_date(begin)
95 | end_date = dateutils.convert_date(end)
96 |
97 | query = agg_total_by_week(begin_date, end_date)
98 | response = request.app.connection[COLLECTION_NAME].aggregate(query)
99 | source_stats = list(response)
100 | if not source_stats:
101 | logger.error("No data found for date range %s to %s" % (begin, end))
102 | # Work with the data in pandas
103 | df = pd.json_normalize(source_stats, max_level=1).sort_values(by="_id.outlet").reset_index(drop=True)
104 | df.rename(
105 | columns={
106 | "_id.outlet": "outlet",
107 | "_id.week": "week",
108 | "_id.year": "year",
109 | },
110 | inplace=True,
111 | )
112 | # Replace outlet names if necessary
113 | df["outlet"] = df["outlet"].replace(ID_MAPPING)
114 | # Construct DataFrame and handle begin/end dates as datetimes for summing by week
115 | df["w_begin"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 0), axis=1)
116 | df["w_end"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 6), axis=1)
117 | df["w_begin"], df["w_end"] = zip(
118 | *df.apply(lambda row: (pd.to_datetime(row["w_begin"]), pd.to_datetime(row["w_end"])), axis=1)
119 | )
120 | df = df.drop(columns=["week", "year"], axis=1).sort_values(by=["outlet", "w_begin"])
121 | # In earlier versions, there was a bug due to which we returned partial weekly information for the same week that spanned across years
122 | # This bug only occurred when the last week of one year spanned into the next year (partial week across a year boundary)
123 | # To address this, we perform summation of stats by week in pandas to avoid partial stats per week being passed to the front end
124 | df = df.groupby(["outlet", "w_begin", "w_end"]).sum().reset_index()
125 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"]
126 | df["perFemales"] = df["totalFemales"] / df["totalGenders"]
127 | df["perMales"] = df["totalMales"] / df["totalGenders"]
128 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"]
129 | # Convert datetimes back to string for JSON serialization
130 | df["w_begin"] = df["w_begin"].dt.strftime("%Y-%m-%d")
131 | df["w_end"] = df["w_end"].dt.strftime("%Y-%m-%d")
132 | df = df.drop(columns=["totalGenders", "totalFemales", "totalMales", "totalUnknowns"], axis=1)
133 |
134 | # Convert dataframe to dict prior to JSON serialization
135 | weekly_data = dict()
136 | for outlet in df["outlet"]:
137 | per_outlet_data = df[df["outlet"] == outlet].to_dict(orient="records")
138 | # Remove the outlet key from weekly_data
139 | [item.pop("outlet") for item in per_outlet_data]
140 | weekly_data[outlet] = per_outlet_data
141 | output = TotalStatsByWeek(outlets=weekly_data)
142 | return output
143 |
--------------------------------------------------------------------------------
/api/english/gunicorn_conf.py:
--------------------------------------------------------------------------------
1 | # gunicorn_conf.py to point gunicorn to the uvicorn workers
2 | from multiprocessing import cpu_count
3 |
4 | # Socket path
5 | bind = 'unix:/g-tracker/WomenInMedia/api/english/g-tracker.sock'
6 |
7 | # Worker Options
8 | workers = cpu_count() - 1
9 | worker_class = 'uvicorn.workers.UvicornWorker'
10 |
11 | # Logging Options
12 | loglevel = 'debug'
13 | accesslog = '/g-tracker/WomenInMedia/api/english/access_log'
14 | errorlog = '/g-tracker/WomenInMedia/api/english/error_log'
15 |
--------------------------------------------------------------------------------
/api/english/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root, gunicorn.error, gunicorn.access
3 |
4 | [handlers]
5 | keys=console, error_file, access_file
6 |
7 | [formatters]
8 | keys=generic, access
9 |
10 | [logger_root]
11 | level=INFO
12 | handlers=console
13 |
14 | [logger_gunicorn.error]
15 | level=INFO
16 | handlers=error_file
17 | propagate=1
18 | qualname=gunicorn.error
19 |
20 | [logger_gunicorn.access]
21 | level=INFO
22 | handlers=access_file
23 | propagate=0
24 | qualname=gunicorn.access
25 |
26 | [handler_console]
27 | class=StreamHandler
28 | formatter=generic
29 | args=(sys.stdout, )
30 |
31 | [handler_error_file]
32 | class=logging.FileHandler
33 | formatter=generic
34 | args=('/var/log/gunicorn/error.log',)
35 |
36 | [handler_access_file]
37 | class=logging.FileHandler
38 | formatter=access
39 | args=('/var/log/gunicorn/access.log',)
40 |
41 | [formatter_generic]
42 | format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s
43 | datefmt=%Y-%m-%d %H:%M:%S
44 | class=logging.Formatter
45 |
46 | [formatter_access]
47 | format=%(message)s
48 | class=logging.Formatter
49 |
--------------------------------------------------------------------------------
/api/english/main.py:
--------------------------------------------------------------------------------
1 | from contextlib import asynccontextmanager
2 | from collections.abc import AsyncGenerator
3 | from pathlib import Path
4 |
5 | from fastapi import FastAPI
6 | from fastapi.responses import HTMLResponse
7 | from fastapi.staticfiles import StaticFiles
8 | from pymongo import MongoClient
9 |
10 | from db.config import config
11 | from endpoints.outlet_stats import outlet_router
12 |
13 | # Constants
14 | HOST = config["MONGO_HOST"]
15 | PORT = config["MONGO_PORT"]
16 | MONGO_ARGS = config["MONGO_ARGS"]
17 | DB = config["DB_NAME"]
18 | STATIC_PATH = "gender-gap-tracker"
19 | STATIC_HTML = "tracker.html"
20 |
21 |
22 | @asynccontextmanager
23 | async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
24 | """Async context manager for MongoDB connection."""
25 | app.mongodb_client = MongoClient(HOST, PORT, **MONGO_ARGS)
26 | app.connection = app.mongodb_client[DB]
27 | print("Successfully connected to MongoDB")
28 | yield
29 | app.mongodb_client.close()
30 | print("Successfully closed MongoDB connection")
31 |
32 |
33 | app = FastAPI(
34 | title="Gender Gap Tracker",
35 | description="RESTful API for the Gender Gap Tracker public-facing dashboard",
36 | version="1.1.4",
37 | lifespan=lifespan,
38 | )
39 |
40 |
41 | @app.get("/", include_in_schema=False)
42 | async def root() -> HTMLResponse:
43 | with open(Path(f"{STATIC_PATH}") / STATIC_HTML, "r") as f:
44 | html_content = f.read()
45 | return HTMLResponse(content=html_content, media_type="text/html")
46 |
47 |
48 | # Attach routes
49 | app.include_router(outlet_router, prefix="/expertWomen", tags=["info"])
50 | # Add additional routers here for future endpoints
51 | # ...
52 |
53 | # Serve static files for front end from directory specified as STATIC_PATH
54 | app.mount("/", StaticFiles(directory=STATIC_PATH), name="static")
55 |
56 |
57 | if __name__ == "__main__":
58 | import uvicorn
59 | uvicorn.run("main:app", host="0.0.0.0", port=8000, loop="uvloop", reload=True)
60 |
--------------------------------------------------------------------------------
/api/english/schemas/stats_by_date.py:
--------------------------------------------------------------------------------
1 | from math import isnan
2 | from typing import List
3 |
4 | from pydantic import BaseModel, Field, root_validator
5 |
6 |
7 | def valid_percentage(_, values):
8 | """Avoid NaNs by setting them to 0.0"""
9 | for key in ["perFemales", "perMales", "perUnknowns"]:
10 | if isnan(values[key]):
11 | values[key] = 0.0
12 | return values
13 |
14 |
15 | class OutletStatsByDate(BaseModel):
16 | # In Pydantic, the underscore prefix of a field like `_id` is treated as a private attribute
17 | # We thus define an alias so that the `_id` field can be referenced as is.
18 | id: str = Field(alias="_id")
19 | totalArticles: int
20 | totalFemales: int
21 | totalMales: int
22 | totalUnknowns: int
23 | totalGenders: int
24 | perFemales: float
25 | perMales: float
26 | perUnknowns: float
27 | perArticles: float
28 |
29 | # validators
30 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage)
31 |
32 | class TotalStatsByDate(BaseModel):
33 | totalArticles: int
34 | totalFemales: int
35 | totalMales: int
36 | totalUnknowns: int
37 | totalGenders: int
38 | perFemales: float
39 | perMales: float
40 | perUnknowns: float
41 | sources: List[OutletStatsByDate]
42 |
43 | # validators
44 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage)
45 |
46 |
--------------------------------------------------------------------------------
/api/english/schemas/stats_weekly.py:
--------------------------------------------------------------------------------
1 | from datetime import date, datetime
2 | from math import isnan
3 | from typing import Dict, List
4 |
5 | from pydantic import BaseModel, root_validator, validator
6 |
7 |
8 | class OutletStatsByWeek(BaseModel):
9 | w_begin: date
10 | w_end: date
11 | perFemales: float
12 | perMales: float
13 | perUnknowns: float
14 |
15 | # validation
16 | @validator("w_begin", "w_end", pre=True, always=True)
17 | def valid_date(dateval):
18 | """Validate a date string to be of the format yyyy-mm-dd"""
19 | if isinstance(dateval, str):
20 | return datetime.strptime(dateval, "%Y-%m-%d").strftime("%Y-%m-%d")
21 | return dateval
22 |
23 | @root_validator
24 | def _valid_percentage(cls, values):
25 | """Avoid NaNs by setting them to 0.0"""
26 | for key in ["perFemales", "perMales", "perUnknowns"]:
27 | if isnan(values[key]):
28 | values[key] = 0.0
29 | return values
30 |
31 |
32 | class TotalStatsByWeek(BaseModel):
33 | outlets: Dict[str, List[OutletStatsByWeek]]
34 |
35 | class Config:
36 | schema_extra = {
37 | "example": {
38 | "outlets": {
39 | "Outlet 1": [
40 | {
41 | "w_begin": "2021-12-26",
42 | "w_end": "2022-01-01",
43 | "perFemales": 0.3915470494417863,
44 | "perMales": 0.6052631578947368,
45 | "perUnknowns": 0.003189792663476874,
46 | },
47 | {
48 | "w_begin": "2022-01-02",
49 | "w_end": "2022-01-08",
50 | "perFemales": 0.39904862579281186,
51 | "perMales": 0.6004228329809725,
52 | "perUnknowns": 0.0005285412262156448,
53 | },
54 | ],
55 | "Outlet 2": [
56 | {
57 | "w_begin": "2021-12-26",
58 | "w_end": "2022-01-01",
59 | "perFemales": 0.34763636363636363,
60 | "perMales": 0.648,
61 | "perUnknowns": 0.004363636363636364,
62 | },
63 | {
64 | "w_begin": "2022-01-02",
65 | "w_end": "2022-01-08",
66 | "perFemales": 0.0,
67 | "perMales": 0.0,
68 | "perUnknowns": 0.0,
69 | },
70 | ],
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/api/english/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/english/tests/__init__.py
--------------------------------------------------------------------------------
/api/english/tests/test_mock_outlet_stats.py:
--------------------------------------------------------------------------------
1 | from fastapi.testclient import TestClient
2 |
3 | from english.main import app
4 | from endpoints.outlet_stats import ID_MAPPING
5 |
6 | PREFIX = "expertWomen"
7 |
8 |
9 | def test_get_info_by_date():
10 | with TestClient(app) as client:
11 | # We test mock data in a date range outside that specified in outlet_stats.py
12 | begin = "2018-09-29"
13 | end = "2018-09-30"
14 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}")
15 | assert response.status_code == 200
16 | body = response.json()
17 | # Ensure there are no NaN values due to DivisionByZero when no sources exist
18 | assert body.get("perFemales") >= 0
19 | assert body.get("perMales") >= 0
20 | assert body.get("perUnknowns") >= 0
21 | assert isinstance(body.get("sources"), list)
22 | for obj in body.get("sources"):
23 | assert isinstance(obj.get("_id"), str)
24 | assert obj.get("perFemales") >= 0
25 | assert obj.get("perMales") >= 0
26 | assert obj.get("perUnknowns") >= 0
27 |
28 |
29 | def test_get_info_outlet_name_mapping_in_list():
30 | with TestClient(app) as client:
31 | begin = "2018-09-29"
32 | end = "2018-09-30"
33 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}")
34 | outlet_list = [item.get("_id") for item in response.json().get("sources")]
35 | for outlet in ID_MAPPING:
36 | assert ID_MAPPING[outlet] in outlet_list
37 |
38 |
39 | def test_weekly_info_outlet_name_mapping_in_list():
40 | with TestClient(app) as client:
41 | begin = "2018-09-29"
42 | end = "2018-09-30"
43 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}")
44 | outlet_list = [k for k, _ in response.json().get("outlets").items()]
45 | for outlet in ID_MAPPING:
46 | assert ID_MAPPING[outlet] in outlet_list
--------------------------------------------------------------------------------
/api/english/tests/test_outlet_stats.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from fastapi.testclient import TestClient
3 |
4 | from english.main import app
5 | from endpoints.outlet_stats import LOWER_BOUND_START_DATE
6 |
7 | PREFIX = "expertWomen"
8 |
9 |
10 | def test_read_main():
11 | with TestClient(app) as client:
12 | response = client.get("/")
13 | assert response.status_code == 200
14 |
15 |
16 | def test_get_info_by_date():
17 | with TestClient(app) as client:
18 | # Choose a date range that is in the recent past
19 | begin = datetime.today().date() - timedelta(days=7)
20 | end = datetime.today().date() - timedelta(days=3)
21 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}")
22 | assert response.status_code == 200
23 | body = response.json()
24 | assert body.get("perFemales") >= 0
25 | assert body.get("perMales") >= 0
26 | assert body.get("perUnknowns") >= 0
27 | assert isinstance(body.get("sources"), list)
28 | for obj in body.get("sources"):
29 | assert isinstance(obj.get("_id"), str)
30 | assert obj.get("perFemales") >= 0
31 | assert obj.get("perMales") >= 0
32 | assert obj.get("perUnknowns") >= 0
33 |
34 |
35 | def test_get_info_by_date_invalid_date_range():
36 | with TestClient(app) as client:
37 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date()
38 | past = lower_bound_date - timedelta(days=2)
39 | response = client.get(f"/{PREFIX}/info_by_date?begin={past}&end={lower_bound_date}")
40 | assert (
41 | response.status_code == 416
42 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later"
43 | today = datetime.today().date()
44 | future = today + timedelta(days=2)
45 | response = client.get(f"/{PREFIX}/info_by_date?begin={today}&end={future}")
46 | assert response.status_code == 416, "Cannot request stats for dates in the future"
47 |
48 |
49 | def test_get_weekly_info():
50 | with TestClient(app) as client:
51 | # Choose a date range that is in the recent past
52 | begin = datetime.today().date() - timedelta(days=7)
53 | end = datetime.today().date() - timedelta(days=3)
54 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}")
55 | assert response.status_code == 200
56 | body = response.json().get("outlets")
57 | assert len(body) > 0
58 | for _, stats in body.items():
59 | for week_id in stats:
60 | assert isinstance(week_id.get("w_begin"), str)
61 | assert isinstance(week_id.get("w_end"), str)
62 | assert week_id.get("perFemales") >= 0
63 | assert week_id.get("perMales") >= 0
64 | assert week_id.get("perUnknowns") >= 0
65 |
66 |
67 | def test_get_weekly_info_invalid_date_range():
68 | with TestClient(app) as client:
69 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date()
70 | past = lower_bound_date - timedelta(days=2)
71 | response = client.get(f"/{PREFIX}/weekly_info?begin={past}&end={lower_bound_date}")
72 | assert (
73 | response.status_code == 416
74 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later"
75 | today = datetime.today().date()
76 | future = today + timedelta(days=2)
77 | response = client.get(f"/{PREFIX}/weekly_info?begin={today}&end={future}")
78 | assert response.status_code == 416, "Cannot request stats for dates in the future"
--------------------------------------------------------------------------------
/api/english/utils/dateutils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 |
4 | def is_valid_date_range(start_date: str, end_date: str, lower_bound: str) -> bool:
5 | tommorrow = datetime.today() + timedelta(days=1)
6 | if (tommorrow >= convert_date(end_date)) and (
7 | convert_date(start_date) >= convert_date(lower_bound)
8 | ):
9 | return True
10 | else:
11 | return False
12 |
13 |
14 | def convert_date(date_str: str) -> datetime:
15 | return datetime.strptime(date_str, "%Y-%m-%d")
16 |
17 |
18 | def get_week_bound(year: int, week: int, day_of_week: int) -> str:
19 | """
20 | Get begin or end date for a week of the year as a string YYYY-MM-DD
21 | - Start of week is Sunday
22 | - For start of week, set `day_of_week` to 0
23 | - For end of week, set `day_of_week` to 6
24 | """
25 | w_bound = datetime.strptime(f"{year}-{week}-{day_of_week}", "%Y-%U-%w")
26 | w_bound_str = w_bound.strftime("%Y-%m-%d")
27 | return w_bound_str
28 |
--------------------------------------------------------------------------------
/api/english/utils/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from logging.handlers import TimedRotatingFileHandler
4 | from pathlib import Path
5 |
6 |
7 | def get_logger(filename: str = "g-tracker-fastapi") -> logging.Logger:
8 | filename = f"{filename}.log" if not filename.endswith(".log") else filename
9 | Path("logs").mkdir(parents=True, exist_ok=True)
10 | log = logging.getLogger(filename)
11 | log.setLevel(logging.INFO)
12 | format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
13 | rotateHandler = TimedRotatingFileHandler(
14 | Path("logs") / filename,
15 | when="midnight",
16 | backupCount=3,
17 | )
18 | rotateHandler.setFormatter(format)
19 | stream = logging.StreamHandler(sys.stdout)
20 | stream.setFormatter(format)
21 | log.addHandler(rotateHandler)
22 | log.addHandler(stream)
23 | return log
24 |
--------------------------------------------------------------------------------
/api/french/README.md:
--------------------------------------------------------------------------------
1 | # Radar de Parité: API
2 |
3 | FastAPI code base for the API that serves the [Radar de Parité public dashboard](https://radardeparite.femmesexpertes.ca/). The dashboard itself is hosted externally, and its front end code is hosted on this [GitLab repo](https://gitlab.com/client-transfer-group/rdp).
4 |
5 | ## API docs
6 |
7 | The docs can be accessed in one of two ways:
8 |
9 | * Swagger: https://radardeparite.femmesexpertes.ca/docs
10 | * Useful to test out the API interactively on the browser
11 | * Redoc: https://radardeparite.femmesexpertes.ca/redoc
12 | * Clean, modern UI to see the API structure in a responsive format
13 |
14 | ## Run tests
15 |
16 | Tests are run via `pytest`. Set up an ssh tunnel on a Unix shell to forward the MongoDB host connection to the local machine on port 27017 as follows. In the example below, `vm12` is the alias for the primary node of the MongoDB cluster.
17 |
18 | ```
19 | $ ssh vm12 -f -N -L 27017:localhost:27017
20 | ```
21 | Run the tests:
22 |
23 | ```sh
24 | $ cd /path_to_repo/api/english
25 | $ python -m pytest -v
26 | ```
27 |
28 | ## Extensibility
29 |
30 | The code base has been written with the intention that future developers can add endpoints for other functionality that can potentially serve other dashboards.
31 |
32 | * `db`: Contains MongoDB-specific code (config and queries) that help interact with the RdP data on our MongoDB database
33 | * `endpoints`: Add new functionality to process and serve results via RESTful API endpoints
34 | * `schemas`: Perform response data validation so that the JSON results from the endpoint are formatted properly in the docs
35 | * `utils`: Add utility functions that support data manipulation within the routers
36 | * `tests`: Add tests to check that data from the endpoints are as expected for the front end
37 | * `gunicorn_conf.py`: Contains deployment-specific instructions for the web server, explained below.
38 |
39 | ## Deployment
40 |
41 | We perform a standard deployment of FastAPI in production, as per the best practices [shown in this blog post](https://www.vultr.com/docs/how-to-deploy-fastapi-applications-with-gunicorn-and-nginx-on-ubuntu-20-04/).
42 |
43 | * `uvicorn` is used as an async web server (compatible with the `gunicorn` web server for production apps)
44 | * We set `uvicorn` to use `uvloop` instead of `asyncio` to handle async coroutines under the hood (due to a bug with `asyncio` on CentOS)
45 | * `gunicorn` works as a process manager that starts multiple `uvicorn` processes via the `uvicorn.workers.UvicornWorker` class
46 | * `nginx` is used as a reverse proxy
47 |
48 | The deployment and maintenance of the web server is carried out by SFU's Research Computing Group (RCG).
49 |
50 |
51 |
--------------------------------------------------------------------------------
/api/french/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/french/__init__.py
--------------------------------------------------------------------------------
/api/french/db/config.py:
--------------------------------------------------------------------------------
1 | host = ["mongo0", "mongo1", "mongo2"]
2 | # host = "localhost"
3 | is_direct_connection = True if (host == "localhost") else False
4 |
5 | config = {
6 | "MONGO_HOST": host,
7 | "MONGO_PORT": 27017,
8 | "MONGO_ARGS": {
9 | "authSource": "admin",
10 | "readPreference": "primaryPreferred",
11 | "username": "username",
12 | "password": "password",
13 | "directConnection": is_direct_connection,
14 | },
15 | "DB_NAME": "mediaTracker",
16 | "LOGS_DIR": "logs/",
17 | }
18 |
--------------------------------------------------------------------------------
/api/french/db/mongoqueries.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | def agg_total_per_outlet(begin_date: datetime, end_date: datetime):
4 | query = [
5 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}},
6 | {
7 | "$group": {
8 | "_id": "$outlet",
9 | "totalArticles": {"$sum": "$totalArticles"},
10 | "totalFemales": {"$sum": "$totalFemales"},
11 | "totalMales": {"$sum": "$totalMales"},
12 | "totalUnknowns": {"$sum": "$totalUnknowns"},
13 | }
14 | },
15 | ]
16 | return query
17 |
18 |
19 | def agg_total_by_week(begin_date: datetime, end_date: datetime):
20 | query = [
21 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}},
22 | {
23 | "$group": {
24 | "_id": {
25 | "outlet": "$outlet",
26 | "week": {"$week": "$publishedAt"},
27 | "year": {"$year": "$publishedAt"},
28 | },
29 | "totalFemales": {"$sum": "$totalFemales"},
30 | "totalMales": {"$sum": "$totalMales"},
31 | "totalUnknowns": {"$sum": "$totalUnknowns"},
32 | }
33 | },
34 | ]
35 | return query
36 |
--------------------------------------------------------------------------------
/api/french/endpoints/outlet_stats.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from fastapi import APIRouter, HTTPException, Query, Request
3 |
4 | import utils.dateutils as dateutils
5 | from db.mongoqueries import agg_total_by_week, agg_total_per_outlet
6 | from schemas.stats_by_date import TotalStatsByDate
7 | from schemas.stats_weekly import TotalStatsByWeek
8 | from utils.logger import get_logger
9 | from typing import Any
10 |
11 | outlet_router = APIRouter()
12 | COLLECTION_NAME = "mediaDailyFrench"
13 | LOWER_BOUND_START_DATE = "2021-09-29" # Specify start date slightly earlier 2021-10-01 for pytest suite
14 | ID_MAPPING = {
15 | "Journal De Montreal": "Le Journal de Montréal",
16 | "TVA News": "TVA Nouvelles",
17 | "Radio Canada": "Radio-Canada",
18 | }
19 |
20 | logger = get_logger("g-tracker-fastapi-fr")
21 |
22 |
23 | @outlet_router.get(
24 | "/info_by_date",
25 | response_model=TotalStatsByDate,
26 | response_description="Get total and per outlet gender statistics for French outlets between two dates",
27 | )
28 | def femmesexpertes_info_by_date(
29 | request: Request,
30 | begin: str = Query(description="Start date in yyyy-mm-dd format"),
31 | end: str = Query(description="End date in yyyy-mm-dd format"),
32 | ) -> TotalStatsByDate:
33 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE):
34 | raise HTTPException(
35 | status_code=416,
36 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date",
37 | )
38 | result = _femmesexpertes_info_by_date(request, begin, end)
39 | logger.info("Obtained info by date for French outlets between %s and %s" % (begin, end))
40 | return result
41 |
42 |
43 | @outlet_router.get(
44 | "/weekly_info",
45 | response_model=TotalStatsByWeek,
46 | response_description="Get gender statistics per French outlet aggregated WEEKLY between two dates",
47 | )
48 | def femmesexpertes_weekly_info(
49 | request: Request,
50 | begin: str = Query(description="Start date in yyyy-mm-dd format"),
51 | end: str = Query(description="End date in yyyy-mm-dd format"),
52 | ) -> TotalStatsByWeek:
53 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE):
54 | raise HTTPException(
55 | status_code=416,
56 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date",
57 | )
58 | result = _femmesexpertes_weekly_info(request, begin, end)
59 | logger.info("Obtained weekly info for French outlets between %s and %s" % (begin, end))
60 | return result
61 |
62 |
63 | def _femmesexpertes_info_by_date(request: Request, begin: str, end: str) -> TotalStatsByDate:
64 | """
65 | Run aggregation query on MongoDB data to obtain total stats within a specified date range
66 | """
67 | begin_date = dateutils.convert_date(begin)
68 | end_date = dateutils.convert_date(end)
69 |
70 | query = agg_total_per_outlet(begin_date, end_date)
71 | response = request.app.connection[COLLECTION_NAME].aggregate(query)
72 | # Work with the data in pandas
73 | source_stats = list(response)
74 | if not source_stats:
75 | logger.error("No data found for date range %s to %s" % (begin, end))
76 | df = pd.DataFrame.from_dict(source_stats)
77 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"]
78 | # Replace outlet names if necessary
79 | df["_id"] = df["_id"].replace(ID_MAPPING)
80 | # Take sums of total males, females, unknowns and articles and convert to dict
81 | result = df.drop("_id", axis=1).sum().to_dict()
82 | # Compute per outlet stats
83 | df["perFemales"] = df["totalFemales"] / df["totalGenders"]
84 | df["perMales"] = df["totalMales"] / df["totalGenders"]
85 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"]
86 | df["perArticles"] = df["totalArticles"] / result["totalArticles"]
87 | # Convert dataframe to dict prior to JSON serialization
88 | result["sources"] = df.to_dict("records")
89 | result["perFemales"] = result["totalFemales"] / result["totalGenders"]
90 | result["perMales"] = result["totalMales"] / result["totalGenders"]
91 | result["perUnknowns"] = result["totalUnknowns"] / result["totalGenders"]
92 | return result
93 |
94 |
95 | def _femmesexpertes_weekly_info(request: Request, begin: str, end: str) -> TotalStatsByWeek:
96 | """
97 | Run aggregation query on MongoDB data to obtain weekly stats within a specified date range
98 | """
99 | begin_date = dateutils.convert_date(begin)
100 | end_date = dateutils.convert_date(end)
101 |
102 | query = agg_total_by_week(begin_date, end_date)
103 | response = request.app.connection[COLLECTION_NAME].aggregate(query)
104 | source_stats = list(response)
105 | if not source_stats:
106 | logger.error("No data found for date range %s to %s" % (begin, end))
107 | # Work with the data in pandas
108 | df = pd.json_normalize(source_stats, max_level=1).sort_values(by="_id.outlet").reset_index(drop=True)
109 | df.rename(
110 | columns={
111 | "_id.outlet": "outlet",
112 | "_id.week": "week",
113 | "_id.year": "year",
114 | },
115 | inplace=True,
116 | )
117 | # Replace outlet names if necessary
118 | df["outlet"] = df["outlet"].replace(ID_MAPPING)
119 | # Construct DataFrame and handle begin/end dates as datetimes for summing by week
120 | df["w_begin"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 0), axis=1)
121 | df["w_end"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 6), axis=1)
122 | df["w_begin"], df["w_end"] = zip(
123 | *df.apply(lambda row: (pd.to_datetime(row["w_begin"]), pd.to_datetime(row["w_end"])), axis=1)
124 | )
125 | df = df.drop(columns=["week", "year"], axis=1).sort_values(by=["outlet", "w_begin"])
126 | # In earlier versions, there was a bug due to which we returned partial weekly information for the same week that spanned across years
127 | # This bug only occurred when the last week of one year spanned into the next year (partial week across a year boundary)
128 | # To address this, we perform summation of stats by week in pandas to avoid partial stats per week being passed to the front end
129 | df = df.groupby(["outlet", "w_begin", "w_end"]).sum().reset_index()
130 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"]
131 | df["perFemales"] = df["totalFemales"] / df["totalGenders"]
132 | df["perMales"] = df["totalMales"] / df["totalGenders"]
133 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"]
134 | # Convert datetimes back to string for JSON serialization
135 | df["w_begin"] = df["w_begin"].dt.strftime("%Y-%m-%d")
136 | df["w_end"] = df["w_end"].dt.strftime("%Y-%m-%d")
137 | df = df.drop(columns=["totalGenders", "totalFemales", "totalMales", "totalUnknowns"], axis=1)
138 |
139 | # Convert dataframe to dict prior to JSON serialization
140 | weekly_data = dict()
141 | for outlet in df["outlet"]:
142 | per_outlet_data = df[df["outlet"] == outlet].to_dict(orient="records")
143 | # Remove the outlet key from weekly_data
144 | [item.pop("outlet") for item in per_outlet_data]
145 | weekly_data[outlet] = per_outlet_data
146 | output = TotalStatsByWeek(outlets=weekly_data)
147 | return output
148 |
--------------------------------------------------------------------------------
/api/french/gunicorn_conf.py:
--------------------------------------------------------------------------------
1 | # gunicorn_conf.py to point gunicorn to the uvicorn workers
2 | from multiprocessing import cpu_count
3 |
4 | # Socket path
5 | bind = 'unix:/g-tracker/WomenInMedia/api/french/g-tracker-fr.sock'
6 |
7 | # Worker Options
8 | workers = cpu_count() - 1
9 | worker_class = 'uvicorn.workers.UvicornWorker'
10 |
11 | # Logging Options
12 | loglevel = 'debug'
13 | accesslog = '/g-tracker/WomenInMedia/api/french/access_log'
14 | errorlog = '/g-tracker/WomenInMedia/api/french/error_log'
15 |
--------------------------------------------------------------------------------
/api/french/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root, gunicorn.error, gunicorn.access
3 |
4 | [handlers]
5 | keys=console, error_file, access_file
6 |
7 | [formatters]
8 | keys=generic, access
9 |
10 | [logger_root]
11 | level=INFO
12 | handlers=console
13 |
14 | [logger_gunicorn.error]
15 | level=INFO
16 | handlers=error_file
17 | propagate=1
18 | qualname=gunicorn.error
19 |
20 | [logger_gunicorn.access]
21 | level=INFO
22 | handlers=access_file
23 | propagate=0
24 | qualname=gunicorn.access
25 |
26 | [handler_console]
27 | class=StreamHandler
28 | formatter=generic
29 | args=(sys.stdout, )
30 |
31 | [handler_error_file]
32 | class=logging.FileHandler
33 | formatter=generic
34 | args=('/var/log/gunicorn/error.log',)
35 |
36 | [handler_access_file]
37 | class=logging.FileHandler
38 | formatter=access
39 | args=('/var/log/gunicorn/access.log',)
40 |
41 | [formatter_generic]
42 | format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s
43 | datefmt=%Y-%m-%d %H:%M:%S
44 | class=logging.Formatter
45 |
46 | [formatter_access]
47 | format=%(message)s
48 | class=logging.Formatter
49 |
--------------------------------------------------------------------------------
/api/french/main.py:
--------------------------------------------------------------------------------
1 | from contextlib import asynccontextmanager
2 | from collections.abc import AsyncGenerator
3 | from pathlib import Path
4 |
5 | from fastapi import FastAPI
6 | from fastapi.responses import HTMLResponse
7 | from fastapi.staticfiles import StaticFiles
8 | from pymongo import MongoClient
9 |
10 | from db.config import config
11 | from endpoints.outlet_stats import outlet_router
12 |
13 | # Constants
14 | HOST = config["MONGO_HOST"]
15 | PORT = config["MONGO_PORT"]
16 | MONGO_ARGS = config["MONGO_ARGS"]
17 | DB = config["DB_NAME"]
18 | STATIC_PATH = "rdp"
19 | STATIC_HTML = "tracker.html"
20 |
21 |
22 | @asynccontextmanager
23 | async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
24 | """Async context manager for MongoDB connection."""
25 | app.mongodb_client = MongoClient(HOST, PORT, **MONGO_ARGS)
26 | app.connection = app.mongodb_client[DB]
27 | print("Successfully connected to MongoDB")
28 | yield
29 | app.mongodb_client.close()
30 | print("Successfully closed MongoDB connection")
31 |
32 |
33 | app = FastAPI(
34 | title="Radar de Parité",
35 | description="RESTful API for the Radar de Parité public-facing dashboard",
36 | version="1.1.4",
37 | lifespan=lifespan,
38 | )
39 |
40 |
41 | @app.get("/", include_in_schema=False)
42 | async def root() -> HTMLResponse:
43 | with open(Path(f"{STATIC_PATH}") / STATIC_HTML, "r") as f:
44 | html_content = f.read()
45 | return HTMLResponse(content=html_content, media_type="text/html")
46 |
47 |
48 | # Attach routes
49 | app.include_router(outlet_router, prefix="/femmesExpertes", tags=["info"])
50 | # Add additional routers here for future endpoints
51 | # ...
52 |
53 | # Serve static files for front end from directory specified as STATIC_PATH
54 | app.mount("/", StaticFiles(directory=STATIC_PATH), name="static")
55 |
56 |
57 | if __name__ == "__main__":
58 | import uvicorn
59 | uvicorn.run("main:app", host="0.0.0.0", port=8000, loop="uvloop", reload=True)
60 |
--------------------------------------------------------------------------------
/api/french/schemas/stats_by_date.py:
--------------------------------------------------------------------------------
1 | from math import isnan
2 | from typing import List
3 |
4 | from pydantic import BaseModel, Field, root_validator
5 |
6 |
7 | def valid_percentage(_, values):
8 | """Avoid NaNs by setting them to 0.0"""
9 | for key in ["perFemales", "perMales", "perUnknowns"]:
10 | if isnan(values[key]):
11 | values[key] = 0.0
12 | return values
13 |
14 |
15 | class OutletStatsByDate(BaseModel):
16 | # In Pydantic, the underscore prefix of a field like `_id` is treated as a private attribute
17 | # We thus define an alias so that the `_id` field can be referenced as is.
18 | id: str = Field(alias="_id")
19 | totalArticles: int
20 | totalFemales: int
21 | totalMales: int
22 | totalUnknowns: int
23 | totalGenders: int
24 | perFemales: float
25 | perMales: float
26 | perUnknowns: float
27 | perArticles: float
28 |
29 | # validators
30 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage)
31 |
32 |
33 | class TotalStatsByDate(BaseModel):
34 | totalArticles: int
35 | totalFemales: int
36 | totalMales: int
37 | totalUnknowns: int
38 | totalGenders: int
39 | perFemales: float
40 | perMales: float
41 | perUnknowns: float
42 | sources: List[OutletStatsByDate]
43 |
44 | # validators
45 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage)
46 |
47 |
--------------------------------------------------------------------------------
/api/french/schemas/stats_weekly.py:
--------------------------------------------------------------------------------
1 | from datetime import date, datetime
2 | from math import isnan
3 | from typing import Dict, List
4 |
5 | from pydantic import BaseModel, root_validator, validator
6 |
7 |
8 | class OutletStatsByWeek(BaseModel):
9 | w_begin: date
10 | w_end: date
11 | perFemales: float
12 | perMales: float
13 | perUnknowns: float
14 |
15 | # validation
16 | @validator("w_begin", "w_end", pre=True, always=True)
17 | def valid_date(dateval):
18 | """Validate a date string to be of the format yyyy-mm-dd"""
19 | if isinstance(dateval, str):
20 | return datetime.strptime(dateval, "%Y-%m-%d").strftime("%Y-%m-%d")
21 | return dateval
22 |
23 | @root_validator
24 | def _valid_percentage(cls, values):
25 | """Avoid NaNs by setting them to 0.0"""
26 | for key in ["perFemales", "perMales", "perUnknowns"]:
27 | if isnan(values[key]):
28 | values[key] = 0.0
29 | return values
30 |
31 |
32 | class TotalStatsByWeek(BaseModel):
33 | outlets: Dict[str, List[OutletStatsByWeek]]
34 |
35 | class Config:
36 | schema_extra = {
37 | "example": {
38 | "outlets": {
39 | "Outlet 1": [
40 | {
41 | "w_begin": "2021-12-26",
42 | "w_end": "2022-01-01",
43 | "perFemales": 0.3915470494417863,
44 | "perMales": 0.6052631578947368,
45 | "perUnknowns": 0.003189792663476874,
46 | },
47 | {
48 | "w_begin": "2022-01-02",
49 | "w_end": "2022-01-08",
50 | "perFemales": 0.39904862579281186,
51 | "perMales": 0.6004228329809725,
52 | "perUnknowns": 0.0005285412262156448,
53 | },
54 | ],
55 | "Outlet 2": [
56 | {
57 | "w_begin": "2021-12-26",
58 | "w_end": "2022-01-01",
59 | "perFemales": 0.34763636363636363,
60 | "perMales": 0.648,
61 | "perUnknowns": 0.004363636363636364,
62 | },
63 | {
64 | "w_begin": "2022-01-02",
65 | "w_end": "2022-01-08",
66 | "perFemales": 0.0,
67 | "perMales": 0.0,
68 | "perUnknowns": 0.0,
69 | },
70 | ],
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/api/french/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/french/tests/__init__.py
--------------------------------------------------------------------------------
/api/french/tests/test_mock_outlet_stats.py:
--------------------------------------------------------------------------------
1 | from fastapi.testclient import TestClient
2 |
3 | from french.main import app
4 | from endpoints.outlet_stats import ID_MAPPING
5 |
6 | PREFIX = "femmesExpertes"
7 |
8 |
9 | def test_get_info_by_date():
10 | with TestClient(app) as client:
11 | begin = "2021-09-29"
12 | end = "2021-09-30"
13 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}")
14 | assert response.status_code == 200
15 | body = response.json()
16 | # Ensure there are no NaN values due to DivisionByZero when no sources exist
17 | assert body.get("perFemales") >= 0
18 | assert body.get("perMales") >= 0
19 | assert body.get("perUnknowns") >= 0
20 | assert isinstance(body.get("sources"), list)
21 | for obj in body.get("sources"):
22 | assert isinstance(obj.get("_id"), str)
23 | assert obj.get("perFemales") >= 0
24 | assert obj.get("perMales") >= 0
25 | assert obj.get("perUnknowns") >= 0
26 |
27 |
28 | def test_get_info_outlet_name_mapping_in_list():
29 | with TestClient(app) as client:
30 | begin = "2021-09-29"
31 | end = "2021-09-30"
32 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}")
33 | outlet_list = [item.get("_id") for item in response.json().get("sources")]
34 | for outlet in ID_MAPPING:
35 | assert ID_MAPPING[outlet] in outlet_list
36 |
37 |
38 | def test_weekly_info_outlet_name_mapping_in_list():
39 | with TestClient(app) as client:
40 | begin = "2021-09-29"
41 | end = "2021-09-30"
42 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}")
43 | outlet_list = [k for k, _ in response.json().get("outlets").items()]
44 | for outlet in ID_MAPPING:
45 | assert ID_MAPPING[outlet] in outlet_list
--------------------------------------------------------------------------------
/api/french/tests/test_outlet_stats.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from fastapi.testclient import TestClient
3 |
4 | from french.main import app
5 | from endpoints.outlet_stats import LOWER_BOUND_START_DATE
6 |
7 | PREFIX = "femmesExpertes"
8 |
9 |
10 | def test_read_main():
11 | with TestClient(app) as client:
12 | response = client.get("/")
13 | assert response.status_code == 200
14 |
15 |
16 | def test_get_info_by_date():
17 | with TestClient(app) as client:
18 | # Choose a date range that is in the recent past
19 | begin = datetime.today().date() - timedelta(days=7)
20 | end = datetime.today().date() - timedelta(days=3)
21 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}")
22 | assert response.status_code == 200
23 | body = response.json()
24 | assert body.get("perFemales") >= 0
25 | assert body.get("perMales") >= 0
26 | assert body.get("perUnknowns") >= 0
27 | assert isinstance(body.get("sources"), list)
28 | for obj in body.get("sources"):
29 | assert isinstance(obj.get("_id"), str)
30 | assert obj.get("perFemales") >= 0
31 | assert obj.get("perMales") >= 0
32 | assert obj.get("perUnknowns") >= 0
33 |
34 |
35 | def test_get_info_by_date_invalid_date_range():
36 | with TestClient(app) as client:
37 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date()
38 | past = lower_bound_date - timedelta(days=2)
39 | response = client.get(f"/{PREFIX}/info_by_date?begin={past}&end={lower_bound_date}")
40 | assert (
41 | response.status_code == 416
42 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later"
43 | today = datetime.today().date()
44 | future = today + timedelta(days=2)
45 | response = client.get(f"/{PREFIX}/info_by_date?begin={today}&end={future}")
46 | assert response.status_code == 416, "Cannot request stats for dates in the future"
47 |
48 |
49 | def test_get_weekly_info():
50 | with TestClient(app) as client:
51 | # Choose a date range that is in the recent past
52 | begin = datetime.today().date() - timedelta(days=7)
53 | end = datetime.today().date() - timedelta(days=3)
54 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}")
55 | assert response.status_code == 200
56 | body = response.json().get("outlets")
57 | assert len(body) > 0
58 | for _, stats in body.items():
59 | for week_id in stats:
60 | assert isinstance(week_id.get("w_begin"), str)
61 | assert isinstance(week_id.get("w_end"), str)
62 | assert week_id.get("perFemales") >= 0
63 | assert week_id.get("perMales") >= 0
64 | assert week_id.get("perUnknowns") >= 0
65 |
66 |
67 | def test_get_weekly_info_invalid_date_range():
68 | with TestClient(app) as client:
69 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date()
70 | past = lower_bound_date - timedelta(days=2)
71 | response = client.get(f"/{PREFIX}/weekly_info?begin={past}&end={lower_bound_date}")
72 | assert (
73 | response.status_code == 416
74 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later"
75 | today = datetime.today().date()
76 | future = today + timedelta(days=2)
77 | response = client.get(f"/{PREFIX}/weekly_info?begin={today}&end={future}")
78 | assert response.status_code == 416, "Cannot request stats for dates in the future"
--------------------------------------------------------------------------------
/api/french/utils/dateutils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 |
4 | def is_valid_date_range(start_date: str, end_date: str, lower_bound: str) -> bool:
5 | tommorrow = datetime.today() + timedelta(days=1)
6 | if (tommorrow >= convert_date(end_date)) and (
7 | convert_date(start_date) >= convert_date(lower_bound)
8 | ):
9 | return True
10 | else:
11 | return False
12 |
13 |
14 | def convert_date(date_str: str) -> datetime:
15 | return datetime.strptime(date_str, "%Y-%m-%d")
16 |
17 |
18 | def get_week_bound(year: int, week: int, day_of_week: int) -> str:
19 | """
20 | Get begin or end date for a week of the year as a string YYYY-MM-DD
21 | - Start of week is Sunday
22 | - For start of week, set `day_of_week` to 0
23 | - For end of week, set `day_of_week` to 6
24 | """
25 | w_bound = datetime.strptime(f"{year}-{week}-{day_of_week}", "%Y-%U-%w")
26 | w_bound_str = w_bound.strftime("%Y-%m-%d")
27 | return w_bound_str
28 |
--------------------------------------------------------------------------------
/api/french/utils/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from logging.handlers import TimedRotatingFileHandler
4 | from pathlib import Path
5 |
6 |
7 | def get_logger(filename: str = "g-tracker-fastapi") -> logging.Logger:
8 | filename = f"{filename}.log" if not filename.endswith(".log") else filename
9 | Path("logs").mkdir(parents=True, exist_ok=True)
10 | log = logging.getLogger(filename)
11 | log.setLevel(logging.INFO)
12 | format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
13 | rotateHandler = TimedRotatingFileHandler(
14 | Path("logs") / filename,
15 | when="midnight",
16 | backupCount=3,
17 | )
18 | rotateHandler.setFormatter(format)
19 | stream = logging.StreamHandler(sys.stdout)
20 | stream.setFormatter(format)
21 | log.addHandler(rotateHandler)
22 | log.addHandler(stream)
23 | return log
24 |
--------------------------------------------------------------------------------
/api/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.28.2
2 | pandas>=1.5.3,<1.6.0
3 | pymongo<4.0.0
4 | pydantic<2.0.0
5 | httpx>=0.23.0, <0.24.0
6 | fastapi>=0.94.0,<0.95.0
7 | gunicorn>=20.1.0,<20.2.0
8 | uvicorn>=0.20.0,<0.21.0
9 | uvloop==0.17.0
10 | pytest>=7.2.1
--------------------------------------------------------------------------------
/nlp/english/config.py:
--------------------------------------------------------------------------------
1 | host = ["mongo0", "mongo1", "mongo2"]
2 | # host = "localhost"
3 | prefix = "." if (host == "localhost") else "/path_to_code/GenderGapTracker/nlp/english"
4 |
5 | config = {
6 | "MONGO_ARGS": {
7 | "host": host,
8 | "port": 27017,
9 | "username": "username",
10 | "password": "password",
11 | "authSource": "admin",
12 | "readPreference": "nearest"
13 | },
14 | "GENDER_RECOGNITION": {
15 | "GENDERIZE_ENABLED": False,
16 | "GENDERAPI_ENABLED": True,
17 | "GENDERAPI_TOKEN": "JSON_AUTH_TOKEN",
18 | "MANUAL_CACHE": "manual",
19 | "GENDERAPI_CACHE": "genderAPICleaned",
20 | "GENDERIZE_CACHE": "genderizeCleaned",
21 | "FIRSTNAME_CACHE": "firstNamesCleaned",
22 | },
23 | "NLP": {
24 | "MAX_BODY_LENGTH": 20000,
25 | "AUTHOR_BLOCKLIST": f"{prefix}/rules/author_blocklist.txt",
26 | "NAME_PATTERNS": f"{prefix}/rules/name_patterns.jsonl",
27 | "QUOTE_VERBS": f"{prefix}/rules/quote_verb_list.txt"
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/nlp/english/evaluation/README.md:
--------------------------------------------------------------------------------
1 | # English GGT Evaluation
2 | This folder contains methodology and code for evaluating the results of the English pipeline.
3 |
4 | For consistent and reproducible results, make sure any evaluation run locally uses the **same Python environment that is running in production**.
5 |
6 | ## Download Data
7 | The raw text data containing news article text, as well as the human-annotated data, is made available upon request (please contact Maite Taboada at [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca)).
8 | Obtain the directories named `humanAnnotations` and `rawtext` and place them in their respective paths as per the structure below.
9 |
10 | ```sh
11 | ├── .
12 | | ├── src
13 | | | ├── rawtexts
14 | | | ├── eval
15 | | | | └── humanAnnotations
16 | | | └── evaluate.py
17 | | | └── run_predictions.py
18 | ```
19 |
20 | ## Set Up Environment
21 | This section assumes that the English NLP environment in `../nlp/english` has already been set up, as the dashboard has a dependency on the English NLP modules, specifically the entity gender annotator for NER and coreference resolution. **Just like in the English NLP pipeline**, the dash app requires Python 3.6 for legacy reasons -- it uses spaCy 2.1.3 and `neuralcoref` for performing coreference resolution, which, unfortunately, are not installable on higher versions of spaCy or Python.
22 |
23 |
24 | If not done already, install a virtual environment using the `requirements.txt` from the `../nlp/english` directory in this repo.
25 |
26 | ```sh
27 | cd /path_to_code/GenderGapTracker/nlp/english
28 | python3 -m venv GRIM-EN # python3 -> python3.6 for legacy reasons (neuralcoref)
29 | source GRIM-EN/bin/activate
30 | python3 -m pip install -U pip wheel # Upgrade pip and install latest wheel package first
31 | python3 -m pip install -r requirements.txt
32 | ```
33 |
34 | #### `spaCy` language model
35 | **First, make sure that the spaCy version shown in `requirements.txt` is the same as the one running in production**.
36 |
37 | Manually download spaCy's large English language model for the quote extraction pipeline - this is a one-time step for this specific virtual environment.
38 | ```sh
39 | python3 -m spacy download en_core_web_lg
40 | ```
41 |
--------------------------------------------------------------------------------
/nlp/english/evaluation/src/README.md:
--------------------------------------------------------------------------------
1 | # Extracting quotes, named entities and gender
2 |
3 | This directory stores the scripts and methodology used to evaluate quotes extraction and named entities identification and gender annotations performed by the English NLP pipeline.
4 |
5 | ## Prerequisite: Obtain ssh tunnel to the MongoDB database
6 | To run this script locally, it is first required to set up an ssh tunnel that forwards the database connection to the local machine. This step is essential to complete the evaluation because we host a gender lookup cache on our database, which allows us to retrieve existing names and their associated genders.
7 |
8 | Set up the database tunnel on a Unix shell as follows. In the example below, `vm12` is the primary database on which the gender cache is hosted. We simply forward the connection from port 27017 on the remote database to the same port on our local machine.
9 |
10 | ```sh
11 | ssh vm12 -f -N -L 27017:localhost:27017
12 | ```
13 |
14 | In case database connectivity is not possible, it's possible to rewrite the gender service to only obtain named-based lookups via external gender APIs. However, in such a case, the results might vary from those shown below.
15 | ## 1. Produce the annotations
16 | Before evaluating the annotations made by the system, you'll need to produce those annotations. The gender annotation pipeline can be broken down into two successive steps :
17 | - Quote Extraction
18 | - Entity Gender Annotation
19 |
20 | The entity gender annotation step takes the output of the quote extraction step as input.
21 | In order to evaluate the performance of each part the pipeline individually, ```run_predictions.py``` can run each part of the pipeline by using the fully accurate input for each step (which is why the target annotations must be passed to the script).
22 | It can also run the whole NLP pipeline.
23 |
24 | ### Optional Arguments
25 | ```sh
26 | python3 run_predictions.py --help
27 | usage: run_predictions.py [-h] [--in_dir IN_DIR] [--out_dir OUT_DIR] [--target_dir TARGET_DIR] [--quote_extraction] [--gender_annotation] [--all] [--spacy_model SPACY_MODEL] [--poolsize POOLSIZE] [--chunksize CHUNKSIZE]
28 |
29 | Evaluation of all the steps of the gender annotation pipeline
30 |
31 | optional arguments:
32 | -h, --help show this help message and exit
33 | --in_dir IN_DIR Path to read input text files from this directory.
34 | --out_dir OUT_DIR Path to dir to output all predictions
35 | --target_dir TARGET_DIR
36 | Path to json target files. Serve as anchor for intermediate steps of the pipeline.
37 | --quote_extraction run quote extractor on text input files
38 | --gender_annotation run whole the whole pipeline on text on text input files
39 | --all compute all metrics
40 | --spacy_model SPACY_MODEL
41 | spacy language model
42 | --poolsize POOLSIZE Size of the concurrent process pool for the given task
43 | --chunksize CHUNKSIZE
44 | Number of articles per chunk being processed concurrently
45 | ```
46 |
47 | ### Example run command
48 | For V7.0, this is the command used to generate all the needed outputs.
49 | ```sh
50 | python3 run_predictions.py --in_dir ./rawtexts/ --target_dir ./eval/humanAnnotations/ --out_dir ./eval/systemAnnotations/V7.0/ --all
51 | ```
52 | This dumps out 98 JSON files containing the respective system output in each of these directories : `./eval/systemAnnotations/V7.0/quotes/extracted_quotes`, `./eval/systemAnnotations/V7.0/gender_annotation/entire_pipeline`
53 |
54 | ## 2. Get the metrics
55 |
56 | The script `evaluate.py` must be run after the script `run_predictions.py` has been run.
57 | It is only possible to get the metrics for the predictions that have already been run (for instance, do not specify --gender_annotation in `evaluate.py` if this argument was not specified in `run_predictions.py`)
58 |
59 | For more details regarding the way the metrics are computed, see the readme in the `./eval` directory.
60 |
61 |
62 | ### Optional Arguments
63 | ```sh
64 | python3 evaluate.py --help
65 | usage: evaluate.py [-h] [--target_dir TARGET_DIR] [--pred_dir PRED_DIR] [--quote_extraction] [--gender_annotation] [--gender_ratio] [--all]
66 |
67 | evaluation of all the steps of the gender annotation pipeline
68 |
69 | optional arguments:
70 | -h, --help show this help message and exit
71 | --target_dir TARGET_DIR
72 | Path to read input text files from this directory.
73 | --pred_dir PRED_DIR Path to write JSON quotes to this directory.
74 | --quote_extraction compute metrics on the quote extractor output
75 | --gender_annotation compute metrics on the gender annotator on the whole pipeline
76 | --gender_ratio compare overall gender ratios between target and output of whole pipeline
77 | --all compute all metrics
78 | ```
79 |
80 | ### Example run command
81 | For V7.0, this is the command used to display the metrics for all parts of the pipeline
82 | ```sh
83 | python3 evaluate.py --target_dir eval/humanAnnotations/ --pred_dir eval/systemAnnotations/V7.0/ --all
84 | ```
85 | Our latest (best) evaluation produced the metrics shown below.
86 |
87 | ```
88 | Quote Extraction
89 | ----------------------------------------
90 | Precision (%) Recall (%) F1-Score (%) Accuracy (%)
91 | Quotes: 0.3 84.647 82.719 83.672 -
92 | Speaker match: 0.3 - - - 86.478
93 | Verb match: 0.3 - - - 92.065
94 | Quotes: 0.8 76.971 75.218 76.084 -
95 | Speaker match: 0.8 - - - 87.444
96 | Verb match: 0.8 - - - 93.321
97 | Speakers (indep): 80.672 97.595 88.33 -
98 | Verbs (indep): 83.027 88.11 85.493 -
99 |
100 |
101 | Gender Annotation
102 | ----------------------------------------
103 | Precision (%) Recall (%) F1-Score (%)
104 | peopleFemale 71.939 77.049 74.406
105 | peopleMale 78.361 92.278 84.752
106 | peopleUnknown N/A 0.0 N/A
107 | sourcesFemale 94.643 64.634 76.812
108 | sourcesMale 87.805 76.923 82.005
109 | sourcesUnknown N/A 0.0 N/A
110 |
111 |
112 | Gender Ratio: People
113 | ----------------------------------------
114 | Male Female Unknown
115 | Human annotations 0.738 0.261 0.001
116 | System V7.0 0.758 0.242 0.0
117 |
118 |
119 |
120 | Gender Ratio: Sources
121 | ----------------------------------------
122 | Male Female Unknown
123 | Human annotations 0.738 0.259 0.003
124 | System V7.0 0.785 0.215 0.0
125 | ```
126 |
--------------------------------------------------------------------------------
/nlp/english/evaluation/src/run_predictions.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 | import json
5 | from pathlib import Path
6 | from multiprocessing import Pool, cpu_count
7 | from pathlib import Path
8 | import requests
9 | import spacy
10 | from spacy.pipeline import EntityRuler
11 | import neuralcoref
12 | from tqdm import tqdm
13 |
14 | sys.path.insert(1, os.path.realpath(Path(__file__).resolve().parents[2]))
15 |
16 | from quote_extractor import QuoteExtractor
17 | from entity_gender_annotator import EntityGenderAnnotator
18 | from config import config
19 | import utils
20 | """
21 | Runs several predictions on the annotated data
22 | This script must be run before evaluate.py
23 | """
24 |
25 |
26 | def get_rawtexts_from_file(filename):
27 | with open(filename, "r") as f:
28 | return f.read()
29 |
30 |
31 | def get_data_from_json(filename):
32 | with open(filename, "r") as f:
33 | return json.load(f)
34 |
35 |
36 | def dir_empty(dir_path):
37 | return not next(os.scandir(dir_path), None)
38 |
39 |
40 | def chunker(iterable, chunksize):
41 | """Yield a smaller chunk of a large iterable"""
42 | for i in range(0, len(iterable), chunksize):
43 | yield iterable[i: i + chunksize]
44 |
45 |
46 | def process_chunks(chunk):
47 | db_client = utils.init_client(config["MONGO_ARGS"])
48 | for idx in chunk:
49 | rawtext = get_rawtexts_from_file(Path(IN_DIR) / f"{idx}.txt")
50 | text = utils.preprocess_text(rawtext)
51 | doc = nlp(text)
52 | if QUOTE_EXTRACTION:
53 | pred_extracted_quotes = quote_extractor.extract_quotes(doc)
54 | json.dump(
55 | pred_extracted_quotes,
56 | open(os.path.join(extracted_quotes_dir, idx + ".json"), "w"),
57 | )
58 | print(f"Processed quotes for {idx}")
59 | if GENDER_ANNOTATION:
60 | pred_extracted_quotes = quote_extractor.extract_quotes(doc)
61 | json.dump(
62 | pred_extracted_quotes,
63 | open(os.path.join(extracted_quotes_dir, idx + ".json"), "w"),
64 | )
65 | print(f"Processed quotes for {idx}")
66 | pred_annotation = entity_gender_annotator.run(
67 | db_client, text, [], pred_extracted_quotes, []
68 | )
69 | pred_annotation["lastModified"] = pred_annotation["lastModified"].strftime(
70 | "%m/%d/%Y, %H:%M:%S"
71 | )
72 | json.dump(
73 | pred_annotation,
74 | open(os.path.join(gender_annotation_dir, idx + ".json"), "w"),
75 | )
76 | print(f"Processed entity genders for {idx}")
77 |
78 |
79 | def run_predictions():
80 | """
81 | Make predictions on quote extraction and entity gender annotation for comparison with gold test set
82 | """
83 | num_files = len(common_ids)
84 | num_chunks = len(list(chunker(common_ids, chunksize=CHUNKSIZE)))
85 | print(f"Organized {num_files} files into {num_chunks} chunks for concurrent processing...")
86 | # Process files using a pool of executors
87 | with Pool(processes=POOLSIZE) as pool:
88 | for _ in tqdm(pool.imap(process_chunks, chunker(common_ids, chunksize=CHUNKSIZE)), total=num_chunks):
89 | pass
90 |
91 |
92 | if __name__ == "__main__":
93 | parser = argparse.ArgumentParser(description="Evaluation of all the steps of the gender annotation pipeline")
94 | parser.add_argument("--in_dir", type=str, default="./rawtexts/", help="Path to read input text files from this directory.")
95 | parser.add_argument("--out_dir", type=str, default="./eval/systemAnnotations/V7.0/", help="Path to dir to output all predictions")
96 | parser.add_argument("--target_dir", type=str, default="./eval/humanAnnotations/", help="Path to json target files. Serve as anchor for intermediate steps of the pipeline.")
97 | parser.add_argument('--quote_extraction', action='store_true', help="run quote extractor on text input files")
98 | parser.add_argument('--gender_annotation', action='store_true', help="run whole the whole pipeline on text on text input files")
99 | parser.add_argument('--all', action='store_true', help="compute all metrics")
100 | parser.add_argument('--spacy_model', type=str, default="en_core_web_lg", help="spacy language model")
101 | parser.add_argument("--poolsize", type=int, default=cpu_count(), help="Size of the concurrent process pool for the given task")
102 | parser.add_argument("--chunksize", type=int, default=5, help="Number of articles per chunk being processed concurrently")
103 | args = vars(parser.parse_args())
104 | IN_DIR = args["in_dir"]
105 | TARGET_DIR = args["target_dir"]
106 | PRED_DIR = args["out_dir"]
107 | QUOTE_EXTRACTION = args["quote_extraction"]
108 | GENDER_ANNOTATION = args["gender_annotation"]
109 | POOLSIZE = args["poolsize"]
110 | CHUNKSIZE = args["chunksize"]
111 | if args["all"]:
112 | QUOTE_EXTRACTION = False # No need to run quote extraction if we're running the whole pipeline
113 | GENDER_ANNOTATION = True
114 |
115 | config["NLP"]["QUOTE_VERBS"] = "../../rules/quote_verb_list.txt"
116 | config["NLP"]["AUTHOR_BLOCKLIST"] = "../../rules/author_blocklist.txt"
117 | config["NLP"]["NAME_PATTERNS"] = "../../rules/name_patterns.jsonl"
118 | config["MONGO_ARGS"]["host"] = "localhost"
119 | # Load spaCy language model and attach custom entity ruler and coreferee pipes downstream
120 | print(f"Loading spaCy language model: {args['spacy_model']}...")
121 | nlp = spacy.load(args["spacy_model"])
122 | # Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify
123 | ruler = EntityRuler(nlp, overwrite_ents=True).from_disk(
124 | config["NLP"]["NAME_PATTERNS"]
125 | )
126 | nlp.add_pipe(ruler)
127 | coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200)
128 | nlp.add_pipe(coref, name="neuralcoref")
129 | print("Finished loading")
130 |
131 | args["spacy_lang"] = nlp
132 | session = requests.Session()
133 | args["session"] = session
134 | config = {**args, **config}
135 |
136 | quote_extractor = QuoteExtractor(config)
137 | entity_gender_annotator = EntityGenderAnnotator(config)
138 |
139 | txt_files = [f for f in Path(IN_DIR).glob("*.txt")]
140 | target_files = [f for f in Path(TARGET_DIR).glob("*.json")]
141 | common_ids = list(set([p.stem for p in txt_files]) & set([p.stem for p in target_files]))
142 |
143 | extracted_quotes_dir = os.path.join(PRED_DIR, "quotes", "extracted_quotes")
144 | os.makedirs(extracted_quotes_dir, exist_ok=True)
145 | gender_annotation_dir = os.path.join(
146 | PRED_DIR, "gender_annotation", "entire_pipeline"
147 | )
148 | os.makedirs(gender_annotation_dir, exist_ok=True)
149 | run_predictions()
150 |
--------------------------------------------------------------------------------
/nlp/english/img/concurrent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/img/concurrent.png
--------------------------------------------------------------------------------
/nlp/english/merge_collections.py:
--------------------------------------------------------------------------------
1 | """
2 | This script merges the results from the newly created collection from the entity
3 | gender annotation script (when the user specifies the `writecol` argument) with
4 | the original collection.
5 |
6 | Only the fields specified in this file are merged with (i.e. overwrite) the original
7 | `media` collection - the remaining fields in the original collection are left untouched.
8 | """
9 | import argparse
10 | from multiprocessing import Pool, cpu_count
11 | from config import config
12 | import utils
13 |
14 |
15 | def update_field(existing_collection, new_collection, idx):
16 | """Overwrite existing collection's fields with new collection's fields (except IDs)"""
17 | new_id = idx['_id']
18 | existing_id = idx['currentId']
19 | doc = new_collection.find_one({'_id': new_id}, no_cursor_timeout=True)
20 | existing_collection.update(
21 | {'_id': existing_id},
22 | {'$set': filter_dict(doc)}
23 | )
24 |
25 |
26 | def filter_dict(dict_obj):
27 | """Return a dictionary that has the same keys/values as the original dictionary,
28 | except for a few select keys that are to be excluded.
29 | """
30 | ignore_keys = ['_id', 'currentId']
31 | new_dict = {key: dict_obj[key] for key in dict_obj if key not in ignore_keys}
32 | return new_dict
33 |
34 |
35 | def chunker(iterable, chunksize):
36 | """Yield a smaller chunk of a large iterable"""
37 | for i in range(0, len(iterable), chunksize):
38 | yield iterable[i:i + chunksize]
39 |
40 |
41 | def parse_chunks(chunk):
42 | """Pass through a chunk of document IDs and update fields"""
43 | db_client = utils.init_client(MONGO_ARGS)
44 | existing_collection = db_client[DB_NAME][EXISTING_COL]
45 | new_collection = db_client[DB_NAME][NEW_COL]
46 | for idx in chunk:
47 | update_field(existing_collection, new_collection, idx)
48 |
49 |
50 | def run_pool(poolsize, chunksize):
51 | """Concurrently run independent operations on multiple cores"""
52 | db_client = utils.init_client(MONGO_ARGS)
53 | # Get list of new and old IDs from new collection
54 | new_col = db_client[DB_NAME][NEW_COL]
55 | new_old_ids = list(new_col.find({}, {'_id': 1, 'currentId': 1}))
56 | print('Obtained ID list of length {}.'.format(len(new_old_ids)))
57 | # Process quotes using a pool of executors
58 | pool = Pool(processes=poolsize)
59 | pool.map(parse_chunks, chunker(new_old_ids, chunksize=chunksize))
60 | pool.close()
61 |
62 |
63 | if __name__ == '__main__':
64 | parser = argparse.ArgumentParser()
65 | parser.add_argument('--db', type=str, default='mediaTracker', help="Database name")
66 | parser.add_argument('--oldcol', type=str, default='media', help="Existing collection name")
67 | parser.add_argument('--newcol', type=str, default='entitiesAnnotated', help="New collection name")
68 | parser.add_argument("--poolsize", type=int, default=cpu_count() + 1, help="Size of the concurrent process pool for the given task")
69 | parser.add_argument("--chunksize", type=int, default=100, help="Number of articles IDs per chunk being processed concurrently")
70 | args = vars(parser.parse_args())
71 |
72 | # From config
73 | MONGO_ARGS = config['MONGO_ARGS']
74 | # Parse arguments
75 | DB_NAME = args['db']
76 | EXISTING_COL = args['oldcol']
77 | NEW_COL = args['newcol']
78 | poolsize = args['poolsize']
79 | chunksize = args['chunksize']
80 |
81 | run_pool(poolsize, chunksize)
82 | print("Finished merging collections!")
83 |
--------------------------------------------------------------------------------
/nlp/english/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.27.1
2 | pandas>=1.1.5
3 | spacy==2.1.3
4 | neuralcoref==4.0
5 | pymongo>=3.10.0,<4.0.0
6 | dash==2.15.0
7 | dash_bootstrap_components==1.2.1
8 | dash_auth==1.4.1
9 | statsmodels>=0.12.2
--------------------------------------------------------------------------------
/nlp/english/rules/author_blocklist.txt:
--------------------------------------------------------------------------------
1 | Cbc
2 | Ctv
3 | News
4 | Associated
5 | Afp
6 | Radio
7 | Reuters
8 | Bloomberg
9 | Canada
10 | Canadian
11 | Média
12 | Follow
13 | Twitter
14 | Agence
15 | Http
16 | Https
17 | National
18 | Online
19 | Journalist
20 | Staff
21 | Reporter
22 | Report
23 | Reporting
24 | Washington
25 | Starmetro
26 | Thestar
27 | Vancouver
28 | Times
29 | Bureau
30 | Tribune
31 | Sports
32 | Presse
33 | Canadienne
34 | Special
35 | Edmonton
36 | Calgary
37 | Halifax
38 | Vancouver
39 | Ottawa
40 | Breaking
41 | Opens
42 | Hours
43 | Newsletter
44 | Columnist
45 | Digital
46 | Www.Facebook.Com
47 | Facebook
48 | Photo
49 | Photography
50 | Video
51 | Share
52 | Getty
53 | Images
54 | Pages
55 | File
56 | Studio
57 | TV
58 | Tva
59 | cbc
60 | ctv
61 | Business
62 | University
63 | Now
64 | Movies
65 | Games
66 | Pictures
67 | Classics
68 | Abroad
69 | Politics
70 | Covered
71 | Mail
72 | Gmail
73 | Transportation
74 | Critic
75 | Story
76 | Le Droit
77 | Le Soleil
78 | Journaliste
79 | Postmedia
80 | Day Ago
81 | Updated
82 | Remember
83 | Brother
84 | Sister
85 | Mother
86 | Father
87 | Ont.
88 | Pm
89 | Am
90 | Ap
91 | Edt
92 | Edtlast
93 |
--------------------------------------------------------------------------------
/nlp/english/rules/quote_verb_list.txt:
--------------------------------------------------------------------------------
1 | accept
2 | accepted
3 | acclaim
4 | acclaimed
5 | acclaiming
6 | acclaims
7 | acknowledge
8 | acknowledged
9 | acknowledges
10 | acknowledging
11 | add
12 | added
13 | adding
14 | adds
15 | admit
16 | admits
17 | admitted
18 | admitting
19 | advise
20 | advised
21 | advises
22 | advising
23 | announce
24 | announced
25 | announces
26 | announcing
27 | argue
28 | argued
29 | argues
30 | arguing
31 | assert
32 | asserted
33 | asserting
34 | asserts
35 | assure
36 | assured
37 | assures
38 | assuring
39 | claim
40 | claimed
41 | claiming
42 | claims
43 | clarified
44 | clarifies
45 | clarify
46 | clarifying
47 | comment
48 | commented
49 | commenting
50 | comments
51 | conclude
52 | concluded
53 | concludes
54 | concluding
55 | confirm
56 | confirmed
57 | confirming
58 | confirms
59 | continue
60 | continued
61 | continues
62 | continuing
63 | convince
64 | convinced
65 | convinces
66 | convincing
67 | criticize
68 | criticized
69 | criticizes
70 | criticizing
71 | declaim
72 | declaimed
73 | declaiming
74 | declaims
75 | declare
76 | declared
77 | declares
78 | declaring
79 | decried
80 | decries
81 | decry
82 | decrying
83 | demonstrate
84 | demonstrated
85 | demonstrates
86 | demonstrating
87 | denounce
88 | denounced
89 | denounces
90 | denouncing
91 | describe
92 | described
93 | describes
94 | describing
95 | disclaim
96 | disclaimed
97 | disclaiming
98 | disclaims
99 | dispute
100 | disputed
101 | disputes
102 | disputing
103 | ensure
104 | ensured
105 | ensures
106 | ensuring
107 | estimated
108 | estimates
109 | exclaim
110 | exclaimed
111 | exclaiming
112 | exclaims
113 | explain
114 | explained
115 | explaining
116 | explains
117 | finding
118 | finds
119 | highlight
120 | highlighted
121 | highlighting
122 | highlights
123 | illustrate
124 | illustrated
125 | illustrates
126 | illustrating
127 | indicate
128 | indicated
129 | indicates
130 | indicating
131 | inform
132 | informed
133 | informing
134 | informs
135 | insist
136 | insisted
137 | insisting
138 | insists
139 | mention
140 | mentioned
141 | mentioning
142 | mentions
143 | note
144 | noted
145 | notes
146 | notified
147 | notifies
148 | notify
149 | notifying
150 | noting
151 | persist
152 | persisted
153 | persisting
154 | persists
155 | point
156 | pointed
157 | pointing
158 | points
159 | preach
160 | preached
161 | preaches
162 | preaching
163 | predict
164 | predicted
165 | predicting
166 | predicts
167 | present
168 | presenting
169 | presents
170 | proclaim
171 | proclaimed
172 | proclaiming
173 | proclaims
174 | rave
175 | raved
176 | raves
177 | raving
178 | reassert
179 | reasserted
180 | reasserting
181 | reasserts
182 | reassure
183 | reassured
184 | reassures
185 | reassuring
186 | reckon
187 | reckoned
188 | reckoning
189 | reckons
190 | reconfirm
191 | reconfirmed
192 | reconfirming
193 | reconfirms
194 | release
195 | released
196 | releases
197 | releasing
198 | remind
199 | reminded
200 | reminding
201 | reminds
202 | replied
203 | replies
204 | reply
205 | replying
206 | report
207 | reported
208 | reporting
209 | reports
210 | respond
211 | responded
212 | responding
213 | responds
214 | restate
215 | restated
216 | restates
217 | restating
218 | retell
219 | retelling
220 | retells
221 | retold
222 | said
223 | say
224 | saying
225 | says
226 | state
227 | stated
228 | states
229 | stating
230 | suggest
231 | suggested
232 | suggesting
233 | suggests
234 | tell
235 | telling
236 | tells
237 | told
238 | testified
239 | testifies
240 | testify
241 | testifying
242 | think
243 | thinking
244 | thinks
245 | thought
246 | tweet
247 | tweeted
248 | tweeting
249 | tweets
250 | warn
251 | warned
252 | warning
253 | warns
254 | write
255 | writes
256 | writing
257 | wrote
258 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | spark-topic-modeling
3 | test.csv
4 | sample.json
5 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/config.py:
--------------------------------------------------------------------------------
1 | config = {
2 | 'MONGO_ARGS': {
3 | 'host': ['mongo0', 'mongo1', 'mongo2'],
4 | 'port': 27017,
5 | 'username': 'username',
6 | 'password': 'password',
7 | 'authSource': 'admin',
8 | 'readPreference': 'primaryPreferred'
9 | },
10 | 'DB': {
11 | 'READ_DB': 'mediaTracker',
12 | 'READ_COL': 'media',
13 | 'WRITE_DB': 'topicModel',
14 | 'WRITE_COL': 'topicResults'
15 | },
16 | 'MODEL': {
17 | 'OUTLETS': [
18 | 'National Post', 'The Globe And Mail', 'The Star',
19 | 'Global News', 'CTV News', 'CBC News'
20 | ],
21 | 'STOPWORDS': 'stopwords/stopwords.txt',
22 | 'LEMMAS': 'spacyLemmas/spacy_english_lemmas.txt'
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/corpus_analysis/config.py:
--------------------------------------------------------------------------------
1 | config = {
2 | 'MONGO_ARGS': {
3 | 'host': ['mongo0', 'mongo1', 'mongo2'],
4 | 'port': 27017,
5 | 'username': 'username',
6 | 'password': 'password',
7 | 'authSource': 'admin',
8 | 'readPreference': 'primaryPreferred'
9 | },
10 | 'DB': {
11 | 'MEDIA_DB': 'mediaTracker',
12 | 'MEDIA_COL': 'media',
13 | 'TOPIC_DB': 'topicModel',
14 | 'TOPIC_COL': 'topicResults'
15 | },
16 | }
17 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/corpus_analysis/download_articles.py:
--------------------------------------------------------------------------------
1 | """
2 | Download specific articles that are have high values of a particular topic's weights
3 | (t1, t2, etc.). Based on a user's input topic, we rank the article IDs in descending
4 | order of that topic's weights.
5 |
6 | The top 200 (or any other desired number of) article
7 | bodies are downloaded and stored to individual text files, following which we can perform
8 | keyness or other corpus-based linguistic analyses methods.
9 | """
10 | import argparse
11 | import os
12 | from pymongo import MongoClient
13 | from bson import ObjectId
14 | import pandas as pd
15 | from config import config
16 |
17 |
18 | def make_dirs(dirpath):
19 | """ Make directories for output if they don't exist. """
20 | if not os.path.exists(dirpath):
21 | os.makedirs(dirpath)
22 |
23 |
24 | def init_client(MONGO_ARGS):
25 | """ Initialize a MongoDB client. """
26 | _db_client = MongoClient(**MONGO_ARGS)
27 | return _db_client
28 |
29 |
30 | def download_articles(root_dir, collection, doc_id_list, case='female'):
31 | """ Download a document object and export its body content to a file.
32 | """
33 | doc_obj = [ObjectId(doc_id.strip()) for doc_id in doc_id_list]
34 | for idx in doc_obj:
35 | doc = collection.find_one(
36 | {'_id': idx},
37 | {'_id': 1, 'body': 1},
38 | no_cursor_timeout=True
39 | )
40 | make_dirs(f"{root_dir}/{TOPIC}/{case}")
41 | with open(f"{root_dir}/{TOPIC}/{case}/{str(idx)}.txt", 'w') as f:
42 | f.write(doc['body'])
43 |
44 |
45 | def read_data(filepath):
46 | """ Read topic-split data from CSV """
47 | df = pd.read_csv(filepath, header=0, parse_dates=['publishedAt'],
48 | index_col='_id')
49 | print(f"Obtained {df.shape[0]} articles in total")
50 | return df
51 |
52 |
53 | def get_gender_splitDF(df):
54 | """ Split the given Dataframe into two smaller Dataframes that each
55 | represent articles that are female or male source-dominated.
56 | """
57 | female = df.loc[df['sourcesFemaleCount'] > df['sourcesMaleCount']]
58 | male = df.loc[df['sourcesFemaleCount'] < df['sourcesMaleCount']]
59 | print(f"Found {female.shape[0]} articles dominated by female sources.")
60 | print(f"Found {male.shape[0]} articles dominated by male sources.")
61 | return female, male
62 |
63 |
64 | def top100_per_gender_and_topic(female, male, topic):
65 | """ Collect top 100 articles sorted by topic weight for a particular
66 | topic (The topic names are t1-t15 by default in the CSV).
67 | """
68 | t_female = female.sort_values(by=topic, ascending=False).iloc[:LIMIT, :]
69 | t_male = male.sort_values(by=topic, ascending=False).iloc[:LIMIT, :]
70 | return t_female, t_male
71 |
72 |
73 | def get_ids(filepath, topic):
74 | """ Obtain article ID lists for female/male source-dominated articles. """
75 | df = read_data(filepath)
76 | female, male = get_gender_splitDF(df)
77 | t_female, t_male = top100_per_gender_and_topic(female, male, topic)
78 | female_ids, male_ids = list(t_female.index), list(t_male.index)
79 | return female_ids, male_ids
80 |
81 |
82 | def main(filepath, topic='t1'):
83 | """ Download articles using main pipeline """
84 | female_ids, male_ids = get_ids(filepath, topic)
85 | client = init_client(MONGO_ARGS)
86 | collection = client[DB_NAME][COL_NAME]
87 | # Make root directory before downloading files
88 | root_dir = FILENAME.split('/')[-1].replace(".csv", "")
89 | download_articles(root_dir, collection, female_ids, case='female')
90 | download_articles(root_dir, collection, male_ids, case='male')
91 |
92 |
93 | if __name__ == "__main__":
94 | parser = argparse.ArgumentParser()
95 | parser.add_argument('--db', '-d', type=str, default='mediaTracker', help="Database name")
96 | parser.add_argument('--col', '-c', type=str, default='media', help="Existing collection name")
97 | parser.add_argument('--topic', '-t', type=str, default='t1', help="Topic (t1, t2, etc.) to extract articles for")
98 | parser.add_argument('--file', '-f', type=str, required=True, help="CSV file containing topic splits")
99 | parser.add_argument('--limit', '-l', type=int, default=200, help="Max. number of articles to consider")
100 | args = parser.parse_args()
101 |
102 | # Config settings
103 | MONGO_ARGS = config['MONGO_ARGS']
104 | # Parse args
105 | DB_NAME = args.db
106 | COL_NAME = args.col
107 | TOPIC = args.topic
108 | FILENAME = args.file
109 | LIMIT = args.limit
110 |
111 | main(FILENAME, topic=TOPIC)
112 |
113 |
114 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/corpus_analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.0.3
2 | matplotlib==3.1.0
3 | seaborn==0.10.0
4 | pyspark==2.4.0
5 | wordcloud==1.6.0
6 | pymongo==3.8.0
7 | tqdm==4.32.1
8 | spacy==2.3.2
9 | corpus_toolkit==0.29
10 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/img/example_divergent_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/topic_model/img/example_divergent_heatmap.png
--------------------------------------------------------------------------------
/nlp/english/topic_model/img/example_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/topic_model/img/example_heatmap.png
--------------------------------------------------------------------------------
/nlp/english/topic_model/img/example_wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/topic_model/img/example_wordcloud.png
--------------------------------------------------------------------------------
/nlp/english/topic_model/preproc.py:
--------------------------------------------------------------------------------
1 | """
2 | Test script to directly pull data from Mongo database and convert to Spark DataFrame.
3 | (Not used in the pipeline) - this script is purely for testing the DB connection with Spark.
4 | """
5 | import argparse
6 | import datetime
7 | from pyspark.sql import SparkSession
8 | from pymongo import MongoClient
9 | from config import config
10 |
11 |
12 | def convert_date(date_str):
13 | return datetime.datetime.strptime(date_str, '%Y-%m-%d')
14 |
15 |
16 | if __name__ == "__main__":
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("--partitions", type=int, default=100, help="Number of shuffle partitions in PySpark")
19 | parser.add_argument("--begin_date", type=str, default='2020-04-28', help="Begin date format YYYY-MM-DD")
20 | parser.add_argument("--end_date", type=str, default='2020-04-30', help="End date format YYYY-MM-DD")
21 |
22 | args = parser.parse_args()
23 |
24 | begin_date = convert_date(args.begin_date)
25 | end_date = convert_date(args.end_date)
26 |
27 | # Read config
28 | MONGO_ARGS = config['MONGO_ARGS']
29 | DB_NAME = config['DB']['DB_NAME']
30 | COLLECTION_NAME = config['DB']['COLLECTION_NAME']
31 | OUTLETS = config['MODEL']['OUTLETS']
32 |
33 | with MongoClient(**MONGO_ARGS) as connection:
34 | collection = connection[DB_NAME][COLLECTION_NAME]
35 | articles = collection.aggregate([
36 | {"$match": {
37 | "outlet": {"$in": OUTLETS},
38 | "publishedAt": {"$gte": begin_date, "$lte": end_date}
39 | }},
40 | {"$project": {
41 | '_id': {'$toString': '$_id'}, 'url': 1, 'publishedAt': 1,
42 | 'outlet': 1, 'title': 1, 'body': 1,
43 | 'peopleFemaleCount': 1, 'peopleMaleCount': 1,
44 | 'sourcesFemaleCount': 1, 'sourcesMaleCount': 1}}
45 | ])
46 |
47 | spark = SparkSession.builder.appName("Cleanup for GGT MongoDB Data Dump") \
48 | .config("spark.shuffle.io.maxRetries", 20) \
49 | .config("spark.shuffle.io.retryWait", "20s") \
50 | .config("spark.buffer.pageSize", "2m") \
51 | .config("spark.sql.shuffle.partitions", args.partitions) \
52 | .getOrCreate()
53 |
54 | # Specify timezone as UTC to match with raw data on MongoDB!
55 | spark.conf.set("spark.sql.session.timeZone", "UTC")
56 | df_articles = spark.createDataFrame(list(articles))
57 | num_articles = df_articles.count()
58 | dtypes = df_articles.dtypes
59 |
60 | print("\n\n***\nObtained {} articles after filtering".format(num_articles))
61 | print("\n\n***\nThe below columns are output to new Parquet files:\n{}".format(dtypes))
62 | print("\n\n***\nEarliest timestamp article in data: {}\nLatest timestamp article in data: {}\n".format(begin_date, end_date))
63 |
64 | df_articles.show()
65 | spark.stop()
66 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/preproc_cc.py:
--------------------------------------------------------------------------------
1 | """
2 | Prepare Data for Topic Modelling:
3 |
4 | Since the raw dump from MongoDB has data in an undesirable format,
5 | we clean it up and filter the relevant subset for our needs in topic modelling.
6 | """
7 | import argparse
8 | import os
9 | from pyspark.sql import SparkSession
10 | import pyspark.sql.functions as f
11 | import pyspark.sql.types as t
12 | from config import config
13 |
14 | # root_dit = "./"
15 | root_dir = "/home/pprao/projects/ctb-popowich/ggt"
16 | dataloc = os.path.join(root_dir, '21-04-2020-ggt.parquet')
17 |
18 |
19 | @f.udf(t.StringType())
20 | def get_ids(_id):
21 | return _id[0]
22 |
23 |
24 | def filter_raw_data(df):
25 | """Extract only relevant columns of data we require for topic modelling.
26 | NOTE: The unix timestamp from MongoDB is divided by 1000 here because of the
27 | extra 3 zeros at the end (we don't need milliseconds).
28 | """
29 | dataDF = df.select('_id', 'publishedAt', 'outlet', 'url', 'title', 'body', 'peopleFemaleCount',
30 | 'peopleMaleCount', 'sourcesFemaleCount', 'sourcesMaleCount') \
31 | .withColumn('id', get_ids(f.col('_id'))) \
32 | .withColumn('unix_timestamp', f.get_json_object(df.publishedAt, "$.$date") / 1000) \
33 | .withColumn('string_timestamp', f.from_unixtime(f.col('unix_timestamp'))) \
34 | .withColumn('timestamp', f.col('string_timestamp').cast(t.TimestampType())) \
35 | .drop('_id', 'publishedAt', 'unix_timestamp', 'string_timestamp')
36 | return dataDF
37 |
38 |
39 | def get_english_by_timestamp(df):
40 | """Extract English articles only within the given date range"""
41 | englishArticleDF = df.where(f.col('outlet').isin(OUTLETS))
42 | # Use timestamps for the first and last minute of the start/end days respectively
43 | start = "{} 00:00:00".format(begin_date)
44 | end = "{} 23:59:59".format(end_date)
45 | filteredDF = englishArticleDF.filter(f.col("timestamp") > f.unix_timestamp(
46 | f.lit(start)).cast('timestamp')) \
47 | .filter(f.col("timestamp") < f.unix_timestamp(
48 | f.lit(end)).cast('timestamp'))
49 | return filteredDF
50 |
51 |
52 | def get_articles_with_sources(df):
53 | """Ignore articles for which the `sourcesFemaleCount` and `sourcesMaleCount` fields are
54 | null (this means that the full NLP pipeline wasn't run on these articles).
55 | Zero sources in the article are possible, and these are not filtered out.
56 | """
57 | sourcesDF = df.filter('sourcesFemaleCount is not NULL and sourcesMaleCount is not NULL')
58 | return sourcesDF
59 |
60 |
61 | def get_date_range(df, colname='timestamp'):
62 | """Sanity check to verify that the minimum and maximum dates make sense
63 | (after running the filtering and cleanup steps).
64 | """
65 | min_date = f.date_format(f.min(colname), 'YYYY-MM-dd HH:mm:ss')
66 | max_date = f.date_format(f.max(colname), 'YYYY-MM-dd HH:mm:ss')
67 | min_date, max_date = df.select(min_date, max_date).first()
68 | print("Earliest timestamp in data: {}".format(min_date))
69 | print("Latest timestamp in data: {}".format(max_date))
70 | return min_date, max_date
71 |
72 |
73 | def write_output_parquet(df, output_dir):
74 | df.write.mode('overwrite').parquet(output_dir)
75 |
76 |
77 | def make_dir(dirpath):
78 | if not os.path.exists(dirpath):
79 | os.makedirs(dirpath)
80 |
81 |
82 | def run_cleanup():
83 | df = spark.read.parquet(dataloc)
84 | dataDF = filter_raw_data(df)
85 | filteredDF = get_english_by_timestamp(dataDF)
86 | sourcesDF = get_articles_with_sources(filteredDF)
87 | sourcesReordered = sourcesDF.select('id', 'timestamp', 'outlet', 'url', 'title', 'body',
88 | 'peopleFemaleCount', 'peopleMaleCount',
89 | 'sourcesFemaleCount', 'sourcesMaleCount',
90 | )
91 | return sourcesReordered
92 |
93 |
94 | if __name__ == "__main__":
95 | parser = argparse.ArgumentParser()
96 | parser.add_argument("--partitions", type=int, default=200, help="Number of shuffle partitions in PySpark")
97 | parser.add_argument("--begin_date", type=str, default='2018-10-01', help="Begin date format YYYY-MM-DD")
98 | parser.add_argument("--end_date", type=str, default='2020-04-20', help="End date format YYYY-MM-DD")
99 | args = parser.parse_args()
100 |
101 | # Parse arge
102 | begin_date = args.begin_date
103 | end_date = args.end_date
104 |
105 | # Read config
106 | OUTLETS = config['MODEL']['OUTLETS']
107 |
108 | spark = SparkSession.builder.appName("Cleanup for GGT MongoDB Data Dump") \
109 | .config("spark.shuffle.io.maxRetries", 20) \
110 | .config("spark.shuffle.io.retryWait", "20s") \
111 | .config("spark.buffer.pageSize", "2m") \
112 | .config("spark.sql.shuffle.partitions", args.partitions) \
113 | .getOrCreate()
114 | # Specify timezone as UTC to match with raw data on MongoDB!
115 | spark.conf.set("spark.sql.session.timeZone", "UTC")
116 | # Create output directory
117 | output_dir = "{}/ggt_english_{}_{}".format(root_dir, begin_date, end_date)
118 | make_dir(output_dir)
119 |
120 | existSourcesDF = run_cleanup()
121 | num_articles = existSourcesDF.count()
122 | dtypes = existSourcesDF.dtypes
123 | # Show minimum and maximum timestamps in the filtered data
124 | min_date, max_date = get_date_range(existSourcesDF, 'timestamp')
125 | # Write data to output directory
126 | write_output_parquet(existSourcesDF, output_dir)
127 |
128 | print("\n\n***\nObtained {} articles after filtering".format(num_articles))
129 | print("\n\n***\nThe below columns are output to new Parquet files:\n{}".format(dtypes))
130 | print("\n\n***\nEarliest timestamp article in data: {}\nLatest timestamp article in data: {}\n".format(min_date, max_date))
131 |
132 | spark.stop()
133 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.3.4
2 | wordcloud==1.8.1
3 | pandas==1.1.5
4 | py4j==0.10.7
5 | pymongo==3.11.3
6 | pyspark==2.4.5
7 | scipy==1.10.0
8 | seaborn==0.11.1
9 | tqdm==4.59.0
10 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/spacyLemmas/README.md:
--------------------------------------------------------------------------------
1 | # Lemmatization workflow
2 | We lemmatize all terms in each document prior to topic modelling in Spark. In our initial experiments, we observed that the lemmatizer used by Spark NLP (the third party used for lemmatization in Spark) was not of the requisite quality for our purposes. As a result, we chose to use spaCy's [lemma lookup data available on GitHub](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data).
3 |
4 | ## Formatting
5 | The lemmas from spaCy's lookup data are available as JSON, specified as `{lemma: [word1, word2, ...]}` where each key is the lemma, and the value is a list of words that share that lemma . In addition, a lot of the lemma keys contain extraneous symbols and punctuation, which we know are cleaned in an upstream step in our topic modelling pipeline. As a result, we don't need to include such entries with symbols and punctuation, because they will never be looked up in our topic model pipeline.
6 |
7 | Spark NLP expects lemmas to be in the following format -- note that it uses space-separated words in a flat file format (no JSON).
8 |
9 | ```
10 | colony -> colony colonies
11 | colonisation -> colonisation colonisations
12 | colonise -> colonise colonised colonises colonising
13 | coloniser -> coloniser colonisers
14 | colonist -> colonist colonists
15 | colonization -> colonization colonizations
16 | colonize -> colonize colonized colonizes colonizing
17 | colonizer -> colonizer colonizers
18 | ```
19 |
20 | When we load in the lemma lookup table as shown above to Spark, we can specify the separator symbol (`-->`), that indicates to Spark that the lemma is on the left and the words that share that lemma are on the right of that separator.
21 |
22 | ## Preparing the lemma lookup file
23 | In our experiments over many months' of real world data, we observed certain words like "data", which occur very regularly in the news, were reduced to "datum" during lemmatization. This is not ideal during topic keyword interpretation for labelling the topics. As a result, we define a "ban list" of lemmas in the file `convert_spacy_lemmas.py`, currently consisting of just the lemma "datum". In specifying this list, we are able to exclude it from the generated lookup file for Spark, so that when the model encounters words like "data", it does not reduce it to its lemma form (it is kept as "data").
24 |
25 | The order of steps in generating an up-to-date lemma lookup table for Spark via spaCy is below.
26 |
27 | 1. In `convert_spacy_lemmas.py` define a ban list of lemmas that shouldn't be considered during lemmatization. Words that have this as a lemma in spaCy's English lemma lookup are not lemmatized as a result.
28 | 2. Run the rile `convert_spacy_lemmas.py` (one-time process each time we want to update the lemma list) -- this downloads the latest English lemma lookup JSON from spaCy's GitHub repo, formats it and removes the unnecessary lemmas as we defined in the script.
29 | 3. Commit both `convert_spacy_lemmas.py` as well as generated text file `spacy_english_lemmas.txt` to GitHub. Pull the latest code on the topic modelling VM to ensure that the latest lemma list is in use for our monthly pipeline.
--------------------------------------------------------------------------------
/nlp/english/topic_model/spacyLemmas/convert_spacy_lemmas.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert the most recent spaCy lemma dictionary to a format that can be read
3 | by Spark-NLP.
4 | """
5 | import json
6 | from urllib.request import urlopen
7 |
8 |
9 | def get_ban_list():
10 | """List of lemmas that we don't want from spaCy's default lookup list"""
11 | banned_lemmas = ["datum"]
12 | return banned_lemmas
13 |
14 |
15 | def get_spacy_lemmas():
16 | """Download most recent spaCy lemma dictionary from their GitHub repo."""
17 | spacy_lemma_url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/master/spacy_lookups_data/data/en_lemma_lookup.json"
18 | with urlopen(spacy_lemma_url) as response:
19 | lemmas = response.read()
20 | return json.loads(lemmas)
21 |
22 |
23 | def get_same_value_keys(spacy_lemmas):
24 | """Map all duplicate values in the lemma dict to the key that they point to."""
25 | same_value_keys = {}
26 | for key, value in spacy_lemmas.items():
27 | if value not in same_value_keys:
28 | same_value_keys[value] = [key]
29 | else:
30 | same_value_keys[value].append(key)
31 | return same_value_keys
32 |
33 |
34 | def write_sparknlp_lemmas(spacy_lemmas):
35 | """Write out the lemmas as per Spark NLP's format:
36 | https://stackoverflow.com/a/57873365/1194761
37 | """
38 | ban_list = get_ban_list()
39 | same_value_keys = get_same_value_keys(spacy_lemmas)
40 | with open('spacy_english_lemmas.txt', "w") as f:
41 | for key, values in same_value_keys.items():
42 | print(key, " -->", values)
43 | if key in ban_list:
44 | # Prevent lemmas that we banned from being included in the output lemma list for Spark
45 | pass
46 | else:
47 | # Only output values without special characters
48 | alphabet_values = [val.lower() for val in values if val.isalpha()]
49 | if key.isalpha():
50 | f.write("{0} -> {0} {1}\n".format(key.lower(), ' '.join(list(alphabet_values))))
51 |
52 |
53 | def main():
54 | spacy_lemmas = get_spacy_lemmas()
55 | write_sparknlp_lemmas(spacy_lemmas)
56 |
57 |
58 | if __name__ == "__main__":
59 | main()
60 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/stopwords/README.md:
--------------------------------------------------------------------------------
1 | # Stopwords for Topic Modelling
2 | Choosing the right stopwords for topic modelling is an iterative process [[1]](https://databricks.com/blog/2015/09/22/large-scale-topic-modeling-improvements-to-lda-on-apache-spark.html). Based on the news outlet vocabulary in our corpus, certain common verbs can hinder the interpretation of topics. Most nouns, however, are useful for interpreting topics as they offer some context to the news categories being covered.
3 |
4 | The below lists of words are combined to produce the overall stopword list used in topic modelling.
5 |
6 | ## NLTK (curated)
7 | From past projects at the discourse processing lab, the default NLTK stopword list was curated and some additional common charactes/symbols/digits added to this list. This list of words is in the file `nltk_curated.txt`.
8 |
9 | ## Light verbs
10 | These are [verbs with little semantic content of their own](https://en.wikipedia.org/wiki/Light_verb), such as *do, give, make, take*. The list of light verbs relevant to the GGT news corpus is extended and customized (with some trial and error based on intermediate topic model results) and added to the file `create_stopword_list.py`.
11 |
12 | **NOTE**: In the Python file, just the verb roots are specified manually. The full list of verbs (in present/past tenses) is obtained by looking up each lemma's alternate forms from spaCy's lemma dictionary.
13 |
14 | ## Custom words
15 | Initially, an additional list of "general" nouns, or [signalling nouns](https://books.google.ca/books/about/Signalling_Nouns_in_Academic_English.html?id=3f-XoAEACAAJ&redir_esc=y), or [shell nouns](https://www.aclweb.org/anthology/W13-2314/) was considered. These include certain abstract nouns like "problem", "fact" or "result" - i.e. nouns with non-specific meaning when considered in isolation. It was found that most of these nouns are actually very useful in interpreting topics, which in itself is a task where words (especially nouns) are looked at in isolation.
16 |
17 | As a result, general/signalling/shell nouns are **not** used in this task.
18 |
19 | However, based on the initial topic modelling experiments run, a separate list of custom words that hinder topic interpretability were created manually. The below words were included in the file `create_stopword_list.py`.
20 |
21 | * **Social media-related**: *post, sign, like, love, tag, star, call, group, video, photo, pic, inbox*
22 | * **URL and embed terms**: *http, https, href, ref, com, cbc, ctv, src, twsrc, 5etfw*
23 | * **Frequently occurring common nouns**: *people, man, woman, life, family, friend, news, report, press, page, story*
24 | * **Time of the day/week**: *morning, afternoon, evening, today, yesterday, tomorrow*
25 | * **Time periods**: *day, week, month, year*
26 | * **Time zones**: *edt, pst*
27 | * **Day of the week**: *monday, tuesday, wednesday, thursday, friday, saturday, sunday*
28 | * **Months of the year**: *january, february, march, ..., october, november, december*
29 | * **Year**: *2018, 2019, 2020, 2021*
30 |
31 | ## Generate a final list of stopwords
32 | The included Python file is run as follows.
33 | ```
34 | python3 create_stopword_list.py
35 | ```
36 |
37 | This concatenates words from the above lists into a single, de-duplicated set and sorts them in alphabetical order, producing a final stopword file `stopwords.txt`.
38 |
39 | ## References
40 | [1] [Large Scale Topic Modeling: Improvements to LDA on Apache Spark](https://databricks.com/blog/2015/09/22/large-scale-topic-modeling-improvements-to-lda-on-apache-spark.html)
--------------------------------------------------------------------------------
/nlp/english/topic_model/stopwords/create_stopword_list.py:
--------------------------------------------------------------------------------
1 | """
2 | Script to generate a custom list of stopwords that extend upon existing word lists.
3 | """
4 | import json
5 | from urllib.request import urlopen
6 | from itertools import chain
7 |
8 |
9 | def combine(*lists):
10 | "Combine an arbitrary number of lists into a single list"
11 | return list(chain(*lists))
12 |
13 |
14 | def get_spacy_lemmas():
15 | "Read in spaCy lemma dict from the raw GitHub source"
16 | spacy_lemma_url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/master/spacy_lookups_data/data/en_lemma_lookup.json"
17 | with urlopen(spacy_lemma_url) as response:
18 | lemmas = response.read()
19 | return json.loads(lemmas)
20 |
21 |
22 | def get_words(filename):
23 | "Read in a list of words from a stopword list"
24 | words = []
25 | with open(filename) as f:
26 | for word in f:
27 | words.append(word.strip())
28 | return words
29 |
30 |
31 | def lookup_verbs(roots, spacy_lemmas):
32 | """Return a full of list light verbs and all its forms (present, past tense, etc.)"""
33 |
34 | def flatten(list_of_lists):
35 | "Return a flattened list of a list of lists"
36 | return [item for sublist in list_of_lists for item in sublist]
37 |
38 | verblist = []
39 | for root in roots:
40 | verbs = [key for key in spacy_lemmas if spacy_lemmas[key] == root]
41 | verbs.append(root)
42 | verblist.append(verbs)
43 | return flatten(verblist)
44 |
45 |
46 | if __name__ == "__main__":
47 | # We first get the NLTK curated word list
48 | nltk_stopwords = set(get_words('nltk_curated.txt'))
49 | # Obtain spaCy lemma dictionary for retrieving light verb full forms
50 | spacy_lemmas = get_spacy_lemmas()
51 |
52 | # Create custom word lists depending on the class of words seen in the data
53 | url_terms = ['href', 'http', 'https', 'src', 'twsrc', '5etfw', 'ref', 'com', 'cbc',
54 | 'ctv', 'star', '5127en', 'httpstco', 'www']
55 | # Don't take 'wed', 'sat' and 'sun' because they are also normal words
56 | days_of_the_week = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
57 | 'saturday', 'sunday', 'mon', 'tue', 'thu', 'fri']
58 | months_of_the_year = ['january', 'february', 'march', 'april', 'may', 'june', 'july',
59 | 'august', 'september', 'october', 'november', 'december', 'jan',
60 | 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct',
61 | 'nov', 'dec']
62 | years = ["2018", "2019", "2020", "2021", "2022", "2023"]
63 | time_periods = ['minute', 'minutes', 'hour', 'hours', 'day', 'days', 'week', 'weeks',
64 | 'month', 'months', 'year', 'years']
65 | time_related = ['yesterday', 'today', 'tomorrow', 'day', 'night', 'morning',
66 | 'afternoon', 'evening', 'edt', 'est', 'pst', 'pdt', 'time', 'times']
67 | common_words = ['press', 'news', 'report', 'page', 'user', 'reuters', 'email', 'browser',
68 | 'file', 'files', 'video', 'pic', 'photo', 'online', 'social', 'media', 'article',
69 | 'group', 'inbox', 'item', 'advertisement', 'world', 'store', 'story',
70 | 'life', 'family', 'people', 'man', 'woman', 'friend', 'friends']
71 | social_media = ['twitter', 'facebook', 'google', 'gmail', 'linkedin', 'pinterest', 'getty',
72 | 'video', 'photo', 'image', 'images', 'user', 'social', 'media', 'newsletter',
73 | 'subscribe', 'page', 'online', 'live', 'stream', 'post', 'app', 'postmedia',
74 | 'apnews']
75 | light_verb_roots = ['call', 'click', 'continue', 'comment', 'do', 'feel', 'find',
76 | 'give', 'get', 'have', 'include', 'like', 'live', 'love', 'make',
77 | 'post', 'read', 'say', 'speak', 'send', 'share', 'show', 'sign',
78 | 'tag', 'take', 'tell', 'think', 'update', 'work', 'write', 'join',
79 | 'view', 'load']
80 |
81 | # Convert light verb roots to all its forms using lemma lookup
82 | light_verbs_full = lookup_verbs(light_verb_roots, spacy_lemmas)
83 |
84 | # Combine into a single list of stopwords
85 | add_stopwords = set(
86 | combine(
87 | nltk_stopwords, url_terms, days_of_the_week, months_of_the_year, years,
88 | time_periods, time_related, common_words, social_media, light_verbs_full
89 | )
90 | )
91 |
92 | # Combine all stopwords into one list and export to text file
93 | combined_stopwords = nltk_stopwords.union(add_stopwords)
94 | stopword_list = sorted(list(combined_stopwords))
95 | # Write out stopwords to file
96 | with open('stopwords.txt', 'w') as f:
97 | for word in stopword_list:
98 | f.write(word + '\n')
99 |
100 | print(f"Exported {len(stopword_list)} words to stopword list.")
101 |
--------------------------------------------------------------------------------
/nlp/english/topic_model/stopwords/nltk_curated.txt:
--------------------------------------------------------------------------------
1 | -
2 | 0
3 | 000
4 | 1
5 | 2
6 | 3
7 | 4
8 | 5
9 | 6
10 | 7
11 | 8
12 | 9
13 | a
14 | a's
15 | able
16 | about
17 | above
18 | according
19 | accordingly
20 | across
21 | actually
22 | after
23 | afterwards
24 | again
25 | against
26 | ain
27 | ain't
28 | all
29 | allow
30 | allows
31 | almost
32 | alone
33 | along
34 | already
35 | also
36 | although
37 | always
38 | am
39 | among
40 | amongst
41 | an
42 | and
43 | another
44 | any
45 | anybody
46 | anyhow
47 | anyone
48 | anything
49 | anyway
50 | anyways
51 | anywhere
52 | apart
53 | appear
54 | appreciate
55 | appropriate
56 | are
57 | area
58 | aren
59 | aren't
60 | around
61 | as
62 | aside
63 | ask
64 | asking
65 | associated
66 | at
67 | available
68 | away
69 | awfully
70 | b
71 | back
72 | be
73 | became
74 | because
75 | become
76 | becomes
77 | becoming
78 | been
79 | before
80 | beforehand
81 | behind
82 | being
83 | believe
84 | below
85 | beside
86 | besides
87 | best
88 | better
89 | between
90 | beyond
91 | big
92 | bit
93 | both
94 | brief
95 | but
96 | by
97 | c
98 | c'mon
99 | c's
100 | came
101 | can
102 | canada
103 | canadian
104 | can't
105 | cannot
106 | cant
107 | cause
108 | causes
109 | cent
110 | certain
111 | certainly
112 | changes
113 | city
114 | clearly
115 | co
116 | com
117 | come
118 | comes
119 | concerning
120 | consequently
121 | consider
122 | considering
123 | contain
124 | containing
125 | contains
126 | continue
127 | corresponding
128 | could
129 | couldn
130 | couldn't
131 | course
132 | currently
133 | d
134 | day
135 | definitely
136 | described
137 | despite
138 | did
139 | didn
140 | didn't
141 | different
142 | do
143 | does
144 | doesn
145 | doesn't
146 | doing
147 | don
148 | don't
149 | done
150 | down
151 | downwards
152 | during
153 | e
154 | each
155 | edu
156 | eg
157 | eight
158 | either
159 | else
160 | elsewhere
161 | end
162 | enough
163 | entirely
164 | especially
165 | et
166 | etc
167 | even
168 | ever
169 | every
170 | everybody
171 | everyone
172 | everything
173 | everywhere
174 | ex
175 | exactly
176 | example
177 | except
178 | f
179 | far
180 | feel
181 | few
182 | fifth
183 | find
184 | first
185 | five
186 | followed
187 | following
188 | follows
189 | for
190 | former
191 | formerly
192 | forth
193 | four
194 | from
195 | further
196 | furthermore
197 | g
198 | get
199 | gets
200 | getting
201 | given
202 | gives
203 | go
204 | goes
205 | going
206 | gone
207 | got
208 | gotten
209 | greetings
210 | h
211 | had
212 | hadn
213 | hadn't
214 | happens
215 | hardly
216 | has
217 | hasn
218 | hasn't
219 | have
220 | haven
221 | haven't
222 | having
223 | he
224 | he's
225 | he'se
226 | hello
227 | help
228 | hence
229 | her
230 | here
231 | here's
232 | hereafter
233 | hereby
234 | herein
235 | hereupon
236 | hers
237 | herself
238 | hi
239 | him
240 | himself
241 | his
242 | hither
243 | hopefully
244 | how
245 | howbeit
246 | however
247 | i
248 | i'd
249 | i'll
250 | i'm
251 | i've
252 | ie
253 | if
254 | ignored
255 | immediate
256 | in
257 | inasmuch
258 | inc
259 | indeed
260 | indicate
261 | indicated
262 | indicates
263 | inner
264 | insofar
265 | instead
266 | into
267 | inward
268 | is
269 | isn
270 | isn't
271 | it
272 | it'd
273 | it'll
274 | it's
275 | it,
276 | its
277 | itself
278 | j
279 | just
280 | k
281 | keep
282 | keeps
283 | kept
284 | know
285 | known
286 | knows
287 | l
288 | last
289 | lately
290 | later
291 | latter
292 | latterly
293 | least
294 | less
295 | lest
296 | let
297 | let's
298 | like
299 | liked
300 | likely
301 | little
302 | ll
303 | look
304 | looking
305 | looks
306 | lot
307 | ltd
308 | m
309 | ma
310 | made
311 | mainly
312 | man
313 | many
314 | may
315 | maybe
316 | me
317 | mean
318 | meanwhile
319 | merely
320 | might
321 | mightn
322 | mightn't
323 | more
324 | moreover
325 | most
326 | mostly
327 | much
328 | must
329 | mustn
330 | mustn't
331 | my
332 | myself
333 | n
334 | name
335 | namely
336 | nd
337 | near
338 | nearly
339 | necessary
340 | need
341 | needn
342 | needn't
343 | needs
344 | neither
345 | never
346 | nevertheless
347 | new
348 | next
349 | nine
350 | no
351 | nobody
352 | non
353 | none
354 | noone
355 | nor
356 | normally
357 | not
358 | nothing
359 | novel
360 | now
361 | nowhere
362 | o
363 | obviously
364 | of
365 | off
366 | often
367 | oh
368 | ok
369 | okay
370 | old
371 | on
372 | once
373 | one
374 | ones
375 | only
376 | onto
377 | or
378 | other
379 | others
380 | otherwise
381 | ought
382 | our
383 | ours
384 | ourselves
385 | out
386 | outside
387 | over
388 | overall
389 | own
390 | p
391 | particular
392 | particularly
393 | per
394 | perhaps
395 | place
396 | placed
397 | play
398 | please
399 | plus
400 | possible
401 | presumably
402 | probably
403 | provides
404 | put
405 | q
406 | que
407 | quite
408 | qv
409 | r
410 | rather
411 | rd
412 | re
413 | really
414 | reasonably
415 | regarding
416 | regardless
417 | regards
418 | relatively
419 | respectively
420 | right
421 | s
422 | said
423 | same
424 | saw
425 | say
426 | saying
427 | says
428 | second
429 | secondly
430 | see
431 | seeing
432 | seem
433 | seemed
434 | seeming
435 | seems
436 | seen
437 | self
438 | selves
439 | sensible
440 | sent
441 | serious
442 | seriously
443 | service
444 | seven
445 | several
446 | shall
447 | shan
448 | shan't
449 | she
450 | she's
451 | should
452 | should've
453 | shouldn
454 | shouldn't
455 | since
456 | six
457 | so
458 | some
459 | somebody
460 | somehow
461 | someone
462 | something
463 | sometime
464 | sometimes
465 | somewhat
466 | somewhere
467 | soon
468 | sorry
469 | specified
470 | specify
471 | specifying
472 | start
473 | still
474 | sub
475 | such
476 | sup
477 | sure
478 | t
479 | t's
480 | take
481 | taken
482 | tell
483 | tends
484 | th
485 | than
486 | thank
487 | thanks
488 | thanx
489 | that
490 | that'll
491 | that's
492 | thats
493 | the
494 | their
495 | theirs
496 | them
497 | themselves
498 | then
499 | thence
500 | there
501 | there's
502 | thereafter
503 | thereby
504 | therefore
505 | therein
506 | theres
507 | thereupon
508 | these
509 | they
510 | they'd
511 | they'll
512 | they're
513 | they've
514 | things
515 | think
516 | third
517 | this
518 | thorough
519 | thoroughly
520 | those
521 | though
522 | three
523 | through
524 | throughout
525 | thru
526 | thus
527 | to
528 | together
529 | too
530 | took
531 | toward
532 | towards
533 | tried
534 | tries
535 | truly
536 | try
537 | trying
538 | twice
539 | two
540 | u
541 | un
542 | under
543 | unfortunately
544 | unless
545 | unlikely
546 | until
547 | unto
548 | up
549 | upon
550 | us
551 | use
552 | used
553 | useful
554 | uses
555 | using
556 | usually
557 | uucp
558 | v
559 | value
560 | various
561 | ve
562 | very
563 | via
564 | viz
565 | vs
566 | w
567 | want
568 | wants
569 | was
570 | wasn
571 | wasn't
572 | way
573 | we
574 | we'd
575 | we'll
576 | we're
577 | we've
578 | welcome
579 | well
580 | went
581 | were
582 | weren
583 | weren't
584 | what
585 | what's
586 | whatever
587 | when
588 | whence
589 | whenever
590 | where
591 | where's
592 | whereafter
593 | whereas
594 | whereby
595 | wherein
596 | whereupon
597 | wherever
598 | whether
599 | which
600 | while
601 | whither
602 | who
603 | who's
604 | whoever
605 | whole
606 | whom
607 | whose
608 | why
609 | will
610 | willing
611 | wish
612 | with
613 | within
614 | without
615 | won
616 | won't
617 | wonder
618 | working
619 | would
620 | wouldn
621 | wouldn't
622 | x
623 | y
624 | years
625 | yes
626 | yet
627 | you
628 | you'd
629 | you'll
630 | you're
631 | you've
632 | your
633 | yours
634 | yourself
635 | yourselves
636 | z
637 | zero
638 |
--------------------------------------------------------------------------------
/nlp/french/README.md:
--------------------------------------------------------------------------------
1 | # French NLP pipeline
2 | ## Set up environment
3 | The French NLP pipeline uses a third party coreference resolution library named [coreferee](https://github.com/explosion/coreferee), which requires the use of Python 3.9. It is assumed that Python 3.9 exists on the system on which the French NLP code runs.
4 |
5 | Make sure that `gcc`, `build-essential` and `python3.9-devel` (on Red Hat/CentOS), or `python3.9-dev` (on ubuntu) are installed on the system. Also, install `python3.9-venv` for managing virtual environments, and ensure `wheel` is installed prior to installing the dependencies (as shown below)
6 |
7 | ```sh
8 | python3.9 - venv GRIM-FR
9 | ```
10 |
11 | Activate the environment and install the dependencies:
12 |
13 | ```
14 | source GRIM-FR/bin/activate
15 | python3.9 -m pip install -U pip wheel # Upgrade pip and install the wheel package first
16 | python3.9 -m pip install -r requirements.txt
17 | ```
18 |
19 | ## Quote extractor
20 | Extract `quotes` from the database or from local files. Save the output locally, or update the database directly.
21 |
22 | ### Default mode
23 | By default, the quote extractor only works on articles that weren't processed earlier (i.e., new articles that are freshly scraped with `lastModifier = mediaCollectors`).
24 |
25 | ```sh
26 | python3.9 quote_extractor.py --db mediaTracker --readcol media --limit 0
27 | ```
28 | `--limit 0` (which is the default setting) means no limitation, and the script runs on all documents in the database.
29 |
30 | ### Force update
31 | To force-update the results and overwrite existing data for all articles, use the `--force_update` argument.
32 | ```sh
33 | python3.9 quote_extractor.py --db mediaTracker --readcol media --force_update --limit 10
34 | ```
35 | `--limit 10` means that the script will process just 10 documents, which is useful during testing.
36 |
37 | ### Specify time period
38 | We can easily limit the quote extraction process to only articles from a specified time period.
39 |
40 | ```sh
41 | python3.9 quote_extractor.py --db mediaTracker --readcol media --force_update --begin_date 2021-12-01 --end_date 2021-12-31
42 | ```
43 |
44 | For the full list of optional arguments, type the following:
45 |
46 | ```sh
47 | python3.9 quote_extractor.py --help
48 | ```
49 | ## Quote highlighter
50 | Take an input text, a set of corresponding predicted `quotes` (usually output data from the quote extractor) and optionally target `quotes` to compare against and output HTML files highlighting the quotes and speakers in the text.
51 |
52 | Example commands:
53 | ```
54 | python3.9 quote_highlighter.py --text-base=./input/ --prediction-base=./predictions/ --no-target --html-base=./html/
55 | ```
56 | Optional arguments:
57 | ```
58 | -h, --help show this help message and exit
59 | --text-base TEXT_BASE
60 | Where the text which the quotes were extracted from is stored.
61 | --html-base HTML_BASE
62 | Where to store the output HTML.
63 | --target-base TARGET_BASE
64 | Where the (annotated) target quotes are stored.
65 | --prediction-base PREDICTION_BASE
66 | Where the predicted quotes are stored.
67 | --no-target, -n Don't highlight target quotes/speakers
68 | ```
69 |
70 | ---
71 | ## Entity gender annotator
72 |
73 | Once the quotes have been extracted and written to the DB, we can then run the entity gender annotation script. This script utilizes the quotes (stored as a list) from each article, performs NER on them, and then merges the extracted named entities with the speakers of the quotes. In addition, we also perform quote merging to match the merged named entities to the speaker of a quote, wherever possible.
74 |
75 | ### Default mode
76 | Just like the quote extractor, the entity gender annotator by default only works on articles that weren't processed earlier (i.e.,articles that were just processed by quote extractor, with `lastModifier = quote_extractor`).
77 |
78 | ```sh
79 | python3.9 entity_gender_annotator.py --db mediaTracker --readcol media
80 | ```
81 |
82 | ### Force update
83 |
84 | To force-update the results and overwrite existing data for all articles, use the `--force_update` argument.
85 |
86 | ```sh
87 | python3.9 entity_gender_annotator.py --db mediaTracker --readcol media --force_update
88 | ```
89 |
90 | ### Specify write collection
91 | **It is strongly recommended** to use the `--writecol` argument when running the script on a large collection. This is so that even if the NLP operations take many days to run, the database statistics will not pick up partially completed results, and we can then run the `merge_collections.py` script to move the NLP results from the `newmedia` to the `media` collection.
92 |
93 | ```sh
94 | python3.9 entity_gender_annotator.py --force_update --db mediaTracker --readcol media --writecol newmedia
95 | ```
96 |
97 |
98 | ### Specify time period
99 | We can easily limit the quote extraction process to only articles from a specified time period.
100 |
101 | ```sh
102 | python3.9 entity_gender_annotator.py --db mediaTracker --readcol media --force_update --begin_date 2020-01-01 --end_date 2020-01-31
103 | ```
104 |
105 | For further help options, type the following:
106 |
107 | ```sh
108 | python3.9 entity_gender_annotator.py --help
109 | ```
110 |
111 | ## Note on multiprocessing
112 | As of spaCy 3.2.x and coreferee 1.3.1, multiprocessing is **not** supported (due to the inability of coreferee to share data between forked processes). As a result, we are unable to speed up the performance of the French entity gender annotator by dividing the computation across separate processes -- **this might change in a future version** when there are updates to the coreference algorithm within base spaCy.
--------------------------------------------------------------------------------
/nlp/french/config.py:
--------------------------------------------------------------------------------
1 | host = ["mongo0", "mongo1", "mongo2"]
2 | # host = "localhost"
3 | prefix = "." if (host == "localhost") else "/path_to_code/GenderGapTracker/nlp/french"
4 |
5 | config = {
6 | "MONGO_ARGS": {
7 | "host": host,
8 | "port": 27017,
9 | "authSource": "admin",
10 | "readPreference": "primaryPreferred",
11 | "username": "username",
12 | "password": "password",
13 | },
14 | "GENDER_RECOGNITION": {
15 | "GENDERIZE_ENABLED": False,
16 | "GENDERAPI_ENABLED": True,
17 | "GENDERAPI_TOKEN": "JSON_AUTH_TOKEN",
18 | "MANUAL_CACHE": "manual",
19 | "GENDERAPI_CACHE": "genderAPICleaned",
20 | "GENDERIZE_CACHE": "genderizeCleaned",
21 | "FIRSTNAME_CACHE": "firstNamesCleaned",
22 | },
23 | "NLP": {
24 | "MAX_BODY_LENGTH": 20000,
25 | "QUOTE_VERBS": f"{prefix}/rules/quote_verb_list.txt",
26 | "AUTHOR_BLOCKLIST": f"{prefix}/rules/author_blocklist.txt",
27 | "NAME_PATTERNS": f"{prefix}/rules/name_patterns.jsonl",
28 | },
29 | }
30 |
--------------------------------------------------------------------------------
/nlp/french/data_statistics.py:
--------------------------------------------------------------------------------
1 | import os, json, re
2 | import argparse
3 | from ast import literal_eval
4 |
5 | import pandas as pd
6 | import Levenshtein as lev
7 | import spacy
8 | from spacy.language import Language
9 | from spacy.tokens import Doc, Span
10 | from coreferee.rules import RulesAnalyzerFactory
11 | from coreferee.data_model import Mention
12 |
13 | import utils
14 |
15 |
16 | def compute_statistics(text_dir, target_dir, output_file=None):
17 | files = utils.get_files_from_folder(text_dir)
18 | files_data = []
19 | files_indexes = []
20 | for i, doc_name in enumerate(files):
21 | # print(doc_name)
22 | text = utils.preprocess_text(files[doc_name])
23 | json_file = target_dir + doc_name + ".json"
24 | if not os.path.exists(json_file):
25 | continue
26 | quote_objects = json.load(open(json_file, encoding="mac-roman"))
27 | file_data = get_file_stats(text, quote_objects)
28 |
29 | files_data.append(file_data)
30 | files_indexes.append(doc_name)
31 | # print(files_data)
32 | return process_results(files_data, files_indexes, output_file)
33 |
34 |
35 | def process_results(files_data, files_indexes, output_file):
36 | columns = [
37 | "nouns",
38 | "proper nouns",
39 | "other nouns",
40 | "anaphora",
41 | "non-covered speakers",
42 | "speakerless quotes",
43 | "verbless quotes",
44 | "unknown speaker's gender",
45 | "referenceless quotes",
46 | "self-evident_references",
47 | "plural speaker",
48 | "total_quotes",
49 | ]
50 | df = pd.DataFrame(files_data, index=files_indexes, columns=columns)
51 | total = df.sum(numeric_only=True, axis=0)
52 | mean = df.mean(numeric_only=True, axis=0)
53 | median = df.median(numeric_only=True, axis=0)
54 | standard_deviation = df.std(numeric_only=True, axis=0)
55 | total_proportion = total / total["total_quotes"]
56 |
57 | df.loc["Mean"] = mean
58 | df.loc["Median"] = median
59 | df.loc["Standard_Deviation"] = standard_deviation
60 | df.loc["Total"] = total
61 | df.loc["Total_Proportion"] = total_proportion
62 |
63 | print(df)
64 | if output_file:
65 | df.to_csv(output_file, sep=";")
66 | return df
67 |
68 |
69 | def get_file_stats(text, quote_objects):
70 | doc = NLP(text)
71 | independent_nouns = (
72 | substantives
73 | ) = (
74 | proper_n
75 | ) = (
76 | anaphora
77 | ) = (
78 | uncovered_mention
79 | ) = (
80 | speakerless
81 | ) = (
82 | verbless
83 | ) = genderless = referenceless = evident_references = plural_speakers = quotes = 0
84 |
85 | for quote_object in quote_objects:
86 | speaker_index = quote_object["speaker_index"]
87 | speaker = quote_object["speaker"]
88 | reference = quote_object["reference"]
89 | speaker_gender = quote_object["speaker_gender"]
90 | verb = quote_object["verb"]
91 | if not verb:
92 | verbless += 1
93 | if speaker_gender == "unknown":
94 | genderless += 1
95 | if reference:
96 | pass
97 | else:
98 | referenceless += 1
99 |
100 | if speaker_index:
101 | start, end = literal_eval(speaker_index)
102 | speaker_span = doc.char_span(start, end, alignment_mode="expand")
103 | speaker_root = speaker_span.root
104 | is_mention = False
105 | if RULES_ANALYZER.is_independent_noun(speaker_root):
106 | is_mention = True
107 | independent_nouns += 1
108 | if speaker_root.pos_ == "PROPN":
109 | proper_n += 1
110 | else:
111 | substantives += 1
112 | elif RULES_ANALYZER.is_potential_anaphor(speaker_root):
113 | is_mention = True
114 | anaphora += 1
115 | else:
116 | infos_root = [
117 | speaker_root,
118 | speaker_root.pos_,
119 | speaker_root.dep_,
120 | speaker_root.morph,
121 | ]
122 | print(
123 | "NOT COVERED :",
124 | speaker,
125 | (start, end, speaker_span.start, speaker_span.end),
126 | infos_root,
127 | )
128 | uncovered_mention += 1
129 | if RULES_ANALYZER.is_independent_noun(
130 | speaker_root
131 | ) and RULES_ANALYZER.is_potential_anaphor(speaker_root):
132 | print(
133 | "DOUBLE",
134 | speaker,
135 | speaker_root,
136 | speaker_root.pos_,
137 | speaker_root.dep_,
138 | speaker_root.morph,
139 | sep="|",
140 | )
141 |
142 | if reference and lev.distance(speaker.lower(), reference.lower()) <= 2:
143 | evident_references += 1
144 |
145 | masc, fem, sing, plur = RULES_ANALYZER.get_gender_number_info(speaker_root)
146 | siblings = RULES_ANALYZER.get_dependent_siblings(speaker_root)
147 | if is_mention and (
148 | (plur and not sing) or (siblings and siblings[-1].idx <= end)
149 | ):
150 | # print("PLURAL :", speaker)
151 | plural_speakers += 1
152 | else:
153 | speakerless += 1
154 | quotes += 1
155 | data = (
156 | independent_nouns,
157 | proper_n,
158 | substantives,
159 | anaphora,
160 | uncovered_mention,
161 | speakerless,
162 | verbless,
163 | genderless,
164 | referenceless,
165 | evident_references,
166 | plural_speakers,
167 | quotes,
168 | )
169 | return data
170 |
171 |
172 | NLP = spacy.load("fr_core_news_lg")
173 | RULES_ANALYZER = RulesAnalyzerFactory.get_rules_analyzer(NLP)
174 |
175 | if __name__ == "__main__":
176 | parser = argparse.ArgumentParser(description="Computes statistics about the quotes and their speakers and write them to csv")
177 | parser.add_argument("--text_dir", type=str, help="Path to the texts directory")
178 | parser.add_argument("--target_dir", type=str, help="Path to the target directory")
179 | parser.add_argument("--output_file", type=str, default="", help="Path to the output csv file")
180 | args = parser.parse_args()
181 | TEXT_DIR = args.text_dir
182 | TARGET_DIR = args.target_dir
183 | OUTPUT_FILE = args.output_file
184 | compute_statistics(TEXT_DIR, TARGET_DIR, OUTPUT_FILE)
185 |
--------------------------------------------------------------------------------
/nlp/french/evaluation/README.md:
--------------------------------------------------------------------------------
1 | # French GGT Evaluation
2 | This folder contains methodology and code for evaluating the results of the French pipeline.
3 |
4 | For consistent and reproducible results, make sure any evaluation run locally uses the **same Python environment that is running in production**.
5 |
6 | ## Download Data
7 | The raw text data containing news article text, as well as the human-annotated data, is made available upon request (please contact Maite Taboada at [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca)).
8 | Obtain the directories named `humanAnnotations` and `rawtext` and place them in their respective paths as per the structure below.
9 |
10 | ```sh
11 | ├── .
12 | | ├── src
13 | | | ├── rawtexts
14 | | | ├── eval
15 | | | | └── humanAnnotations
16 | | | └── evaluate.py
17 | | | └── run_predictions.py
18 | ```
19 |
20 | ## Set Up Environment
21 | This section assumes that the virtual environment for French NLP has already been set up. The French NLP pipeline uses a third party coreference resolution library named [coreferee](https://github.com/explosion/coreferee), which requires the use of Python 3.9. It is assumed that Python 3.9 exists on the system on which the French NLP code runs.
22 |
23 | Make sure that `gcc`, `build-essential` and `python3.9-devel` (on Red Hat/CentOS), or `python3.9-dev` (on ubuntu) are installed on the system. Also, install `python3.9-venv` for managing virtual environments, and ensure `wheel` is installed prior to installing the dependencies (as shown below)
24 |
25 |
26 | If not done already, install a virtual environment using the `requirements.txt` from the `nlp/french` directory in this repo.
27 |
28 | ```sh
29 | cd /path_to_code/GenderGapTracker/nlp/french
30 | python3.9 -m venv GRIM-FR
31 | source GRIM-FR/bin/activate
32 | python3.9 -m pip install -U pip wheel # Upgrade pip and install the wheel package first
33 | python3.9 -m pip install -r requirements.txt
34 | ```
35 |
36 | This installs the correct versions of spaCy, its associated language model, as well as coreferee (for coreference resolution).
37 |
--------------------------------------------------------------------------------
/nlp/french/merge_collections.py:
--------------------------------------------------------------------------------
1 | """
2 | This script merges the results from the newly created collection from the entity
3 | gender annotation script (when the user specifies the `writecol` argument) with
4 | the original collection.
5 |
6 | Only the fields specified in this file are merged with (i.e. overwrite) the original
7 | `media` collection - the remaining fields in the original collection are left untouched.
8 | """
9 | import argparse
10 | from multiprocessing import Pool, cpu_count
11 | from config import config
12 | import utils
13 |
14 |
15 | def update_field(existing_collection, new_collection, idx):
16 | """Overwrite existing collection's fields with new collection's fields (except IDs)"""
17 | new_id = idx['_id']
18 | existing_id = idx['currentId']
19 | doc = new_collection.find_one({'_id': new_id})
20 | existing_collection.update_one(
21 | {'_id': existing_id},
22 | {'$set': filter_dict(doc)}
23 | )
24 |
25 |
26 | def filter_dict(dict_obj):
27 | """Return a dictionary that has the same keys/values as the original dictionary,
28 | except for a few select keys that are to be excluded.
29 | """
30 | ignore_keys = ['_id', 'currentId']
31 | new_dict = {key: dict_obj[key] for key in dict_obj if key not in ignore_keys}
32 | return new_dict
33 |
34 |
35 | def chunker(iterable, chunksize):
36 | """Yield a smaller chunk of a large iterable"""
37 | for i in range(0, len(iterable), chunksize):
38 | yield iterable[i:i + chunksize]
39 |
40 |
41 | def parse_chunks(chunk):
42 | """Pass through a chunk of document IDs and update fields"""
43 | db_client = utils.init_client(MONGO_ARGS)
44 | existing_collection = db_client[DB_NAME][EXISTING_COL]
45 | new_collection = db_client[DB_NAME][NEW_COL]
46 | for idx in chunk:
47 | update_field(existing_collection, new_collection, idx)
48 |
49 |
50 | def run_pool(poolsize, chunksize):
51 | """Concurrently run independent operations on multiple cores"""
52 | db_client = utils.init_client(MONGO_ARGS)
53 | # Get list of new and old IDs from new collection
54 | new_col = db_client[DB_NAME][NEW_COL]
55 | new_old_ids = list(new_col.find({}, {'_id': 1, 'currentId': 1}))
56 | print('Obtained ID list of length {}.'.format(len(new_old_ids)))
57 | # Process quotes using a pool of executors
58 | pool = Pool(processes=poolsize)
59 | pool.map(parse_chunks, chunker(new_old_ids, chunksize=chunksize))
60 | pool.close()
61 |
62 |
63 | if __name__ == '__main__':
64 | parser = argparse.ArgumentParser()
65 | parser.add_argument('--db', type=str, default='mediaTracker', help="Database name")
66 | parser.add_argument('--oldcol', type=str, default='media', help="Existing collection name")
67 | parser.add_argument('--newcol', type=str, default='entitiesAnnotated', help="New collection name")
68 | parser.add_argument("--poolsize", type=int, default=cpu_count() + 1, help="Size of the concurrent process pool for the given task")
69 | parser.add_argument("--chunksize", type=int, default=100, help="Number of articles IDs per chunk being processed concurrently")
70 | args = vars(parser.parse_args())
71 |
72 | # From config
73 | MONGO_ARGS = config['MONGO_ARGS']
74 | # Parse arguments
75 | DB_NAME = args['db']
76 | EXISTING_COL = args['oldcol']
77 | NEW_COL = args['newcol']
78 | poolsize = args['poolsize']
79 | chunksize = args['chunksize']
80 |
81 | run_pool(poolsize, chunksize)
82 | print("Finished merging collections!")
83 |
--------------------------------------------------------------------------------
/nlp/french/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.28.1
2 | Levenshtein>=0.16.0
3 | pandas>=1.5.3,<1.6.0
4 | pymongo>=3.12.0,<4.0.0
5 | pydantic<2.0.0
6 | spacy==3.2.5
7 | fr-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.2.0/fr_core_news_lg-3.2.0-py3-none-any.whl
8 | coreferee==1.3.1
9 | coreferee-model-fr @ https://github.com/msg-systems/coreferee/raw/master/models/coreferee_model_fr.zip
10 | statsmodels>=0.12.2
11 |
--------------------------------------------------------------------------------
/nlp/french/rules/author_blocklist.txt:
--------------------------------------------------------------------------------
1 | Janvier
2 | Février
3 | Fevrier
4 | Mars
5 | Avril
6 | Mai
7 | Juin
8 | Juillet
9 | Août
10 | Aout
11 | Septembre
12 | Octobre
13 | Novembre
14 | Décembre
15 | Decembre
16 | Lundi
17 | Mardi
18 | Mercredi
19 | Jeudi
20 | Vendredi
21 | Samedi
22 | Dimanche
23 | Mise À Jour
24 | Mis À Jour
25 | Agence France-Presse
26 | Afp
27 | Associated Press
28 | La Presse Canadienne
29 | La Presse
30 | Et
31 | Est
32 | Que Je Vous Souhaite
33 | À
34 | Tva Nouvelles
35 | Journal De Montréal
36 | Journal De Montreal
37 | Le Droit
38 | Le Soleil
39 | Agence Qmi
40 | Le Quotidien
41 | À Ottawa
42 | Correspondante
43 | Professeur
44 | Linguiste
45 | Les
46 | Comme
47 | Publié
48 | Nombre
49 | Commentaires
50 | Capture D'Écran
51 | Ici.Radio-Canada.Ca
52 | Radio-Canada.Ca
53 | Radio
54 | Canada.ca
55 | Canada.Ca
56 | De Vie
57 | Où
58 |
--------------------------------------------------------------------------------
/nlp/french/rules/name_patterns.jsonl:
--------------------------------------------------------------------------------
1 | {"label": "LOC", "pattern": "Niagara Falls"}
2 | {"label": "LOC", "pattern": "Rogers Place"}
3 | {"label": "LOC", "pattern": "Preah Sihanouk"}
4 | {"label": "LOC", "pattern": "Nova Scotia"}
5 | {"label": "LOC", "pattern": "Don Mills"}
6 | {"label": "LOC", "pattern": "Maple Ridge"}
7 | {"label": "LOC", "pattern": "Kneehill County"}
8 | {"label": "LOC", "pattern": "La Loche"}
9 | {"label": "LOC", "pattern": "Alberta Parks"}
10 | {"label": "LOC", "pattern": "Sioux Lookout"}
11 | {"label": "LOC", "pattern": "Rio de Janeiro"}
12 | {"label": "LOC", "pattern": "Quintana Roo"}
13 | {"label": "LOC", "pattern": "High Level"}
14 | {"label": "LOC", "pattern": "Red Deer"}
15 | {"label": "LOC", "pattern": [{"LOWER": "saint"}, {"LOWER": "john"}]}
16 | {"label": "LOC", "pattern": "Yves Paradis de la Cour du Québec"}
17 | {"label": "LOC", "pattern": "Québec"}
18 | {"label": "LOC", "pattern": "Ouje Bougoumou"}
19 | {"label": "ORG", "pattern": "OC Transpo"}
20 | {"label": "ORG", "pattern": "Rystad Energy"}
21 | {"label": "ORG", "pattern": "San Marcos"}
22 | {"label": "ORG", "pattern": "Yeni Safak"}
23 | {"label": "ORG", "pattern": "New Yorker"}
24 | {"label": "ORG", "pattern": "Maple Leafs"}
25 | {"label": "ORG", "pattern": "Canada Goose"}
26 | {"label": "ORG", "pattern": "Moose Jaw"}
27 | {"label": "ORG", "pattern": "Tim Hortons"}
28 | {"label": "ORG", "pattern": "Irving Oil"}
29 | {"label": "ORG", "pattern": "Kinder Morgan"}
30 | {"label": "ORG", "pattern": "Der Spiegel"}
31 | {"label": "ORG", "pattern": "Husky Energy"}
32 | {"label": "ORG", "pattern": "Nesbitt Burns"}
33 | {"label": "ORG", "pattern": "Royal LePage"}
34 | {"label": "ORG", "pattern": "Royal Lepage"}
35 | {"label": "ORG", "pattern": "Accueil Bonneau"}
36 | {"label": "ORG", "pattern": "Dickinson Wright"}
37 | {"label": "ORG", "pattern": "Taquan Air"}
38 | {"label": "ORG", "pattern": "Salmar Theatres"}
39 | {"label": "ORG", "pattern": "Walt Disney"}
40 | {"label": "ORG", "pattern": "McCarthy Tétrault"}
41 | {"label": "ORG", "pattern": "Helsingin Sanomat"}
42 | {"label": "ORG", "pattern": "Tk'emlúps te Secwépemc"}
43 | {"label": "ORG", "pattern": "Tk emlúps te Secwepemc"}
44 | {"label": "ORG", "pattern": "Tk emlups te Secwepemc First Nation"}
45 | {"label": "ORG", "pattern": "Tk'emlups te Secwepemc First Nation"}
46 | {"label": "ORG", "pattern": [{"LOWER": "la presse"}]}
47 | {"label": "ORG", "pattern": [{"LOWER": "la presse canadienne"}]}
48 | {"label": "ORG", "pattern": [{"LOWER": "ctvnews"}]}
49 | {"label": "ORG", "pattern": [{"LOWER": "ctvnews.ca"}]}
50 | {"label": "ORG", "pattern": [{"LOWER": "b"}, {"LOWER": "nai"}, {"LOWER": "brith"}]}
51 | {"label": "ORG", "pattern": [{"LOWER": "b'nai"}, {"LOWER": "brith"}]}
52 | {"label": "ORG", "pattern": "Cordé Électrique"}
53 | {"label": "ORG", "pattern": "Cogeco Connexion"}
54 | {"label": "ORG", "pattern": "Delpharm Industrie"}
55 | {"label": "ORG", "pattern": "Unither Bioélectronique"}
56 | {"label": "ORG", "pattern": "Systèmes Danfreight"}
57 | {"label": "ORG", "pattern": "Résilience Montréal"}
58 | {"label": "ORG", "pattern": "Altshuler Berzon"}
59 | {"label": "ORG", "pattern": "Odgers Berndtson"}
60 | {"label": "ORG", "pattern": "Puamun Meshkenu"}
61 | {"label": "ORG", "pattern": "Guylaine Desforges"}
62 | {"label": "ORG", "pattern": [{"LOWER": "affaire"}, {"TEXT": {"REGEX": "[A-Za-z]+"}}]}
63 | {"label": "ORG", "pattern": [{"LOWER": "vkousno"}, {"LOWER": "i"}, {"LOWER": "totchka"}]}
64 | {"label": "MISC", "pattern": [{"LOWER": "wakanda"}, {"LOWER": "forever"}]}
65 | {"label": "MISC", "pattern": [{"LOWER": "mi'kmaw"}]}
66 | {"label": "MISC", "pattern": "Bergdorf Goodman"}
67 | {"label": "MISC", "pattern": "Yuk Yuk"}
68 | {"label": "MISC", "pattern": [{"LOWER": "wet'suwet'en"}]}
69 | {"label": "MISC", "pattern": "Vuntut Gwitchin"}
70 | {"label": "MISC", "pattern": "Wahbung Abinoonjiiag"}
71 | {"label": "MISC", "pattern": "Manitoba Keewatinowi Okimakanak"}
72 | {"label": "MISC", "pattern": "Keewatinowi Okimakanak"}
73 | {"label": "MISC", "pattern": "Selon"}
74 | {"label": "MISC", "pattern": "Brent de la mer du Nord"}
75 | {"label": "MISC", "pattern": "Père Noël"}
76 | {"label": "PER", "pattern": "Caroline"}
77 | {"label": "PER", "pattern": "Virginie"}
78 | {"label": "PER", "pattern": "Georges Washington"}
--------------------------------------------------------------------------------
/research_dashboard/admin/apps/topsources.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import dash
3 | from dash import dcc, html
4 | from dash.dependencies import Input, Output, State
5 | from pymongo import MongoClient
6 | from server import app
7 | from config import config
8 |
9 | MONGO_ARGS = config['MONGO_ARGS']
10 | SOURCES_DB = config['DB']['SOURCES_DB']
11 | SOURCES_COL = config['DB']['SOURCES_COL']
12 |
13 |
14 | # ========== Functions ================
15 | def get_doc_ids_from_db():
16 | with MongoClient(**MONGO_ARGS) as connection:
17 | read_collection = connection[SOURCES_DB][SOURCES_COL]
18 | _ids = read_collection.find().distinct('_id')
19 | return sorted(_ids)
20 |
21 |
22 | def num2str_month(date_str):
23 | date_obj = datetime.datetime.strptime(date_str, "%Y%m")
24 | date_string = datetime.datetime.strftime(date_obj, "%B %Y")
25 | return date_string
26 |
27 |
28 | def format_dates(_ids):
29 | date_strings = [num2str_month(_id) for _id in _ids]
30 | return date_strings
31 |
32 |
33 | def get_top_n_words(topic_dict, n=5):
34 | """Return a list of top-n words for each topic. This list can
35 | then be used as an axis label if required.
36 | """
37 | top_words = []
38 | for num, data in topic_dict.items():
39 | sorted_words = {k: v for k, v in sorted(data['words'].items(),
40 | key=lambda x: x[1],
41 | reverse=True
42 | )}
43 | words = sorted_words.keys()
44 | top_n_words = list(words)[:n]
45 | top_words.append(', '.join(top_n_words))
46 | return top_words
47 |
48 |
49 | def list_topic_words(topic_dict):
50 | """Return a full list of words for a particular topic"""
51 | sorted_words = {k: v for k, v in sorted(topic_dict['words'].items(),
52 | key=lambda x: x[1],
53 | reverse=True
54 | )}
55 | words = sorted_words.keys()
56 | top_n_words = list(words)
57 | top_words = ', '.join(top_n_words)
58 | return top_words
59 |
60 |
61 | # ========== App Layout ================
62 |
63 | def layout():
64 | """Dynamically serve a layout based on updated DB values (for dropdown menu)"""
65 | # Needs db connection! (Set up tunnel if testing app locally)
66 | _ids = get_doc_ids_from_db()
67 | dropdown_dates = {num2str_month(_id): _id for _id in _ids}
68 |
69 | children_list = [
70 | html.Div([
71 | html.Div([
72 | html.H3('Write observations for monthly top sources by gender'),
73 | dcc.Markdown('''
74 | This app allows a user to write observations and comments for a particular month's top quoted
75 | sources. The text that is written is then saved on the database, and displayed on the [top sources
76 | dashboard app](https://gendergaptracker.research.sfu.ca/apps/topsources).
77 | '''),
78 | ]),
79 | html.H4('Topic month'),
80 | html.P('''
81 | Select the topic month from the dropdown to inspect/update the word distributions for
82 | that month.
83 | '''),
84 | html.Div(
85 | dcc.Loading(
86 | id='load-data-progress',
87 | children=[
88 | dcc.Store(id='top-sources-stats'),
89 | ])
90 | ),
91 | dcc.Dropdown(
92 | id='date-dropdown',
93 | options=[
94 | {'label': date_str, 'value': date_num}
95 | for date_str, date_num in dropdown_dates.items()
96 | ],
97 | value=_ids[-1],
98 | style={'text-align': 'center'}
99 | ),
100 | html.Br(),
101 | html.Label([
102 | html.A('Markdown syntax', href='https://www.markdownguide.org/basic-syntax/'),
103 | ]),
104 | html.P('''
105 | The text box below accepts Markdown syntax for embedding URLs: [Highlighted text](https://example.com).
106 | Make sure to route external URLs with the 'http' or 'https' prefix as shown in the
107 | example.
108 | '''),
109 |
110 | html.Div(id='create-text-input'),
111 | html.Div([html.Button(id='write-button', n_clicks=0, children='Save entries')],
112 | style={'display': 'flex', 'justifyContent': 'center'}),
113 | dcc.Loading(
114 | id='write-progress',
115 | children=[
116 | html.P(id='push-comment-fields')
117 | ], type='default'
118 | )
119 | ])
120 | ]
121 | return children_list
122 |
123 |
124 | # ========== Callbacks ================
125 | @app.callback(Output('top-sources-stats', 'data'), [Input('date-dropdown', 'value')])
126 | def get_monthly_stats(value):
127 | with MongoClient(**MONGO_ARGS) as connection:
128 | read_collection = connection[SOURCES_DB][SOURCES_COL]
129 | stats = read_collection.find({'_id': value})
130 | # Collect top sources stats
131 | stats = list(stats)[0]
132 | return stats
133 |
134 |
135 | @app.callback(Output('create-text-input', 'children'), [Input('top-sources-stats', 'data')])
136 | def create_text_input(stats):
137 | comment = stats['comment']
138 | # Return the text area with existing comment (if any)
139 | inp_box = html.Div(
140 | dcc.Textarea(
141 | id='text-input',
142 | placeholder="Enter your comments/observations for the selected month's top sources",
143 | value=comment,
144 | className='textarea',
145 | style={
146 | 'width': '100%', 'height': 350, 'verticalAlign': 'top',
147 | 'fontFamily': 'Arial', 'fontColor': '#515151',
148 | }
149 | ),
150 | style={'display': 'flex', 'justifyContent': 'center'}
151 | ),
152 | return inp_box
153 |
154 |
155 | @app.callback(Output('push-comment-fields', 'data'),
156 | [Input('write-button', 'n_clicks'),
157 | Input('date-dropdown', 'value'),
158 | Input('top-sources-stats', 'data')],
159 | [State('text-input', 'value')])
160 | def update_db(n_clicks, date_id, stats, comment):
161 | """Check if write-button is clicked, only then update DB"""
162 | ctx = dash.callback_context
163 | if "write-button" in ctx.triggered[0]["prop_id"]:
164 | with MongoClient(**MONGO_ARGS) as connection:
165 | collection = connection[SOURCES_DB][SOURCES_COL]
166 | # Overwrite existing topic names with new user-entered names
167 | stats['comment'] = comment
168 | # Write topics
169 | collection.find_one_and_update({'_id': date_id}, {'$set': stats})
170 | return "Updated user comments/observations in the database.."
171 |
--------------------------------------------------------------------------------
/research_dashboard/admin/apps/unknownsources.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import pandas as pd
3 | from dash import dcc, html, dash_table
4 | from dash.dependencies import Input, Output
5 | from pymongo import MongoClient
6 | from server import app, logger
7 | from config import config
8 |
9 | MONGO_ARGS = config['MONGO_ARGS']
10 | SOURCES_DB = config['DB']['SOURCES_DB']
11 | SOURCES_COL = config['DB']['SOURCES_COL']
12 | NUM_SOURCES_TO_SHOW = 20
13 |
14 |
15 | # ========== Functions ================
16 |
17 | def get_doc_ids_from_db():
18 | with MongoClient(**MONGO_ARGS) as connection:
19 | read_collection = connection[SOURCES_DB][SOURCES_COL]
20 | _ids = read_collection.find().distinct('_id')
21 | return sorted(_ids)
22 |
23 |
24 | def num2str_month(date_str):
25 | date_obj = datetime.datetime.strptime(date_str, "%Y%m")
26 | date_string = datetime.datetime.strftime(date_obj, "%B %Y")
27 | return date_string
28 |
29 |
30 | def format_dates(_ids):
31 | date_strings = [num2str_month(_id) for _id in _ids]
32 | return date_strings
33 |
34 |
35 | def get_unknown_sources(stats):
36 | """Convert JSON object of top sources to pandas DataFrame"""
37 | top_unknown = pd.DataFrame(stats['topUnknownSources'])
38 | top_unknown.columns = ['unknown_count', 'unknown_names']
39 | top_unknown['unknown_count'] = top_unknown['unknown_count'].astype('int')
40 | df = (top_unknown.sort_values(by='unknown_count', ascending=False)
41 | .iloc[:NUM_SOURCES_TO_SHOW, :]
42 | .reset_index(drop=True))
43 | output = df.to_dict(orient='records')
44 | return output
45 |
46 |
47 | # ========== App Layout ================
48 |
49 | def layout():
50 | """Dynamically serve a layout based on updated DB values (for dropdown menu)"""
51 | # Needs db connection! (Set up tunnel if testing app locally)
52 | _ids = get_doc_ids_from_db()
53 | dropdown_dates = {num2str_month(_id): _id for _id in _ids}
54 |
55 | children_list = [
56 | html.Div([
57 | html.Div([
58 | html.H3('View unknown sources'),
59 | dcc.Markdown('''
60 | This app allows a user to inspect the top unknown sources extracted for a
61 | particular month. The reason we obtain unknown sources is twofold—sometimes,
62 | spaCy incorrectly tags an organization or geopolitical entity (i.e., location) as
63 | a person, leading to the gender service erring on the side of caution and not
64 | assigning a gender. In other cases, a person's name is ambiguous, or is non-standard
65 | (i.e., non-western or non-anglicized), so the gender services we use are unaware of
66 | these names' genders.
67 |
68 | Inspect the list of unknown sources for a given month by choosing a
69 | month from the dropdown menu.
70 | '''),
71 | ]),
72 | dcc.Dropdown(
73 | id='date-dropdown',
74 | options=[
75 | {'label': date_str, 'value': date_num}
76 | for date_str, date_num in dropdown_dates.items()
77 | ],
78 | value=_ids[-1],
79 | style={'text-align': 'center'}
80 | ),
81 | html.Div(dcc.Store(id='top-sources-stats-2')),
82 | html.Br(),
83 | html.Div(
84 | dash_table.DataTable(
85 | id='unknown-sources-table',
86 | columns=[
87 | {'name': 'Count', 'id': 'unknown_count'},
88 | {'name': 'Unknown sources', 'id': 'unknown_names'},
89 | ],
90 | style_table={'overflowX': 'auto'},
91 | style_cell={
92 | 'backgroundColor': 'rgba(102, 204, 204, 0.05)',
93 | 'textAlign': 'left',
94 | 'font_family': 'Arial',
95 | },
96 | style_data={'height': 'auto', 'lineHeight': '30px'},
97 | style_cell_conditional=[
98 | {
99 | 'if': {'column_id': 'unknown_count'},
100 | 'minWidth': '100px',
101 | 'width': '100px',
102 | 'maxWidth': '100px',
103 | },
104 | ],
105 | style_header={
106 | 'backgroundColor': 'rgb(255, 255, 255)',
107 | 'text-align': 'left',
108 | },
109 | style_as_list_view=True,
110 | )
111 | ),
112 | dcc.Markdown('''
113 | #### 1. Fix spaCy NER rules
114 | To address incorrect spaCy tags, we add a rule to the below file:
115 | [`WomenInMedia/NLP/main/rules/name_patterns.jsonl`](https://github.com/maitetaboada/WomenInMedia/blob/master/NLP/main/rules/name_patterns.jsonl)
116 |
117 | The below tags are defined for now (others can be added as required):
118 | * `GPE`: Countries, cities, states, famous landmarks
119 | * `ORG`: Companies, agencies, institutions, etc.
120 | * `FAC`: Buildings, airports, highways, bridges, etc.
121 | * `NORP`: Nationalities or religious or political groups.
122 | * `EVENT`: Named hurricanes, battles, wars, sports events, etc.
123 |
124 | For a full list of tags, see the [spaCy documentation](https://spacy.io/api/annotation#named-entities).
125 |
126 | #### 2. Update manual gender cache
127 | Alteratively, for names that are of person (but are ambiguous), we can update the
128 | manual gender cache (`genderCache/manual`). This is done by populating a CSV file
129 | with the correct gender for each person's name and running the manual cache update script:
130 | [`WomenInMedia/NLP/experiments/genderCache/manual_cache`](https://github.com/maitetaboada/WomenInMedia/tree/master/NLP/experiments/genderCache/manual_cache)
131 | ''')
132 | ])
133 | ]
134 | return children_list
135 |
136 |
137 | # ========== Callbacks ================
138 |
139 | @app.callback(Output('top-sources-stats-2', 'data'), [Input('date-dropdown', 'value')])
140 | def get_monthly_stats(value):
141 | with MongoClient(**MONGO_ARGS) as connection:
142 | read_collection = connection[SOURCES_DB][SOURCES_COL]
143 | stats = read_collection.find({'_id': value})
144 | # Collect top sources stats
145 | stats = list(stats)[0]
146 | return stats
147 |
148 |
149 | @app.callback(Output('unknown-sources-table', 'data'), [Input('top-sources-stats-2', 'data')])
150 | def get_unknown_sources_data(stats):
151 | try:
152 | output = get_unknown_sources(stats)
153 | logger.info(f'Obtained unknown sources of length {len(output)}')
154 | except Exception as e:
155 | logger.error("Unknown sources app error:", e)
156 | output = []
157 | return output
158 |
159 |
160 |
--------------------------------------------------------------------------------
/research_dashboard/admin/auth.py:
--------------------------------------------------------------------------------
1 | credentials = {
2 | 'admin': 'admin_password'
3 | }
4 |
--------------------------------------------------------------------------------
/research_dashboard/admin/config.py:
--------------------------------------------------------------------------------
1 | config = {
2 | 'MONGO_ARGS': {
3 | 'host': ['mongo0', 'mongo1', 'mongo2'],
4 | # 'host': 'localhost',
5 | 'port': 27017,
6 | 'username': 'username',
7 | 'password': 'password',
8 | 'authSource': 'admin',
9 | 'readPreference': 'primaryPreferred'
10 | },
11 | 'DB': {
12 | 'READ_DB': 'topicModel',
13 | 'READ_COL': 'topicResults',
14 | 'SOURCES_DB': 'mediaTracker',
15 | 'SOURCES_COL': 'monthlySources',
16 | 'GENDER_DB': 'genderCache',
17 | 'MANUAL_NAME_COL': 'manual',
18 | 'FIRST_NAME_COL': 'firstNamesCleaned',
19 | }
20 | }
--------------------------------------------------------------------------------
/research_dashboard/admin/run.py:
--------------------------------------------------------------------------------
1 | from dash import dcc, html
2 | from dash.dependencies import Input, Output
3 | from server import app, server
4 | from apps import topiclabels, topsources, unknownsources, updatecache
5 |
6 |
7 | box_style = {
8 | 'padding': '10px 10px 5px 5px',
9 | 'marginLeft': 'auto', 'marginRight': 'auto',
10 | }
11 |
12 | # Define the main app's layout
13 | app.layout = html.Div([
14 | dcc.Location(id='url', refresh=False),
15 | html.Div(id='page-content')
16 | ])
17 |
18 | # Layout for text on home page
19 | home_page = [
20 | html.Div(children=[
21 | html.H2("Write to production database"),
22 | dcc.Markdown("""
23 | This is an admin dashboard that allows write access to our production MongoDB database
24 | containing data from the Gender Gap Tracker. Any GUI-based services that allow a user to
25 | write to the database can be included as a separate application through this dashboard
26 | structure. Extend the available functionality by adding new apps to button menu shown.
27 |
28 | Contact: Maite Taboada, [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca)
29 | """
30 | ),
31 | html.P(['© 2021 ', html.A('Discourse Processing Lab.', href='https://www.sfu.ca/discourse-lab')],
32 | style={'font-size': '0.8em', 'color': '#a0a0a0'}
33 | )
34 | ])
35 | ]
36 |
37 |
38 | def get_page_divs(page_layout):
39 | page = html.Div(children=[
40 | html.Div(
41 | children=[html.Table(
42 | html.Tr(
43 | [html.Td(html.Img(src="/static/SFULogo.png", style={'padding': '10px 10px 5px 5px', 'height': '50px', 'width': '165px'}))] +
44 | [html.Td(html.Img(src="/static/discourse-lab-logo.jpeg", style={'padding': '10px 10px 5px 5px', 'height': '100px', 'width': '165px'}))] +
45 | [html.Td(html.H2("Measuring gender bias in media"))]
46 | )
47 | )], className='mainheader'),
48 | html.Br(),
49 | html.Div(
50 | children=[
51 | html.Div([
52 | dcc.Link('Home', href='/'),
53 | dcc.Link('Topic Model Labelling', href='/apps/topiclabels'),
54 | dcc.Link('Top sources: Comments', href='/apps/topsources'),
55 | dcc.Link('Unknown gender sources', href='/apps/unknownsources'),
56 | dcc.Link('Update gender cache', href='/apps/updatecache'),
57 | ], className='menu')
58 | ]),
59 | html.Div(children=page_layout, className='main', style={'text-align': 'justify'}),
60 | ], className='container')
61 | return page
62 |
63 |
64 | @app.callback(Output('page-content', 'children'),
65 | [Input('url', 'pathname')])
66 | def display_page(pathname):
67 | if pathname == '/apps/topiclabels':
68 | return get_page_divs(topiclabels.layout())
69 | elif pathname == '/apps/topsources':
70 | return get_page_divs(topsources.layout())
71 | elif pathname == '/apps/unknownsources':
72 | return get_page_divs(unknownsources.layout())
73 | elif pathname == '/apps/updatecache':
74 | return get_page_divs(updatecache.layout())
75 | else:
76 | return get_page_divs(home_page)
77 |
78 |
79 | if __name__ == '__main__':
80 | # app.run_server(host='0.0.0.0', port=8050, dev_tools_ui=False, threaded=True, debug=True)
81 | app.run_server(host='0.0.0.0', port=8050, debug=True)
--------------------------------------------------------------------------------
/research_dashboard/admin/server.py:
--------------------------------------------------------------------------------
1 | import flask
2 | import os
3 | import sys
4 | import dash
5 | # Logging
6 | import logging
7 | from logging.handlers import RotatingFileHandler
8 | # auth.py simply contains a dictionary {'username': 'password'} that is used
9 | # for basic HTTP authentication
10 | import dash_auth
11 | from auth import credentials
12 |
13 | server = flask.Flask(__name__)
14 | server.secret_key = os.urandom(24)
15 |
16 | app = dash.Dash(__name__, server=server, suppress_callback_exceptions=True)
17 | app.css.config.serve_locally = True
18 | app.scripts.config.serve_locally = True
19 | app.title = "Write data to research dashboard - GGT"
20 | # authentication
21 | authorize = dash_auth.BasicAuth(app, credentials)
22 |
23 |
24 | def create_app_logger(filename):
25 | """Logger format and timed handling"""
26 | logger = logging.getLogger(filename)
27 | logger.setLevel(logging.DEBUG)
28 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
29 | os.makedirs("logs", exist_ok=True)
30 | rotateHandler = RotatingFileHandler('logs/' + "g-tracker-admin-api.log",
31 | mode='a', maxBytes=1000, backupCount=3)
32 | rotateHandler.setFormatter(formatter)
33 | stream = logging.StreamHandler(sys.stdout)
34 | stream.setFormatter(formatter)
35 |
36 | logger.addHandler(rotateHandler)
37 | logger.addHandler(stream)
38 | return logger
39 |
40 |
41 | logger = create_app_logger('adminDashLogger')
--------------------------------------------------------------------------------
/research_dashboard/admin/static/SFULogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/admin/static/SFULogo.png
--------------------------------------------------------------------------------
/research_dashboard/admin/static/discourse-lab-logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/admin/static/discourse-lab-logo.jpeg
--------------------------------------------------------------------------------
/research_dashboard/aliases.txt:
--------------------------------------------------------------------------------
1 | Primary name, alias1, alias 2, ...
2 | Rahaf Mohammed al Qunun, Rahaf Mohammed Alqunun, Rahaf Mohammed
3 | Queen Elizabeth II, Queen Elizabeth
4 | Sarah Huckabee Sanders, Sarah Sanders
5 | Michelle Rempel Garner, Michelle Rempel
6 | Francois Philippe Champagne, Francois Philippe
7 | Bill de Blasio, Mayor Bill de Blasio
8 | Volodymyr Zelensky, Volodymyr Zelenskiy, Volodymyr Zelenskyy
9 | Svetlana Tikhanovskaya, Sviatlana Tsikhanouskaya
10 |
--------------------------------------------------------------------------------
/research_dashboard/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/apps/__init__.py
--------------------------------------------------------------------------------
/research_dashboard/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/assets/favicon.ico
--------------------------------------------------------------------------------
/research_dashboard/config.py:
--------------------------------------------------------------------------------
1 | host = ["mongo0", "mongo1", "mongo2"]
2 | # host = "localhost"
3 |
4 | config = {
5 | "MONGO_ARGS": {
6 | "host": host,
7 | "port": 27017,
8 | "username": "username",
9 | "password": "password",
10 | "authSource": "admin",
11 | "readPreference": "primaryPreferred",
12 | },
13 | "DB": {
14 | "READ_DB": "topicModel",
15 | "READ_COL": "topicResults",
16 | "SOURCES_DB": "mediaTracker",
17 | "SOURCES_COL": "monthlySources",
18 | "SOURCES_TIME_SERIES_COL": "monthlySourcesTimeSeries",
19 | },
20 | "GENDER_RECOGNITION": {
21 | "GENDERIZE_ENABLED": False,
22 | "GENDERAPI_ENABLED": True,
23 | "GENDERAPI_TOKEN": "JSON_AUTH_TOKEN",
24 | "MANUAL_CACHE": "manual",
25 | "GENDERAPI_CACHE": "genderAPICleaned",
26 | "GENDERIZE_CACHE": "genderizeCleaned",
27 | "FIRSTNAME_CACHE": "firstNamesCleaned",
28 | },
29 | "NLP": {
30 | "MAX_BODY_LENGTH": 20000,
31 | "AUTHOR_BLOCKLIST": "../nlp/english/rules/author_blocklist.txt",
32 | "NAME_PATTERNS": "../nlp/english/rules/name_patterns.jsonl",
33 | "QUOTE_VERBS": "../nlp/english/rules/quote_verb_list.txt",
34 | },
35 | "ENGLISH_OUTLETS": [
36 | "CBC News",
37 | "CTV News",
38 | "Global News",
39 | "Huffington Post",
40 | "National Post",
41 | "The Globe And Mail",
42 | "The Star",
43 | ],
44 | "FRENCH_OUTLETS": [
45 | "Journal De Montreal",
46 | "La Presse",
47 | "Le Devoir",
48 | "Le Droit",
49 | "Radio Canada",
50 | "TVA News",
51 | ],
52 | }
53 |
--------------------------------------------------------------------------------
/research_dashboard/run.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from dash import dcc, html
3 | from dash.dependencies import Input, Output
4 |
5 | from server import app, server
6 | from apps import (
7 | textanalyzer,
8 | topicmodel,
9 | topsources,
10 | topsourcetrends,
11 | articlecounts,
12 | dailywomenenglish,
13 | )
14 |
15 | box_style = {
16 | 'padding': '10px 10px 5px 5px',
17 | 'marginLeft': 'auto', 'marginRight': 'auto',
18 | }
19 |
20 | # Define the main app's layout
21 | app.layout = html.Div([
22 | dcc.Location(id='url', refresh=False),
23 | html.Div(id='page-content')
24 | ])
25 |
26 | # Layout for text on home page
27 | home_page = [
28 | html.Div(children=[
29 | html.Br(),
30 | dcc.Markdown(
31 | """
32 | This research dashboard showcases results from our study on gender bias in
33 | the media. We present the [Gender Gap Tracker
34 | (GGT)](https://gendergaptracker.informedopinions.org/), an automated system
35 | that measures men and women’s voices on seven major Canadian news outlets in
36 | real time. We analyze the rich information in news articles using Natural
37 | Language Processing (NLP) and quantify the discrepancy in proportions of men
38 | and women quoted. Our larger goals through this project are to enhance
39 | awareness of women’s portrayal in public discourse through hard evidence,
40 | and to encourage news organizations to provide a more diverse set of voices
41 | in their reporting.
42 |
43 | The Gender Gap Tracker is a collaboration between [Informed
44 | Opinions](https://informedopinions.org/), a non-profit dedicated to
45 | amplifying women’s voices in media and Simon Fraser University, through the
46 | [Discourse Processing Lab] (https://www.sfu.ca/discourse-lab.html) and the
47 | [Big Data Initiative](https://www.sfu.ca/big-data/big-data-sfu).
48 |
49 | See our peer-reviewed publications for more detailed technical information
50 | on our methodology:
51 |
52 | 1. Asr FT, Mazraeh M, Lopes A, Gautam V, Gonzales J, Rao P, Taboada M.
53 | (2021) The Gender Gap Tracker: Using Natural Language Processing to
54 | measure gender bias in media. *PLoS ONE 16(1):e0245533*.
55 | https://doi.org/10.1371/journal.pone.0245533
56 | 2. Rao P, Taboada M. (2021), Gender bias in the news: A scalable topic
57 | modelling and visualization framework. *Frontiers in Artificial
58 | Intelligence, 4(82)*. https://doi.org/10.3389/frai.2021.664737
59 |
60 | All of our code for scraping, NLP, topic modelling and data visualization is
61 | publicly available on GitHub so that others can benefit from the
62 | methodology:
63 | https://github.com/sfu-discourse-lab/GenderGapTracker
64 |
65 | For more information about the research methodology and for questions
66 | regarding collaboration, please contact Maite Taboada at
67 | [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca).
68 | """
69 | ),
70 | html.P(
71 | [
72 | f'© {datetime.today().year} ',
73 | html.A('Discourse Processing Lab',
74 | href='https://www.sfu.ca/discourse-lab'),
75 | ], style={'font-size': '0.8em', 'color': '#a0a0a0'}
76 | )
77 | ])
78 | ]
79 |
80 |
81 | def get_page_divs(page_layout, enable_footer=True):
82 | page = html.Div(children=[
83 | html.Div(
84 | children=[html.Table(
85 | html.Tr(
86 | [html.Td(html.Img(src="/static/SFULogo.png", style={'padding': '10px 10px 5px 5px', 'height': '50px', 'width': '165px'}))] +
87 | [html.Td(html.Img(src="/static/discourse-lab-logo.jpeg", style={'padding': '10px 10px 5px 5px', 'height': '100px', 'width': '165px'}))] +
88 | [html.Td(html.H3("Measuring gender bias in media"))]
89 | )
90 | )], className='mainheader'),
91 | html.Br(),
92 | html.Div(
93 | children=[
94 | html.Div([
95 | dcc.Link('Home', href='/'),
96 | dcc.Link('Text analyzer', href='/apps/textanalyzer'),
97 | dcc.Link('Topic models', href='/apps/topicmodel'),
98 | dcc.Link('Top women and men quoted', href='/apps/topsources'),
99 | dcc.Link('Monthly trends: People quoted', href='/apps/topsourcetrends'),
100 | dcc.Link('Daily % women quoted', href='/apps/dailywomenenglish'),
101 | dcc.Link('Weekly article counts', href='/apps/articlecounts'),
102 | ], className='menu')
103 | ]),
104 | html.Div(children=page_layout, className='main'),
105 | html.Div(children=case_footer(enable_footer))
106 | ], className='container')
107 | return page
108 |
109 |
110 | def case_footer(enable_footer):
111 | if enable_footer:
112 | footer = html.Div(
113 | children=[html.Table(
114 | html.Tr(
115 | [html.Td(html.Img(src="/static/SFULogo.png", style={'height': '30px', 'width': '120px'}))] +
116 | [html.Td(html.Img(src="/static/discourse-lab-logo.jpeg", style={'height': '60px', 'width': '100px'}))] +
117 | [html.Td(html.Div(html.P([f"© {datetime.today().year} Discourse Processing Lab."])))]
118 | )
119 | )
120 | ], className='mainfooter'),
121 | else:
122 | footer = html.Div([])
123 | return footer
124 |
125 |
126 | @app.callback(Output('page-content', 'children'),
127 | [Input('url', 'pathname')])
128 | def display_page(pathname):
129 | if pathname == '/apps/textanalyzer':
130 | return get_page_divs(textanalyzer.layout())
131 | elif pathname == '/apps/topicmodel':
132 | return get_page_divs(topicmodel.layout())
133 | elif pathname == '/apps/topsources':
134 | return get_page_divs(topsources.layout())
135 | elif pathname == '/apps/topsourcetrends':
136 | return get_page_divs(topsourcetrends.layout())
137 | elif pathname == '/apps/dailywomenenglish':
138 | return get_page_divs(dailywomenenglish.layout())
139 | elif pathname == '/apps/articlecounts':
140 | return get_page_divs(articlecounts.layout())
141 | else:
142 | return get_page_divs(home_page, enable_footer=False)
143 |
144 |
145 | if __name__ == '__main__':
146 | app.run_server(host='0.0.0.0', port=8050, debug=True)
147 |
--------------------------------------------------------------------------------
/research_dashboard/server.py:
--------------------------------------------------------------------------------
1 | import flask
2 | import os
3 | import dash
4 | import dash_bootstrap_components as dbc
5 | # For language model and loggers
6 | import sys
7 | import spacy
8 | import neuralcoref
9 | from spacy.pipeline import EntityRuler
10 | import logging
11 | from logging.handlers import RotatingFileHandler
12 |
13 | server = flask.Flask(__name__)
14 | server.secret_key = os.urandom(24)
15 |
16 | app = dash.Dash(
17 | __name__,
18 | server=server,
19 | suppress_callback_exceptions=True,
20 | external_stylesheets=[dbc.themes.BOOTSTRAP],
21 | meta_tags=[
22 | {
23 | 'name': 'Measuring gender bias in media - SFU',
24 | 'content': 'A dashboard to analyze gender discrepancies in mainstream Canadian news media.'
25 | },
26 | {
27 | 'property': 'og:image',
28 | 'content': 'https://www.sfu.ca/content/sfu/discourse-lab/jcr:content/main_content/image_0.img.2000.high.jpg/1499291765186.jpeg',
29 | }
30 | ],
31 | )
32 | app.title = "Measuring gender bias in media - SFU"
33 | # Serve JS and CSS locally
34 | app.css.config.serve_locally = True
35 | app.scripts.config.serve_locally = True
36 |
37 |
38 | def create_app_logger(filename):
39 | """Logger format and timed handling"""
40 | logger = logging.getLogger(filename)
41 | logger.setLevel(logging.DEBUG)
42 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
43 | os.makedirs("logs", exist_ok=True)
44 | rotateHandler = RotatingFileHandler('logs/' + "g-tracker-research-api.log",
45 | mode='a', maxBytes=1_000_000, backupCount=3)
46 | rotateHandler.setFormatter(formatter)
47 | stream = logging.StreamHandler(sys.stdout)
48 | stream.setFormatter(formatter)
49 |
50 | logger.addHandler(rotateHandler)
51 | logger.addHandler(stream)
52 | return logger
53 |
54 |
55 | def load_spacy_lang(lang='en_core_web_sm'):
56 | """Return a specific spaCy language model for the NLP module"""
57 | logger.info(f"Loading spaCy language model: '{lang}'")
58 | nlp = spacy.load(lang)
59 | logger.info("Done...")
60 | # Add neuralcoref pipe
61 | coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200)
62 | nlp.add_pipe(coref, name='neuralcoref')
63 | return nlp
64 |
65 |
66 | logger = create_app_logger('userInputDashLogger')
67 | # Load spaCy Model
68 | print('Loading spaCy language model...')
69 | spacy_lang = spacy.load('en_core_web_sm')
70 | # Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify
71 | ruler = EntityRuler(spacy_lang, overwrite_ents=True).from_disk('../nlp/english/rules/name_patterns.jsonl')
72 | spacy_lang.add_pipe(ruler)
73 | # Add neuralcoref pipe
74 | coref = neuralcoref.NeuralCoref(spacy_lang.vocab, max_dist=200)
75 | spacy_lang.add_pipe(coref, name='neuralcoref')
76 | print('Finished loading.')
77 |
78 |
--------------------------------------------------------------------------------
/research_dashboard/static/GGT_topic_model_technical_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/GGT_topic_model_technical_report.pdf
--------------------------------------------------------------------------------
/research_dashboard/static/SFULogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/SFULogo.png
--------------------------------------------------------------------------------
/research_dashboard/static/discourse-lab-logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/discourse-lab-logo.jpeg
--------------------------------------------------------------------------------
/research_dashboard/static/sfu_discourse_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/sfu_discourse_thumbnail.png
--------------------------------------------------------------------------------
/research_dashboard/static/topic-pipeline-flowchart-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/topic-pipeline-flowchart-1.png
--------------------------------------------------------------------------------
/research_dashboard/static/topic-pipeline-flowchart-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/topic-pipeline-flowchart-2.png
--------------------------------------------------------------------------------
/scraper/README.md:
--------------------------------------------------------------------------------
1 | # Scraping
2 | This section contains the code we used for scraping news article content from various Canadian outlets. Note that we store all our data on a MongoDB database, so the scraper code shown in this repo can be modified accordingly if using any other database downstream. The code in this directory was tested on Python 3.6, but should be valid for higher versions.
3 |
4 | ## Required installations for scraping and data storage
5 | * MongoDB: Installation instructions [here](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/).
6 | * Install Python 3.6+ and follow the below instructions to prepare the Python environment. Make sure that `gcc`, `build-essential` and `python3-devel` (on Red Hat/CentOS), or `python3-dev` (on ubuntu) are installed on the system. Also, install `python3-venv` for managing virtual environments.
7 | * Newspaper3k: We use our own [custom fork of the newspaper library](https://github.com/aleaugustoplus/newspaper) to help in the process of collecting data from news websites
8 | * Install the customized newspaper library into a Python virtual environment using the command `pip install -r requirements.txt` on the requirements file provided in this directory, [which is obtained from the source repo](https://github.com/aleaugustoplus/newspaper/blob/master/requirements.txt).
9 |
10 |
11 | ## News Sources
12 | We scrape news articles from the following Canadian news organizations' websites. The articles in our database date back to October 2018.
13 |
14 | #### English
15 | 1. CBC News
16 | 2. CTV News
17 | 3. Global News
18 | 4. HuffPost Canada*
19 | 5. National Post
20 | 6. The Globe And Mail
21 | 7. The Star
22 |
23 | > * HuffPost Canada stopped publishing articles in March 2021. As a result, our database only contains articles from this outlet until February 2021.
24 |
25 | #### French
26 | 1. Journal De Montreal
27 | 2. La Presse
28 | 3. Le Devoir
29 | 4. Le Droit
30 | 5. Radio Canada
31 | 6. TVA News
32 |
33 | Each outlet's news content is retrieved from their RSS feeds by running the required media collectors. Some examples of usage are shown below.
34 |
35 | ### Example of usage
36 |
37 | Run the `mediaCollectors.py` script with positional arguments pointing to the (case-sensitive) news outlet name as follows.
38 |
39 | ```sh
40 | python3 WomenInMedia/scraper/mediaCollectors.py "Huffington Post"
41 | python3 WomenInMedia/scraper/mediaCollectors.py "Journal De Montreal"
42 | ```
43 |
44 |
45 | ### `config.py` parameters
46 | Set up the config settings accordingly to set up the database connection and write scraped articles.
47 |
48 | ```python
49 | # Production config
50 | MONGODB_HOST = ["mongo0", "mongo1", "mongo2"]
51 | MONGODB_PORT = 27017
52 | MONGO_ARGS = {
53 | "readPreference": "primary",
54 | "username": USERNAME,
55 | "password": PASSWORD,
56 | }
57 | DBS_NAME = 'mediaTracker'
58 | COLLECTION_NAME = 'media'
59 | COLLECTION_INVALID_NAME = 'mediaInvalid'
60 | LOGS_DIR = "logs/"
61 | EMAIL_SERVER = 'xxxx@smtp.gmail.com'
62 | EMAIL = "youremail@gmail.com"
63 | EMAIL_ACCOUNT = ""
64 | EMAIL_PASSWORD = ""
65 | EMAIL_DESTINATION = ""
66 | ```
67 |
--------------------------------------------------------------------------------
/scraper/config.py:
--------------------------------------------------------------------------------
1 | # Production config
2 | MONGODB_HOST = ["mongo0", "mongo1", "mongo2"]
3 | MONGODB_PORT = 27017
4 | MONGO_ARGS = {
5 | "readPreference": "primary",
6 | "username": "USERNAME",
7 | "password": "PASSWORD",
8 | }
9 | DBS_NAME = 'mediaTracker'
10 | COLLECTION_NAME = 'media'
11 | COLLECTION_INVALID_NAME = 'mediaInvalid'
12 | LOGS_DIR = "logs/"
13 | EMAIL_SERVER = 'xxxx@smtp.gmail.com'
14 | EMAIL = "youremail@gmail.com"
15 | EMAIL_ACCOUNT = ""
16 | EMAIL_PASSWORD = ""
17 | EMAIL_DESTINATION = ""
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/scraper/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.4.1
2 | Pillow>=3.3.0
3 | PyYAML>=3.11
4 | cssselect>=0.9.2
5 | lxml>=3.6.0
6 | nltk>=3.2.1
7 | requests>=2.10.0
8 | feedparser>=5.2.1
9 | tldextract>=2.0.1
10 | feedfinder2>=0.0.4
11 | jieba3k>=0.35.1
12 | python-dateutil>=2.5.3
13 | tinysegmenter==0.3 # TODO(codelucas): Investigate making this >=0.3
--------------------------------------------------------------------------------
/scraper/util.py:
--------------------------------------------------------------------------------
1 | # This module aggregates different functions used for scrapping news RSS feeds websites
2 | # Date created: 2018/07/19
3 | import re
4 | import os
5 | import heapq
6 | import smtplib
7 | import logging as log
8 | from datetime import datetime
9 | from logging.handlers import TimedRotatingFileHandler, BufferingHandler
10 |
11 |
12 | def str_or_empty_str(tag):
13 | return "" if tag is None or tag.string is None else tag.string
14 |
15 |
16 | def text_or_empty_str(tag):
17 | return "" if tag is None or tag.text is None else tag.text
18 |
19 |
20 | def clean_text(text):
21 | # Regex to remove non printable chars
22 | return re.sub(r"[\x00-\x1F]+", " ", text).rstrip().strip().lower()
23 |
24 |
25 | def enable_debug_http():
26 | try:
27 | import http.client as http_client
28 | except ImportError:
29 | # Python 2
30 | import httplib as http_client
31 | http_client.HTTPConnection.debuglevel = 1
32 |
33 | # DEBUG
34 | log.basicConfig()
35 | log.getLogger().setLevel(log.DEBUG)
36 | requests_log = log.getLogger("requests.packages.urllib3")
37 | requests_log.setLevel(log.DEBUG)
38 | requests_log.propagate = True
39 |
40 |
41 | def conv_str2date(strDate):
42 |
43 | strDate = (
44 | strDate.replace("GMT", "")
45 | .replace("-0400", "")
46 | .replace("EDT", "")
47 | .replace("EST", "")
48 | .replace("+0000", "")
49 | .replace("-0300", "")
50 | .replace("-0700", "")
51 | .replace("-0600", "")
52 | .replace("-0500", "")
53 | .replace("-0001 ", "")
54 | .replace(".000", "")
55 | .strip()
56 | )
57 | try:
58 | try:
59 | convDate = datetime.strptime(strDate, "%a, %d %b %Y %H:%M:%S")
60 | except ValueError:
61 | try:
62 | convDate = datetime.strptime(strDate, "%Y-%m-%d %H:%M:%S")
63 | except ValueError:
64 | convDate = datetime.strptime(strDate, "%d %b %Y %H:%M:%S")
65 |
66 | # log.info("Converted: %s", convDate)
67 | except Exception as ex:
68 | log.exception("Exception: %s", ex)
69 | convDate = datetime.utcnow()
70 |
71 | return convDate
72 |
73 |
74 | # Partially Extracted from: https://gist.github.com/anonymous/1379446
75 | class BufferingSMTPHandler(BufferingHandler):
76 | def __init__(self, mailhost, fromaddr, toaddrs, subject, capacity=1024 * 10, credentials=None):
77 |
78 | BufferingHandler.__init__(self, capacity)
79 | self.mailhost = mailhost
80 | self.mailport = None
81 | self.fromaddr = fromaddr
82 | self.toaddrs = toaddrs
83 | self.subject = subject
84 | self.credentials = credentials
85 |
86 | def flush(self):
87 | if len(self.buffer) > 0:
88 | try:
89 | smtp = smtplib.SMTP_SSL(self.mailhost, 465)
90 | smtp.ehlo()
91 | smtp.login(self.credentials[0], self.credentials[1])
92 | body = ""
93 | for record in self.buffer:
94 | s = self.format(record)
95 | body += s + "\n"
96 |
97 | msg = "From: %s\nSubject: %s\n%s" % (self.fromaddr, self.subject, body)
98 |
99 | smtp.sendmail(self.fromaddr, self.toaddrs, msg.encode("utf-8"))
100 | smtp.quit()
101 | except:
102 | self.handleError(None) # no particular record
103 | self.buffer = []
104 |
105 | def close(self):
106 | self.flush()
107 |
108 |
109 | def get_filename(filename):
110 | # Get logs directory
111 | log_directory = os.path.split(filename)[0]
112 |
113 | # Get file extension (also it's a suffix's value (i.e. ".20181231")) without dot
114 | date = os.path.splitext(filename)[0]
115 | # date = os.path.splitext(tmp)[1][1:]
116 |
117 | # Create new file name
118 | filename = os.path.join(log_directory, date)
119 |
120 | # I don't want to add index if only one log file will exists for date
121 | if not os.path.exists("{}.log".format(filename)):
122 | return "{}.log".format(filename)
123 |
124 | # Create new file name with index
125 | index = 0
126 | f = "{}.{}.log".format(filename, index)
127 | while os.path.exists(f):
128 | index += 1
129 | f = "{}.{}.log".format(filename, index)
130 | return f
131 |
132 |
133 | class CustomTimedRotatingFileHandler(TimedRotatingFileHandler):
134 | def __init__(
135 | self,
136 | filename,
137 | when="S",
138 | interval=1,
139 | backupCount=20,
140 | encoding=None,
141 | delay=False,
142 | utc=False,
143 | atTime=None,
144 | ):
145 | TimedRotatingFileHandler.__init__(
146 | self, filename, when, interval, backupCount, encoding, delay, utc, atTime
147 | )
148 | self.namer = get_filename
149 |
150 | def doRollover(self):
151 |
152 | TimedRotatingFileHandler.doRollover(self)
153 |
154 | if os.stat(self.baseFilename).st_size <= 0:
155 | os.remove(self.baseFilename)
156 |
157 |
158 | class PrioritySet(object):
159 | def __init__(self):
160 | self.heap = []
161 |
162 | def add(self, d):
163 | heapq.heappush(self.heap, (d.priority, d))
164 |
165 | def get(self):
166 | pri, d = heapq.heappop(self.heap)
167 | return d
168 |
169 | def __len__(self):
170 | return len(self.heap)
171 |
--------------------------------------------------------------------------------
/statistics/config.py:
--------------------------------------------------------------------------------
1 | config = {
2 | 'MONGO_ARGS': {
3 | 'host': ['mongo0', 'mongo1', 'mongo2'],
4 | 'port': 27017,
5 | 'username': 'username',
6 | 'password': 'password',
7 | 'authSource': 'admin',
8 | 'readPreference': 'nearest',
9 | }
10 | }
--------------------------------------------------------------------------------
/statistics/daily_pipeline/README.md:
--------------------------------------------------------------------------------
1 | # Daily aggregate statistics
2 |
3 |
4 | ## Daily article counts per outlet
5 | To keep track of whether our news article scrapers are performing, an additional app is added to [our research dashboard](https://gendergaptracker.research.sfu.ca/). We plot daily counts of articles for all news outlets in English and French over a given time period. To do this, we run a daily aggregator script that counts the number of sources and articles for each outlet each day, and write this to the `mediaDaily` collection on the DB. Following this, the charts on the dashboard query the data from the last 180 days, so that we can see if there is an abrupt decline in daily article counts per outlet over a sustained period -- this could be an indication that a particular scraper is out of date and that we need to more closely inspect its source code.
6 |
7 | #### Run the daily article/source aggregator script
8 | This script aggregates the number of articles and sources per gender, per outlet, and writes them to the `mediaDaily` collection in the database. By default, this runs over all articles published within the last 180 days (6 months). Alternately, a custom date range over which the daily counts need to be performed can be specified as follows.
9 |
10 | ```sh
11 | cd daily_pipeline
12 | python3 media_daily.py --begin_date 2021-10-01 --end_date 2021-10-31
13 | ```
14 |
--------------------------------------------------------------------------------
/statistics/daily_pipeline/config.py:
--------------------------------------------------------------------------------
1 | config = {
2 | 'MONGO_ARGS': {
3 | 'host': ['mongo0', 'mongo1', 'mongo2'],
4 | 'port': 27017,
5 | 'username': 'username',
6 | 'password': 'password',
7 | 'authSource': 'admin',
8 | 'readPreference': 'nearest',
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/statistics/daily_pipeline/daily_article_counts.py:
--------------------------------------------------------------------------------
1 | """
2 | This script outputs daily counts of articles per outlet to a separate collection on the
3 | `mediaTracker database`. Following this, the daily counts can be plotted on a chart to
4 | track whether the scrapers are performing as intended.
5 | """
6 | import argparse
7 | from datetime import timedelta, datetime
8 | from pymongo import MongoClient
9 | from config import config
10 |
11 |
12 | def get_connection():
13 | _db_client = MongoClient(**MONGO_ARGS)
14 | return _db_client
15 |
16 |
17 | def format_date(date_str):
18 | dateFormat = '%Y-%m-%d'
19 | return datetime.strptime(date_str, dateFormat)
20 |
21 |
22 | def get_past_date_as_str(days_ago=1):
23 | today = datetime.today().date() - timedelta(days=days_ago)
24 | return today.strftime("%Y-%m-%d")
25 |
26 |
27 | def daily_article_counts(start_date, end_date):
28 | """
29 | Returns the daily counts for articles published by each outlet between two specified dates
30 | """
31 | query = [
32 | {
33 | "$match": {
34 | "body": {"$ne": ""},
35 | "outlet": {"$in": args["outlets"]},
36 | "publishedAt": {
37 | "$gte": start_date,
38 | "$lt": end_date,
39 | },
40 | }
41 | },
42 | {
43 | "$project": {
44 | "publishedAt": {
45 | "$dateToString": {"format": "%Y-%m-%d", "date": "$publishedAt"}
46 | },
47 | "outlet": 1.0,
48 | }
49 | },
50 | {
51 | "$group": {
52 | "_id": {"publishedAt": "$publishedAt", "outlet": "$outlet"},
53 | "totalArticles": {"$sum": 1.0},
54 | }
55 | },
56 | # Final projection: Extract the date (from string) and the outlet name, along with article counts
57 | {
58 | "$project": {
59 | "_id": 0.0,
60 | "publishedAt": {
61 | "$dateFromString": {
62 | "dateString": "$_id.publishedAt",
63 | "format": "%Y-%m-%d",
64 | }
65 | },
66 | "outlet": "$_id.outlet",
67 | "totalArticles": 1.0,
68 | }
69 | },
70 | ]
71 | return query
72 |
73 |
74 | def update_db(collection, payload):
75 | """
76 | Insert aggregated stats of daily per-outlet article counts to the specified
77 | collection in the DB
78 | """
79 | try:
80 | # Find and upsert stats based on the date string value and outlet name
81 | # To avoid duplicates, we match on BOTH the name of the outlet and the date string
82 | for item in payload:
83 | collection.update_one(
84 | {
85 | "$and": [
86 | {"outlet": item["outlet"]},
87 | {"publishedAt": item["publishedAt"]},
88 | ]
89 | },
90 | {"$set": {"totalArticles": item["totalArticles"]}},
91 | upsert=True,
92 | )
93 | except Exception as e:
94 | print(f"Error: {e}")
95 |
96 |
97 | def main():
98 | """Run query and write the daily per-outlet article counts to the database."""
99 | daily_counts = read_collection.aggregate(daily_article_counts(start_date, end_date))
100 | # Write daily article counts per outlet to DB for the given date range
101 | update_db(write_collection, daily_counts)
102 |
103 |
104 | if __name__ == "__main__":
105 | parser = argparse.ArgumentParser()
106 | parser.add_argument("--db", type=str, default="mediaTracker", help="Database name")
107 | parser.add_argument("--readcol", type=str, default="media", help="Read collection name")
108 | parser.add_argument("--writecol", type=str, default="articleCountsDaily", help="Write collection name")
109 | parser.add_argument("--begin_date", type=str, default=get_past_date_as_str(days_ago=90), help="Start date in the format YYYY-MM-DD")
110 | parser.add_argument("--end_date", type=str, default=get_past_date_as_str(days_ago=1), help="End date in the format YYYY-MM-DD")
111 | parser.add_argument("--outlets", type=str, help="Comma-separated list of news outlets to consider in query scope")
112 | args = vars(parser.parse_args())
113 |
114 | start_date = format_date(args["begin_date"])
115 | end_date = format_date(args["end_date"]) + timedelta(days=1)
116 |
117 | # Import config settings
118 | MONGO_ARGS = config["MONGO_ARGS"]
119 |
120 | if not args["outlets"]:
121 | # Consider all English and French outlets by default
122 | args["outlets"] = [
123 | "National Post",
124 | "The Globe And Mail",
125 | "The Star",
126 | "Huffington Post",
127 | "Global News",
128 | "CTV News",
129 | "CBC News",
130 | "Journal De Montreal",
131 | "La Presse",
132 | "Le Devoir",
133 | "Le Droit",
134 | "Radio Canada",
135 | "TVA News",
136 | ]
137 | else:
138 | # Format outlets as a list of strings
139 | args["outlets"] = args["outlets"].split(",")
140 |
141 | # Connect to database
142 | _client = get_connection()
143 | read_collection = _client[args["db"]][args["readcol"]]
144 | write_collection = _client[args["db"]][args["writecol"]]
145 |
146 | main()
--------------------------------------------------------------------------------
/statistics/daily_pipeline/media_daily.py:
--------------------------------------------------------------------------------
1 | """
2 | This script is designed to be a replacement for the tools.py script that calculates
3 | daily article/source counts per outlet. The aim is to speed up the computation (the
4 | earlier version used vanilla Python) using native mongo objects and queries.
5 |
6 | By default, this script is run for all articles published within the last 3 months. Even
7 | though this is redundant, we feel this is necessary, because in some cases, the scrapers
8 | can populate the DB with new articles from a past date. This is why it makes sense to
9 | check up to 3 months back on a daily basis.
10 | """
11 | import argparse
12 | from datetime import timedelta, datetime
13 | from pymongo import MongoClient
14 | from config import config
15 |
16 |
17 | def get_connection():
18 | _db_client = MongoClient(**MONGO_ARGS)
19 | return _db_client
20 |
21 |
22 | def format_date(date_str):
23 | dateFormat = '%Y-%m-%d'
24 | return datetime.strptime(date_str, dateFormat)
25 |
26 |
27 | def get_past_date_as_str(days_ago=1):
28 | today = datetime.today().date() - timedelta(days=days_ago)
29 | return today.strftime("%Y-%m-%d")
30 |
31 |
32 | def daily_article_counts(start_date, end_date):
33 | """
34 | Returns the daily counts for articles and sources by gender, as published by each
35 | outlet between two specified dates
36 | """
37 | query = [
38 | {
39 | "$match": {
40 | "body": {"$ne": ""},
41 | "outlet": {"$in": args["outlets"]},
42 | "publishedAt": {
43 | "$gte": start_date,
44 | "$lt": end_date,
45 | },
46 | }
47 | },
48 | {
49 | "$project": {
50 | "publishedAt": {
51 | "$dateToString": {"format": "%Y-%m-%d", "date": "$publishedAt"}
52 | },
53 | "outlet": 1.0,
54 | "sourcesFemaleCount": 1.0,
55 | "sourcesMaleCount": 1.0,
56 | "sourcesUnknownCount": 1.0,
57 | }
58 | },
59 | {
60 | "$group": {
61 | "_id": {
62 | "publishedAt": "$publishedAt",
63 | "outlet": "$outlet",
64 | },
65 | "totalArticles": {"$sum": 1.0},
66 | "totalFemales": {"$sum": "$sourcesFemaleCount"},
67 | "totalMales": {"$sum": "$sourcesMaleCount"},
68 | "totalUnknowns": {"$sum": "$sourcesUnknownCount"},
69 | }
70 | },
71 | # Final projection: Extract the date (from string) and the outlet name, along with article counts
72 | {
73 | "$project": {
74 | "_id": 0.0,
75 | "publishedAt": {
76 | "$dateFromString": {
77 | "dateString": "$_id.publishedAt",
78 | "format": "%Y-%m-%d",
79 | }
80 | },
81 | "outlet": "$_id.outlet",
82 | "totalArticles": 1.0,
83 | "totalFemales": 1.0,
84 | "totalMales": 1.0,
85 | "totalUnknowns": 1.0,
86 | }
87 | },
88 | ]
89 | return query
90 |
91 |
92 | def update_db(collection, payload):
93 | """
94 | Insert aggregated stats of daily per-outlet article and source counts to the
95 | specified collection in the DB
96 | """
97 | try:
98 | # Find and upsert stats based on the date string value and outlet name
99 | # To avoid duplicates, we match on BOTH the name of the outlet and the date string
100 | for item in payload:
101 | collection.update_one(
102 | {
103 | "$and": [
104 | {"outlet": item["outlet"]},
105 | {"publishedAt": item["publishedAt"]},
106 | ]
107 | },
108 | {
109 | "$set": {
110 | "totalArticles": item["totalArticles"],
111 | "totalFemales": item["totalFemales"],
112 | "totalMales": item["totalMales"],
113 | "totalUnknowns": item["totalUnknowns"],
114 | }
115 | },
116 | upsert=True,
117 | )
118 | except Exception as e:
119 | print(f"Error: {e}")
120 |
121 |
122 | def main():
123 | """Run query and write the daily per-outlet article counts to the database."""
124 | daily_counts = read_collection.aggregate(daily_article_counts(start_date, end_date))
125 | # Write daily article counts per outlet to DB for the given date range
126 | update_db(write_collection, daily_counts)
127 |
128 |
129 | if __name__ == "__main__":
130 | parser = argparse.ArgumentParser()
131 | parser.add_argument("--db", type=str, default="mediaTracker", help="Database name")
132 | parser.add_argument("--readcol", type=str, default="media", help="Read collection name")
133 | parser.add_argument("--writecol", type=str, default="mediaDaily", help="Write collection name")
134 | parser.add_argument("--begin_date", type=str, default=get_past_date_as_str(days_ago=90), help="Start date in the string format YYYY-MM-DD")
135 | parser.add_argument("--end_date", type=str, default=get_past_date_as_str(days_ago=3), help="End date in the string format YYYY-MM-DD")
136 | parser.add_argument("--outlets", type=str, help="Comma-separated list of news outlets to consider in query scope")
137 | args = vars(parser.parse_args())
138 |
139 | start_date = format_date(args["begin_date"])
140 | end_date = format_date(args["end_date"]) + timedelta(days=1)
141 |
142 | # Import config settings
143 | MONGO_ARGS = config["MONGO_ARGS"]
144 |
145 | if not args["outlets"]:
146 | # English outlets
147 | args["outlets"] = [
148 | "National Post",
149 | "The Globe And Mail",
150 | "The Star",
151 | "Huffington Post",
152 | "Global News",
153 | "CTV News",
154 | "CBC News",
155 | ]
156 | else:
157 | # Format outlets as a list of strings
158 | args["outlets"] = args["outlets"].split(",")
159 |
160 | # Connect to database
161 | _client = get_connection()
162 | read_collection = _client[args["db"]][args["readcol"]]
163 | write_collection = _client[args["db"]][args["writecol"]]
164 |
165 | main()
--------------------------------------------------------------------------------
/statistics/monthly_pipeline/README.md:
--------------------------------------------------------------------------------
1 | # Monthly aggregate statistics
2 |
3 | For [our research dashboard](https://gendergaptracker.research.sfu.ca/), we aggregate our results on a monthly basis. This is primarily for us to study trends in our topic models each month, as well as to analyze the top quoted men and women over time.
4 |
5 | Calculate the top 50 quoted men and women for a particular month by specifying the month and year as follows:
6 |
7 | ```sh
8 | cd monthly_aggregate
9 | # Calculate top 50 male & female sources for all outlets for November and December 2020
10 | python3 monthly_top_sources.py --year 2020 --month 11
11 | python3 monthly_top_sources.py --year 2020 --month 12
12 | ```
13 |
14 | Similarly, we can calculate the top 50 quoted men and women each month to study the top quoted people's quote counts as a time series. We limit the calculation to just the top 50 for querying-efficiency reasons (otherwise the time series lookup can become inefficient). Each month's calculation is run one at a time, sequentially, as follows.
15 |
16 | ```sh
17 | cd monthly_aggregate
18 | # Calculate the quote counts for each of the top 50 male & female sources for all outlets for April, May and June 2020
19 | python3 monthly_top_sources_timeseries.py --year 2020 --month 4
20 | python3 monthly_top_sources_timeseries.py --year 2020 --month 5
21 | python3 monthly_top_sources_timeseries.py --year 2020 --month 6
22 | ```
23 |
--------------------------------------------------------------------------------
/statistics/monthly_pipeline/config.py:
--------------------------------------------------------------------------------
1 | config = {
2 | 'MONGO_ARGS': {
3 | 'host': ['mongo0', 'mongo1', 'mongo2'],
4 | 'port': 27017,
5 | 'username': 'username',
6 | 'password': 'password',
7 | 'authSource': 'admin',
8 | 'readPreference': 'nearest',
9 | }
10 | }
--------------------------------------------------------------------------------
/statistics/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.27.1
2 | pymongo>=3.10.0,<4.0.0
3 | pandas>=1.1.5
4 |
5 |
--------------------------------------------------------------------------------
/statistics/run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import time
4 | import os
5 | import sys
6 | import pandas as pd
7 | from datetime import datetime
8 | from logging.handlers import TimedRotatingFileHandler
9 | from pymongo import MongoClient
10 | # config
11 | from config import config
12 | # User-created queries
13 | import queries
14 |
15 |
16 | def get_connection():
17 | connection = MongoClient(**MONGO_ARGS)
18 | return connection
19 |
20 |
21 | def format_date(date_str):
22 | dateFormat = '%Y-%m-%d'
23 | return datetime.strptime(date_str, dateFormat)
24 |
25 |
26 | def create_app_logger(filename):
27 | """Logger format and timed handling"""
28 | logger = logging.getLogger(filename)
29 | logger.setLevel(logging.DEBUG)
30 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
31 | rotateHandler = TimedRotatingFileHandler(os.path.join("logs", "g-statistics.log"),
32 | when="midnight")
33 | rotateHandler.setFormatter(formatter)
34 | stream = logging.StreamHandler(sys.stdout)
35 | stream.setFormatter(formatter)
36 |
37 | logger.addHandler(rotateHandler)
38 | logger.addHandler(stream)
39 | return logger
40 |
41 |
42 | def run_aggregation_queries():
43 | """Collect aggregation query methods from queries.py and run them."""
44 | query_list = []
45 | for method_name in args.keys():
46 | requested = args[method_name]
47 | if requested and isinstance(requested, bool):
48 | # Only those args supplied as boolean flags will run as queries
49 | # getattr(foo, 'bar') equals foo.bar
50 | query_list.append(getattr(queries, method_name))
51 |
52 | # Run multiple aggregation queries between specified start/end dates
53 | for query in query_list:
54 | logger.info(f"Query: '{query.__name__}', date range: ({start_date}, {end_date})")
55 | start_time = time.time()
56 | result = collection.aggregate(query(args))
57 |
58 | # Export CSV
59 | filename = f"{query.__name__}_{start_date}_to_{end_date}.csv"
60 | df = pd.DataFrame.from_dict(result)
61 | df.to_csv(filename, index=False)
62 |
63 | logger.info(f"{query.__name__} query completed in {time.time() - start_time:.3f} seconds.")
64 |
65 |
66 | if __name__ == "__main__":
67 | parser = argparse.ArgumentParser()
68 | parser.add_argument('--db', type=str, default='mediaTracker', help="Database name")
69 | parser.add_argument('--col', type=str, default='media', help="Read collection name")
70 | parser.add_argument("--begin_date", type=str, default="2020-04-29", help="Start date in the format YYYY-MM-DD")
71 | parser.add_argument("--end_date", type=str, default="2020-04-30", help="End date in the format YYYY-MM-DD")
72 | parser.add_argument("--outlets", type=str, help="Comma-separated list of news outlets to consider in query scope")
73 | parser.add_argument("--limit", type=int, default=100, help="Number of results to limit to")
74 | parser.add_argument("--sort", type=str, default='desc', help="Sort results in ascending or descending order")
75 | # Query name args (specified as booleans)
76 | parser.add_argument("--db_stats", action='store_true', help="Run query to calculate overall gender stats (sources, people, authors)")
77 | parser.add_argument("--outlet_stats", action='store_true', help="Run query to calculate gender stats (sources, people, authors) per outlet")
78 | parser.add_argument("--top_sources_female", action='store_true', help="Run query to calculate top N female sources")
79 | parser.add_argument("--top_sources_male", action='store_true', help="Run query to calculate top N male sources")
80 | parser.add_argument("--top_sources_unknown", action='store_true', help="Run query to calculate top N unknown sources")
81 | parser.add_argument("--top_sources_all", action='store_true', help="Run query to calculate top N overall sources (male or female)")
82 | parser.add_argument("--female_author_sources", action='store_true', help="Run query to cross-tabulate female author sources vs. source gender counts")
83 | parser.add_argument("--male_author_sources", action='store_true', help="Run query to cross-tabulate male author sources vs. source gender counts")
84 | parser.add_argument("--mixed_author_sources", action='store_true', help="Run query to cross-tabulate both gender (male & female) author sources vs. source gender counts")
85 | parser.add_argument("--unknown_author_sources", action='store_true', help="Run query to cross-tabulate unknown author sources vs. source gender counts")
86 | parser.add_argument("--daily_article_counts", action='store_true', help="Run query to get a tally of daily article counts")
87 | args = vars(parser.parse_args())
88 |
89 | # Import config settings
90 | MONGO_ARGS = config['MONGO_ARGS']
91 |
92 | if not args['outlets']:
93 | # Consider all seven English-language outlets by default
94 | args['outlets'] = [
95 | 'National Post', 'The Globe And Mail', 'The Star',
96 | 'Huffington Post', 'Global News', 'CTV News', 'CBC News'
97 | ]
98 | else:
99 | # Format outlets as a list of strings
100 | args['outlets'] = args['outlets'].split(",")
101 |
102 | # Convert sort value to float for pymongo (1.0 is ascending, -1.0 is descending)
103 | args['sort'] = 1.0 if args['sort'] == 'asc' else -1.0
104 |
105 | # Store dates as strings for file naming
106 | start_date = args['begin_date']
107 | end_date = args['end_date']
108 | # Format dates as datetime objects for pymongo
109 | args['begin_date'] = format_date(args['begin_date'])
110 | args['end_date'] = format_date(args['end_date'])
111 |
112 | # Create logs
113 | os.makedirs("logs", exist_ok=True)
114 | logger = create_app_logger('statisticsLog')
115 |
116 | # Connect to database
117 | connection = get_connection()
118 | collection = connection[args['db']][args['col']]
119 |
120 | run_aggregation_queries()
121 |
122 |
--------------------------------------------------------------------------------