├── .gitattributes ├── .gitignore ├── CONTRIBUTORS.md ├── LICENSE ├── README.md ├── api ├── README.md ├── english │ ├── README.md │ ├── __init__.py │ ├── db │ │ ├── config.py │ │ └── mongoqueries.py │ ├── endpoints │ │ └── outlet_stats.py │ ├── gunicorn_conf.py │ ├── logging.conf │ ├── main.py │ ├── schemas │ │ ├── stats_by_date.py │ │ └── stats_weekly.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_mock_outlet_stats.py │ │ └── test_outlet_stats.py │ └── utils │ │ ├── dateutils.py │ │ └── logger.py ├── french │ ├── README.md │ ├── __init__.py │ ├── db │ │ ├── config.py │ │ └── mongoqueries.py │ ├── endpoints │ │ └── outlet_stats.py │ ├── gunicorn_conf.py │ ├── logging.conf │ ├── main.py │ ├── schemas │ │ ├── stats_by_date.py │ │ └── stats_weekly.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_mock_outlet_stats.py │ │ └── test_outlet_stats.py │ └── utils │ │ ├── dateutils.py │ │ └── logger.py └── requirements.txt ├── nlp ├── english │ ├── README.md │ ├── config.py │ ├── entity_gender_annotator.py │ ├── evaluation │ │ ├── README.md │ │ └── src │ │ │ ├── README.md │ │ │ ├── evaluate.py │ │ │ ├── evaluate_quotes.py │ │ │ └── run_predictions.py │ ├── gender_predictor.py │ ├── img │ │ └── concurrent.png │ ├── merge_collections.py │ ├── quote_extractor.py │ ├── requirements.txt │ ├── rules │ │ ├── README.md │ │ ├── author_blocklist.txt │ │ ├── name_patterns.jsonl │ │ └── quote_verb_list.txt │ ├── topic_model │ │ ├── .gitignore │ │ ├── README.md │ │ ├── config.py │ │ ├── corpus_analysis │ │ │ ├── README.md │ │ │ ├── analyze.py │ │ │ ├── config.py │ │ │ ├── download_articles.py │ │ │ ├── requirements.txt │ │ │ ├── spacyLemmas │ │ │ │ └── spacy_english_lemmas.txt │ │ │ └── test_corpus_functions.py │ │ ├── img │ │ │ ├── example_divergent_heatmap.png │ │ │ ├── example_heatmap.png │ │ │ └── example_wordcloud.png │ │ ├── preproc.py │ │ ├── preproc_cc.py │ │ ├── requirements.txt │ │ ├── spacyLemmas │ │ │ ├── README.md │ │ │ ├── convert_spacy_lemmas.py │ │ │ └── spacy_english_lemmas.txt │ │ ├── stopwords │ │ │ ├── README.md │ │ │ ├── create_stopword_list.py │ │ │ ├── nltk_curated.txt │ │ │ └── stopwords.txt │ │ ├── train.py │ │ ├── train_cc.py │ │ └── vis.py │ └── utils.py └── french │ ├── README.md │ ├── config.py │ ├── data_statistics.py │ ├── entity_gender_annotator.py │ ├── entity_merger.py │ ├── evaluation │ ├── README.md │ └── src │ │ ├── README.md │ │ ├── evaluate.py │ │ ├── evaluate_quotes.py │ │ └── run_predictions.py │ ├── gender_predictor.py │ ├── merge_collections.py │ ├── quote_extractor.py │ ├── quote_highlighter.py │ ├── quote_merger.py │ ├── requirements.txt │ ├── rules │ ├── author_blocklist.txt │ ├── name_patterns.jsonl │ └── quote_verb_list.txt │ └── utils.py ├── research_dashboard ├── README.md ├── admin │ ├── apps │ │ ├── topiclabels.py │ │ ├── topsources.py │ │ ├── unknownsources.py │ │ └── updatecache.py │ ├── assets │ │ └── style.css │ ├── auth.py │ ├── config.py │ ├── run.py │ ├── server.py │ └── static │ │ ├── SFULogo.png │ │ └── discourse-lab-logo.jpeg ├── aliases.txt ├── apps │ ├── __init__.py │ ├── articlecounts.py │ ├── dailywomenenglish.py │ ├── textanalyzer.py │ ├── topicmodel.py │ ├── topsources.py │ └── topsourcetrends.py ├── assets │ ├── favicon.ico │ └── style.css ├── config.py ├── run.py ├── server.py └── static │ ├── GGT_topic_model_technical_report.pdf │ ├── SFULogo.png │ ├── discourse-lab-logo.jpeg │ ├── sfu_discourse_thumbnail.png │ ├── topic-pipeline-flowchart-1.png │ └── topic-pipeline-flowchart-2.png ├── scraper ├── README.md ├── config.py ├── mediaCollectors.py ├── requirements.txt └── util.py └── statistics ├── README.md ├── config.py ├── daily_pipeline ├── README.md ├── config.py ├── daily_article_counts.py └── media_daily.py ├── monthly_pipeline ├── README.md ├── config.py ├── monthly_top_sources.py └── monthly_top_sources_timeseries.py ├── queries.py ├── requirements.txt └── run.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-vendored 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Custom 107 | *.DS_Store 108 | logs 109 | 110 | # Generated Data 111 | NLP/experiments/MDS_Capstone/validation/annotatedData/ 112 | NLP/experiments/MDS_Capstone/validation/results/ 113 | NLP/experiments/MDS_Capstone/validation/V4.0/ 114 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | ## Contributors 2 | 3 | * [Professor Maite Taboada](https://www.sfu.ca/~mtaboada/): Principal Investigator and Director, Discourse Processing Lab 4 | * [Dr. Fatemeh Torabi Asr](https://ftasr.github.io/): Postdoctoral Fellow and Project Manager 5 | * [Alexandre Lopes](https://github.com/aleaugustoplus): Data Scientist and Database Manager 6 | * [Mohammad Mazraeh](https://github.com/MohMaz): Software Developer and Machine Learning Engineer 7 | * [Vagrant Gautam](https://dippedrusk.com/): Computational Linguist 8 | * [Junette Dianne Gonzales](http://www.sfu.ca/linguistics/events/2020/08/junette-gonzales-sda-minor.html): Language Data Annotator 9 | * [Lucas Chambers](https://www.sfu.ca/linguistics/events/2019/10/lucas-chambers.html): Linguist and Topic Label Annotator 10 | * [Jillian Anderson](https://github.com/jillianderson8): Big Data Developer 11 | * [Prashanth Rao](https://github.com/prrao87): Data Scientist and Software Developer 12 | * [Philipp Eibl](https://philippnoah.github.io): Data Scientist and Software Developer (French NLP) 13 | * [Valentin-Gabriel Soumah](https://github.com/Pantalaymon): Data Scientist and Software Developer (French NLP) 14 | 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-2023 Maite Taboada 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | __Status: V7.0__ (Code provided as-is; only sporadic updates expected) 2 | 3 | # Measuring gender bias in media 4 | 5 | We present the code and framework for two bodies of work in this repo: 6 | 7 | 1. [The Gender Gap Tracker](https://gendergaptracker.informedopinions.org/) (GGT) for English news articles 8 | 2. [Radar de Parité](https://radardeparite.femmesexpertes.ca/) (RdP) for French news articles 9 | 10 | The GGT and RdP are automated systems that measure men and women’s voices on mainstream Canadian news outlets in real time. We analyze articles from six English outlets (for the GGT) and six French outlets (for the RdP) in Canada using Natural Language Processing (NLP), and quantify the discrepancy in proportions of men and women quoted. Our larger goals through this project are to enhance awareness of women’s portrayal in public discourse through hard evidence, and to encourage news organizations to provide a more diverse set of voices in their reporting. 11 | 12 | The Gender Gap Tracker is a collaboration between [Informed Opinions](https://informedopinions.org/), a non-profit dedicated to amplifying the voices of women and gender-diverse people in media and Simon Fraser University, through the [Discourse Processing Lab](https://www.sfu.ca/discourse-lab.html) and the [Big Data Initiative](https://www.sfu.ca/big-data/big-data-sfu). 13 | 14 | ## Publications 15 | 1. Asr FT, Mazraeh M, Lopes A, Gautam V, Gonzales J, Rao P, Taboada M. (2021) The Gender Gap Tracker: Using Natural Language Processing to measure gender bias in media. *PLoS ONE 16(1): e0245533*. https://doi.org/10.1371/journal.pone.0245533 16 | 2. Rao P, Taboada M. (2021), Gender bias in the news: A scalable topic modelling and visualization framework. *Frontiers in Artificial Intelligence, 4(82)*. https://doi.org/10.3389/frai.2021.664737 17 | 3. Soumah, V.-G., Rao, P., Eibl, P., & Taboada, M. (2023). Radar de Parité: An NLP system to measure gender representation in French news stories. *Proceedings of the Canadian Conference on Artificial Intelligence*. https://doi.org/10.21428/594757db.b6f3c89e 18 | 19 | 20 | ## Contributors 21 | 22 | See [CONTRIBUTORS.md](CONTRIBUTORS.md) 23 | ## Contents of this repo 24 | 25 | * `scraper`: Modules for scraping English and French news articles from various Canadian news organizations' websites and RSS feeds. 26 | * `nlp`: NLP modules for performing quote extraction and entity gender annotation on both English and French news articles. 27 | * `api`: FastAPI code base exposing endpoints that serve our daily statistics to public-facing dashboards: [Gender Gap Tracker](https://gendergaptracker.informedopinions.org) and [Radar de Parité](https://radardeparite.femmesexpertes.ca) 28 | * `research_dashboard`: [A multi-page, extensible dashboard](https://gendergaptracker.research.sfu.ca/) built in Plotly Dash that allows us to explore the GGT data in more detail. 29 | * `statistics`: Scripts for running batch queries on our MongoDB database to retrieve source/gender statistics. 30 | 31 | ## Data 32 | 33 | Both the English and French datasets were downloaded from public and subscription websites of newspapers, under the ‘fair dealing’ provision in Canada’s Copyright Act. This means that the data can be made available (upon signing a licence agreement) **only** for non-commercial and/or research purposes. 34 | 35 | ## Future directions 36 | 37 | In future versions of the software, we are planning to visualize more fine-grained information about who is being quoted, separating politicians, witnesses and/or victims, from experts (as informed sources of analysis, context and opinion). We are also looking into different ways of separating wire copy from the original publication of each news outlet in order to provide a clearer view of the gender gap in Canadian media, produced by the news outlets themselves. 38 | 39 | From a research perspective, questions of salience and space arise, i.e., whether quotes by men are presented more prominently in an article, and whether men are given more space in average (perhaps counted in number of words). More nuanced questions that involve language analysis include whether the quotes are presented differently in terms of endorsement or distance from the content of the quote (*stated* vs. *claimed*). Analyses of transitivity structure in clauses can yield further insights about the type of roles women are portrayed in, complementing some of our studies' findings via dependency analyses. 40 | 41 | We are mindful of and acknowledge the relative lack of work in NLP in languages other than English. We believe that we have played at least a small role here, through our analyses in the Radar de Parité on French news articles, though there still remains a lot more to be done in this domain. Our hope is that further such work will yield not only interesting methodological insights, but also reveal larger similarities in gender disparities in other regions of the world. While we are actively pursuing such additional areas of inquiry, we also invite other researchers to join in this effort! 42 | 43 | 44 | ## Contact 45 | 46 | For more information about the research methodology and for questions regarding collaboration, please contact Dr. Maite Taboada. 47 | 48 | > **Maite Taboada** 49 | mtaboada@sfu.ca 50 | Director, Discourse Processing Lab 51 | Simon Fraser University 52 | Burnaby, British Columbia, Canada 53 | -------------------------------------------------------------------------------- /api/README.md: -------------------------------------------------------------------------------- 1 | # APIs for public-facing dashboards 2 | 3 | This section hosts code for the backend APIs that serve our public-facing dashboards for our partner organization, Informed Opinions. 4 | 5 | We have two APIs: one each serving the English and French dashboards (for the Gender Gap Tracker and the Radar de Parité, respectively). 6 | 7 | ## Dashboards 8 | * English: https://gendergaptracker.informedopinions.org 9 | * French: https://radardeparite.femmesexpertes.ca 10 | 11 | ### Front end code 12 | 13 | The front end code base, for clearer separation of roles and responsibilities, is hosted externally on GitLab. 14 | 15 | * English: [Kanopi_GGT/Gender Gap Tracker](https://gitlab.com/client-transfer-group/gender-gap-tracker) 16 | * French: [Kanopi_GGT/RDP](https://gitlab.com/client-transfer-group/rdp) 17 | 18 | Access to these repos is restricted, so please reach out to mtaboada@sfu.ca to get access to the code, if required. 19 | 20 | ## Setup 21 | 22 | Both APIs are written using [FastAPI](https://fastapi.tiangolo.com/), a high-performance web framework for building APIs in Python. 23 | 24 | This code base has been tested in Python 3.9, but there shouldn't be too many problems if using a higher Python version. 25 | 26 | Install the required dependencies via `requirements.txt` as follows. 27 | 28 | Install a new virtual environment if it does not already exist: 29 | ```sh 30 | $ python3.9 -m venv api_venv 31 | $ python3.9 -m pip install -r requirements.txt 32 | ``` 33 | 34 | For further use, activate the virtual environment: 35 | 36 | ```sh 37 | $ source api_venv/bin/activate 38 | ``` 39 | 40 | 41 | -------------------------------------------------------------------------------- /api/english/README.md: -------------------------------------------------------------------------------- 1 | # Gender Gap Tracker: API 2 | 3 | This section contains the code for the API that serves the [Gender Gap Tracker public dashboard](https://gendergaptracker.informedopinions.org/). The dashboard itself is hosted externally, and its front end code is hosted on this [GitLab repo](https://gitlab.com/client-transfer-group/gender-gap-tracker). 4 | 5 | ## API docs 6 | 7 | The docs can be accessed in one of two ways: 8 | 9 | * Swagger: https://gendergaptracker.informedopinions.org/docs 10 | * Useful to test out the API interactively on the browser 11 | * Redoc: https://gendergaptracker.informedopinions.org/redoc 12 | * Clean, modern UI to see the API structure in a responsive format 13 | 14 | 15 | ## Run tests 16 | 17 | Tests are run via `pytest`. Set up an ssh tunnel on a Unix shell to forward the MongoDB host connection to the local machine on port 27017 as follows. In the example below, `vm12` is the alias for the primary node of the MongoDB cluster. 18 | 19 | ``` 20 | $ ssh vm12 -f -N -L 27017:localhost:27017 21 | ``` 22 | Run the tests: 23 | 24 | ```sh 25 | $ cd /path_to_repo/api/english 26 | $ python -m pytest -v 27 | ``` 28 | 29 | ## Extensibility 30 | 31 | The code base has been written with the intention that future developers can add endpoints for other functionality that can potentially serve other dashboards. 32 | 33 | * `db`: Contains MongoDB-specific code (config and queries) that help interact with the RdP data on our MongoDB database 34 | * `endpoints`: Add new functionality to process and serve results via RESTful API endpoints 35 | * `schemas`: Perform response data validation so that the JSON results from the endpoint are formatted properly in the docs 36 | * `utils`: Add utility functions that support data manipulation within the routers 37 | * `tests`: Add tests to check that data from the endpoints are as expected for the front end 38 | * `gunicorn_conf.py`: Contains deployment-specific instructions for the web server, explained below. 39 | 40 | ## Deployment 41 | 42 | We perform a standard deployment of FastAPI in production, as per the best practices [shown in this blog post](https://www.vultr.com/docs/how-to-deploy-fastapi-applications-with-gunicorn-and-nginx-on-ubuntu-20-04/). 43 | 44 | * `uvicorn` is used as an async web server (compatible with the `gunicorn` web server for production apps) 45 | * We set `uvicorn` to use `uvloop` instead of `asyncio` to handle async coroutines under the hood (due to a bug with `asyncio` on CentOS) 46 | * `gunicorn` works as a process manager that starts multiple `uvicorn` processes via the `uvicorn.workers.UvicornWorker` class 47 | * `nginx` is used as a reverse proxy 48 | 49 | The deployment and maintenance of the web server is carried out by SFU's Research Computing Group (RCG). 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /api/english/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/english/__init__.py -------------------------------------------------------------------------------- /api/english/db/config.py: -------------------------------------------------------------------------------- 1 | host = ["mongo0", "mongo1", "mongo2"] 2 | # host = "localhost" 3 | is_direct_connection = True if (host == "localhost") else False 4 | 5 | config = { 6 | "MONGO_HOST": host, 7 | "MONGO_PORT": 27017, 8 | "MONGO_ARGS": { 9 | "authSource": "admin", 10 | "readPreference": "primaryPreferred", 11 | "username": "username", 12 | "password": "password", 13 | "directConnection": is_direct_connection, 14 | }, 15 | "DB_NAME": "mediaTracker", 16 | "LOGS_DIR": "logs/", 17 | } 18 | 19 | -------------------------------------------------------------------------------- /api/english/db/mongoqueries.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | def agg_total_per_outlet(begin_date: datetime, end_date: datetime): 5 | query = [ 6 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}}, 7 | { 8 | "$group": { 9 | "_id": "$outlet", 10 | "totalArticles": {"$sum": "$totalArticles"}, 11 | "totalFemales": {"$sum": "$totalFemales"}, 12 | "totalMales": {"$sum": "$totalMales"}, 13 | "totalUnknowns": {"$sum": "$totalUnknowns"}, 14 | } 15 | }, 16 | ] 17 | return query 18 | 19 | 20 | def agg_total_by_week(begin_date: datetime, end_date: datetime): 21 | query = [ 22 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}}, 23 | { 24 | "$group": { 25 | "_id": { 26 | "outlet": "$outlet", 27 | "week": {"$week": "$publishedAt"}, 28 | "year": {"$year": "$publishedAt"}, 29 | }, 30 | "totalFemales": {"$sum": "$totalFemales"}, 31 | "totalMales": {"$sum": "$totalMales"}, 32 | "totalUnknowns": {"$sum": "$totalUnknowns"}, 33 | } 34 | }, 35 | ] 36 | return query 37 | -------------------------------------------------------------------------------- /api/english/endpoints/outlet_stats.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from fastapi import APIRouter, HTTPException, Query, Request 3 | 4 | import utils.dateutils as dateutils 5 | from db.mongoqueries import agg_total_by_week, agg_total_per_outlet 6 | from schemas.stats_by_date import TotalStatsByDate 7 | from schemas.stats_weekly import TotalStatsByWeek 8 | from utils.logger import get_logger 9 | 10 | outlet_router = APIRouter() 11 | COLLECTION_NAME = "mediaDaily" 12 | LOWER_BOUND_START_DATE = "2018-09-29" # Specify start date slightly earlier 2018-10-01 for pytest suite 13 | ID_MAPPING = {"Huffington Post": "HuffPost Canada"} 14 | 15 | logger = get_logger("g-tracker-fastapi-en") 16 | 17 | 18 | @outlet_router.get( 19 | "/info_by_date", 20 | response_model=TotalStatsByDate, 21 | response_description="Get total and per outlet gender statistics for English outlets between two dates", 22 | ) 23 | def expertwomen_info_by_date( 24 | request: Request, 25 | begin: str = Query(description="Start date in yyyy-mm-dd format"), 26 | end: str = Query(description="End date in yyyy-mm-dd format"), 27 | ) -> TotalStatsByDate: 28 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE): 29 | raise HTTPException( 30 | status_code=416, 31 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date", 32 | ) 33 | result = _expertwomen_info_by_date(request, begin, end) 34 | logger.info("Obtained info by date for English outlets between %s and %s" % (begin, end)) 35 | return result 36 | 37 | 38 | @outlet_router.get( 39 | "/weekly_info", 40 | response_model=TotalStatsByWeek, 41 | response_description="Get gender statistics per English outlet aggregated WEEKLY between two dates", 42 | ) 43 | def expertwomen_weekly_info( 44 | request: Request, 45 | begin: str = Query(description="Start date in yyyy-mm-dd format"), 46 | end: str = Query(description="End date in yyyy-mm-dd format"), 47 | ) -> TotalStatsByWeek: 48 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE): 49 | raise HTTPException( 50 | status_code=416, 51 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date", 52 | ) 53 | result = _expertwomen_weekly_info(request, begin, end) 54 | logger.info("Obtained weekly info for English outlets between %s and %s" % (begin, end)) 55 | return result 56 | 57 | 58 | def _expertwomen_info_by_date(request: Request, begin: str, end: str) -> TotalStatsByDate: 59 | """ 60 | Run aggregation query on MongoDB data to obtain total stats within a specified date range 61 | """ 62 | begin_date = dateutils.convert_date(begin) 63 | end_date = dateutils.convert_date(end) 64 | 65 | query = agg_total_per_outlet(begin_date, end_date) 66 | response = request.app.connection[COLLECTION_NAME].aggregate(query) 67 | # Work with the data in pandas 68 | source_stats = list(response) 69 | if not source_stats: 70 | logger.error("No data found for date range %s to %s" % (begin, end)) 71 | df = pd.DataFrame.from_dict(source_stats) 72 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"] 73 | # Replace outlet names if necessary 74 | df["_id"] = df["_id"].replace(ID_MAPPING) 75 | # Take sums of total males, females, unknowns and articles and convert to dict 76 | result = df.drop("_id", axis=1).sum().to_dict() 77 | # Compute per outlet stats 78 | df["perFemales"] = df["totalFemales"] / df["totalGenders"] 79 | df["perMales"] = df["totalMales"] / df["totalGenders"] 80 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"] 81 | df["perArticles"] = df["totalArticles"] / result["totalArticles"] 82 | # Convert dataframe to dict prior to JSON serialization 83 | result["sources"] = df.to_dict("records") 84 | result["perFemales"] = result["totalFemales"] / result["totalGenders"] 85 | result["perMales"] = result["totalMales"] / result["totalGenders"] 86 | result["perUnknowns"] = result["totalUnknowns"] / result["totalGenders"] 87 | return result 88 | 89 | 90 | def _expertwomen_weekly_info(request: Request, begin: str, end: str) -> TotalStatsByWeek: 91 | """ 92 | Run aggregation query on MongoDB data to obtain weekly stats within a specified date range 93 | """ 94 | begin_date = dateutils.convert_date(begin) 95 | end_date = dateutils.convert_date(end) 96 | 97 | query = agg_total_by_week(begin_date, end_date) 98 | response = request.app.connection[COLLECTION_NAME].aggregate(query) 99 | source_stats = list(response) 100 | if not source_stats: 101 | logger.error("No data found for date range %s to %s" % (begin, end)) 102 | # Work with the data in pandas 103 | df = pd.json_normalize(source_stats, max_level=1).sort_values(by="_id.outlet").reset_index(drop=True) 104 | df.rename( 105 | columns={ 106 | "_id.outlet": "outlet", 107 | "_id.week": "week", 108 | "_id.year": "year", 109 | }, 110 | inplace=True, 111 | ) 112 | # Replace outlet names if necessary 113 | df["outlet"] = df["outlet"].replace(ID_MAPPING) 114 | # Construct DataFrame and handle begin/end dates as datetimes for summing by week 115 | df["w_begin"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 0), axis=1) 116 | df["w_end"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 6), axis=1) 117 | df["w_begin"], df["w_end"] = zip( 118 | *df.apply(lambda row: (pd.to_datetime(row["w_begin"]), pd.to_datetime(row["w_end"])), axis=1) 119 | ) 120 | df = df.drop(columns=["week", "year"], axis=1).sort_values(by=["outlet", "w_begin"]) 121 | # In earlier versions, there was a bug due to which we returned partial weekly information for the same week that spanned across years 122 | # This bug only occurred when the last week of one year spanned into the next year (partial week across a year boundary) 123 | # To address this, we perform summation of stats by week in pandas to avoid partial stats per week being passed to the front end 124 | df = df.groupby(["outlet", "w_begin", "w_end"]).sum().reset_index() 125 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"] 126 | df["perFemales"] = df["totalFemales"] / df["totalGenders"] 127 | df["perMales"] = df["totalMales"] / df["totalGenders"] 128 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"] 129 | # Convert datetimes back to string for JSON serialization 130 | df["w_begin"] = df["w_begin"].dt.strftime("%Y-%m-%d") 131 | df["w_end"] = df["w_end"].dt.strftime("%Y-%m-%d") 132 | df = df.drop(columns=["totalGenders", "totalFemales", "totalMales", "totalUnknowns"], axis=1) 133 | 134 | # Convert dataframe to dict prior to JSON serialization 135 | weekly_data = dict() 136 | for outlet in df["outlet"]: 137 | per_outlet_data = df[df["outlet"] == outlet].to_dict(orient="records") 138 | # Remove the outlet key from weekly_data 139 | [item.pop("outlet") for item in per_outlet_data] 140 | weekly_data[outlet] = per_outlet_data 141 | output = TotalStatsByWeek(outlets=weekly_data) 142 | return output 143 | -------------------------------------------------------------------------------- /api/english/gunicorn_conf.py: -------------------------------------------------------------------------------- 1 | # gunicorn_conf.py to point gunicorn to the uvicorn workers 2 | from multiprocessing import cpu_count 3 | 4 | # Socket path 5 | bind = 'unix:/g-tracker/WomenInMedia/api/english/g-tracker.sock' 6 | 7 | # Worker Options 8 | workers = cpu_count() - 1 9 | worker_class = 'uvicorn.workers.UvicornWorker' 10 | 11 | # Logging Options 12 | loglevel = 'debug' 13 | accesslog = '/g-tracker/WomenInMedia/api/english/access_log' 14 | errorlog = '/g-tracker/WomenInMedia/api/english/error_log' 15 | -------------------------------------------------------------------------------- /api/english/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root, gunicorn.error, gunicorn.access 3 | 4 | [handlers] 5 | keys=console, error_file, access_file 6 | 7 | [formatters] 8 | keys=generic, access 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=console 13 | 14 | [logger_gunicorn.error] 15 | level=INFO 16 | handlers=error_file 17 | propagate=1 18 | qualname=gunicorn.error 19 | 20 | [logger_gunicorn.access] 21 | level=INFO 22 | handlers=access_file 23 | propagate=0 24 | qualname=gunicorn.access 25 | 26 | [handler_console] 27 | class=StreamHandler 28 | formatter=generic 29 | args=(sys.stdout, ) 30 | 31 | [handler_error_file] 32 | class=logging.FileHandler 33 | formatter=generic 34 | args=('/var/log/gunicorn/error.log',) 35 | 36 | [handler_access_file] 37 | class=logging.FileHandler 38 | formatter=access 39 | args=('/var/log/gunicorn/access.log',) 40 | 41 | [formatter_generic] 42 | format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s 43 | datefmt=%Y-%m-%d %H:%M:%S 44 | class=logging.Formatter 45 | 46 | [formatter_access] 47 | format=%(message)s 48 | class=logging.Formatter 49 | -------------------------------------------------------------------------------- /api/english/main.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | from collections.abc import AsyncGenerator 3 | from pathlib import Path 4 | 5 | from fastapi import FastAPI 6 | from fastapi.responses import HTMLResponse 7 | from fastapi.staticfiles import StaticFiles 8 | from pymongo import MongoClient 9 | 10 | from db.config import config 11 | from endpoints.outlet_stats import outlet_router 12 | 13 | # Constants 14 | HOST = config["MONGO_HOST"] 15 | PORT = config["MONGO_PORT"] 16 | MONGO_ARGS = config["MONGO_ARGS"] 17 | DB = config["DB_NAME"] 18 | STATIC_PATH = "gender-gap-tracker" 19 | STATIC_HTML = "tracker.html" 20 | 21 | 22 | @asynccontextmanager 23 | async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: 24 | """Async context manager for MongoDB connection.""" 25 | app.mongodb_client = MongoClient(HOST, PORT, **MONGO_ARGS) 26 | app.connection = app.mongodb_client[DB] 27 | print("Successfully connected to MongoDB") 28 | yield 29 | app.mongodb_client.close() 30 | print("Successfully closed MongoDB connection") 31 | 32 | 33 | app = FastAPI( 34 | title="Gender Gap Tracker", 35 | description="RESTful API for the Gender Gap Tracker public-facing dashboard", 36 | version="1.1.4", 37 | lifespan=lifespan, 38 | ) 39 | 40 | 41 | @app.get("/", include_in_schema=False) 42 | async def root() -> HTMLResponse: 43 | with open(Path(f"{STATIC_PATH}") / STATIC_HTML, "r") as f: 44 | html_content = f.read() 45 | return HTMLResponse(content=html_content, media_type="text/html") 46 | 47 | 48 | # Attach routes 49 | app.include_router(outlet_router, prefix="/expertWomen", tags=["info"]) 50 | # Add additional routers here for future endpoints 51 | # ... 52 | 53 | # Serve static files for front end from directory specified as STATIC_PATH 54 | app.mount("/", StaticFiles(directory=STATIC_PATH), name="static") 55 | 56 | 57 | if __name__ == "__main__": 58 | import uvicorn 59 | uvicorn.run("main:app", host="0.0.0.0", port=8000, loop="uvloop", reload=True) 60 | -------------------------------------------------------------------------------- /api/english/schemas/stats_by_date.py: -------------------------------------------------------------------------------- 1 | from math import isnan 2 | from typing import List 3 | 4 | from pydantic import BaseModel, Field, root_validator 5 | 6 | 7 | def valid_percentage(_, values): 8 | """Avoid NaNs by setting them to 0.0""" 9 | for key in ["perFemales", "perMales", "perUnknowns"]: 10 | if isnan(values[key]): 11 | values[key] = 0.0 12 | return values 13 | 14 | 15 | class OutletStatsByDate(BaseModel): 16 | # In Pydantic, the underscore prefix of a field like `_id` is treated as a private attribute 17 | # We thus define an alias so that the `_id` field can be referenced as is. 18 | id: str = Field(alias="_id") 19 | totalArticles: int 20 | totalFemales: int 21 | totalMales: int 22 | totalUnknowns: int 23 | totalGenders: int 24 | perFemales: float 25 | perMales: float 26 | perUnknowns: float 27 | perArticles: float 28 | 29 | # validators 30 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage) 31 | 32 | class TotalStatsByDate(BaseModel): 33 | totalArticles: int 34 | totalFemales: int 35 | totalMales: int 36 | totalUnknowns: int 37 | totalGenders: int 38 | perFemales: float 39 | perMales: float 40 | perUnknowns: float 41 | sources: List[OutletStatsByDate] 42 | 43 | # validators 44 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage) 45 | 46 | -------------------------------------------------------------------------------- /api/english/schemas/stats_weekly.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | from math import isnan 3 | from typing import Dict, List 4 | 5 | from pydantic import BaseModel, root_validator, validator 6 | 7 | 8 | class OutletStatsByWeek(BaseModel): 9 | w_begin: date 10 | w_end: date 11 | perFemales: float 12 | perMales: float 13 | perUnknowns: float 14 | 15 | # validation 16 | @validator("w_begin", "w_end", pre=True, always=True) 17 | def valid_date(dateval): 18 | """Validate a date string to be of the format yyyy-mm-dd""" 19 | if isinstance(dateval, str): 20 | return datetime.strptime(dateval, "%Y-%m-%d").strftime("%Y-%m-%d") 21 | return dateval 22 | 23 | @root_validator 24 | def _valid_percentage(cls, values): 25 | """Avoid NaNs by setting them to 0.0""" 26 | for key in ["perFemales", "perMales", "perUnknowns"]: 27 | if isnan(values[key]): 28 | values[key] = 0.0 29 | return values 30 | 31 | 32 | class TotalStatsByWeek(BaseModel): 33 | outlets: Dict[str, List[OutletStatsByWeek]] 34 | 35 | class Config: 36 | schema_extra = { 37 | "example": { 38 | "outlets": { 39 | "Outlet 1": [ 40 | { 41 | "w_begin": "2021-12-26", 42 | "w_end": "2022-01-01", 43 | "perFemales": 0.3915470494417863, 44 | "perMales": 0.6052631578947368, 45 | "perUnknowns": 0.003189792663476874, 46 | }, 47 | { 48 | "w_begin": "2022-01-02", 49 | "w_end": "2022-01-08", 50 | "perFemales": 0.39904862579281186, 51 | "perMales": 0.6004228329809725, 52 | "perUnknowns": 0.0005285412262156448, 53 | }, 54 | ], 55 | "Outlet 2": [ 56 | { 57 | "w_begin": "2021-12-26", 58 | "w_end": "2022-01-01", 59 | "perFemales": 0.34763636363636363, 60 | "perMales": 0.648, 61 | "perUnknowns": 0.004363636363636364, 62 | }, 63 | { 64 | "w_begin": "2022-01-02", 65 | "w_end": "2022-01-08", 66 | "perFemales": 0.0, 67 | "perMales": 0.0, 68 | "perUnknowns": 0.0, 69 | }, 70 | ], 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /api/english/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/english/tests/__init__.py -------------------------------------------------------------------------------- /api/english/tests/test_mock_outlet_stats.py: -------------------------------------------------------------------------------- 1 | from fastapi.testclient import TestClient 2 | 3 | from english.main import app 4 | from endpoints.outlet_stats import ID_MAPPING 5 | 6 | PREFIX = "expertWomen" 7 | 8 | 9 | def test_get_info_by_date(): 10 | with TestClient(app) as client: 11 | # We test mock data in a date range outside that specified in outlet_stats.py 12 | begin = "2018-09-29" 13 | end = "2018-09-30" 14 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}") 15 | assert response.status_code == 200 16 | body = response.json() 17 | # Ensure there are no NaN values due to DivisionByZero when no sources exist 18 | assert body.get("perFemales") >= 0 19 | assert body.get("perMales") >= 0 20 | assert body.get("perUnknowns") >= 0 21 | assert isinstance(body.get("sources"), list) 22 | for obj in body.get("sources"): 23 | assert isinstance(obj.get("_id"), str) 24 | assert obj.get("perFemales") >= 0 25 | assert obj.get("perMales") >= 0 26 | assert obj.get("perUnknowns") >= 0 27 | 28 | 29 | def test_get_info_outlet_name_mapping_in_list(): 30 | with TestClient(app) as client: 31 | begin = "2018-09-29" 32 | end = "2018-09-30" 33 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}") 34 | outlet_list = [item.get("_id") for item in response.json().get("sources")] 35 | for outlet in ID_MAPPING: 36 | assert ID_MAPPING[outlet] in outlet_list 37 | 38 | 39 | def test_weekly_info_outlet_name_mapping_in_list(): 40 | with TestClient(app) as client: 41 | begin = "2018-09-29" 42 | end = "2018-09-30" 43 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}") 44 | outlet_list = [k for k, _ in response.json().get("outlets").items()] 45 | for outlet in ID_MAPPING: 46 | assert ID_MAPPING[outlet] in outlet_list -------------------------------------------------------------------------------- /api/english/tests/test_outlet_stats.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from fastapi.testclient import TestClient 3 | 4 | from english.main import app 5 | from endpoints.outlet_stats import LOWER_BOUND_START_DATE 6 | 7 | PREFIX = "expertWomen" 8 | 9 | 10 | def test_read_main(): 11 | with TestClient(app) as client: 12 | response = client.get("/") 13 | assert response.status_code == 200 14 | 15 | 16 | def test_get_info_by_date(): 17 | with TestClient(app) as client: 18 | # Choose a date range that is in the recent past 19 | begin = datetime.today().date() - timedelta(days=7) 20 | end = datetime.today().date() - timedelta(days=3) 21 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}") 22 | assert response.status_code == 200 23 | body = response.json() 24 | assert body.get("perFemales") >= 0 25 | assert body.get("perMales") >= 0 26 | assert body.get("perUnknowns") >= 0 27 | assert isinstance(body.get("sources"), list) 28 | for obj in body.get("sources"): 29 | assert isinstance(obj.get("_id"), str) 30 | assert obj.get("perFemales") >= 0 31 | assert obj.get("perMales") >= 0 32 | assert obj.get("perUnknowns") >= 0 33 | 34 | 35 | def test_get_info_by_date_invalid_date_range(): 36 | with TestClient(app) as client: 37 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date() 38 | past = lower_bound_date - timedelta(days=2) 39 | response = client.get(f"/{PREFIX}/info_by_date?begin={past}&end={lower_bound_date}") 40 | assert ( 41 | response.status_code == 416 42 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later" 43 | today = datetime.today().date() 44 | future = today + timedelta(days=2) 45 | response = client.get(f"/{PREFIX}/info_by_date?begin={today}&end={future}") 46 | assert response.status_code == 416, "Cannot request stats for dates in the future" 47 | 48 | 49 | def test_get_weekly_info(): 50 | with TestClient(app) as client: 51 | # Choose a date range that is in the recent past 52 | begin = datetime.today().date() - timedelta(days=7) 53 | end = datetime.today().date() - timedelta(days=3) 54 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}") 55 | assert response.status_code == 200 56 | body = response.json().get("outlets") 57 | assert len(body) > 0 58 | for _, stats in body.items(): 59 | for week_id in stats: 60 | assert isinstance(week_id.get("w_begin"), str) 61 | assert isinstance(week_id.get("w_end"), str) 62 | assert week_id.get("perFemales") >= 0 63 | assert week_id.get("perMales") >= 0 64 | assert week_id.get("perUnknowns") >= 0 65 | 66 | 67 | def test_get_weekly_info_invalid_date_range(): 68 | with TestClient(app) as client: 69 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date() 70 | past = lower_bound_date - timedelta(days=2) 71 | response = client.get(f"/{PREFIX}/weekly_info?begin={past}&end={lower_bound_date}") 72 | assert ( 73 | response.status_code == 416 74 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later" 75 | today = datetime.today().date() 76 | future = today + timedelta(days=2) 77 | response = client.get(f"/{PREFIX}/weekly_info?begin={today}&end={future}") 78 | assert response.status_code == 416, "Cannot request stats for dates in the future" -------------------------------------------------------------------------------- /api/english/utils/dateutils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | 4 | def is_valid_date_range(start_date: str, end_date: str, lower_bound: str) -> bool: 5 | tommorrow = datetime.today() + timedelta(days=1) 6 | if (tommorrow >= convert_date(end_date)) and ( 7 | convert_date(start_date) >= convert_date(lower_bound) 8 | ): 9 | return True 10 | else: 11 | return False 12 | 13 | 14 | def convert_date(date_str: str) -> datetime: 15 | return datetime.strptime(date_str, "%Y-%m-%d") 16 | 17 | 18 | def get_week_bound(year: int, week: int, day_of_week: int) -> str: 19 | """ 20 | Get begin or end date for a week of the year as a string YYYY-MM-DD 21 | - Start of week is Sunday 22 | - For start of week, set `day_of_week` to 0 23 | - For end of week, set `day_of_week` to 6 24 | """ 25 | w_bound = datetime.strptime(f"{year}-{week}-{day_of_week}", "%Y-%U-%w") 26 | w_bound_str = w_bound.strftime("%Y-%m-%d") 27 | return w_bound_str 28 | -------------------------------------------------------------------------------- /api/english/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from logging.handlers import TimedRotatingFileHandler 4 | from pathlib import Path 5 | 6 | 7 | def get_logger(filename: str = "g-tracker-fastapi") -> logging.Logger: 8 | filename = f"{filename}.log" if not filename.endswith(".log") else filename 9 | Path("logs").mkdir(parents=True, exist_ok=True) 10 | log = logging.getLogger(filename) 11 | log.setLevel(logging.INFO) 12 | format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 13 | rotateHandler = TimedRotatingFileHandler( 14 | Path("logs") / filename, 15 | when="midnight", 16 | backupCount=3, 17 | ) 18 | rotateHandler.setFormatter(format) 19 | stream = logging.StreamHandler(sys.stdout) 20 | stream.setFormatter(format) 21 | log.addHandler(rotateHandler) 22 | log.addHandler(stream) 23 | return log 24 | -------------------------------------------------------------------------------- /api/french/README.md: -------------------------------------------------------------------------------- 1 | # Radar de Parité: API 2 | 3 | FastAPI code base for the API that serves the [Radar de Parité public dashboard](https://radardeparite.femmesexpertes.ca/). The dashboard itself is hosted externally, and its front end code is hosted on this [GitLab repo](https://gitlab.com/client-transfer-group/rdp). 4 | 5 | ## API docs 6 | 7 | The docs can be accessed in one of two ways: 8 | 9 | * Swagger: https://radardeparite.femmesexpertes.ca/docs 10 | * Useful to test out the API interactively on the browser 11 | * Redoc: https://radardeparite.femmesexpertes.ca/redoc 12 | * Clean, modern UI to see the API structure in a responsive format 13 | 14 | ## Run tests 15 | 16 | Tests are run via `pytest`. Set up an ssh tunnel on a Unix shell to forward the MongoDB host connection to the local machine on port 27017 as follows. In the example below, `vm12` is the alias for the primary node of the MongoDB cluster. 17 | 18 | ``` 19 | $ ssh vm12 -f -N -L 27017:localhost:27017 20 | ``` 21 | Run the tests: 22 | 23 | ```sh 24 | $ cd /path_to_repo/api/english 25 | $ python -m pytest -v 26 | ``` 27 | 28 | ## Extensibility 29 | 30 | The code base has been written with the intention that future developers can add endpoints for other functionality that can potentially serve other dashboards. 31 | 32 | * `db`: Contains MongoDB-specific code (config and queries) that help interact with the RdP data on our MongoDB database 33 | * `endpoints`: Add new functionality to process and serve results via RESTful API endpoints 34 | * `schemas`: Perform response data validation so that the JSON results from the endpoint are formatted properly in the docs 35 | * `utils`: Add utility functions that support data manipulation within the routers 36 | * `tests`: Add tests to check that data from the endpoints are as expected for the front end 37 | * `gunicorn_conf.py`: Contains deployment-specific instructions for the web server, explained below. 38 | 39 | ## Deployment 40 | 41 | We perform a standard deployment of FastAPI in production, as per the best practices [shown in this blog post](https://www.vultr.com/docs/how-to-deploy-fastapi-applications-with-gunicorn-and-nginx-on-ubuntu-20-04/). 42 | 43 | * `uvicorn` is used as an async web server (compatible with the `gunicorn` web server for production apps) 44 | * We set `uvicorn` to use `uvloop` instead of `asyncio` to handle async coroutines under the hood (due to a bug with `asyncio` on CentOS) 45 | * `gunicorn` works as a process manager that starts multiple `uvicorn` processes via the `uvicorn.workers.UvicornWorker` class 46 | * `nginx` is used as a reverse proxy 47 | 48 | The deployment and maintenance of the web server is carried out by SFU's Research Computing Group (RCG). 49 | 50 | 51 | -------------------------------------------------------------------------------- /api/french/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/french/__init__.py -------------------------------------------------------------------------------- /api/french/db/config.py: -------------------------------------------------------------------------------- 1 | host = ["mongo0", "mongo1", "mongo2"] 2 | # host = "localhost" 3 | is_direct_connection = True if (host == "localhost") else False 4 | 5 | config = { 6 | "MONGO_HOST": host, 7 | "MONGO_PORT": 27017, 8 | "MONGO_ARGS": { 9 | "authSource": "admin", 10 | "readPreference": "primaryPreferred", 11 | "username": "username", 12 | "password": "password", 13 | "directConnection": is_direct_connection, 14 | }, 15 | "DB_NAME": "mediaTracker", 16 | "LOGS_DIR": "logs/", 17 | } 18 | -------------------------------------------------------------------------------- /api/french/db/mongoqueries.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | def agg_total_per_outlet(begin_date: datetime, end_date: datetime): 4 | query = [ 5 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}}, 6 | { 7 | "$group": { 8 | "_id": "$outlet", 9 | "totalArticles": {"$sum": "$totalArticles"}, 10 | "totalFemales": {"$sum": "$totalFemales"}, 11 | "totalMales": {"$sum": "$totalMales"}, 12 | "totalUnknowns": {"$sum": "$totalUnknowns"}, 13 | } 14 | }, 15 | ] 16 | return query 17 | 18 | 19 | def agg_total_by_week(begin_date: datetime, end_date: datetime): 20 | query = [ 21 | {"$match": {"publishedAt": {"$gte": begin_date, "$lte": end_date}}}, 22 | { 23 | "$group": { 24 | "_id": { 25 | "outlet": "$outlet", 26 | "week": {"$week": "$publishedAt"}, 27 | "year": {"$year": "$publishedAt"}, 28 | }, 29 | "totalFemales": {"$sum": "$totalFemales"}, 30 | "totalMales": {"$sum": "$totalMales"}, 31 | "totalUnknowns": {"$sum": "$totalUnknowns"}, 32 | } 33 | }, 34 | ] 35 | return query 36 | -------------------------------------------------------------------------------- /api/french/endpoints/outlet_stats.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from fastapi import APIRouter, HTTPException, Query, Request 3 | 4 | import utils.dateutils as dateutils 5 | from db.mongoqueries import agg_total_by_week, agg_total_per_outlet 6 | from schemas.stats_by_date import TotalStatsByDate 7 | from schemas.stats_weekly import TotalStatsByWeek 8 | from utils.logger import get_logger 9 | from typing import Any 10 | 11 | outlet_router = APIRouter() 12 | COLLECTION_NAME = "mediaDailyFrench" 13 | LOWER_BOUND_START_DATE = "2021-09-29" # Specify start date slightly earlier 2021-10-01 for pytest suite 14 | ID_MAPPING = { 15 | "Journal De Montreal": "Le Journal de Montréal", 16 | "TVA News": "TVA Nouvelles", 17 | "Radio Canada": "Radio-Canada", 18 | } 19 | 20 | logger = get_logger("g-tracker-fastapi-fr") 21 | 22 | 23 | @outlet_router.get( 24 | "/info_by_date", 25 | response_model=TotalStatsByDate, 26 | response_description="Get total and per outlet gender statistics for French outlets between two dates", 27 | ) 28 | def femmesexpertes_info_by_date( 29 | request: Request, 30 | begin: str = Query(description="Start date in yyyy-mm-dd format"), 31 | end: str = Query(description="End date in yyyy-mm-dd format"), 32 | ) -> TotalStatsByDate: 33 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE): 34 | raise HTTPException( 35 | status_code=416, 36 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date", 37 | ) 38 | result = _femmesexpertes_info_by_date(request, begin, end) 39 | logger.info("Obtained info by date for French outlets between %s and %s" % (begin, end)) 40 | return result 41 | 42 | 43 | @outlet_router.get( 44 | "/weekly_info", 45 | response_model=TotalStatsByWeek, 46 | response_description="Get gender statistics per French outlet aggregated WEEKLY between two dates", 47 | ) 48 | def femmesexpertes_weekly_info( 49 | request: Request, 50 | begin: str = Query(description="Start date in yyyy-mm-dd format"), 51 | end: str = Query(description="End date in yyyy-mm-dd format"), 52 | ) -> TotalStatsByWeek: 53 | if not dateutils.is_valid_date_range(begin, end, LOWER_BOUND_START_DATE): 54 | raise HTTPException( 55 | status_code=416, 56 | detail=f"Date range error: Should be between {LOWER_BOUND_START_DATE} and tomorrow's date", 57 | ) 58 | result = _femmesexpertes_weekly_info(request, begin, end) 59 | logger.info("Obtained weekly info for French outlets between %s and %s" % (begin, end)) 60 | return result 61 | 62 | 63 | def _femmesexpertes_info_by_date(request: Request, begin: str, end: str) -> TotalStatsByDate: 64 | """ 65 | Run aggregation query on MongoDB data to obtain total stats within a specified date range 66 | """ 67 | begin_date = dateutils.convert_date(begin) 68 | end_date = dateutils.convert_date(end) 69 | 70 | query = agg_total_per_outlet(begin_date, end_date) 71 | response = request.app.connection[COLLECTION_NAME].aggregate(query) 72 | # Work with the data in pandas 73 | source_stats = list(response) 74 | if not source_stats: 75 | logger.error("No data found for date range %s to %s" % (begin, end)) 76 | df = pd.DataFrame.from_dict(source_stats) 77 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"] 78 | # Replace outlet names if necessary 79 | df["_id"] = df["_id"].replace(ID_MAPPING) 80 | # Take sums of total males, females, unknowns and articles and convert to dict 81 | result = df.drop("_id", axis=1).sum().to_dict() 82 | # Compute per outlet stats 83 | df["perFemales"] = df["totalFemales"] / df["totalGenders"] 84 | df["perMales"] = df["totalMales"] / df["totalGenders"] 85 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"] 86 | df["perArticles"] = df["totalArticles"] / result["totalArticles"] 87 | # Convert dataframe to dict prior to JSON serialization 88 | result["sources"] = df.to_dict("records") 89 | result["perFemales"] = result["totalFemales"] / result["totalGenders"] 90 | result["perMales"] = result["totalMales"] / result["totalGenders"] 91 | result["perUnknowns"] = result["totalUnknowns"] / result["totalGenders"] 92 | return result 93 | 94 | 95 | def _femmesexpertes_weekly_info(request: Request, begin: str, end: str) -> TotalStatsByWeek: 96 | """ 97 | Run aggregation query on MongoDB data to obtain weekly stats within a specified date range 98 | """ 99 | begin_date = dateutils.convert_date(begin) 100 | end_date = dateutils.convert_date(end) 101 | 102 | query = agg_total_by_week(begin_date, end_date) 103 | response = request.app.connection[COLLECTION_NAME].aggregate(query) 104 | source_stats = list(response) 105 | if not source_stats: 106 | logger.error("No data found for date range %s to %s" % (begin, end)) 107 | # Work with the data in pandas 108 | df = pd.json_normalize(source_stats, max_level=1).sort_values(by="_id.outlet").reset_index(drop=True) 109 | df.rename( 110 | columns={ 111 | "_id.outlet": "outlet", 112 | "_id.week": "week", 113 | "_id.year": "year", 114 | }, 115 | inplace=True, 116 | ) 117 | # Replace outlet names if necessary 118 | df["outlet"] = df["outlet"].replace(ID_MAPPING) 119 | # Construct DataFrame and handle begin/end dates as datetimes for summing by week 120 | df["w_begin"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 0), axis=1) 121 | df["w_end"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 6), axis=1) 122 | df["w_begin"], df["w_end"] = zip( 123 | *df.apply(lambda row: (pd.to_datetime(row["w_begin"]), pd.to_datetime(row["w_end"])), axis=1) 124 | ) 125 | df = df.drop(columns=["week", "year"], axis=1).sort_values(by=["outlet", "w_begin"]) 126 | # In earlier versions, there was a bug due to which we returned partial weekly information for the same week that spanned across years 127 | # This bug only occurred when the last week of one year spanned into the next year (partial week across a year boundary) 128 | # To address this, we perform summation of stats by week in pandas to avoid partial stats per week being passed to the front end 129 | df = df.groupby(["outlet", "w_begin", "w_end"]).sum().reset_index() 130 | df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"] 131 | df["perFemales"] = df["totalFemales"] / df["totalGenders"] 132 | df["perMales"] = df["totalMales"] / df["totalGenders"] 133 | df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"] 134 | # Convert datetimes back to string for JSON serialization 135 | df["w_begin"] = df["w_begin"].dt.strftime("%Y-%m-%d") 136 | df["w_end"] = df["w_end"].dt.strftime("%Y-%m-%d") 137 | df = df.drop(columns=["totalGenders", "totalFemales", "totalMales", "totalUnknowns"], axis=1) 138 | 139 | # Convert dataframe to dict prior to JSON serialization 140 | weekly_data = dict() 141 | for outlet in df["outlet"]: 142 | per_outlet_data = df[df["outlet"] == outlet].to_dict(orient="records") 143 | # Remove the outlet key from weekly_data 144 | [item.pop("outlet") for item in per_outlet_data] 145 | weekly_data[outlet] = per_outlet_data 146 | output = TotalStatsByWeek(outlets=weekly_data) 147 | return output 148 | -------------------------------------------------------------------------------- /api/french/gunicorn_conf.py: -------------------------------------------------------------------------------- 1 | # gunicorn_conf.py to point gunicorn to the uvicorn workers 2 | from multiprocessing import cpu_count 3 | 4 | # Socket path 5 | bind = 'unix:/g-tracker/WomenInMedia/api/french/g-tracker-fr.sock' 6 | 7 | # Worker Options 8 | workers = cpu_count() - 1 9 | worker_class = 'uvicorn.workers.UvicornWorker' 10 | 11 | # Logging Options 12 | loglevel = 'debug' 13 | accesslog = '/g-tracker/WomenInMedia/api/french/access_log' 14 | errorlog = '/g-tracker/WomenInMedia/api/french/error_log' 15 | -------------------------------------------------------------------------------- /api/french/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root, gunicorn.error, gunicorn.access 3 | 4 | [handlers] 5 | keys=console, error_file, access_file 6 | 7 | [formatters] 8 | keys=generic, access 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=console 13 | 14 | [logger_gunicorn.error] 15 | level=INFO 16 | handlers=error_file 17 | propagate=1 18 | qualname=gunicorn.error 19 | 20 | [logger_gunicorn.access] 21 | level=INFO 22 | handlers=access_file 23 | propagate=0 24 | qualname=gunicorn.access 25 | 26 | [handler_console] 27 | class=StreamHandler 28 | formatter=generic 29 | args=(sys.stdout, ) 30 | 31 | [handler_error_file] 32 | class=logging.FileHandler 33 | formatter=generic 34 | args=('/var/log/gunicorn/error.log',) 35 | 36 | [handler_access_file] 37 | class=logging.FileHandler 38 | formatter=access 39 | args=('/var/log/gunicorn/access.log',) 40 | 41 | [formatter_generic] 42 | format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s 43 | datefmt=%Y-%m-%d %H:%M:%S 44 | class=logging.Formatter 45 | 46 | [formatter_access] 47 | format=%(message)s 48 | class=logging.Formatter 49 | -------------------------------------------------------------------------------- /api/french/main.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | from collections.abc import AsyncGenerator 3 | from pathlib import Path 4 | 5 | from fastapi import FastAPI 6 | from fastapi.responses import HTMLResponse 7 | from fastapi.staticfiles import StaticFiles 8 | from pymongo import MongoClient 9 | 10 | from db.config import config 11 | from endpoints.outlet_stats import outlet_router 12 | 13 | # Constants 14 | HOST = config["MONGO_HOST"] 15 | PORT = config["MONGO_PORT"] 16 | MONGO_ARGS = config["MONGO_ARGS"] 17 | DB = config["DB_NAME"] 18 | STATIC_PATH = "rdp" 19 | STATIC_HTML = "tracker.html" 20 | 21 | 22 | @asynccontextmanager 23 | async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: 24 | """Async context manager for MongoDB connection.""" 25 | app.mongodb_client = MongoClient(HOST, PORT, **MONGO_ARGS) 26 | app.connection = app.mongodb_client[DB] 27 | print("Successfully connected to MongoDB") 28 | yield 29 | app.mongodb_client.close() 30 | print("Successfully closed MongoDB connection") 31 | 32 | 33 | app = FastAPI( 34 | title="Radar de Parité", 35 | description="RESTful API for the Radar de Parité public-facing dashboard", 36 | version="1.1.4", 37 | lifespan=lifespan, 38 | ) 39 | 40 | 41 | @app.get("/", include_in_schema=False) 42 | async def root() -> HTMLResponse: 43 | with open(Path(f"{STATIC_PATH}") / STATIC_HTML, "r") as f: 44 | html_content = f.read() 45 | return HTMLResponse(content=html_content, media_type="text/html") 46 | 47 | 48 | # Attach routes 49 | app.include_router(outlet_router, prefix="/femmesExpertes", tags=["info"]) 50 | # Add additional routers here for future endpoints 51 | # ... 52 | 53 | # Serve static files for front end from directory specified as STATIC_PATH 54 | app.mount("/", StaticFiles(directory=STATIC_PATH), name="static") 55 | 56 | 57 | if __name__ == "__main__": 58 | import uvicorn 59 | uvicorn.run("main:app", host="0.0.0.0", port=8000, loop="uvloop", reload=True) 60 | -------------------------------------------------------------------------------- /api/french/schemas/stats_by_date.py: -------------------------------------------------------------------------------- 1 | from math import isnan 2 | from typing import List 3 | 4 | from pydantic import BaseModel, Field, root_validator 5 | 6 | 7 | def valid_percentage(_, values): 8 | """Avoid NaNs by setting them to 0.0""" 9 | for key in ["perFemales", "perMales", "perUnknowns"]: 10 | if isnan(values[key]): 11 | values[key] = 0.0 12 | return values 13 | 14 | 15 | class OutletStatsByDate(BaseModel): 16 | # In Pydantic, the underscore prefix of a field like `_id` is treated as a private attribute 17 | # We thus define an alias so that the `_id` field can be referenced as is. 18 | id: str = Field(alias="_id") 19 | totalArticles: int 20 | totalFemales: int 21 | totalMales: int 22 | totalUnknowns: int 23 | totalGenders: int 24 | perFemales: float 25 | perMales: float 26 | perUnknowns: float 27 | perArticles: float 28 | 29 | # validators 30 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage) 31 | 32 | 33 | class TotalStatsByDate(BaseModel): 34 | totalArticles: int 35 | totalFemales: int 36 | totalMales: int 37 | totalUnknowns: int 38 | totalGenders: int 39 | perFemales: float 40 | perMales: float 41 | perUnknowns: float 42 | sources: List[OutletStatsByDate] 43 | 44 | # validators 45 | _avoid_nans = root_validator(allow_reuse=True)(valid_percentage) 46 | 47 | -------------------------------------------------------------------------------- /api/french/schemas/stats_weekly.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | from math import isnan 3 | from typing import Dict, List 4 | 5 | from pydantic import BaseModel, root_validator, validator 6 | 7 | 8 | class OutletStatsByWeek(BaseModel): 9 | w_begin: date 10 | w_end: date 11 | perFemales: float 12 | perMales: float 13 | perUnknowns: float 14 | 15 | # validation 16 | @validator("w_begin", "w_end", pre=True, always=True) 17 | def valid_date(dateval): 18 | """Validate a date string to be of the format yyyy-mm-dd""" 19 | if isinstance(dateval, str): 20 | return datetime.strptime(dateval, "%Y-%m-%d").strftime("%Y-%m-%d") 21 | return dateval 22 | 23 | @root_validator 24 | def _valid_percentage(cls, values): 25 | """Avoid NaNs by setting them to 0.0""" 26 | for key in ["perFemales", "perMales", "perUnknowns"]: 27 | if isnan(values[key]): 28 | values[key] = 0.0 29 | return values 30 | 31 | 32 | class TotalStatsByWeek(BaseModel): 33 | outlets: Dict[str, List[OutletStatsByWeek]] 34 | 35 | class Config: 36 | schema_extra = { 37 | "example": { 38 | "outlets": { 39 | "Outlet 1": [ 40 | { 41 | "w_begin": "2021-12-26", 42 | "w_end": "2022-01-01", 43 | "perFemales": 0.3915470494417863, 44 | "perMales": 0.6052631578947368, 45 | "perUnknowns": 0.003189792663476874, 46 | }, 47 | { 48 | "w_begin": "2022-01-02", 49 | "w_end": "2022-01-08", 50 | "perFemales": 0.39904862579281186, 51 | "perMales": 0.6004228329809725, 52 | "perUnknowns": 0.0005285412262156448, 53 | }, 54 | ], 55 | "Outlet 2": [ 56 | { 57 | "w_begin": "2021-12-26", 58 | "w_end": "2022-01-01", 59 | "perFemales": 0.34763636363636363, 60 | "perMales": 0.648, 61 | "perUnknowns": 0.004363636363636364, 62 | }, 63 | { 64 | "w_begin": "2022-01-02", 65 | "w_end": "2022-01-08", 66 | "perFemales": 0.0, 67 | "perMales": 0.0, 68 | "perUnknowns": 0.0, 69 | }, 70 | ], 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /api/french/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/api/french/tests/__init__.py -------------------------------------------------------------------------------- /api/french/tests/test_mock_outlet_stats.py: -------------------------------------------------------------------------------- 1 | from fastapi.testclient import TestClient 2 | 3 | from french.main import app 4 | from endpoints.outlet_stats import ID_MAPPING 5 | 6 | PREFIX = "femmesExpertes" 7 | 8 | 9 | def test_get_info_by_date(): 10 | with TestClient(app) as client: 11 | begin = "2021-09-29" 12 | end = "2021-09-30" 13 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}") 14 | assert response.status_code == 200 15 | body = response.json() 16 | # Ensure there are no NaN values due to DivisionByZero when no sources exist 17 | assert body.get("perFemales") >= 0 18 | assert body.get("perMales") >= 0 19 | assert body.get("perUnknowns") >= 0 20 | assert isinstance(body.get("sources"), list) 21 | for obj in body.get("sources"): 22 | assert isinstance(obj.get("_id"), str) 23 | assert obj.get("perFemales") >= 0 24 | assert obj.get("perMales") >= 0 25 | assert obj.get("perUnknowns") >= 0 26 | 27 | 28 | def test_get_info_outlet_name_mapping_in_list(): 29 | with TestClient(app) as client: 30 | begin = "2021-09-29" 31 | end = "2021-09-30" 32 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}") 33 | outlet_list = [item.get("_id") for item in response.json().get("sources")] 34 | for outlet in ID_MAPPING: 35 | assert ID_MAPPING[outlet] in outlet_list 36 | 37 | 38 | def test_weekly_info_outlet_name_mapping_in_list(): 39 | with TestClient(app) as client: 40 | begin = "2021-09-29" 41 | end = "2021-09-30" 42 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}") 43 | outlet_list = [k for k, _ in response.json().get("outlets").items()] 44 | for outlet in ID_MAPPING: 45 | assert ID_MAPPING[outlet] in outlet_list -------------------------------------------------------------------------------- /api/french/tests/test_outlet_stats.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from fastapi.testclient import TestClient 3 | 4 | from french.main import app 5 | from endpoints.outlet_stats import LOWER_BOUND_START_DATE 6 | 7 | PREFIX = "femmesExpertes" 8 | 9 | 10 | def test_read_main(): 11 | with TestClient(app) as client: 12 | response = client.get("/") 13 | assert response.status_code == 200 14 | 15 | 16 | def test_get_info_by_date(): 17 | with TestClient(app) as client: 18 | # Choose a date range that is in the recent past 19 | begin = datetime.today().date() - timedelta(days=7) 20 | end = datetime.today().date() - timedelta(days=3) 21 | response = client.get(f"/{PREFIX}/info_by_date?begin={begin}&end={end}") 22 | assert response.status_code == 200 23 | body = response.json() 24 | assert body.get("perFemales") >= 0 25 | assert body.get("perMales") >= 0 26 | assert body.get("perUnknowns") >= 0 27 | assert isinstance(body.get("sources"), list) 28 | for obj in body.get("sources"): 29 | assert isinstance(obj.get("_id"), str) 30 | assert obj.get("perFemales") >= 0 31 | assert obj.get("perMales") >= 0 32 | assert obj.get("perUnknowns") >= 0 33 | 34 | 35 | def test_get_info_by_date_invalid_date_range(): 36 | with TestClient(app) as client: 37 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date() 38 | past = lower_bound_date - timedelta(days=2) 39 | response = client.get(f"/{PREFIX}/info_by_date?begin={past}&end={lower_bound_date}") 40 | assert ( 41 | response.status_code == 416 42 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later" 43 | today = datetime.today().date() 44 | future = today + timedelta(days=2) 45 | response = client.get(f"/{PREFIX}/info_by_date?begin={today}&end={future}") 46 | assert response.status_code == 416, "Cannot request stats for dates in the future" 47 | 48 | 49 | def test_get_weekly_info(): 50 | with TestClient(app) as client: 51 | # Choose a date range that is in the recent past 52 | begin = datetime.today().date() - timedelta(days=7) 53 | end = datetime.today().date() - timedelta(days=3) 54 | response = client.get(f"/{PREFIX}/weekly_info?begin={begin}&end={end}") 55 | assert response.status_code == 200 56 | body = response.json().get("outlets") 57 | assert len(body) > 0 58 | for _, stats in body.items(): 59 | for week_id in stats: 60 | assert isinstance(week_id.get("w_begin"), str) 61 | assert isinstance(week_id.get("w_end"), str) 62 | assert week_id.get("perFemales") >= 0 63 | assert week_id.get("perMales") >= 0 64 | assert week_id.get("perUnknowns") >= 0 65 | 66 | 67 | def test_get_weekly_info_invalid_date_range(): 68 | with TestClient(app) as client: 69 | lower_bound_date = datetime.fromisoformat(LOWER_BOUND_START_DATE).date() 70 | past = lower_bound_date - timedelta(days=2) 71 | response = client.get(f"/{PREFIX}/weekly_info?begin={past}&end={lower_bound_date}") 72 | assert ( 73 | response.status_code == 416 74 | ), "English articles start on 2018-10-01, so start date should be 2018-10-01 or later" 75 | today = datetime.today().date() 76 | future = today + timedelta(days=2) 77 | response = client.get(f"/{PREFIX}/weekly_info?begin={today}&end={future}") 78 | assert response.status_code == 416, "Cannot request stats for dates in the future" -------------------------------------------------------------------------------- /api/french/utils/dateutils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | 4 | def is_valid_date_range(start_date: str, end_date: str, lower_bound: str) -> bool: 5 | tommorrow = datetime.today() + timedelta(days=1) 6 | if (tommorrow >= convert_date(end_date)) and ( 7 | convert_date(start_date) >= convert_date(lower_bound) 8 | ): 9 | return True 10 | else: 11 | return False 12 | 13 | 14 | def convert_date(date_str: str) -> datetime: 15 | return datetime.strptime(date_str, "%Y-%m-%d") 16 | 17 | 18 | def get_week_bound(year: int, week: int, day_of_week: int) -> str: 19 | """ 20 | Get begin or end date for a week of the year as a string YYYY-MM-DD 21 | - Start of week is Sunday 22 | - For start of week, set `day_of_week` to 0 23 | - For end of week, set `day_of_week` to 6 24 | """ 25 | w_bound = datetime.strptime(f"{year}-{week}-{day_of_week}", "%Y-%U-%w") 26 | w_bound_str = w_bound.strftime("%Y-%m-%d") 27 | return w_bound_str 28 | -------------------------------------------------------------------------------- /api/french/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from logging.handlers import TimedRotatingFileHandler 4 | from pathlib import Path 5 | 6 | 7 | def get_logger(filename: str = "g-tracker-fastapi") -> logging.Logger: 8 | filename = f"{filename}.log" if not filename.endswith(".log") else filename 9 | Path("logs").mkdir(parents=True, exist_ok=True) 10 | log = logging.getLogger(filename) 11 | log.setLevel(logging.INFO) 12 | format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 13 | rotateHandler = TimedRotatingFileHandler( 14 | Path("logs") / filename, 15 | when="midnight", 16 | backupCount=3, 17 | ) 18 | rotateHandler.setFormatter(format) 19 | stream = logging.StreamHandler(sys.stdout) 20 | stream.setFormatter(format) 21 | log.addHandler(rotateHandler) 22 | log.addHandler(stream) 23 | return log 24 | -------------------------------------------------------------------------------- /api/requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.28.2 2 | pandas>=1.5.3,<1.6.0 3 | pymongo<4.0.0 4 | pydantic<2.0.0 5 | httpx>=0.23.0, <0.24.0 6 | fastapi>=0.94.0,<0.95.0 7 | gunicorn>=20.1.0,<20.2.0 8 | uvicorn>=0.20.0,<0.21.0 9 | uvloop==0.17.0 10 | pytest>=7.2.1 -------------------------------------------------------------------------------- /nlp/english/config.py: -------------------------------------------------------------------------------- 1 | host = ["mongo0", "mongo1", "mongo2"] 2 | # host = "localhost" 3 | prefix = "." if (host == "localhost") else "/path_to_code/GenderGapTracker/nlp/english" 4 | 5 | config = { 6 | "MONGO_ARGS": { 7 | "host": host, 8 | "port": 27017, 9 | "username": "username", 10 | "password": "password", 11 | "authSource": "admin", 12 | "readPreference": "nearest" 13 | }, 14 | "GENDER_RECOGNITION": { 15 | "GENDERIZE_ENABLED": False, 16 | "GENDERAPI_ENABLED": True, 17 | "GENDERAPI_TOKEN": "JSON_AUTH_TOKEN", 18 | "MANUAL_CACHE": "manual", 19 | "GENDERAPI_CACHE": "genderAPICleaned", 20 | "GENDERIZE_CACHE": "genderizeCleaned", 21 | "FIRSTNAME_CACHE": "firstNamesCleaned", 22 | }, 23 | "NLP": { 24 | "MAX_BODY_LENGTH": 20000, 25 | "AUTHOR_BLOCKLIST": f"{prefix}/rules/author_blocklist.txt", 26 | "NAME_PATTERNS": f"{prefix}/rules/name_patterns.jsonl", 27 | "QUOTE_VERBS": f"{prefix}/rules/quote_verb_list.txt" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /nlp/english/evaluation/README.md: -------------------------------------------------------------------------------- 1 | # English GGT Evaluation 2 | This folder contains methodology and code for evaluating the results of the English pipeline. 3 | 4 | For consistent and reproducible results, make sure any evaluation run locally uses the **same Python environment that is running in production**. 5 | 6 | ## Download Data 7 | The raw text data containing news article text, as well as the human-annotated data, is made available upon request (please contact Maite Taboada at [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca)). 8 | Obtain the directories named `humanAnnotations` and `rawtext` and place them in their respective paths as per the structure below. 9 | 10 | ```sh 11 | ├── . 12 | | ├── src 13 | | | ├── rawtexts 14 | | | ├── eval 15 | | | | └── humanAnnotations 16 | | | └── evaluate.py 17 | | | └── run_predictions.py 18 | ``` 19 | 20 | ## Set Up Environment 21 | This section assumes that the English NLP environment in `../nlp/english` has already been set up, as the dashboard has a dependency on the English NLP modules, specifically the entity gender annotator for NER and coreference resolution. **Just like in the English NLP pipeline**, the dash app requires Python 3.6 for legacy reasons -- it uses spaCy 2.1.3 and `neuralcoref` for performing coreference resolution, which, unfortunately, are not installable on higher versions of spaCy or Python. 22 | 23 | 24 | If not done already, install a virtual environment using the `requirements.txt` from the `../nlp/english` directory in this repo. 25 | 26 | ```sh 27 | cd /path_to_code/GenderGapTracker/nlp/english 28 | python3 -m venv GRIM-EN # python3 -> python3.6 for legacy reasons (neuralcoref) 29 | source GRIM-EN/bin/activate 30 | python3 -m pip install -U pip wheel # Upgrade pip and install latest wheel package first 31 | python3 -m pip install -r requirements.txt 32 | ``` 33 | 34 | #### `spaCy` language model 35 | **First, make sure that the spaCy version shown in `requirements.txt` is the same as the one running in production**. 36 | 37 | Manually download spaCy's large English language model for the quote extraction pipeline - this is a one-time step for this specific virtual environment. 38 | ```sh 39 | python3 -m spacy download en_core_web_lg 40 | ``` 41 | -------------------------------------------------------------------------------- /nlp/english/evaluation/src/README.md: -------------------------------------------------------------------------------- 1 | # Extracting quotes, named entities and gender 2 | 3 | This directory stores the scripts and methodology used to evaluate quotes extraction and named entities identification and gender annotations performed by the English NLP pipeline. 4 | 5 | ## Prerequisite: Obtain ssh tunnel to the MongoDB database 6 | To run this script locally, it is first required to set up an ssh tunnel that forwards the database connection to the local machine. This step is essential to complete the evaluation because we host a gender lookup cache on our database, which allows us to retrieve existing names and their associated genders. 7 | 8 | Set up the database tunnel on a Unix shell as follows. In the example below, `vm12` is the primary database on which the gender cache is hosted. We simply forward the connection from port 27017 on the remote database to the same port on our local machine. 9 | 10 | ```sh 11 | ssh vm12 -f -N -L 27017:localhost:27017 12 | ``` 13 | 14 | In case database connectivity is not possible, it's possible to rewrite the gender service to only obtain named-based lookups via external gender APIs. However, in such a case, the results might vary from those shown below. 15 | ## 1. Produce the annotations 16 | Before evaluating the annotations made by the system, you'll need to produce those annotations. The gender annotation pipeline can be broken down into two successive steps : 17 | - Quote Extraction 18 | - Entity Gender Annotation 19 | 20 | The entity gender annotation step takes the output of the quote extraction step as input. 21 | In order to evaluate the performance of each part the pipeline individually, ```run_predictions.py``` can run each part of the pipeline by using the fully accurate input for each step (which is why the target annotations must be passed to the script). 22 | It can also run the whole NLP pipeline. 23 | 24 | ### Optional Arguments 25 | ```sh 26 | python3 run_predictions.py --help 27 | usage: run_predictions.py [-h] [--in_dir IN_DIR] [--out_dir OUT_DIR] [--target_dir TARGET_DIR] [--quote_extraction] [--gender_annotation] [--all] [--spacy_model SPACY_MODEL] [--poolsize POOLSIZE] [--chunksize CHUNKSIZE] 28 | 29 | Evaluation of all the steps of the gender annotation pipeline 30 | 31 | optional arguments: 32 | -h, --help show this help message and exit 33 | --in_dir IN_DIR Path to read input text files from this directory. 34 | --out_dir OUT_DIR Path to dir to output all predictions 35 | --target_dir TARGET_DIR 36 | Path to json target files. Serve as anchor for intermediate steps of the pipeline. 37 | --quote_extraction run quote extractor on text input files 38 | --gender_annotation run whole the whole pipeline on text on text input files 39 | --all compute all metrics 40 | --spacy_model SPACY_MODEL 41 | spacy language model 42 | --poolsize POOLSIZE Size of the concurrent process pool for the given task 43 | --chunksize CHUNKSIZE 44 | Number of articles per chunk being processed concurrently 45 | ``` 46 | 47 | ### Example run command 48 | For V7.0, this is the command used to generate all the needed outputs. 49 | ```sh 50 | python3 run_predictions.py --in_dir ./rawtexts/ --target_dir ./eval/humanAnnotations/ --out_dir ./eval/systemAnnotations/V7.0/ --all 51 | ``` 52 | This dumps out 98 JSON files containing the respective system output in each of these directories : `./eval/systemAnnotations/V7.0/quotes/extracted_quotes`, `./eval/systemAnnotations/V7.0/gender_annotation/entire_pipeline` 53 | 54 | ## 2. Get the metrics 55 | 56 | The script `evaluate.py` must be run after the script `run_predictions.py` has been run. 57 | It is only possible to get the metrics for the predictions that have already been run (for instance, do not specify --gender_annotation in `evaluate.py` if this argument was not specified in `run_predictions.py`) 58 | 59 | For more details regarding the way the metrics are computed, see the readme in the `./eval` directory. 60 | 61 | 62 | ### Optional Arguments 63 | ```sh 64 | python3 evaluate.py --help 65 | usage: evaluate.py [-h] [--target_dir TARGET_DIR] [--pred_dir PRED_DIR] [--quote_extraction] [--gender_annotation] [--gender_ratio] [--all] 66 | 67 | evaluation of all the steps of the gender annotation pipeline 68 | 69 | optional arguments: 70 | -h, --help show this help message and exit 71 | --target_dir TARGET_DIR 72 | Path to read input text files from this directory. 73 | --pred_dir PRED_DIR Path to write JSON quotes to this directory. 74 | --quote_extraction compute metrics on the quote extractor output 75 | --gender_annotation compute metrics on the gender annotator on the whole pipeline 76 | --gender_ratio compare overall gender ratios between target and output of whole pipeline 77 | --all compute all metrics 78 | ``` 79 | 80 | ### Example run command 81 | For V7.0, this is the command used to display the metrics for all parts of the pipeline 82 | ```sh 83 | python3 evaluate.py --target_dir eval/humanAnnotations/ --pred_dir eval/systemAnnotations/V7.0/ --all 84 | ``` 85 | Our latest (best) evaluation produced the metrics shown below. 86 | 87 | ``` 88 | Quote Extraction 89 | ---------------------------------------- 90 | Precision (%) Recall (%) F1-Score (%) Accuracy (%) 91 | Quotes: 0.3 84.647 82.719 83.672 - 92 | Speaker match: 0.3 - - - 86.478 93 | Verb match: 0.3 - - - 92.065 94 | Quotes: 0.8 76.971 75.218 76.084 - 95 | Speaker match: 0.8 - - - 87.444 96 | Verb match: 0.8 - - - 93.321 97 | Speakers (indep): 80.672 97.595 88.33 - 98 | Verbs (indep): 83.027 88.11 85.493 - 99 | 100 | 101 | Gender Annotation 102 | ---------------------------------------- 103 | Precision (%) Recall (%) F1-Score (%) 104 | peopleFemale 71.939 77.049 74.406 105 | peopleMale 78.361 92.278 84.752 106 | peopleUnknown N/A 0.0 N/A 107 | sourcesFemale 94.643 64.634 76.812 108 | sourcesMale 87.805 76.923 82.005 109 | sourcesUnknown N/A 0.0 N/A 110 | 111 | 112 | Gender Ratio: People 113 | ---------------------------------------- 114 | Male Female Unknown 115 | Human annotations 0.738 0.261 0.001 116 | System V7.0 0.758 0.242 0.0 117 | 118 | 119 | 120 | Gender Ratio: Sources 121 | ---------------------------------------- 122 | Male Female Unknown 123 | Human annotations 0.738 0.259 0.003 124 | System V7.0 0.785 0.215 0.0 125 | ``` 126 | -------------------------------------------------------------------------------- /nlp/english/evaluation/src/run_predictions.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import json 5 | from pathlib import Path 6 | from multiprocessing import Pool, cpu_count 7 | from pathlib import Path 8 | import requests 9 | import spacy 10 | from spacy.pipeline import EntityRuler 11 | import neuralcoref 12 | from tqdm import tqdm 13 | 14 | sys.path.insert(1, os.path.realpath(Path(__file__).resolve().parents[2])) 15 | 16 | from quote_extractor import QuoteExtractor 17 | from entity_gender_annotator import EntityGenderAnnotator 18 | from config import config 19 | import utils 20 | """ 21 | Runs several predictions on the annotated data 22 | This script must be run before evaluate.py 23 | """ 24 | 25 | 26 | def get_rawtexts_from_file(filename): 27 | with open(filename, "r") as f: 28 | return f.read() 29 | 30 | 31 | def get_data_from_json(filename): 32 | with open(filename, "r") as f: 33 | return json.load(f) 34 | 35 | 36 | def dir_empty(dir_path): 37 | return not next(os.scandir(dir_path), None) 38 | 39 | 40 | def chunker(iterable, chunksize): 41 | """Yield a smaller chunk of a large iterable""" 42 | for i in range(0, len(iterable), chunksize): 43 | yield iterable[i: i + chunksize] 44 | 45 | 46 | def process_chunks(chunk): 47 | db_client = utils.init_client(config["MONGO_ARGS"]) 48 | for idx in chunk: 49 | rawtext = get_rawtexts_from_file(Path(IN_DIR) / f"{idx}.txt") 50 | text = utils.preprocess_text(rawtext) 51 | doc = nlp(text) 52 | if QUOTE_EXTRACTION: 53 | pred_extracted_quotes = quote_extractor.extract_quotes(doc) 54 | json.dump( 55 | pred_extracted_quotes, 56 | open(os.path.join(extracted_quotes_dir, idx + ".json"), "w"), 57 | ) 58 | print(f"Processed quotes for {idx}") 59 | if GENDER_ANNOTATION: 60 | pred_extracted_quotes = quote_extractor.extract_quotes(doc) 61 | json.dump( 62 | pred_extracted_quotes, 63 | open(os.path.join(extracted_quotes_dir, idx + ".json"), "w"), 64 | ) 65 | print(f"Processed quotes for {idx}") 66 | pred_annotation = entity_gender_annotator.run( 67 | db_client, text, [], pred_extracted_quotes, [] 68 | ) 69 | pred_annotation["lastModified"] = pred_annotation["lastModified"].strftime( 70 | "%m/%d/%Y, %H:%M:%S" 71 | ) 72 | json.dump( 73 | pred_annotation, 74 | open(os.path.join(gender_annotation_dir, idx + ".json"), "w"), 75 | ) 76 | print(f"Processed entity genders for {idx}") 77 | 78 | 79 | def run_predictions(): 80 | """ 81 | Make predictions on quote extraction and entity gender annotation for comparison with gold test set 82 | """ 83 | num_files = len(common_ids) 84 | num_chunks = len(list(chunker(common_ids, chunksize=CHUNKSIZE))) 85 | print(f"Organized {num_files} files into {num_chunks} chunks for concurrent processing...") 86 | # Process files using a pool of executors 87 | with Pool(processes=POOLSIZE) as pool: 88 | for _ in tqdm(pool.imap(process_chunks, chunker(common_ids, chunksize=CHUNKSIZE)), total=num_chunks): 89 | pass 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser(description="Evaluation of all the steps of the gender annotation pipeline") 94 | parser.add_argument("--in_dir", type=str, default="./rawtexts/", help="Path to read input text files from this directory.") 95 | parser.add_argument("--out_dir", type=str, default="./eval/systemAnnotations/V7.0/", help="Path to dir to output all predictions") 96 | parser.add_argument("--target_dir", type=str, default="./eval/humanAnnotations/", help="Path to json target files. Serve as anchor for intermediate steps of the pipeline.") 97 | parser.add_argument('--quote_extraction', action='store_true', help="run quote extractor on text input files") 98 | parser.add_argument('--gender_annotation', action='store_true', help="run whole the whole pipeline on text on text input files") 99 | parser.add_argument('--all', action='store_true', help="compute all metrics") 100 | parser.add_argument('--spacy_model', type=str, default="en_core_web_lg", help="spacy language model") 101 | parser.add_argument("--poolsize", type=int, default=cpu_count(), help="Size of the concurrent process pool for the given task") 102 | parser.add_argument("--chunksize", type=int, default=5, help="Number of articles per chunk being processed concurrently") 103 | args = vars(parser.parse_args()) 104 | IN_DIR = args["in_dir"] 105 | TARGET_DIR = args["target_dir"] 106 | PRED_DIR = args["out_dir"] 107 | QUOTE_EXTRACTION = args["quote_extraction"] 108 | GENDER_ANNOTATION = args["gender_annotation"] 109 | POOLSIZE = args["poolsize"] 110 | CHUNKSIZE = args["chunksize"] 111 | if args["all"]: 112 | QUOTE_EXTRACTION = False # No need to run quote extraction if we're running the whole pipeline 113 | GENDER_ANNOTATION = True 114 | 115 | config["NLP"]["QUOTE_VERBS"] = "../../rules/quote_verb_list.txt" 116 | config["NLP"]["AUTHOR_BLOCKLIST"] = "../../rules/author_blocklist.txt" 117 | config["NLP"]["NAME_PATTERNS"] = "../../rules/name_patterns.jsonl" 118 | config["MONGO_ARGS"]["host"] = "localhost" 119 | # Load spaCy language model and attach custom entity ruler and coreferee pipes downstream 120 | print(f"Loading spaCy language model: {args['spacy_model']}...") 121 | nlp = spacy.load(args["spacy_model"]) 122 | # Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify 123 | ruler = EntityRuler(nlp, overwrite_ents=True).from_disk( 124 | config["NLP"]["NAME_PATTERNS"] 125 | ) 126 | nlp.add_pipe(ruler) 127 | coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200) 128 | nlp.add_pipe(coref, name="neuralcoref") 129 | print("Finished loading") 130 | 131 | args["spacy_lang"] = nlp 132 | session = requests.Session() 133 | args["session"] = session 134 | config = {**args, **config} 135 | 136 | quote_extractor = QuoteExtractor(config) 137 | entity_gender_annotator = EntityGenderAnnotator(config) 138 | 139 | txt_files = [f for f in Path(IN_DIR).glob("*.txt")] 140 | target_files = [f for f in Path(TARGET_DIR).glob("*.json")] 141 | common_ids = list(set([p.stem for p in txt_files]) & set([p.stem for p in target_files])) 142 | 143 | extracted_quotes_dir = os.path.join(PRED_DIR, "quotes", "extracted_quotes") 144 | os.makedirs(extracted_quotes_dir, exist_ok=True) 145 | gender_annotation_dir = os.path.join( 146 | PRED_DIR, "gender_annotation", "entire_pipeline" 147 | ) 148 | os.makedirs(gender_annotation_dir, exist_ok=True) 149 | run_predictions() 150 | -------------------------------------------------------------------------------- /nlp/english/img/concurrent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/img/concurrent.png -------------------------------------------------------------------------------- /nlp/english/merge_collections.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script merges the results from the newly created collection from the entity 3 | gender annotation script (when the user specifies the `writecol` argument) with 4 | the original collection. 5 | 6 | Only the fields specified in this file are merged with (i.e. overwrite) the original 7 | `media` collection - the remaining fields in the original collection are left untouched. 8 | """ 9 | import argparse 10 | from multiprocessing import Pool, cpu_count 11 | from config import config 12 | import utils 13 | 14 | 15 | def update_field(existing_collection, new_collection, idx): 16 | """Overwrite existing collection's fields with new collection's fields (except IDs)""" 17 | new_id = idx['_id'] 18 | existing_id = idx['currentId'] 19 | doc = new_collection.find_one({'_id': new_id}, no_cursor_timeout=True) 20 | existing_collection.update( 21 | {'_id': existing_id}, 22 | {'$set': filter_dict(doc)} 23 | ) 24 | 25 | 26 | def filter_dict(dict_obj): 27 | """Return a dictionary that has the same keys/values as the original dictionary, 28 | except for a few select keys that are to be excluded. 29 | """ 30 | ignore_keys = ['_id', 'currentId'] 31 | new_dict = {key: dict_obj[key] for key in dict_obj if key not in ignore_keys} 32 | return new_dict 33 | 34 | 35 | def chunker(iterable, chunksize): 36 | """Yield a smaller chunk of a large iterable""" 37 | for i in range(0, len(iterable), chunksize): 38 | yield iterable[i:i + chunksize] 39 | 40 | 41 | def parse_chunks(chunk): 42 | """Pass through a chunk of document IDs and update fields""" 43 | db_client = utils.init_client(MONGO_ARGS) 44 | existing_collection = db_client[DB_NAME][EXISTING_COL] 45 | new_collection = db_client[DB_NAME][NEW_COL] 46 | for idx in chunk: 47 | update_field(existing_collection, new_collection, idx) 48 | 49 | 50 | def run_pool(poolsize, chunksize): 51 | """Concurrently run independent operations on multiple cores""" 52 | db_client = utils.init_client(MONGO_ARGS) 53 | # Get list of new and old IDs from new collection 54 | new_col = db_client[DB_NAME][NEW_COL] 55 | new_old_ids = list(new_col.find({}, {'_id': 1, 'currentId': 1})) 56 | print('Obtained ID list of length {}.'.format(len(new_old_ids))) 57 | # Process quotes using a pool of executors 58 | pool = Pool(processes=poolsize) 59 | pool.map(parse_chunks, chunker(new_old_ids, chunksize=chunksize)) 60 | pool.close() 61 | 62 | 63 | if __name__ == '__main__': 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('--db', type=str, default='mediaTracker', help="Database name") 66 | parser.add_argument('--oldcol', type=str, default='media', help="Existing collection name") 67 | parser.add_argument('--newcol', type=str, default='entitiesAnnotated', help="New collection name") 68 | parser.add_argument("--poolsize", type=int, default=cpu_count() + 1, help="Size of the concurrent process pool for the given task") 69 | parser.add_argument("--chunksize", type=int, default=100, help="Number of articles IDs per chunk being processed concurrently") 70 | args = vars(parser.parse_args()) 71 | 72 | # From config 73 | MONGO_ARGS = config['MONGO_ARGS'] 74 | # Parse arguments 75 | DB_NAME = args['db'] 76 | EXISTING_COL = args['oldcol'] 77 | NEW_COL = args['newcol'] 78 | poolsize = args['poolsize'] 79 | chunksize = args['chunksize'] 80 | 81 | run_pool(poolsize, chunksize) 82 | print("Finished merging collections!") 83 | -------------------------------------------------------------------------------- /nlp/english/requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.27.1 2 | pandas>=1.1.5 3 | spacy==2.1.3 4 | neuralcoref==4.0 5 | pymongo>=3.10.0,<4.0.0 6 | dash==2.15.0 7 | dash_bootstrap_components==1.2.1 8 | dash_auth==1.4.1 9 | statsmodels>=0.12.2 -------------------------------------------------------------------------------- /nlp/english/rules/author_blocklist.txt: -------------------------------------------------------------------------------- 1 | Cbc 2 | Ctv 3 | News 4 | Associated 5 | Afp 6 | Radio 7 | Reuters 8 | Bloomberg 9 | Canada 10 | Canadian 11 | Média 12 | Follow 13 | Twitter 14 | Agence 15 | Http 16 | Https 17 | National 18 | Online 19 | Journalist 20 | Staff 21 | Reporter 22 | Report 23 | Reporting 24 | Washington 25 | Starmetro 26 | Thestar 27 | Vancouver 28 | Times 29 | Bureau 30 | Tribune 31 | Sports 32 | Presse 33 | Canadienne 34 | Special 35 | Edmonton 36 | Calgary 37 | Halifax 38 | Vancouver 39 | Ottawa 40 | Breaking 41 | Opens 42 | Hours 43 | Newsletter 44 | Columnist 45 | Digital 46 | Www.Facebook.Com 47 | Facebook 48 | Photo 49 | Photography 50 | Video 51 | Share 52 | Getty 53 | Images 54 | Pages 55 | File 56 | Studio 57 | TV 58 | Tva 59 | cbc 60 | ctv 61 | Business 62 | University 63 | Now 64 | Movies 65 | Games 66 | Pictures 67 | Classics 68 | Abroad 69 | Politics 70 | Covered 71 | Mail 72 | Gmail 73 | Transportation 74 | Critic 75 | Story 76 | Le Droit 77 | Le Soleil 78 | Journaliste 79 | Postmedia 80 | Day Ago 81 | Updated 82 | Remember 83 | Brother 84 | Sister 85 | Mother 86 | Father 87 | Ont. 88 | Pm 89 | Am 90 | Ap 91 | Edt 92 | Edtlast 93 | -------------------------------------------------------------------------------- /nlp/english/rules/quote_verb_list.txt: -------------------------------------------------------------------------------- 1 | accept 2 | accepted 3 | acclaim 4 | acclaimed 5 | acclaiming 6 | acclaims 7 | acknowledge 8 | acknowledged 9 | acknowledges 10 | acknowledging 11 | add 12 | added 13 | adding 14 | adds 15 | admit 16 | admits 17 | admitted 18 | admitting 19 | advise 20 | advised 21 | advises 22 | advising 23 | announce 24 | announced 25 | announces 26 | announcing 27 | argue 28 | argued 29 | argues 30 | arguing 31 | assert 32 | asserted 33 | asserting 34 | asserts 35 | assure 36 | assured 37 | assures 38 | assuring 39 | claim 40 | claimed 41 | claiming 42 | claims 43 | clarified 44 | clarifies 45 | clarify 46 | clarifying 47 | comment 48 | commented 49 | commenting 50 | comments 51 | conclude 52 | concluded 53 | concludes 54 | concluding 55 | confirm 56 | confirmed 57 | confirming 58 | confirms 59 | continue 60 | continued 61 | continues 62 | continuing 63 | convince 64 | convinced 65 | convinces 66 | convincing 67 | criticize 68 | criticized 69 | criticizes 70 | criticizing 71 | declaim 72 | declaimed 73 | declaiming 74 | declaims 75 | declare 76 | declared 77 | declares 78 | declaring 79 | decried 80 | decries 81 | decry 82 | decrying 83 | demonstrate 84 | demonstrated 85 | demonstrates 86 | demonstrating 87 | denounce 88 | denounced 89 | denounces 90 | denouncing 91 | describe 92 | described 93 | describes 94 | describing 95 | disclaim 96 | disclaimed 97 | disclaiming 98 | disclaims 99 | dispute 100 | disputed 101 | disputes 102 | disputing 103 | ensure 104 | ensured 105 | ensures 106 | ensuring 107 | estimated 108 | estimates 109 | exclaim 110 | exclaimed 111 | exclaiming 112 | exclaims 113 | explain 114 | explained 115 | explaining 116 | explains 117 | finding 118 | finds 119 | highlight 120 | highlighted 121 | highlighting 122 | highlights 123 | illustrate 124 | illustrated 125 | illustrates 126 | illustrating 127 | indicate 128 | indicated 129 | indicates 130 | indicating 131 | inform 132 | informed 133 | informing 134 | informs 135 | insist 136 | insisted 137 | insisting 138 | insists 139 | mention 140 | mentioned 141 | mentioning 142 | mentions 143 | note 144 | noted 145 | notes 146 | notified 147 | notifies 148 | notify 149 | notifying 150 | noting 151 | persist 152 | persisted 153 | persisting 154 | persists 155 | point 156 | pointed 157 | pointing 158 | points 159 | preach 160 | preached 161 | preaches 162 | preaching 163 | predict 164 | predicted 165 | predicting 166 | predicts 167 | present 168 | presenting 169 | presents 170 | proclaim 171 | proclaimed 172 | proclaiming 173 | proclaims 174 | rave 175 | raved 176 | raves 177 | raving 178 | reassert 179 | reasserted 180 | reasserting 181 | reasserts 182 | reassure 183 | reassured 184 | reassures 185 | reassuring 186 | reckon 187 | reckoned 188 | reckoning 189 | reckons 190 | reconfirm 191 | reconfirmed 192 | reconfirming 193 | reconfirms 194 | release 195 | released 196 | releases 197 | releasing 198 | remind 199 | reminded 200 | reminding 201 | reminds 202 | replied 203 | replies 204 | reply 205 | replying 206 | report 207 | reported 208 | reporting 209 | reports 210 | respond 211 | responded 212 | responding 213 | responds 214 | restate 215 | restated 216 | restates 217 | restating 218 | retell 219 | retelling 220 | retells 221 | retold 222 | said 223 | say 224 | saying 225 | says 226 | state 227 | stated 228 | states 229 | stating 230 | suggest 231 | suggested 232 | suggesting 233 | suggests 234 | tell 235 | telling 236 | tells 237 | told 238 | testified 239 | testifies 240 | testify 241 | testifying 242 | think 243 | thinking 244 | thinks 245 | thought 246 | tweet 247 | tweeted 248 | tweeting 249 | tweets 250 | warn 251 | warned 252 | warning 253 | warns 254 | write 255 | writes 256 | writing 257 | wrote 258 | -------------------------------------------------------------------------------- /nlp/english/topic_model/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | spark-topic-modeling 3 | test.csv 4 | sample.json 5 | -------------------------------------------------------------------------------- /nlp/english/topic_model/config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'MONGO_ARGS': { 3 | 'host': ['mongo0', 'mongo1', 'mongo2'], 4 | 'port': 27017, 5 | 'username': 'username', 6 | 'password': 'password', 7 | 'authSource': 'admin', 8 | 'readPreference': 'primaryPreferred' 9 | }, 10 | 'DB': { 11 | 'READ_DB': 'mediaTracker', 12 | 'READ_COL': 'media', 13 | 'WRITE_DB': 'topicModel', 14 | 'WRITE_COL': 'topicResults' 15 | }, 16 | 'MODEL': { 17 | 'OUTLETS': [ 18 | 'National Post', 'The Globe And Mail', 'The Star', 19 | 'Global News', 'CTV News', 'CBC News' 20 | ], 21 | 'STOPWORDS': 'stopwords/stopwords.txt', 22 | 'LEMMAS': 'spacyLemmas/spacy_english_lemmas.txt' 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /nlp/english/topic_model/corpus_analysis/config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'MONGO_ARGS': { 3 | 'host': ['mongo0', 'mongo1', 'mongo2'], 4 | 'port': 27017, 5 | 'username': 'username', 6 | 'password': 'password', 7 | 'authSource': 'admin', 8 | 'readPreference': 'primaryPreferred' 9 | }, 10 | 'DB': { 11 | 'MEDIA_DB': 'mediaTracker', 12 | 'MEDIA_COL': 'media', 13 | 'TOPIC_DB': 'topicModel', 14 | 'TOPIC_COL': 'topicResults' 15 | }, 16 | } 17 | -------------------------------------------------------------------------------- /nlp/english/topic_model/corpus_analysis/download_articles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download specific articles that are have high values of a particular topic's weights 3 | (t1, t2, etc.). Based on a user's input topic, we rank the article IDs in descending 4 | order of that topic's weights. 5 | 6 | The top 200 (or any other desired number of) article 7 | bodies are downloaded and stored to individual text files, following which we can perform 8 | keyness or other corpus-based linguistic analyses methods. 9 | """ 10 | import argparse 11 | import os 12 | from pymongo import MongoClient 13 | from bson import ObjectId 14 | import pandas as pd 15 | from config import config 16 | 17 | 18 | def make_dirs(dirpath): 19 | """ Make directories for output if they don't exist. """ 20 | if not os.path.exists(dirpath): 21 | os.makedirs(dirpath) 22 | 23 | 24 | def init_client(MONGO_ARGS): 25 | """ Initialize a MongoDB client. """ 26 | _db_client = MongoClient(**MONGO_ARGS) 27 | return _db_client 28 | 29 | 30 | def download_articles(root_dir, collection, doc_id_list, case='female'): 31 | """ Download a document object and export its body content to a file. 32 | """ 33 | doc_obj = [ObjectId(doc_id.strip()) for doc_id in doc_id_list] 34 | for idx in doc_obj: 35 | doc = collection.find_one( 36 | {'_id': idx}, 37 | {'_id': 1, 'body': 1}, 38 | no_cursor_timeout=True 39 | ) 40 | make_dirs(f"{root_dir}/{TOPIC}/{case}") 41 | with open(f"{root_dir}/{TOPIC}/{case}/{str(idx)}.txt", 'w') as f: 42 | f.write(doc['body']) 43 | 44 | 45 | def read_data(filepath): 46 | """ Read topic-split data from CSV """ 47 | df = pd.read_csv(filepath, header=0, parse_dates=['publishedAt'], 48 | index_col='_id') 49 | print(f"Obtained {df.shape[0]} articles in total") 50 | return df 51 | 52 | 53 | def get_gender_splitDF(df): 54 | """ Split the given Dataframe into two smaller Dataframes that each 55 | represent articles that are female or male source-dominated. 56 | """ 57 | female = df.loc[df['sourcesFemaleCount'] > df['sourcesMaleCount']] 58 | male = df.loc[df['sourcesFemaleCount'] < df['sourcesMaleCount']] 59 | print(f"Found {female.shape[0]} articles dominated by female sources.") 60 | print(f"Found {male.shape[0]} articles dominated by male sources.") 61 | return female, male 62 | 63 | 64 | def top100_per_gender_and_topic(female, male, topic): 65 | """ Collect top 100 articles sorted by topic weight for a particular 66 | topic (The topic names are t1-t15 by default in the CSV). 67 | """ 68 | t_female = female.sort_values(by=topic, ascending=False).iloc[:LIMIT, :] 69 | t_male = male.sort_values(by=topic, ascending=False).iloc[:LIMIT, :] 70 | return t_female, t_male 71 | 72 | 73 | def get_ids(filepath, topic): 74 | """ Obtain article ID lists for female/male source-dominated articles. """ 75 | df = read_data(filepath) 76 | female, male = get_gender_splitDF(df) 77 | t_female, t_male = top100_per_gender_and_topic(female, male, topic) 78 | female_ids, male_ids = list(t_female.index), list(t_male.index) 79 | return female_ids, male_ids 80 | 81 | 82 | def main(filepath, topic='t1'): 83 | """ Download articles using main pipeline """ 84 | female_ids, male_ids = get_ids(filepath, topic) 85 | client = init_client(MONGO_ARGS) 86 | collection = client[DB_NAME][COL_NAME] 87 | # Make root directory before downloading files 88 | root_dir = FILENAME.split('/')[-1].replace(".csv", "") 89 | download_articles(root_dir, collection, female_ids, case='female') 90 | download_articles(root_dir, collection, male_ids, case='male') 91 | 92 | 93 | if __name__ == "__main__": 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument('--db', '-d', type=str, default='mediaTracker', help="Database name") 96 | parser.add_argument('--col', '-c', type=str, default='media', help="Existing collection name") 97 | parser.add_argument('--topic', '-t', type=str, default='t1', help="Topic (t1, t2, etc.) to extract articles for") 98 | parser.add_argument('--file', '-f', type=str, required=True, help="CSV file containing topic splits") 99 | parser.add_argument('--limit', '-l', type=int, default=200, help="Max. number of articles to consider") 100 | args = parser.parse_args() 101 | 102 | # Config settings 103 | MONGO_ARGS = config['MONGO_ARGS'] 104 | # Parse args 105 | DB_NAME = args.db 106 | COL_NAME = args.col 107 | TOPIC = args.topic 108 | FILENAME = args.file 109 | LIMIT = args.limit 110 | 111 | main(FILENAME, topic=TOPIC) 112 | 113 | 114 | -------------------------------------------------------------------------------- /nlp/english/topic_model/corpus_analysis/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.0.3 2 | matplotlib==3.1.0 3 | seaborn==0.10.0 4 | pyspark==2.4.0 5 | wordcloud==1.6.0 6 | pymongo==3.8.0 7 | tqdm==4.32.1 8 | spacy==2.3.2 9 | corpus_toolkit==0.29 10 | -------------------------------------------------------------------------------- /nlp/english/topic_model/img/example_divergent_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/topic_model/img/example_divergent_heatmap.png -------------------------------------------------------------------------------- /nlp/english/topic_model/img/example_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/topic_model/img/example_heatmap.png -------------------------------------------------------------------------------- /nlp/english/topic_model/img/example_wordcloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/nlp/english/topic_model/img/example_wordcloud.png -------------------------------------------------------------------------------- /nlp/english/topic_model/preproc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test script to directly pull data from Mongo database and convert to Spark DataFrame. 3 | (Not used in the pipeline) - this script is purely for testing the DB connection with Spark. 4 | """ 5 | import argparse 6 | import datetime 7 | from pyspark.sql import SparkSession 8 | from pymongo import MongoClient 9 | from config import config 10 | 11 | 12 | def convert_date(date_str): 13 | return datetime.datetime.strptime(date_str, '%Y-%m-%d') 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--partitions", type=int, default=100, help="Number of shuffle partitions in PySpark") 19 | parser.add_argument("--begin_date", type=str, default='2020-04-28', help="Begin date format YYYY-MM-DD") 20 | parser.add_argument("--end_date", type=str, default='2020-04-30', help="End date format YYYY-MM-DD") 21 | 22 | args = parser.parse_args() 23 | 24 | begin_date = convert_date(args.begin_date) 25 | end_date = convert_date(args.end_date) 26 | 27 | # Read config 28 | MONGO_ARGS = config['MONGO_ARGS'] 29 | DB_NAME = config['DB']['DB_NAME'] 30 | COLLECTION_NAME = config['DB']['COLLECTION_NAME'] 31 | OUTLETS = config['MODEL']['OUTLETS'] 32 | 33 | with MongoClient(**MONGO_ARGS) as connection: 34 | collection = connection[DB_NAME][COLLECTION_NAME] 35 | articles = collection.aggregate([ 36 | {"$match": { 37 | "outlet": {"$in": OUTLETS}, 38 | "publishedAt": {"$gte": begin_date, "$lte": end_date} 39 | }}, 40 | {"$project": { 41 | '_id': {'$toString': '$_id'}, 'url': 1, 'publishedAt': 1, 42 | 'outlet': 1, 'title': 1, 'body': 1, 43 | 'peopleFemaleCount': 1, 'peopleMaleCount': 1, 44 | 'sourcesFemaleCount': 1, 'sourcesMaleCount': 1}} 45 | ]) 46 | 47 | spark = SparkSession.builder.appName("Cleanup for GGT MongoDB Data Dump") \ 48 | .config("spark.shuffle.io.maxRetries", 20) \ 49 | .config("spark.shuffle.io.retryWait", "20s") \ 50 | .config("spark.buffer.pageSize", "2m") \ 51 | .config("spark.sql.shuffle.partitions", args.partitions) \ 52 | .getOrCreate() 53 | 54 | # Specify timezone as UTC to match with raw data on MongoDB! 55 | spark.conf.set("spark.sql.session.timeZone", "UTC") 56 | df_articles = spark.createDataFrame(list(articles)) 57 | num_articles = df_articles.count() 58 | dtypes = df_articles.dtypes 59 | 60 | print("\n\n***\nObtained {} articles after filtering".format(num_articles)) 61 | print("\n\n***\nThe below columns are output to new Parquet files:\n{}".format(dtypes)) 62 | print("\n\n***\nEarliest timestamp article in data: {}\nLatest timestamp article in data: {}\n".format(begin_date, end_date)) 63 | 64 | df_articles.show() 65 | spark.stop() 66 | -------------------------------------------------------------------------------- /nlp/english/topic_model/preproc_cc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prepare Data for Topic Modelling: 3 | 4 | Since the raw dump from MongoDB has data in an undesirable format, 5 | we clean it up and filter the relevant subset for our needs in topic modelling. 6 | """ 7 | import argparse 8 | import os 9 | from pyspark.sql import SparkSession 10 | import pyspark.sql.functions as f 11 | import pyspark.sql.types as t 12 | from config import config 13 | 14 | # root_dit = "./" 15 | root_dir = "/home/pprao/projects/ctb-popowich/ggt" 16 | dataloc = os.path.join(root_dir, '21-04-2020-ggt.parquet') 17 | 18 | 19 | @f.udf(t.StringType()) 20 | def get_ids(_id): 21 | return _id[0] 22 | 23 | 24 | def filter_raw_data(df): 25 | """Extract only relevant columns of data we require for topic modelling. 26 | NOTE: The unix timestamp from MongoDB is divided by 1000 here because of the 27 | extra 3 zeros at the end (we don't need milliseconds). 28 | """ 29 | dataDF = df.select('_id', 'publishedAt', 'outlet', 'url', 'title', 'body', 'peopleFemaleCount', 30 | 'peopleMaleCount', 'sourcesFemaleCount', 'sourcesMaleCount') \ 31 | .withColumn('id', get_ids(f.col('_id'))) \ 32 | .withColumn('unix_timestamp', f.get_json_object(df.publishedAt, "$.$date") / 1000) \ 33 | .withColumn('string_timestamp', f.from_unixtime(f.col('unix_timestamp'))) \ 34 | .withColumn('timestamp', f.col('string_timestamp').cast(t.TimestampType())) \ 35 | .drop('_id', 'publishedAt', 'unix_timestamp', 'string_timestamp') 36 | return dataDF 37 | 38 | 39 | def get_english_by_timestamp(df): 40 | """Extract English articles only within the given date range""" 41 | englishArticleDF = df.where(f.col('outlet').isin(OUTLETS)) 42 | # Use timestamps for the first and last minute of the start/end days respectively 43 | start = "{} 00:00:00".format(begin_date) 44 | end = "{} 23:59:59".format(end_date) 45 | filteredDF = englishArticleDF.filter(f.col("timestamp") > f.unix_timestamp( 46 | f.lit(start)).cast('timestamp')) \ 47 | .filter(f.col("timestamp") < f.unix_timestamp( 48 | f.lit(end)).cast('timestamp')) 49 | return filteredDF 50 | 51 | 52 | def get_articles_with_sources(df): 53 | """Ignore articles for which the `sourcesFemaleCount` and `sourcesMaleCount` fields are 54 | null (this means that the full NLP pipeline wasn't run on these articles). 55 | Zero sources in the article are possible, and these are not filtered out. 56 | """ 57 | sourcesDF = df.filter('sourcesFemaleCount is not NULL and sourcesMaleCount is not NULL') 58 | return sourcesDF 59 | 60 | 61 | def get_date_range(df, colname='timestamp'): 62 | """Sanity check to verify that the minimum and maximum dates make sense 63 | (after running the filtering and cleanup steps). 64 | """ 65 | min_date = f.date_format(f.min(colname), 'YYYY-MM-dd HH:mm:ss') 66 | max_date = f.date_format(f.max(colname), 'YYYY-MM-dd HH:mm:ss') 67 | min_date, max_date = df.select(min_date, max_date).first() 68 | print("Earliest timestamp in data: {}".format(min_date)) 69 | print("Latest timestamp in data: {}".format(max_date)) 70 | return min_date, max_date 71 | 72 | 73 | def write_output_parquet(df, output_dir): 74 | df.write.mode('overwrite').parquet(output_dir) 75 | 76 | 77 | def make_dir(dirpath): 78 | if not os.path.exists(dirpath): 79 | os.makedirs(dirpath) 80 | 81 | 82 | def run_cleanup(): 83 | df = spark.read.parquet(dataloc) 84 | dataDF = filter_raw_data(df) 85 | filteredDF = get_english_by_timestamp(dataDF) 86 | sourcesDF = get_articles_with_sources(filteredDF) 87 | sourcesReordered = sourcesDF.select('id', 'timestamp', 'outlet', 'url', 'title', 'body', 88 | 'peopleFemaleCount', 'peopleMaleCount', 89 | 'sourcesFemaleCount', 'sourcesMaleCount', 90 | ) 91 | return sourcesReordered 92 | 93 | 94 | if __name__ == "__main__": 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument("--partitions", type=int, default=200, help="Number of shuffle partitions in PySpark") 97 | parser.add_argument("--begin_date", type=str, default='2018-10-01', help="Begin date format YYYY-MM-DD") 98 | parser.add_argument("--end_date", type=str, default='2020-04-20', help="End date format YYYY-MM-DD") 99 | args = parser.parse_args() 100 | 101 | # Parse arge 102 | begin_date = args.begin_date 103 | end_date = args.end_date 104 | 105 | # Read config 106 | OUTLETS = config['MODEL']['OUTLETS'] 107 | 108 | spark = SparkSession.builder.appName("Cleanup for GGT MongoDB Data Dump") \ 109 | .config("spark.shuffle.io.maxRetries", 20) \ 110 | .config("spark.shuffle.io.retryWait", "20s") \ 111 | .config("spark.buffer.pageSize", "2m") \ 112 | .config("spark.sql.shuffle.partitions", args.partitions) \ 113 | .getOrCreate() 114 | # Specify timezone as UTC to match with raw data on MongoDB! 115 | spark.conf.set("spark.sql.session.timeZone", "UTC") 116 | # Create output directory 117 | output_dir = "{}/ggt_english_{}_{}".format(root_dir, begin_date, end_date) 118 | make_dir(output_dir) 119 | 120 | existSourcesDF = run_cleanup() 121 | num_articles = existSourcesDF.count() 122 | dtypes = existSourcesDF.dtypes 123 | # Show minimum and maximum timestamps in the filtered data 124 | min_date, max_date = get_date_range(existSourcesDF, 'timestamp') 125 | # Write data to output directory 126 | write_output_parquet(existSourcesDF, output_dir) 127 | 128 | print("\n\n***\nObtained {} articles after filtering".format(num_articles)) 129 | print("\n\n***\nThe below columns are output to new Parquet files:\n{}".format(dtypes)) 130 | print("\n\n***\nEarliest timestamp article in data: {}\nLatest timestamp article in data: {}\n".format(min_date, max_date)) 131 | 132 | spark.stop() 133 | -------------------------------------------------------------------------------- /nlp/english/topic_model/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.3.4 2 | wordcloud==1.8.1 3 | pandas==1.1.5 4 | py4j==0.10.7 5 | pymongo==3.11.3 6 | pyspark==2.4.5 7 | scipy==1.10.0 8 | seaborn==0.11.1 9 | tqdm==4.59.0 10 | -------------------------------------------------------------------------------- /nlp/english/topic_model/spacyLemmas/README.md: -------------------------------------------------------------------------------- 1 | # Lemmatization workflow 2 | We lemmatize all terms in each document prior to topic modelling in Spark. In our initial experiments, we observed that the lemmatizer used by Spark NLP (the third party used for lemmatization in Spark) was not of the requisite quality for our purposes. As a result, we chose to use spaCy's [lemma lookup data available on GitHub](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data). 3 | 4 | ## Formatting 5 | The lemmas from spaCy's lookup data are available as JSON, specified as `{lemma: [word1, word2, ...]}` where each key is the lemma, and the value is a list of words that share that lemma . In addition, a lot of the lemma keys contain extraneous symbols and punctuation, which we know are cleaned in an upstream step in our topic modelling pipeline. As a result, we don't need to include such entries with symbols and punctuation, because they will never be looked up in our topic model pipeline. 6 | 7 | Spark NLP expects lemmas to be in the following format -- note that it uses space-separated words in a flat file format (no JSON). 8 | 9 | ``` 10 | colony -> colony colonies 11 | colonisation -> colonisation colonisations 12 | colonise -> colonise colonised colonises colonising 13 | coloniser -> coloniser colonisers 14 | colonist -> colonist colonists 15 | colonization -> colonization colonizations 16 | colonize -> colonize colonized colonizes colonizing 17 | colonizer -> colonizer colonizers 18 | ``` 19 | 20 | When we load in the lemma lookup table as shown above to Spark, we can specify the separator symbol (`-->`), that indicates to Spark that the lemma is on the left and the words that share that lemma are on the right of that separator. 21 | 22 | ## Preparing the lemma lookup file 23 | In our experiments over many months' of real world data, we observed certain words like "data", which occur very regularly in the news, were reduced to "datum" during lemmatization. This is not ideal during topic keyword interpretation for labelling the topics. As a result, we define a "ban list" of lemmas in the file `convert_spacy_lemmas.py`, currently consisting of just the lemma "datum". In specifying this list, we are able to exclude it from the generated lookup file for Spark, so that when the model encounters words like "data", it does not reduce it to its lemma form (it is kept as "data"). 24 | 25 | The order of steps in generating an up-to-date lemma lookup table for Spark via spaCy is below. 26 | 27 | 1. In `convert_spacy_lemmas.py` define a ban list of lemmas that shouldn't be considered during lemmatization. Words that have this as a lemma in spaCy's English lemma lookup are not lemmatized as a result. 28 | 2. Run the rile `convert_spacy_lemmas.py` (one-time process each time we want to update the lemma list) -- this downloads the latest English lemma lookup JSON from spaCy's GitHub repo, formats it and removes the unnecessary lemmas as we defined in the script. 29 | 3. Commit both `convert_spacy_lemmas.py` as well as generated text file `spacy_english_lemmas.txt` to GitHub. Pull the latest code on the topic modelling VM to ensure that the latest lemma list is in use for our monthly pipeline. -------------------------------------------------------------------------------- /nlp/english/topic_model/spacyLemmas/convert_spacy_lemmas.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert the most recent spaCy lemma dictionary to a format that can be read 3 | by Spark-NLP. 4 | """ 5 | import json 6 | from urllib.request import urlopen 7 | 8 | 9 | def get_ban_list(): 10 | """List of lemmas that we don't want from spaCy's default lookup list""" 11 | banned_lemmas = ["datum"] 12 | return banned_lemmas 13 | 14 | 15 | def get_spacy_lemmas(): 16 | """Download most recent spaCy lemma dictionary from their GitHub repo.""" 17 | spacy_lemma_url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/master/spacy_lookups_data/data/en_lemma_lookup.json" 18 | with urlopen(spacy_lemma_url) as response: 19 | lemmas = response.read() 20 | return json.loads(lemmas) 21 | 22 | 23 | def get_same_value_keys(spacy_lemmas): 24 | """Map all duplicate values in the lemma dict to the key that they point to.""" 25 | same_value_keys = {} 26 | for key, value in spacy_lemmas.items(): 27 | if value not in same_value_keys: 28 | same_value_keys[value] = [key] 29 | else: 30 | same_value_keys[value].append(key) 31 | return same_value_keys 32 | 33 | 34 | def write_sparknlp_lemmas(spacy_lemmas): 35 | """Write out the lemmas as per Spark NLP's format: 36 | https://stackoverflow.com/a/57873365/1194761 37 | """ 38 | ban_list = get_ban_list() 39 | same_value_keys = get_same_value_keys(spacy_lemmas) 40 | with open('spacy_english_lemmas.txt', "w") as f: 41 | for key, values in same_value_keys.items(): 42 | print(key, " -->", values) 43 | if key in ban_list: 44 | # Prevent lemmas that we banned from being included in the output lemma list for Spark 45 | pass 46 | else: 47 | # Only output values without special characters 48 | alphabet_values = [val.lower() for val in values if val.isalpha()] 49 | if key.isalpha(): 50 | f.write("{0} -> {0} {1}\n".format(key.lower(), ' '.join(list(alphabet_values)))) 51 | 52 | 53 | def main(): 54 | spacy_lemmas = get_spacy_lemmas() 55 | write_sparknlp_lemmas(spacy_lemmas) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /nlp/english/topic_model/stopwords/README.md: -------------------------------------------------------------------------------- 1 | # Stopwords for Topic Modelling 2 | Choosing the right stopwords for topic modelling is an iterative process [[1]](https://databricks.com/blog/2015/09/22/large-scale-topic-modeling-improvements-to-lda-on-apache-spark.html). Based on the news outlet vocabulary in our corpus, certain common verbs can hinder the interpretation of topics. Most nouns, however, are useful for interpreting topics as they offer some context to the news categories being covered. 3 | 4 | The below lists of words are combined to produce the overall stopword list used in topic modelling. 5 | 6 | ## NLTK (curated) 7 | From past projects at the discourse processing lab, the default NLTK stopword list was curated and some additional common charactes/symbols/digits added to this list. This list of words is in the file `nltk_curated.txt`. 8 | 9 | ## Light verbs 10 | These are [verbs with little semantic content of their own](https://en.wikipedia.org/wiki/Light_verb), such as *do, give, make, take*. The list of light verbs relevant to the GGT news corpus is extended and customized (with some trial and error based on intermediate topic model results) and added to the file `create_stopword_list.py`. 11 | 12 | **NOTE**: In the Python file, just the verb roots are specified manually. The full list of verbs (in present/past tenses) is obtained by looking up each lemma's alternate forms from spaCy's lemma dictionary. 13 | 14 | ## Custom words 15 | Initially, an additional list of "general" nouns, or [signalling nouns](https://books.google.ca/books/about/Signalling_Nouns_in_Academic_English.html?id=3f-XoAEACAAJ&redir_esc=y), or [shell nouns](https://www.aclweb.org/anthology/W13-2314/) was considered. These include certain abstract nouns like "problem", "fact" or "result" - i.e. nouns with non-specific meaning when considered in isolation. It was found that most of these nouns are actually very useful in interpreting topics, which in itself is a task where words (especially nouns) are looked at in isolation. 16 | 17 | As a result, general/signalling/shell nouns are **not** used in this task. 18 | 19 | However, based on the initial topic modelling experiments run, a separate list of custom words that hinder topic interpretability were created manually. The below words were included in the file `create_stopword_list.py`. 20 | 21 | * **Social media-related**: *post, sign, like, love, tag, star, call, group, video, photo, pic, inbox* 22 | * **URL and embed terms**: *http, https, href, ref, com, cbc, ctv, src, twsrc, 5etfw* 23 | * **Frequently occurring common nouns**: *people, man, woman, life, family, friend, news, report, press, page, story* 24 | * **Time of the day/week**: *morning, afternoon, evening, today, yesterday, tomorrow* 25 | * **Time periods**: *day, week, month, year* 26 | * **Time zones**: *edt, pst* 27 | * **Day of the week**: *monday, tuesday, wednesday, thursday, friday, saturday, sunday* 28 | * **Months of the year**: *january, february, march, ..., october, november, december* 29 | * **Year**: *2018, 2019, 2020, 2021* 30 | 31 | ## Generate a final list of stopwords 32 | The included Python file is run as follows. 33 | ``` 34 | python3 create_stopword_list.py 35 | ``` 36 | 37 | This concatenates words from the above lists into a single, de-duplicated set and sorts them in alphabetical order, producing a final stopword file `stopwords.txt`. 38 | 39 | ## References 40 | [1] [Large Scale Topic Modeling: Improvements to LDA on Apache Spark](https://databricks.com/blog/2015/09/22/large-scale-topic-modeling-improvements-to-lda-on-apache-spark.html) -------------------------------------------------------------------------------- /nlp/english/topic_model/stopwords/create_stopword_list.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to generate a custom list of stopwords that extend upon existing word lists. 3 | """ 4 | import json 5 | from urllib.request import urlopen 6 | from itertools import chain 7 | 8 | 9 | def combine(*lists): 10 | "Combine an arbitrary number of lists into a single list" 11 | return list(chain(*lists)) 12 | 13 | 14 | def get_spacy_lemmas(): 15 | "Read in spaCy lemma dict from the raw GitHub source" 16 | spacy_lemma_url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/master/spacy_lookups_data/data/en_lemma_lookup.json" 17 | with urlopen(spacy_lemma_url) as response: 18 | lemmas = response.read() 19 | return json.loads(lemmas) 20 | 21 | 22 | def get_words(filename): 23 | "Read in a list of words from a stopword list" 24 | words = [] 25 | with open(filename) as f: 26 | for word in f: 27 | words.append(word.strip()) 28 | return words 29 | 30 | 31 | def lookup_verbs(roots, spacy_lemmas): 32 | """Return a full of list light verbs and all its forms (present, past tense, etc.)""" 33 | 34 | def flatten(list_of_lists): 35 | "Return a flattened list of a list of lists" 36 | return [item for sublist in list_of_lists for item in sublist] 37 | 38 | verblist = [] 39 | for root in roots: 40 | verbs = [key for key in spacy_lemmas if spacy_lemmas[key] == root] 41 | verbs.append(root) 42 | verblist.append(verbs) 43 | return flatten(verblist) 44 | 45 | 46 | if __name__ == "__main__": 47 | # We first get the NLTK curated word list 48 | nltk_stopwords = set(get_words('nltk_curated.txt')) 49 | # Obtain spaCy lemma dictionary for retrieving light verb full forms 50 | spacy_lemmas = get_spacy_lemmas() 51 | 52 | # Create custom word lists depending on the class of words seen in the data 53 | url_terms = ['href', 'http', 'https', 'src', 'twsrc', '5etfw', 'ref', 'com', 'cbc', 54 | 'ctv', 'star', '5127en', 'httpstco', 'www'] 55 | # Don't take 'wed', 'sat' and 'sun' because they are also normal words 56 | days_of_the_week = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 57 | 'saturday', 'sunday', 'mon', 'tue', 'thu', 'fri'] 58 | months_of_the_year = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 59 | 'august', 'september', 'october', 'november', 'december', 'jan', 60 | 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 61 | 'nov', 'dec'] 62 | years = ["2018", "2019", "2020", "2021", "2022", "2023"] 63 | time_periods = ['minute', 'minutes', 'hour', 'hours', 'day', 'days', 'week', 'weeks', 64 | 'month', 'months', 'year', 'years'] 65 | time_related = ['yesterday', 'today', 'tomorrow', 'day', 'night', 'morning', 66 | 'afternoon', 'evening', 'edt', 'est', 'pst', 'pdt', 'time', 'times'] 67 | common_words = ['press', 'news', 'report', 'page', 'user', 'reuters', 'email', 'browser', 68 | 'file', 'files', 'video', 'pic', 'photo', 'online', 'social', 'media', 'article', 69 | 'group', 'inbox', 'item', 'advertisement', 'world', 'store', 'story', 70 | 'life', 'family', 'people', 'man', 'woman', 'friend', 'friends'] 71 | social_media = ['twitter', 'facebook', 'google', 'gmail', 'linkedin', 'pinterest', 'getty', 72 | 'video', 'photo', 'image', 'images', 'user', 'social', 'media', 'newsletter', 73 | 'subscribe', 'page', 'online', 'live', 'stream', 'post', 'app', 'postmedia', 74 | 'apnews'] 75 | light_verb_roots = ['call', 'click', 'continue', 'comment', 'do', 'feel', 'find', 76 | 'give', 'get', 'have', 'include', 'like', 'live', 'love', 'make', 77 | 'post', 'read', 'say', 'speak', 'send', 'share', 'show', 'sign', 78 | 'tag', 'take', 'tell', 'think', 'update', 'work', 'write', 'join', 79 | 'view', 'load'] 80 | 81 | # Convert light verb roots to all its forms using lemma lookup 82 | light_verbs_full = lookup_verbs(light_verb_roots, spacy_lemmas) 83 | 84 | # Combine into a single list of stopwords 85 | add_stopwords = set( 86 | combine( 87 | nltk_stopwords, url_terms, days_of_the_week, months_of_the_year, years, 88 | time_periods, time_related, common_words, social_media, light_verbs_full 89 | ) 90 | ) 91 | 92 | # Combine all stopwords into one list and export to text file 93 | combined_stopwords = nltk_stopwords.union(add_stopwords) 94 | stopword_list = sorted(list(combined_stopwords)) 95 | # Write out stopwords to file 96 | with open('stopwords.txt', 'w') as f: 97 | for word in stopword_list: 98 | f.write(word + '\n') 99 | 100 | print(f"Exported {len(stopword_list)} words to stopword list.") 101 | -------------------------------------------------------------------------------- /nlp/english/topic_model/stopwords/nltk_curated.txt: -------------------------------------------------------------------------------- 1 | - 2 | 0 3 | 000 4 | 1 5 | 2 6 | 3 7 | 4 8 | 5 9 | 6 10 | 7 11 | 8 12 | 9 13 | a 14 | a's 15 | able 16 | about 17 | above 18 | according 19 | accordingly 20 | across 21 | actually 22 | after 23 | afterwards 24 | again 25 | against 26 | ain 27 | ain't 28 | all 29 | allow 30 | allows 31 | almost 32 | alone 33 | along 34 | already 35 | also 36 | although 37 | always 38 | am 39 | among 40 | amongst 41 | an 42 | and 43 | another 44 | any 45 | anybody 46 | anyhow 47 | anyone 48 | anything 49 | anyway 50 | anyways 51 | anywhere 52 | apart 53 | appear 54 | appreciate 55 | appropriate 56 | are 57 | area 58 | aren 59 | aren't 60 | around 61 | as 62 | aside 63 | ask 64 | asking 65 | associated 66 | at 67 | available 68 | away 69 | awfully 70 | b 71 | back 72 | be 73 | became 74 | because 75 | become 76 | becomes 77 | becoming 78 | been 79 | before 80 | beforehand 81 | behind 82 | being 83 | believe 84 | below 85 | beside 86 | besides 87 | best 88 | better 89 | between 90 | beyond 91 | big 92 | bit 93 | both 94 | brief 95 | but 96 | by 97 | c 98 | c'mon 99 | c's 100 | came 101 | can 102 | canada 103 | canadian 104 | can't 105 | cannot 106 | cant 107 | cause 108 | causes 109 | cent 110 | certain 111 | certainly 112 | changes 113 | city 114 | clearly 115 | co 116 | com 117 | come 118 | comes 119 | concerning 120 | consequently 121 | consider 122 | considering 123 | contain 124 | containing 125 | contains 126 | continue 127 | corresponding 128 | could 129 | couldn 130 | couldn't 131 | course 132 | currently 133 | d 134 | day 135 | definitely 136 | described 137 | despite 138 | did 139 | didn 140 | didn't 141 | different 142 | do 143 | does 144 | doesn 145 | doesn't 146 | doing 147 | don 148 | don't 149 | done 150 | down 151 | downwards 152 | during 153 | e 154 | each 155 | edu 156 | eg 157 | eight 158 | either 159 | else 160 | elsewhere 161 | end 162 | enough 163 | entirely 164 | especially 165 | et 166 | etc 167 | even 168 | ever 169 | every 170 | everybody 171 | everyone 172 | everything 173 | everywhere 174 | ex 175 | exactly 176 | example 177 | except 178 | f 179 | far 180 | feel 181 | few 182 | fifth 183 | find 184 | first 185 | five 186 | followed 187 | following 188 | follows 189 | for 190 | former 191 | formerly 192 | forth 193 | four 194 | from 195 | further 196 | furthermore 197 | g 198 | get 199 | gets 200 | getting 201 | given 202 | gives 203 | go 204 | goes 205 | going 206 | gone 207 | got 208 | gotten 209 | greetings 210 | h 211 | had 212 | hadn 213 | hadn't 214 | happens 215 | hardly 216 | has 217 | hasn 218 | hasn't 219 | have 220 | haven 221 | haven't 222 | having 223 | he 224 | he's 225 | he'se 226 | hello 227 | help 228 | hence 229 | her 230 | here 231 | here's 232 | hereafter 233 | hereby 234 | herein 235 | hereupon 236 | hers 237 | herself 238 | hi 239 | him 240 | himself 241 | his 242 | hither 243 | hopefully 244 | how 245 | howbeit 246 | however 247 | i 248 | i'd 249 | i'll 250 | i'm 251 | i've 252 | ie 253 | if 254 | ignored 255 | immediate 256 | in 257 | inasmuch 258 | inc 259 | indeed 260 | indicate 261 | indicated 262 | indicates 263 | inner 264 | insofar 265 | instead 266 | into 267 | inward 268 | is 269 | isn 270 | isn't 271 | it 272 | it'd 273 | it'll 274 | it's 275 | it, 276 | its 277 | itself 278 | j 279 | just 280 | k 281 | keep 282 | keeps 283 | kept 284 | know 285 | known 286 | knows 287 | l 288 | last 289 | lately 290 | later 291 | latter 292 | latterly 293 | least 294 | less 295 | lest 296 | let 297 | let's 298 | like 299 | liked 300 | likely 301 | little 302 | ll 303 | look 304 | looking 305 | looks 306 | lot 307 | ltd 308 | m 309 | ma 310 | made 311 | mainly 312 | man 313 | many 314 | may 315 | maybe 316 | me 317 | mean 318 | meanwhile 319 | merely 320 | might 321 | mightn 322 | mightn't 323 | more 324 | moreover 325 | most 326 | mostly 327 | much 328 | must 329 | mustn 330 | mustn't 331 | my 332 | myself 333 | n 334 | name 335 | namely 336 | nd 337 | near 338 | nearly 339 | necessary 340 | need 341 | needn 342 | needn't 343 | needs 344 | neither 345 | never 346 | nevertheless 347 | new 348 | next 349 | nine 350 | no 351 | nobody 352 | non 353 | none 354 | noone 355 | nor 356 | normally 357 | not 358 | nothing 359 | novel 360 | now 361 | nowhere 362 | o 363 | obviously 364 | of 365 | off 366 | often 367 | oh 368 | ok 369 | okay 370 | old 371 | on 372 | once 373 | one 374 | ones 375 | only 376 | onto 377 | or 378 | other 379 | others 380 | otherwise 381 | ought 382 | our 383 | ours 384 | ourselves 385 | out 386 | outside 387 | over 388 | overall 389 | own 390 | p 391 | particular 392 | particularly 393 | per 394 | perhaps 395 | place 396 | placed 397 | play 398 | please 399 | plus 400 | possible 401 | presumably 402 | probably 403 | provides 404 | put 405 | q 406 | que 407 | quite 408 | qv 409 | r 410 | rather 411 | rd 412 | re 413 | really 414 | reasonably 415 | regarding 416 | regardless 417 | regards 418 | relatively 419 | respectively 420 | right 421 | s 422 | said 423 | same 424 | saw 425 | say 426 | saying 427 | says 428 | second 429 | secondly 430 | see 431 | seeing 432 | seem 433 | seemed 434 | seeming 435 | seems 436 | seen 437 | self 438 | selves 439 | sensible 440 | sent 441 | serious 442 | seriously 443 | service 444 | seven 445 | several 446 | shall 447 | shan 448 | shan't 449 | she 450 | she's 451 | should 452 | should've 453 | shouldn 454 | shouldn't 455 | since 456 | six 457 | so 458 | some 459 | somebody 460 | somehow 461 | someone 462 | something 463 | sometime 464 | sometimes 465 | somewhat 466 | somewhere 467 | soon 468 | sorry 469 | specified 470 | specify 471 | specifying 472 | start 473 | still 474 | sub 475 | such 476 | sup 477 | sure 478 | t 479 | t's 480 | take 481 | taken 482 | tell 483 | tends 484 | th 485 | than 486 | thank 487 | thanks 488 | thanx 489 | that 490 | that'll 491 | that's 492 | thats 493 | the 494 | their 495 | theirs 496 | them 497 | themselves 498 | then 499 | thence 500 | there 501 | there's 502 | thereafter 503 | thereby 504 | therefore 505 | therein 506 | theres 507 | thereupon 508 | these 509 | they 510 | they'd 511 | they'll 512 | they're 513 | they've 514 | things 515 | think 516 | third 517 | this 518 | thorough 519 | thoroughly 520 | those 521 | though 522 | three 523 | through 524 | throughout 525 | thru 526 | thus 527 | to 528 | together 529 | too 530 | took 531 | toward 532 | towards 533 | tried 534 | tries 535 | truly 536 | try 537 | trying 538 | twice 539 | two 540 | u 541 | un 542 | under 543 | unfortunately 544 | unless 545 | unlikely 546 | until 547 | unto 548 | up 549 | upon 550 | us 551 | use 552 | used 553 | useful 554 | uses 555 | using 556 | usually 557 | uucp 558 | v 559 | value 560 | various 561 | ve 562 | very 563 | via 564 | viz 565 | vs 566 | w 567 | want 568 | wants 569 | was 570 | wasn 571 | wasn't 572 | way 573 | we 574 | we'd 575 | we'll 576 | we're 577 | we've 578 | welcome 579 | well 580 | went 581 | were 582 | weren 583 | weren't 584 | what 585 | what's 586 | whatever 587 | when 588 | whence 589 | whenever 590 | where 591 | where's 592 | whereafter 593 | whereas 594 | whereby 595 | wherein 596 | whereupon 597 | wherever 598 | whether 599 | which 600 | while 601 | whither 602 | who 603 | who's 604 | whoever 605 | whole 606 | whom 607 | whose 608 | why 609 | will 610 | willing 611 | wish 612 | with 613 | within 614 | without 615 | won 616 | won't 617 | wonder 618 | working 619 | would 620 | wouldn 621 | wouldn't 622 | x 623 | y 624 | years 625 | yes 626 | yet 627 | you 628 | you'd 629 | you'll 630 | you're 631 | you've 632 | your 633 | yours 634 | yourself 635 | yourselves 636 | z 637 | zero 638 | -------------------------------------------------------------------------------- /nlp/french/README.md: -------------------------------------------------------------------------------- 1 | # French NLP pipeline 2 | ## Set up environment 3 | The French NLP pipeline uses a third party coreference resolution library named [coreferee](https://github.com/explosion/coreferee), which requires the use of Python 3.9. It is assumed that Python 3.9 exists on the system on which the French NLP code runs. 4 | 5 | Make sure that `gcc`, `build-essential` and `python3.9-devel` (on Red Hat/CentOS), or `python3.9-dev` (on ubuntu) are installed on the system. Also, install `python3.9-venv` for managing virtual environments, and ensure `wheel` is installed prior to installing the dependencies (as shown below) 6 | 7 | ```sh 8 | python3.9 - venv GRIM-FR 9 | ``` 10 | 11 | Activate the environment and install the dependencies: 12 | 13 | ``` 14 | source GRIM-FR/bin/activate 15 | python3.9 -m pip install -U pip wheel # Upgrade pip and install the wheel package first 16 | python3.9 -m pip install -r requirements.txt 17 | ``` 18 | 19 | ## Quote extractor 20 | Extract `quotes` from the database or from local files. Save the output locally, or update the database directly. 21 | 22 | ### Default mode 23 | By default, the quote extractor only works on articles that weren't processed earlier (i.e., new articles that are freshly scraped with `lastModifier = mediaCollectors`). 24 | 25 | ```sh 26 | python3.9 quote_extractor.py --db mediaTracker --readcol media --limit 0 27 | ``` 28 | `--limit 0` (which is the default setting) means no limitation, and the script runs on all documents in the database. 29 | 30 | ### Force update 31 | To force-update the results and overwrite existing data for all articles, use the `--force_update` argument. 32 | ```sh 33 | python3.9 quote_extractor.py --db mediaTracker --readcol media --force_update --limit 10 34 | ``` 35 | `--limit 10` means that the script will process just 10 documents, which is useful during testing. 36 | 37 | ### Specify time period 38 | We can easily limit the quote extraction process to only articles from a specified time period. 39 | 40 | ```sh 41 | python3.9 quote_extractor.py --db mediaTracker --readcol media --force_update --begin_date 2021-12-01 --end_date 2021-12-31 42 | ``` 43 | 44 | For the full list of optional arguments, type the following: 45 | 46 | ```sh 47 | python3.9 quote_extractor.py --help 48 | ``` 49 | ## Quote highlighter 50 | Take an input text, a set of corresponding predicted `quotes` (usually output data from the quote extractor) and optionally target `quotes` to compare against and output HTML files highlighting the quotes and speakers in the text. 51 | 52 | Example commands: 53 | ``` 54 | python3.9 quote_highlighter.py --text-base=./input/ --prediction-base=./predictions/ --no-target --html-base=./html/ 55 | ``` 56 | Optional arguments: 57 | ``` 58 | -h, --help show this help message and exit 59 | --text-base TEXT_BASE 60 | Where the text which the quotes were extracted from is stored. 61 | --html-base HTML_BASE 62 | Where to store the output HTML. 63 | --target-base TARGET_BASE 64 | Where the (annotated) target quotes are stored. 65 | --prediction-base PREDICTION_BASE 66 | Where the predicted quotes are stored. 67 | --no-target, -n Don't highlight target quotes/speakers 68 | ``` 69 | 70 | --- 71 | ## Entity gender annotator 72 | 73 | Once the quotes have been extracted and written to the DB, we can then run the entity gender annotation script. This script utilizes the quotes (stored as a list) from each article, performs NER on them, and then merges the extracted named entities with the speakers of the quotes. In addition, we also perform quote merging to match the merged named entities to the speaker of a quote, wherever possible. 74 | 75 | ### Default mode 76 | Just like the quote extractor, the entity gender annotator by default only works on articles that weren't processed earlier (i.e.,articles that were just processed by quote extractor, with `lastModifier = quote_extractor`). 77 | 78 | ```sh 79 | python3.9 entity_gender_annotator.py --db mediaTracker --readcol media 80 | ``` 81 | 82 | ### Force update 83 | 84 | To force-update the results and overwrite existing data for all articles, use the `--force_update` argument. 85 | 86 | ```sh 87 | python3.9 entity_gender_annotator.py --db mediaTracker --readcol media --force_update 88 | ``` 89 | 90 | ### Specify write collection 91 | **It is strongly recommended** to use the `--writecol` argument when running the script on a large collection. This is so that even if the NLP operations take many days to run, the database statistics will not pick up partially completed results, and we can then run the `merge_collections.py` script to move the NLP results from the `newmedia` to the `media` collection. 92 | 93 | ```sh 94 | python3.9 entity_gender_annotator.py --force_update --db mediaTracker --readcol media --writecol newmedia 95 | ``` 96 | 97 | 98 | ### Specify time period 99 | We can easily limit the quote extraction process to only articles from a specified time period. 100 | 101 | ```sh 102 | python3.9 entity_gender_annotator.py --db mediaTracker --readcol media --force_update --begin_date 2020-01-01 --end_date 2020-01-31 103 | ``` 104 | 105 | For further help options, type the following: 106 | 107 | ```sh 108 | python3.9 entity_gender_annotator.py --help 109 | ``` 110 | 111 | ## Note on multiprocessing 112 | As of spaCy 3.2.x and coreferee 1.3.1, multiprocessing is **not** supported (due to the inability of coreferee to share data between forked processes). As a result, we are unable to speed up the performance of the French entity gender annotator by dividing the computation across separate processes -- **this might change in a future version** when there are updates to the coreference algorithm within base spaCy. -------------------------------------------------------------------------------- /nlp/french/config.py: -------------------------------------------------------------------------------- 1 | host = ["mongo0", "mongo1", "mongo2"] 2 | # host = "localhost" 3 | prefix = "." if (host == "localhost") else "/path_to_code/GenderGapTracker/nlp/french" 4 | 5 | config = { 6 | "MONGO_ARGS": { 7 | "host": host, 8 | "port": 27017, 9 | "authSource": "admin", 10 | "readPreference": "primaryPreferred", 11 | "username": "username", 12 | "password": "password", 13 | }, 14 | "GENDER_RECOGNITION": { 15 | "GENDERIZE_ENABLED": False, 16 | "GENDERAPI_ENABLED": True, 17 | "GENDERAPI_TOKEN": "JSON_AUTH_TOKEN", 18 | "MANUAL_CACHE": "manual", 19 | "GENDERAPI_CACHE": "genderAPICleaned", 20 | "GENDERIZE_CACHE": "genderizeCleaned", 21 | "FIRSTNAME_CACHE": "firstNamesCleaned", 22 | }, 23 | "NLP": { 24 | "MAX_BODY_LENGTH": 20000, 25 | "QUOTE_VERBS": f"{prefix}/rules/quote_verb_list.txt", 26 | "AUTHOR_BLOCKLIST": f"{prefix}/rules/author_blocklist.txt", 27 | "NAME_PATTERNS": f"{prefix}/rules/name_patterns.jsonl", 28 | }, 29 | } 30 | -------------------------------------------------------------------------------- /nlp/french/data_statistics.py: -------------------------------------------------------------------------------- 1 | import os, json, re 2 | import argparse 3 | from ast import literal_eval 4 | 5 | import pandas as pd 6 | import Levenshtein as lev 7 | import spacy 8 | from spacy.language import Language 9 | from spacy.tokens import Doc, Span 10 | from coreferee.rules import RulesAnalyzerFactory 11 | from coreferee.data_model import Mention 12 | 13 | import utils 14 | 15 | 16 | def compute_statistics(text_dir, target_dir, output_file=None): 17 | files = utils.get_files_from_folder(text_dir) 18 | files_data = [] 19 | files_indexes = [] 20 | for i, doc_name in enumerate(files): 21 | # print(doc_name) 22 | text = utils.preprocess_text(files[doc_name]) 23 | json_file = target_dir + doc_name + ".json" 24 | if not os.path.exists(json_file): 25 | continue 26 | quote_objects = json.load(open(json_file, encoding="mac-roman")) 27 | file_data = get_file_stats(text, quote_objects) 28 | 29 | files_data.append(file_data) 30 | files_indexes.append(doc_name) 31 | # print(files_data) 32 | return process_results(files_data, files_indexes, output_file) 33 | 34 | 35 | def process_results(files_data, files_indexes, output_file): 36 | columns = [ 37 | "nouns", 38 | "proper nouns", 39 | "other nouns", 40 | "anaphora", 41 | "non-covered speakers", 42 | "speakerless quotes", 43 | "verbless quotes", 44 | "unknown speaker's gender", 45 | "referenceless quotes", 46 | "self-evident_references", 47 | "plural speaker", 48 | "total_quotes", 49 | ] 50 | df = pd.DataFrame(files_data, index=files_indexes, columns=columns) 51 | total = df.sum(numeric_only=True, axis=0) 52 | mean = df.mean(numeric_only=True, axis=0) 53 | median = df.median(numeric_only=True, axis=0) 54 | standard_deviation = df.std(numeric_only=True, axis=0) 55 | total_proportion = total / total["total_quotes"] 56 | 57 | df.loc["Mean"] = mean 58 | df.loc["Median"] = median 59 | df.loc["Standard_Deviation"] = standard_deviation 60 | df.loc["Total"] = total 61 | df.loc["Total_Proportion"] = total_proportion 62 | 63 | print(df) 64 | if output_file: 65 | df.to_csv(output_file, sep=";") 66 | return df 67 | 68 | 69 | def get_file_stats(text, quote_objects): 70 | doc = NLP(text) 71 | independent_nouns = ( 72 | substantives 73 | ) = ( 74 | proper_n 75 | ) = ( 76 | anaphora 77 | ) = ( 78 | uncovered_mention 79 | ) = ( 80 | speakerless 81 | ) = ( 82 | verbless 83 | ) = genderless = referenceless = evident_references = plural_speakers = quotes = 0 84 | 85 | for quote_object in quote_objects: 86 | speaker_index = quote_object["speaker_index"] 87 | speaker = quote_object["speaker"] 88 | reference = quote_object["reference"] 89 | speaker_gender = quote_object["speaker_gender"] 90 | verb = quote_object["verb"] 91 | if not verb: 92 | verbless += 1 93 | if speaker_gender == "unknown": 94 | genderless += 1 95 | if reference: 96 | pass 97 | else: 98 | referenceless += 1 99 | 100 | if speaker_index: 101 | start, end = literal_eval(speaker_index) 102 | speaker_span = doc.char_span(start, end, alignment_mode="expand") 103 | speaker_root = speaker_span.root 104 | is_mention = False 105 | if RULES_ANALYZER.is_independent_noun(speaker_root): 106 | is_mention = True 107 | independent_nouns += 1 108 | if speaker_root.pos_ == "PROPN": 109 | proper_n += 1 110 | else: 111 | substantives += 1 112 | elif RULES_ANALYZER.is_potential_anaphor(speaker_root): 113 | is_mention = True 114 | anaphora += 1 115 | else: 116 | infos_root = [ 117 | speaker_root, 118 | speaker_root.pos_, 119 | speaker_root.dep_, 120 | speaker_root.morph, 121 | ] 122 | print( 123 | "NOT COVERED :", 124 | speaker, 125 | (start, end, speaker_span.start, speaker_span.end), 126 | infos_root, 127 | ) 128 | uncovered_mention += 1 129 | if RULES_ANALYZER.is_independent_noun( 130 | speaker_root 131 | ) and RULES_ANALYZER.is_potential_anaphor(speaker_root): 132 | print( 133 | "DOUBLE", 134 | speaker, 135 | speaker_root, 136 | speaker_root.pos_, 137 | speaker_root.dep_, 138 | speaker_root.morph, 139 | sep="|", 140 | ) 141 | 142 | if reference and lev.distance(speaker.lower(), reference.lower()) <= 2: 143 | evident_references += 1 144 | 145 | masc, fem, sing, plur = RULES_ANALYZER.get_gender_number_info(speaker_root) 146 | siblings = RULES_ANALYZER.get_dependent_siblings(speaker_root) 147 | if is_mention and ( 148 | (plur and not sing) or (siblings and siblings[-1].idx <= end) 149 | ): 150 | # print("PLURAL :", speaker) 151 | plural_speakers += 1 152 | else: 153 | speakerless += 1 154 | quotes += 1 155 | data = ( 156 | independent_nouns, 157 | proper_n, 158 | substantives, 159 | anaphora, 160 | uncovered_mention, 161 | speakerless, 162 | verbless, 163 | genderless, 164 | referenceless, 165 | evident_references, 166 | plural_speakers, 167 | quotes, 168 | ) 169 | return data 170 | 171 | 172 | NLP = spacy.load("fr_core_news_lg") 173 | RULES_ANALYZER = RulesAnalyzerFactory.get_rules_analyzer(NLP) 174 | 175 | if __name__ == "__main__": 176 | parser = argparse.ArgumentParser(description="Computes statistics about the quotes and their speakers and write them to csv") 177 | parser.add_argument("--text_dir", type=str, help="Path to the texts directory") 178 | parser.add_argument("--target_dir", type=str, help="Path to the target directory") 179 | parser.add_argument("--output_file", type=str, default="", help="Path to the output csv file") 180 | args = parser.parse_args() 181 | TEXT_DIR = args.text_dir 182 | TARGET_DIR = args.target_dir 183 | OUTPUT_FILE = args.output_file 184 | compute_statistics(TEXT_DIR, TARGET_DIR, OUTPUT_FILE) 185 | -------------------------------------------------------------------------------- /nlp/french/evaluation/README.md: -------------------------------------------------------------------------------- 1 | # French GGT Evaluation 2 | This folder contains methodology and code for evaluating the results of the French pipeline. 3 | 4 | For consistent and reproducible results, make sure any evaluation run locally uses the **same Python environment that is running in production**. 5 | 6 | ## Download Data 7 | The raw text data containing news article text, as well as the human-annotated data, is made available upon request (please contact Maite Taboada at [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca)). 8 | Obtain the directories named `humanAnnotations` and `rawtext` and place them in their respective paths as per the structure below. 9 | 10 | ```sh 11 | ├── . 12 | | ├── src 13 | | | ├── rawtexts 14 | | | ├── eval 15 | | | | └── humanAnnotations 16 | | | └── evaluate.py 17 | | | └── run_predictions.py 18 | ``` 19 | 20 | ## Set Up Environment 21 | This section assumes that the virtual environment for French NLP has already been set up. The French NLP pipeline uses a third party coreference resolution library named [coreferee](https://github.com/explosion/coreferee), which requires the use of Python 3.9. It is assumed that Python 3.9 exists on the system on which the French NLP code runs. 22 | 23 | Make sure that `gcc`, `build-essential` and `python3.9-devel` (on Red Hat/CentOS), or `python3.9-dev` (on ubuntu) are installed on the system. Also, install `python3.9-venv` for managing virtual environments, and ensure `wheel` is installed prior to installing the dependencies (as shown below) 24 | 25 | 26 | If not done already, install a virtual environment using the `requirements.txt` from the `nlp/french` directory in this repo. 27 | 28 | ```sh 29 | cd /path_to_code/GenderGapTracker/nlp/french 30 | python3.9 -m venv GRIM-FR 31 | source GRIM-FR/bin/activate 32 | python3.9 -m pip install -U pip wheel # Upgrade pip and install the wheel package first 33 | python3.9 -m pip install -r requirements.txt 34 | ``` 35 | 36 | This installs the correct versions of spaCy, its associated language model, as well as coreferee (for coreference resolution). 37 | -------------------------------------------------------------------------------- /nlp/french/merge_collections.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script merges the results from the newly created collection from the entity 3 | gender annotation script (when the user specifies the `writecol` argument) with 4 | the original collection. 5 | 6 | Only the fields specified in this file are merged with (i.e. overwrite) the original 7 | `media` collection - the remaining fields in the original collection are left untouched. 8 | """ 9 | import argparse 10 | from multiprocessing import Pool, cpu_count 11 | from config import config 12 | import utils 13 | 14 | 15 | def update_field(existing_collection, new_collection, idx): 16 | """Overwrite existing collection's fields with new collection's fields (except IDs)""" 17 | new_id = idx['_id'] 18 | existing_id = idx['currentId'] 19 | doc = new_collection.find_one({'_id': new_id}) 20 | existing_collection.update_one( 21 | {'_id': existing_id}, 22 | {'$set': filter_dict(doc)} 23 | ) 24 | 25 | 26 | def filter_dict(dict_obj): 27 | """Return a dictionary that has the same keys/values as the original dictionary, 28 | except for a few select keys that are to be excluded. 29 | """ 30 | ignore_keys = ['_id', 'currentId'] 31 | new_dict = {key: dict_obj[key] for key in dict_obj if key not in ignore_keys} 32 | return new_dict 33 | 34 | 35 | def chunker(iterable, chunksize): 36 | """Yield a smaller chunk of a large iterable""" 37 | for i in range(0, len(iterable), chunksize): 38 | yield iterable[i:i + chunksize] 39 | 40 | 41 | def parse_chunks(chunk): 42 | """Pass through a chunk of document IDs and update fields""" 43 | db_client = utils.init_client(MONGO_ARGS) 44 | existing_collection = db_client[DB_NAME][EXISTING_COL] 45 | new_collection = db_client[DB_NAME][NEW_COL] 46 | for idx in chunk: 47 | update_field(existing_collection, new_collection, idx) 48 | 49 | 50 | def run_pool(poolsize, chunksize): 51 | """Concurrently run independent operations on multiple cores""" 52 | db_client = utils.init_client(MONGO_ARGS) 53 | # Get list of new and old IDs from new collection 54 | new_col = db_client[DB_NAME][NEW_COL] 55 | new_old_ids = list(new_col.find({}, {'_id': 1, 'currentId': 1})) 56 | print('Obtained ID list of length {}.'.format(len(new_old_ids))) 57 | # Process quotes using a pool of executors 58 | pool = Pool(processes=poolsize) 59 | pool.map(parse_chunks, chunker(new_old_ids, chunksize=chunksize)) 60 | pool.close() 61 | 62 | 63 | if __name__ == '__main__': 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('--db', type=str, default='mediaTracker', help="Database name") 66 | parser.add_argument('--oldcol', type=str, default='media', help="Existing collection name") 67 | parser.add_argument('--newcol', type=str, default='entitiesAnnotated', help="New collection name") 68 | parser.add_argument("--poolsize", type=int, default=cpu_count() + 1, help="Size of the concurrent process pool for the given task") 69 | parser.add_argument("--chunksize", type=int, default=100, help="Number of articles IDs per chunk being processed concurrently") 70 | args = vars(parser.parse_args()) 71 | 72 | # From config 73 | MONGO_ARGS = config['MONGO_ARGS'] 74 | # Parse arguments 75 | DB_NAME = args['db'] 76 | EXISTING_COL = args['oldcol'] 77 | NEW_COL = args['newcol'] 78 | poolsize = args['poolsize'] 79 | chunksize = args['chunksize'] 80 | 81 | run_pool(poolsize, chunksize) 82 | print("Finished merging collections!") 83 | -------------------------------------------------------------------------------- /nlp/french/requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.28.1 2 | Levenshtein>=0.16.0 3 | pandas>=1.5.3,<1.6.0 4 | pymongo>=3.12.0,<4.0.0 5 | pydantic<2.0.0 6 | spacy==3.2.5 7 | fr-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.2.0/fr_core_news_lg-3.2.0-py3-none-any.whl 8 | coreferee==1.3.1 9 | coreferee-model-fr @ https://github.com/msg-systems/coreferee/raw/master/models/coreferee_model_fr.zip 10 | statsmodels>=0.12.2 11 | -------------------------------------------------------------------------------- /nlp/french/rules/author_blocklist.txt: -------------------------------------------------------------------------------- 1 | Janvier 2 | Février 3 | Fevrier 4 | Mars 5 | Avril 6 | Mai 7 | Juin 8 | Juillet 9 | Août 10 | Aout 11 | Septembre 12 | Octobre 13 | Novembre 14 | Décembre 15 | Decembre 16 | Lundi 17 | Mardi 18 | Mercredi 19 | Jeudi 20 | Vendredi 21 | Samedi 22 | Dimanche 23 | Mise À Jour 24 | Mis À Jour 25 | Agence France-Presse 26 | Afp 27 | Associated Press 28 | La Presse Canadienne 29 | La Presse 30 | Et 31 | Est 32 | Que Je Vous Souhaite 33 | À 34 | Tva Nouvelles 35 | Journal De Montréal 36 | Journal De Montreal 37 | Le Droit 38 | Le Soleil 39 | Agence Qmi 40 | Le Quotidien 41 | À Ottawa 42 | Correspondante 43 | Professeur 44 | Linguiste 45 | Les 46 | Comme 47 | Publié 48 | Nombre 49 | Commentaires 50 | Capture D'Écran 51 | Ici.Radio-Canada.Ca 52 | Radio-Canada.Ca 53 | Radio 54 | Canada.ca 55 | Canada.Ca 56 | De Vie 57 | Où 58 | -------------------------------------------------------------------------------- /nlp/french/rules/name_patterns.jsonl: -------------------------------------------------------------------------------- 1 | {"label": "LOC", "pattern": "Niagara Falls"} 2 | {"label": "LOC", "pattern": "Rogers Place"} 3 | {"label": "LOC", "pattern": "Preah Sihanouk"} 4 | {"label": "LOC", "pattern": "Nova Scotia"} 5 | {"label": "LOC", "pattern": "Don Mills"} 6 | {"label": "LOC", "pattern": "Maple Ridge"} 7 | {"label": "LOC", "pattern": "Kneehill County"} 8 | {"label": "LOC", "pattern": "La Loche"} 9 | {"label": "LOC", "pattern": "Alberta Parks"} 10 | {"label": "LOC", "pattern": "Sioux Lookout"} 11 | {"label": "LOC", "pattern": "Rio de Janeiro"} 12 | {"label": "LOC", "pattern": "Quintana Roo"} 13 | {"label": "LOC", "pattern": "High Level"} 14 | {"label": "LOC", "pattern": "Red Deer"} 15 | {"label": "LOC", "pattern": [{"LOWER": "saint"}, {"LOWER": "john"}]} 16 | {"label": "LOC", "pattern": "Yves Paradis de la Cour du Québec"} 17 | {"label": "LOC", "pattern": "Québec"} 18 | {"label": "LOC", "pattern": "Ouje Bougoumou"} 19 | {"label": "ORG", "pattern": "OC Transpo"} 20 | {"label": "ORG", "pattern": "Rystad Energy"} 21 | {"label": "ORG", "pattern": "San Marcos"} 22 | {"label": "ORG", "pattern": "Yeni Safak"} 23 | {"label": "ORG", "pattern": "New Yorker"} 24 | {"label": "ORG", "pattern": "Maple Leafs"} 25 | {"label": "ORG", "pattern": "Canada Goose"} 26 | {"label": "ORG", "pattern": "Moose Jaw"} 27 | {"label": "ORG", "pattern": "Tim Hortons"} 28 | {"label": "ORG", "pattern": "Irving Oil"} 29 | {"label": "ORG", "pattern": "Kinder Morgan"} 30 | {"label": "ORG", "pattern": "Der Spiegel"} 31 | {"label": "ORG", "pattern": "Husky Energy"} 32 | {"label": "ORG", "pattern": "Nesbitt Burns"} 33 | {"label": "ORG", "pattern": "Royal LePage"} 34 | {"label": "ORG", "pattern": "Royal Lepage"} 35 | {"label": "ORG", "pattern": "Accueil Bonneau"} 36 | {"label": "ORG", "pattern": "Dickinson Wright"} 37 | {"label": "ORG", "pattern": "Taquan Air"} 38 | {"label": "ORG", "pattern": "Salmar Theatres"} 39 | {"label": "ORG", "pattern": "Walt Disney"} 40 | {"label": "ORG", "pattern": "McCarthy Tétrault"} 41 | {"label": "ORG", "pattern": "Helsingin Sanomat"} 42 | {"label": "ORG", "pattern": "Tk'emlúps te Secwépemc"} 43 | {"label": "ORG", "pattern": "Tk emlúps te Secwepemc"} 44 | {"label": "ORG", "pattern": "Tk emlups te Secwepemc First Nation"} 45 | {"label": "ORG", "pattern": "Tk'emlups te Secwepemc First Nation"} 46 | {"label": "ORG", "pattern": [{"LOWER": "la presse"}]} 47 | {"label": "ORG", "pattern": [{"LOWER": "la presse canadienne"}]} 48 | {"label": "ORG", "pattern": [{"LOWER": "ctvnews"}]} 49 | {"label": "ORG", "pattern": [{"LOWER": "ctvnews.ca"}]} 50 | {"label": "ORG", "pattern": [{"LOWER": "b"}, {"LOWER": "nai"}, {"LOWER": "brith"}]} 51 | {"label": "ORG", "pattern": [{"LOWER": "b'nai"}, {"LOWER": "brith"}]} 52 | {"label": "ORG", "pattern": "Cordé Électrique"} 53 | {"label": "ORG", "pattern": "Cogeco Connexion"} 54 | {"label": "ORG", "pattern": "Delpharm Industrie"} 55 | {"label": "ORG", "pattern": "Unither Bioélectronique"} 56 | {"label": "ORG", "pattern": "Systèmes Danfreight"} 57 | {"label": "ORG", "pattern": "Résilience Montréal"} 58 | {"label": "ORG", "pattern": "Altshuler Berzon"} 59 | {"label": "ORG", "pattern": "Odgers Berndtson"} 60 | {"label": "ORG", "pattern": "Puamun Meshkenu"} 61 | {"label": "ORG", "pattern": "Guylaine Desforges"} 62 | {"label": "ORG", "pattern": [{"LOWER": "affaire"}, {"TEXT": {"REGEX": "[A-Za-z]+"}}]} 63 | {"label": "ORG", "pattern": [{"LOWER": "vkousno"}, {"LOWER": "i"}, {"LOWER": "totchka"}]} 64 | {"label": "MISC", "pattern": [{"LOWER": "wakanda"}, {"LOWER": "forever"}]} 65 | {"label": "MISC", "pattern": [{"LOWER": "mi'kmaw"}]} 66 | {"label": "MISC", "pattern": "Bergdorf Goodman"} 67 | {"label": "MISC", "pattern": "Yuk Yuk"} 68 | {"label": "MISC", "pattern": [{"LOWER": "wet'suwet'en"}]} 69 | {"label": "MISC", "pattern": "Vuntut Gwitchin"} 70 | {"label": "MISC", "pattern": "Wahbung Abinoonjiiag"} 71 | {"label": "MISC", "pattern": "Manitoba Keewatinowi Okimakanak"} 72 | {"label": "MISC", "pattern": "Keewatinowi Okimakanak"} 73 | {"label": "MISC", "pattern": "Selon"} 74 | {"label": "MISC", "pattern": "Brent de la mer du Nord"} 75 | {"label": "MISC", "pattern": "Père Noël"} 76 | {"label": "PER", "pattern": "Caroline"} 77 | {"label": "PER", "pattern": "Virginie"} 78 | {"label": "PER", "pattern": "Georges Washington"} -------------------------------------------------------------------------------- /research_dashboard/admin/apps/topsources.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import dash 3 | from dash import dcc, html 4 | from dash.dependencies import Input, Output, State 5 | from pymongo import MongoClient 6 | from server import app 7 | from config import config 8 | 9 | MONGO_ARGS = config['MONGO_ARGS'] 10 | SOURCES_DB = config['DB']['SOURCES_DB'] 11 | SOURCES_COL = config['DB']['SOURCES_COL'] 12 | 13 | 14 | # ========== Functions ================ 15 | def get_doc_ids_from_db(): 16 | with MongoClient(**MONGO_ARGS) as connection: 17 | read_collection = connection[SOURCES_DB][SOURCES_COL] 18 | _ids = read_collection.find().distinct('_id') 19 | return sorted(_ids) 20 | 21 | 22 | def num2str_month(date_str): 23 | date_obj = datetime.datetime.strptime(date_str, "%Y%m") 24 | date_string = datetime.datetime.strftime(date_obj, "%B %Y") 25 | return date_string 26 | 27 | 28 | def format_dates(_ids): 29 | date_strings = [num2str_month(_id) for _id in _ids] 30 | return date_strings 31 | 32 | 33 | def get_top_n_words(topic_dict, n=5): 34 | """Return a list of top-n words for each topic. This list can 35 | then be used as an axis label if required. 36 | """ 37 | top_words = [] 38 | for num, data in topic_dict.items(): 39 | sorted_words = {k: v for k, v in sorted(data['words'].items(), 40 | key=lambda x: x[1], 41 | reverse=True 42 | )} 43 | words = sorted_words.keys() 44 | top_n_words = list(words)[:n] 45 | top_words.append(', '.join(top_n_words)) 46 | return top_words 47 | 48 | 49 | def list_topic_words(topic_dict): 50 | """Return a full list of words for a particular topic""" 51 | sorted_words = {k: v for k, v in sorted(topic_dict['words'].items(), 52 | key=lambda x: x[1], 53 | reverse=True 54 | )} 55 | words = sorted_words.keys() 56 | top_n_words = list(words) 57 | top_words = ', '.join(top_n_words) 58 | return top_words 59 | 60 | 61 | # ========== App Layout ================ 62 | 63 | def layout(): 64 | """Dynamically serve a layout based on updated DB values (for dropdown menu)""" 65 | # Needs db connection! (Set up tunnel if testing app locally) 66 | _ids = get_doc_ids_from_db() 67 | dropdown_dates = {num2str_month(_id): _id for _id in _ids} 68 | 69 | children_list = [ 70 | html.Div([ 71 | html.Div([ 72 | html.H3('Write observations for monthly top sources by gender'), 73 | dcc.Markdown(''' 74 | This app allows a user to write observations and comments for a particular month's top quoted 75 | sources. The text that is written is then saved on the database, and displayed on the [top sources 76 | dashboard app](https://gendergaptracker.research.sfu.ca/apps/topsources). 77 | '''), 78 | ]), 79 | html.H4('Topic month'), 80 | html.P(''' 81 | Select the topic month from the dropdown to inspect/update the word distributions for 82 | that month. 83 | '''), 84 | html.Div( 85 | dcc.Loading( 86 | id='load-data-progress', 87 | children=[ 88 | dcc.Store(id='top-sources-stats'), 89 | ]) 90 | ), 91 | dcc.Dropdown( 92 | id='date-dropdown', 93 | options=[ 94 | {'label': date_str, 'value': date_num} 95 | for date_str, date_num in dropdown_dates.items() 96 | ], 97 | value=_ids[-1], 98 | style={'text-align': 'center'} 99 | ), 100 | html.Br(), 101 | html.Label([ 102 | html.A('Markdown syntax', href='https://www.markdownguide.org/basic-syntax/'), 103 | ]), 104 | html.P(''' 105 | The text box below accepts Markdown syntax for embedding URLs: [Highlighted text](https://example.com). 106 | Make sure to route external URLs with the 'http' or 'https' prefix as shown in the 107 | example. 108 | '''), 109 | 110 | html.Div(id='create-text-input'), 111 | html.Div([html.Button(id='write-button', n_clicks=0, children='Save entries')], 112 | style={'display': 'flex', 'justifyContent': 'center'}), 113 | dcc.Loading( 114 | id='write-progress', 115 | children=[ 116 | html.P(id='push-comment-fields') 117 | ], type='default' 118 | ) 119 | ]) 120 | ] 121 | return children_list 122 | 123 | 124 | # ========== Callbacks ================ 125 | @app.callback(Output('top-sources-stats', 'data'), [Input('date-dropdown', 'value')]) 126 | def get_monthly_stats(value): 127 | with MongoClient(**MONGO_ARGS) as connection: 128 | read_collection = connection[SOURCES_DB][SOURCES_COL] 129 | stats = read_collection.find({'_id': value}) 130 | # Collect top sources stats 131 | stats = list(stats)[0] 132 | return stats 133 | 134 | 135 | @app.callback(Output('create-text-input', 'children'), [Input('top-sources-stats', 'data')]) 136 | def create_text_input(stats): 137 | comment = stats['comment'] 138 | # Return the text area with existing comment (if any) 139 | inp_box = html.Div( 140 | dcc.Textarea( 141 | id='text-input', 142 | placeholder="Enter your comments/observations for the selected month's top sources", 143 | value=comment, 144 | className='textarea', 145 | style={ 146 | 'width': '100%', 'height': 350, 'verticalAlign': 'top', 147 | 'fontFamily': 'Arial', 'fontColor': '#515151', 148 | } 149 | ), 150 | style={'display': 'flex', 'justifyContent': 'center'} 151 | ), 152 | return inp_box 153 | 154 | 155 | @app.callback(Output('push-comment-fields', 'data'), 156 | [Input('write-button', 'n_clicks'), 157 | Input('date-dropdown', 'value'), 158 | Input('top-sources-stats', 'data')], 159 | [State('text-input', 'value')]) 160 | def update_db(n_clicks, date_id, stats, comment): 161 | """Check if write-button is clicked, only then update DB""" 162 | ctx = dash.callback_context 163 | if "write-button" in ctx.triggered[0]["prop_id"]: 164 | with MongoClient(**MONGO_ARGS) as connection: 165 | collection = connection[SOURCES_DB][SOURCES_COL] 166 | # Overwrite existing topic names with new user-entered names 167 | stats['comment'] = comment 168 | # Write topics 169 | collection.find_one_and_update({'_id': date_id}, {'$set': stats}) 170 | return "Updated user comments/observations in the database.." 171 | -------------------------------------------------------------------------------- /research_dashboard/admin/apps/unknownsources.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import pandas as pd 3 | from dash import dcc, html, dash_table 4 | from dash.dependencies import Input, Output 5 | from pymongo import MongoClient 6 | from server import app, logger 7 | from config import config 8 | 9 | MONGO_ARGS = config['MONGO_ARGS'] 10 | SOURCES_DB = config['DB']['SOURCES_DB'] 11 | SOURCES_COL = config['DB']['SOURCES_COL'] 12 | NUM_SOURCES_TO_SHOW = 20 13 | 14 | 15 | # ========== Functions ================ 16 | 17 | def get_doc_ids_from_db(): 18 | with MongoClient(**MONGO_ARGS) as connection: 19 | read_collection = connection[SOURCES_DB][SOURCES_COL] 20 | _ids = read_collection.find().distinct('_id') 21 | return sorted(_ids) 22 | 23 | 24 | def num2str_month(date_str): 25 | date_obj = datetime.datetime.strptime(date_str, "%Y%m") 26 | date_string = datetime.datetime.strftime(date_obj, "%B %Y") 27 | return date_string 28 | 29 | 30 | def format_dates(_ids): 31 | date_strings = [num2str_month(_id) for _id in _ids] 32 | return date_strings 33 | 34 | 35 | def get_unknown_sources(stats): 36 | """Convert JSON object of top sources to pandas DataFrame""" 37 | top_unknown = pd.DataFrame(stats['topUnknownSources']) 38 | top_unknown.columns = ['unknown_count', 'unknown_names'] 39 | top_unknown['unknown_count'] = top_unknown['unknown_count'].astype('int') 40 | df = (top_unknown.sort_values(by='unknown_count', ascending=False) 41 | .iloc[:NUM_SOURCES_TO_SHOW, :] 42 | .reset_index(drop=True)) 43 | output = df.to_dict(orient='records') 44 | return output 45 | 46 | 47 | # ========== App Layout ================ 48 | 49 | def layout(): 50 | """Dynamically serve a layout based on updated DB values (for dropdown menu)""" 51 | # Needs db connection! (Set up tunnel if testing app locally) 52 | _ids = get_doc_ids_from_db() 53 | dropdown_dates = {num2str_month(_id): _id for _id in _ids} 54 | 55 | children_list = [ 56 | html.Div([ 57 | html.Div([ 58 | html.H3('View unknown sources'), 59 | dcc.Markdown(''' 60 | This app allows a user to inspect the top unknown sources extracted for a 61 | particular month. The reason we obtain unknown sources is twofold—sometimes, 62 | spaCy incorrectly tags an organization or geopolitical entity (i.e., location) as 63 | a person, leading to the gender service erring on the side of caution and not 64 | assigning a gender. In other cases, a person's name is ambiguous, or is non-standard 65 | (i.e., non-western or non-anglicized), so the gender services we use are unaware of 66 | these names' genders. 67 | 68 | Inspect the list of unknown sources for a given month by choosing a 69 | month from the dropdown menu. 70 | '''), 71 | ]), 72 | dcc.Dropdown( 73 | id='date-dropdown', 74 | options=[ 75 | {'label': date_str, 'value': date_num} 76 | for date_str, date_num in dropdown_dates.items() 77 | ], 78 | value=_ids[-1], 79 | style={'text-align': 'center'} 80 | ), 81 | html.Div(dcc.Store(id='top-sources-stats-2')), 82 | html.Br(), 83 | html.Div( 84 | dash_table.DataTable( 85 | id='unknown-sources-table', 86 | columns=[ 87 | {'name': 'Count', 'id': 'unknown_count'}, 88 | {'name': 'Unknown sources', 'id': 'unknown_names'}, 89 | ], 90 | style_table={'overflowX': 'auto'}, 91 | style_cell={ 92 | 'backgroundColor': 'rgba(102, 204, 204, 0.05)', 93 | 'textAlign': 'left', 94 | 'font_family': 'Arial', 95 | }, 96 | style_data={'height': 'auto', 'lineHeight': '30px'}, 97 | style_cell_conditional=[ 98 | { 99 | 'if': {'column_id': 'unknown_count'}, 100 | 'minWidth': '100px', 101 | 'width': '100px', 102 | 'maxWidth': '100px', 103 | }, 104 | ], 105 | style_header={ 106 | 'backgroundColor': 'rgb(255, 255, 255)', 107 | 'text-align': 'left', 108 | }, 109 | style_as_list_view=True, 110 | ) 111 | ), 112 | dcc.Markdown(''' 113 | #### 1. Fix spaCy NER rules 114 | To address incorrect spaCy tags, we add a rule to the below file: 115 | [`WomenInMedia/NLP/main/rules/name_patterns.jsonl`](https://github.com/maitetaboada/WomenInMedia/blob/master/NLP/main/rules/name_patterns.jsonl) 116 | 117 | The below tags are defined for now (others can be added as required): 118 | * `GPE`: Countries, cities, states, famous landmarks 119 | * `ORG`: Companies, agencies, institutions, etc. 120 | * `FAC`: Buildings, airports, highways, bridges, etc. 121 | * `NORP`: Nationalities or religious or political groups. 122 | * `EVENT`: Named hurricanes, battles, wars, sports events, etc. 123 | 124 | For a full list of tags, see the [spaCy documentation](https://spacy.io/api/annotation#named-entities). 125 | 126 | #### 2. Update manual gender cache 127 | Alteratively, for names that are of person (but are ambiguous), we can update the 128 | manual gender cache (`genderCache/manual`). This is done by populating a CSV file 129 | with the correct gender for each person's name and running the manual cache update script: 130 | [`WomenInMedia/NLP/experiments/genderCache/manual_cache`](https://github.com/maitetaboada/WomenInMedia/tree/master/NLP/experiments/genderCache/manual_cache) 131 | ''') 132 | ]) 133 | ] 134 | return children_list 135 | 136 | 137 | # ========== Callbacks ================ 138 | 139 | @app.callback(Output('top-sources-stats-2', 'data'), [Input('date-dropdown', 'value')]) 140 | def get_monthly_stats(value): 141 | with MongoClient(**MONGO_ARGS) as connection: 142 | read_collection = connection[SOURCES_DB][SOURCES_COL] 143 | stats = read_collection.find({'_id': value}) 144 | # Collect top sources stats 145 | stats = list(stats)[0] 146 | return stats 147 | 148 | 149 | @app.callback(Output('unknown-sources-table', 'data'), [Input('top-sources-stats-2', 'data')]) 150 | def get_unknown_sources_data(stats): 151 | try: 152 | output = get_unknown_sources(stats) 153 | logger.info(f'Obtained unknown sources of length {len(output)}') 154 | except Exception as e: 155 | logger.error("Unknown sources app error:", e) 156 | output = [] 157 | return output 158 | 159 | 160 | -------------------------------------------------------------------------------- /research_dashboard/admin/auth.py: -------------------------------------------------------------------------------- 1 | credentials = { 2 | 'admin': 'admin_password' 3 | } 4 | -------------------------------------------------------------------------------- /research_dashboard/admin/config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'MONGO_ARGS': { 3 | 'host': ['mongo0', 'mongo1', 'mongo2'], 4 | # 'host': 'localhost', 5 | 'port': 27017, 6 | 'username': 'username', 7 | 'password': 'password', 8 | 'authSource': 'admin', 9 | 'readPreference': 'primaryPreferred' 10 | }, 11 | 'DB': { 12 | 'READ_DB': 'topicModel', 13 | 'READ_COL': 'topicResults', 14 | 'SOURCES_DB': 'mediaTracker', 15 | 'SOURCES_COL': 'monthlySources', 16 | 'GENDER_DB': 'genderCache', 17 | 'MANUAL_NAME_COL': 'manual', 18 | 'FIRST_NAME_COL': 'firstNamesCleaned', 19 | } 20 | } -------------------------------------------------------------------------------- /research_dashboard/admin/run.py: -------------------------------------------------------------------------------- 1 | from dash import dcc, html 2 | from dash.dependencies import Input, Output 3 | from server import app, server 4 | from apps import topiclabels, topsources, unknownsources, updatecache 5 | 6 | 7 | box_style = { 8 | 'padding': '10px 10px 5px 5px', 9 | 'marginLeft': 'auto', 'marginRight': 'auto', 10 | } 11 | 12 | # Define the main app's layout 13 | app.layout = html.Div([ 14 | dcc.Location(id='url', refresh=False), 15 | html.Div(id='page-content') 16 | ]) 17 | 18 | # Layout for text on home page 19 | home_page = [ 20 | html.Div(children=[ 21 | html.H2("Write to production database"), 22 | dcc.Markdown(""" 23 | This is an admin dashboard that allows write access to our production MongoDB database 24 | containing data from the Gender Gap Tracker. Any GUI-based services that allow a user to 25 | write to the database can be included as a separate application through this dashboard 26 | structure. Extend the available functionality by adding new apps to button menu shown. 27 | 28 | Contact: Maite Taboada, [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca) 29 | """ 30 | ), 31 | html.P(['© 2021 ', html.A('Discourse Processing Lab.', href='https://www.sfu.ca/discourse-lab')], 32 | style={'font-size': '0.8em', 'color': '#a0a0a0'} 33 | ) 34 | ]) 35 | ] 36 | 37 | 38 | def get_page_divs(page_layout): 39 | page = html.Div(children=[ 40 | html.Div( 41 | children=[html.Table( 42 | html.Tr( 43 | [html.Td(html.Img(src="/static/SFULogo.png", style={'padding': '10px 10px 5px 5px', 'height': '50px', 'width': '165px'}))] + 44 | [html.Td(html.Img(src="/static/discourse-lab-logo.jpeg", style={'padding': '10px 10px 5px 5px', 'height': '100px', 'width': '165px'}))] + 45 | [html.Td(html.H2("Measuring gender bias in media"))] 46 | ) 47 | )], className='mainheader'), 48 | html.Br(), 49 | html.Div( 50 | children=[ 51 | html.Div([ 52 | dcc.Link('Home', href='/'), 53 | dcc.Link('Topic Model Labelling', href='/apps/topiclabels'), 54 | dcc.Link('Top sources: Comments', href='/apps/topsources'), 55 | dcc.Link('Unknown gender sources', href='/apps/unknownsources'), 56 | dcc.Link('Update gender cache', href='/apps/updatecache'), 57 | ], className='menu') 58 | ]), 59 | html.Div(children=page_layout, className='main', style={'text-align': 'justify'}), 60 | ], className='container') 61 | return page 62 | 63 | 64 | @app.callback(Output('page-content', 'children'), 65 | [Input('url', 'pathname')]) 66 | def display_page(pathname): 67 | if pathname == '/apps/topiclabels': 68 | return get_page_divs(topiclabels.layout()) 69 | elif pathname == '/apps/topsources': 70 | return get_page_divs(topsources.layout()) 71 | elif pathname == '/apps/unknownsources': 72 | return get_page_divs(unknownsources.layout()) 73 | elif pathname == '/apps/updatecache': 74 | return get_page_divs(updatecache.layout()) 75 | else: 76 | return get_page_divs(home_page) 77 | 78 | 79 | if __name__ == '__main__': 80 | # app.run_server(host='0.0.0.0', port=8050, dev_tools_ui=False, threaded=True, debug=True) 81 | app.run_server(host='0.0.0.0', port=8050, debug=True) -------------------------------------------------------------------------------- /research_dashboard/admin/server.py: -------------------------------------------------------------------------------- 1 | import flask 2 | import os 3 | import sys 4 | import dash 5 | # Logging 6 | import logging 7 | from logging.handlers import RotatingFileHandler 8 | # auth.py simply contains a dictionary {'username': 'password'} that is used 9 | # for basic HTTP authentication 10 | import dash_auth 11 | from auth import credentials 12 | 13 | server = flask.Flask(__name__) 14 | server.secret_key = os.urandom(24) 15 | 16 | app = dash.Dash(__name__, server=server, suppress_callback_exceptions=True) 17 | app.css.config.serve_locally = True 18 | app.scripts.config.serve_locally = True 19 | app.title = "Write data to research dashboard - GGT" 20 | # authentication 21 | authorize = dash_auth.BasicAuth(app, credentials) 22 | 23 | 24 | def create_app_logger(filename): 25 | """Logger format and timed handling""" 26 | logger = logging.getLogger(filename) 27 | logger.setLevel(logging.DEBUG) 28 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 29 | os.makedirs("logs", exist_ok=True) 30 | rotateHandler = RotatingFileHandler('logs/' + "g-tracker-admin-api.log", 31 | mode='a', maxBytes=1000, backupCount=3) 32 | rotateHandler.setFormatter(formatter) 33 | stream = logging.StreamHandler(sys.stdout) 34 | stream.setFormatter(formatter) 35 | 36 | logger.addHandler(rotateHandler) 37 | logger.addHandler(stream) 38 | return logger 39 | 40 | 41 | logger = create_app_logger('adminDashLogger') -------------------------------------------------------------------------------- /research_dashboard/admin/static/SFULogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/admin/static/SFULogo.png -------------------------------------------------------------------------------- /research_dashboard/admin/static/discourse-lab-logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/admin/static/discourse-lab-logo.jpeg -------------------------------------------------------------------------------- /research_dashboard/aliases.txt: -------------------------------------------------------------------------------- 1 | Primary name, alias1, alias 2, ... 2 | Rahaf Mohammed al Qunun, Rahaf Mohammed Alqunun, Rahaf Mohammed 3 | Queen Elizabeth II, Queen Elizabeth 4 | Sarah Huckabee Sanders, Sarah Sanders 5 | Michelle Rempel Garner, Michelle Rempel 6 | Francois Philippe Champagne, Francois Philippe 7 | Bill de Blasio, Mayor Bill de Blasio 8 | Volodymyr Zelensky, Volodymyr Zelenskiy, Volodymyr Zelenskyy 9 | Svetlana Tikhanovskaya, Sviatlana Tsikhanouskaya 10 | -------------------------------------------------------------------------------- /research_dashboard/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/apps/__init__.py -------------------------------------------------------------------------------- /research_dashboard/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/assets/favicon.ico -------------------------------------------------------------------------------- /research_dashboard/config.py: -------------------------------------------------------------------------------- 1 | host = ["mongo0", "mongo1", "mongo2"] 2 | # host = "localhost" 3 | 4 | config = { 5 | "MONGO_ARGS": { 6 | "host": host, 7 | "port": 27017, 8 | "username": "username", 9 | "password": "password", 10 | "authSource": "admin", 11 | "readPreference": "primaryPreferred", 12 | }, 13 | "DB": { 14 | "READ_DB": "topicModel", 15 | "READ_COL": "topicResults", 16 | "SOURCES_DB": "mediaTracker", 17 | "SOURCES_COL": "monthlySources", 18 | "SOURCES_TIME_SERIES_COL": "monthlySourcesTimeSeries", 19 | }, 20 | "GENDER_RECOGNITION": { 21 | "GENDERIZE_ENABLED": False, 22 | "GENDERAPI_ENABLED": True, 23 | "GENDERAPI_TOKEN": "JSON_AUTH_TOKEN", 24 | "MANUAL_CACHE": "manual", 25 | "GENDERAPI_CACHE": "genderAPICleaned", 26 | "GENDERIZE_CACHE": "genderizeCleaned", 27 | "FIRSTNAME_CACHE": "firstNamesCleaned", 28 | }, 29 | "NLP": { 30 | "MAX_BODY_LENGTH": 20000, 31 | "AUTHOR_BLOCKLIST": "../nlp/english/rules/author_blocklist.txt", 32 | "NAME_PATTERNS": "../nlp/english/rules/name_patterns.jsonl", 33 | "QUOTE_VERBS": "../nlp/english/rules/quote_verb_list.txt", 34 | }, 35 | "ENGLISH_OUTLETS": [ 36 | "CBC News", 37 | "CTV News", 38 | "Global News", 39 | "Huffington Post", 40 | "National Post", 41 | "The Globe And Mail", 42 | "The Star", 43 | ], 44 | "FRENCH_OUTLETS": [ 45 | "Journal De Montreal", 46 | "La Presse", 47 | "Le Devoir", 48 | "Le Droit", 49 | "Radio Canada", 50 | "TVA News", 51 | ], 52 | } 53 | -------------------------------------------------------------------------------- /research_dashboard/run.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from dash import dcc, html 3 | from dash.dependencies import Input, Output 4 | 5 | from server import app, server 6 | from apps import ( 7 | textanalyzer, 8 | topicmodel, 9 | topsources, 10 | topsourcetrends, 11 | articlecounts, 12 | dailywomenenglish, 13 | ) 14 | 15 | box_style = { 16 | 'padding': '10px 10px 5px 5px', 17 | 'marginLeft': 'auto', 'marginRight': 'auto', 18 | } 19 | 20 | # Define the main app's layout 21 | app.layout = html.Div([ 22 | dcc.Location(id='url', refresh=False), 23 | html.Div(id='page-content') 24 | ]) 25 | 26 | # Layout for text on home page 27 | home_page = [ 28 | html.Div(children=[ 29 | html.Br(), 30 | dcc.Markdown( 31 | """ 32 | This research dashboard showcases results from our study on gender bias in 33 | the media. We present the [Gender Gap Tracker 34 | (GGT)](https://gendergaptracker.informedopinions.org/), an automated system 35 | that measures men and women’s voices on seven major Canadian news outlets in 36 | real time. We analyze the rich information in news articles using Natural 37 | Language Processing (NLP) and quantify the discrepancy in proportions of men 38 | and women quoted. Our larger goals through this project are to enhance 39 | awareness of women’s portrayal in public discourse through hard evidence, 40 | and to encourage news organizations to provide a more diverse set of voices 41 | in their reporting. 42 | 43 | The Gender Gap Tracker is a collaboration between [Informed 44 | Opinions](https://informedopinions.org/), a non-profit dedicated to 45 | amplifying women’s voices in media and Simon Fraser University, through the 46 | [Discourse Processing Lab] (https://www.sfu.ca/discourse-lab.html) and the 47 | [Big Data Initiative](https://www.sfu.ca/big-data/big-data-sfu). 48 | 49 | See our peer-reviewed publications for more detailed technical information 50 | on our methodology: 51 | 52 | 1. Asr FT, Mazraeh M, Lopes A, Gautam V, Gonzales J, Rao P, Taboada M. 53 | (2021) The Gender Gap Tracker: Using Natural Language Processing to 54 | measure gender bias in media. *PLoS ONE 16(1):e0245533*. 55 | https://doi.org/10.1371/journal.pone.0245533 56 | 2. Rao P, Taboada M. (2021), Gender bias in the news: A scalable topic 57 | modelling and visualization framework. *Frontiers in Artificial 58 | Intelligence, 4(82)*. https://doi.org/10.3389/frai.2021.664737 59 | 60 | All of our code for scraping, NLP, topic modelling and data visualization is 61 | publicly available on GitHub so that others can benefit from the 62 | methodology: 63 | https://github.com/sfu-discourse-lab/GenderGapTracker 64 | 65 | For more information about the research methodology and for questions 66 | regarding collaboration, please contact Maite Taboada at 67 | [mtaboada@sfu.ca](mailto:mtaboada@sfu.ca). 68 | """ 69 | ), 70 | html.P( 71 | [ 72 | f'© {datetime.today().year} ', 73 | html.A('Discourse Processing Lab', 74 | href='https://www.sfu.ca/discourse-lab'), 75 | ], style={'font-size': '0.8em', 'color': '#a0a0a0'} 76 | ) 77 | ]) 78 | ] 79 | 80 | 81 | def get_page_divs(page_layout, enable_footer=True): 82 | page = html.Div(children=[ 83 | html.Div( 84 | children=[html.Table( 85 | html.Tr( 86 | [html.Td(html.Img(src="/static/SFULogo.png", style={'padding': '10px 10px 5px 5px', 'height': '50px', 'width': '165px'}))] + 87 | [html.Td(html.Img(src="/static/discourse-lab-logo.jpeg", style={'padding': '10px 10px 5px 5px', 'height': '100px', 'width': '165px'}))] + 88 | [html.Td(html.H3("Measuring gender bias in media"))] 89 | ) 90 | )], className='mainheader'), 91 | html.Br(), 92 | html.Div( 93 | children=[ 94 | html.Div([ 95 | dcc.Link('Home', href='/'), 96 | dcc.Link('Text analyzer', href='/apps/textanalyzer'), 97 | dcc.Link('Topic models', href='/apps/topicmodel'), 98 | dcc.Link('Top women and men quoted', href='/apps/topsources'), 99 | dcc.Link('Monthly trends: People quoted', href='/apps/topsourcetrends'), 100 | dcc.Link('Daily % women quoted', href='/apps/dailywomenenglish'), 101 | dcc.Link('Weekly article counts', href='/apps/articlecounts'), 102 | ], className='menu') 103 | ]), 104 | html.Div(children=page_layout, className='main'), 105 | html.Div(children=case_footer(enable_footer)) 106 | ], className='container') 107 | return page 108 | 109 | 110 | def case_footer(enable_footer): 111 | if enable_footer: 112 | footer = html.Div( 113 | children=[html.Table( 114 | html.Tr( 115 | [html.Td(html.Img(src="/static/SFULogo.png", style={'height': '30px', 'width': '120px'}))] + 116 | [html.Td(html.Img(src="/static/discourse-lab-logo.jpeg", style={'height': '60px', 'width': '100px'}))] + 117 | [html.Td(html.Div(html.P([f"© {datetime.today().year} Discourse Processing Lab."])))] 118 | ) 119 | ) 120 | ], className='mainfooter'), 121 | else: 122 | footer = html.Div([]) 123 | return footer 124 | 125 | 126 | @app.callback(Output('page-content', 'children'), 127 | [Input('url', 'pathname')]) 128 | def display_page(pathname): 129 | if pathname == '/apps/textanalyzer': 130 | return get_page_divs(textanalyzer.layout()) 131 | elif pathname == '/apps/topicmodel': 132 | return get_page_divs(topicmodel.layout()) 133 | elif pathname == '/apps/topsources': 134 | return get_page_divs(topsources.layout()) 135 | elif pathname == '/apps/topsourcetrends': 136 | return get_page_divs(topsourcetrends.layout()) 137 | elif pathname == '/apps/dailywomenenglish': 138 | return get_page_divs(dailywomenenglish.layout()) 139 | elif pathname == '/apps/articlecounts': 140 | return get_page_divs(articlecounts.layout()) 141 | else: 142 | return get_page_divs(home_page, enable_footer=False) 143 | 144 | 145 | if __name__ == '__main__': 146 | app.run_server(host='0.0.0.0', port=8050, debug=True) 147 | -------------------------------------------------------------------------------- /research_dashboard/server.py: -------------------------------------------------------------------------------- 1 | import flask 2 | import os 3 | import dash 4 | import dash_bootstrap_components as dbc 5 | # For language model and loggers 6 | import sys 7 | import spacy 8 | import neuralcoref 9 | from spacy.pipeline import EntityRuler 10 | import logging 11 | from logging.handlers import RotatingFileHandler 12 | 13 | server = flask.Flask(__name__) 14 | server.secret_key = os.urandom(24) 15 | 16 | app = dash.Dash( 17 | __name__, 18 | server=server, 19 | suppress_callback_exceptions=True, 20 | external_stylesheets=[dbc.themes.BOOTSTRAP], 21 | meta_tags=[ 22 | { 23 | 'name': 'Measuring gender bias in media - SFU', 24 | 'content': 'A dashboard to analyze gender discrepancies in mainstream Canadian news media.' 25 | }, 26 | { 27 | 'property': 'og:image', 28 | 'content': 'https://www.sfu.ca/content/sfu/discourse-lab/jcr:content/main_content/image_0.img.2000.high.jpg/1499291765186.jpeg', 29 | } 30 | ], 31 | ) 32 | app.title = "Measuring gender bias in media - SFU" 33 | # Serve JS and CSS locally 34 | app.css.config.serve_locally = True 35 | app.scripts.config.serve_locally = True 36 | 37 | 38 | def create_app_logger(filename): 39 | """Logger format and timed handling""" 40 | logger = logging.getLogger(filename) 41 | logger.setLevel(logging.DEBUG) 42 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 43 | os.makedirs("logs", exist_ok=True) 44 | rotateHandler = RotatingFileHandler('logs/' + "g-tracker-research-api.log", 45 | mode='a', maxBytes=1_000_000, backupCount=3) 46 | rotateHandler.setFormatter(formatter) 47 | stream = logging.StreamHandler(sys.stdout) 48 | stream.setFormatter(formatter) 49 | 50 | logger.addHandler(rotateHandler) 51 | logger.addHandler(stream) 52 | return logger 53 | 54 | 55 | def load_spacy_lang(lang='en_core_web_sm'): 56 | """Return a specific spaCy language model for the NLP module""" 57 | logger.info(f"Loading spaCy language model: '{lang}'") 58 | nlp = spacy.load(lang) 59 | logger.info("Done...") 60 | # Add neuralcoref pipe 61 | coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200) 62 | nlp.add_pipe(coref, name='neuralcoref') 63 | return nlp 64 | 65 | 66 | logger = create_app_logger('userInputDashLogger') 67 | # Load spaCy Model 68 | print('Loading spaCy language model...') 69 | spacy_lang = spacy.load('en_core_web_sm') 70 | # Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify 71 | ruler = EntityRuler(spacy_lang, overwrite_ents=True).from_disk('../nlp/english/rules/name_patterns.jsonl') 72 | spacy_lang.add_pipe(ruler) 73 | # Add neuralcoref pipe 74 | coref = neuralcoref.NeuralCoref(spacy_lang.vocab, max_dist=200) 75 | spacy_lang.add_pipe(coref, name='neuralcoref') 76 | print('Finished loading.') 77 | 78 | -------------------------------------------------------------------------------- /research_dashboard/static/GGT_topic_model_technical_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/GGT_topic_model_technical_report.pdf -------------------------------------------------------------------------------- /research_dashboard/static/SFULogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/SFULogo.png -------------------------------------------------------------------------------- /research_dashboard/static/discourse-lab-logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/discourse-lab-logo.jpeg -------------------------------------------------------------------------------- /research_dashboard/static/sfu_discourse_thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/sfu_discourse_thumbnail.png -------------------------------------------------------------------------------- /research_dashboard/static/topic-pipeline-flowchart-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/topic-pipeline-flowchart-1.png -------------------------------------------------------------------------------- /research_dashboard/static/topic-pipeline-flowchart-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-discourse-lab/GenderGapTracker/5501de31e8598e18ace47982220e91961ca5460a/research_dashboard/static/topic-pipeline-flowchart-2.png -------------------------------------------------------------------------------- /scraper/README.md: -------------------------------------------------------------------------------- 1 | # Scraping 2 | This section contains the code we used for scraping news article content from various Canadian outlets. Note that we store all our data on a MongoDB database, so the scraper code shown in this repo can be modified accordingly if using any other database downstream. The code in this directory was tested on Python 3.6, but should be valid for higher versions. 3 | 4 | ## Required installations for scraping and data storage 5 | * MongoDB: Installation instructions [here](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/). 6 | * Install Python 3.6+ and follow the below instructions to prepare the Python environment. Make sure that `gcc`, `build-essential` and `python3-devel` (on Red Hat/CentOS), or `python3-dev` (on ubuntu) are installed on the system. Also, install `python3-venv` for managing virtual environments. 7 | * Newspaper3k: We use our own [custom fork of the newspaper library](https://github.com/aleaugustoplus/newspaper) to help in the process of collecting data from news websites 8 | * Install the customized newspaper library into a Python virtual environment using the command `pip install -r requirements.txt` on the requirements file provided in this directory, [which is obtained from the source repo](https://github.com/aleaugustoplus/newspaper/blob/master/requirements.txt). 9 | 10 | 11 | ## News Sources 12 | We scrape news articles from the following Canadian news organizations' websites. The articles in our database date back to October 2018. 13 | 14 | #### English 15 | 1. CBC News 16 | 2. CTV News 17 | 3. Global News 18 | 4. HuffPost Canada* 19 | 5. National Post 20 | 6. The Globe And Mail 21 | 7. The Star 22 | 23 | > * HuffPost Canada stopped publishing articles in March 2021. As a result, our database only contains articles from this outlet until February 2021. 24 | 25 | #### French 26 | 1. Journal De Montreal 27 | 2. La Presse 28 | 3. Le Devoir 29 | 4. Le Droit 30 | 5. Radio Canada 31 | 6. TVA News 32 | 33 | Each outlet's news content is retrieved from their RSS feeds by running the required media collectors. Some examples of usage are shown below. 34 | 35 | ### Example of usage 36 | 37 | Run the `mediaCollectors.py` script with positional arguments pointing to the (case-sensitive) news outlet name as follows. 38 | 39 | ```sh 40 | python3 WomenInMedia/scraper/mediaCollectors.py "Huffington Post" 41 | python3 WomenInMedia/scraper/mediaCollectors.py "Journal De Montreal" 42 | ``` 43 | 44 | 45 | ### `config.py` parameters 46 | Set up the config settings accordingly to set up the database connection and write scraped articles. 47 | 48 | ```python 49 | # Production config 50 | MONGODB_HOST = ["mongo0", "mongo1", "mongo2"] 51 | MONGODB_PORT = 27017 52 | MONGO_ARGS = { 53 | "readPreference": "primary", 54 | "username": USERNAME, 55 | "password": PASSWORD, 56 | } 57 | DBS_NAME = 'mediaTracker' 58 | COLLECTION_NAME = 'media' 59 | COLLECTION_INVALID_NAME = 'mediaInvalid' 60 | LOGS_DIR = "logs/" 61 | EMAIL_SERVER = 'xxxx@smtp.gmail.com' 62 | EMAIL = "youremail@gmail.com" 63 | EMAIL_ACCOUNT = "" 64 | EMAIL_PASSWORD = "" 65 | EMAIL_DESTINATION = "" 66 | ``` 67 | -------------------------------------------------------------------------------- /scraper/config.py: -------------------------------------------------------------------------------- 1 | # Production config 2 | MONGODB_HOST = ["mongo0", "mongo1", "mongo2"] 3 | MONGODB_PORT = 27017 4 | MONGO_ARGS = { 5 | "readPreference": "primary", 6 | "username": "USERNAME", 7 | "password": "PASSWORD", 8 | } 9 | DBS_NAME = 'mediaTracker' 10 | COLLECTION_NAME = 'media' 11 | COLLECTION_INVALID_NAME = 'mediaInvalid' 12 | LOGS_DIR = "logs/" 13 | EMAIL_SERVER = 'xxxx@smtp.gmail.com' 14 | EMAIL = "youremail@gmail.com" 15 | EMAIL_ACCOUNT = "" 16 | EMAIL_PASSWORD = "" 17 | EMAIL_DESTINATION = "" 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /scraper/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.4.1 2 | Pillow>=3.3.0 3 | PyYAML>=3.11 4 | cssselect>=0.9.2 5 | lxml>=3.6.0 6 | nltk>=3.2.1 7 | requests>=2.10.0 8 | feedparser>=5.2.1 9 | tldextract>=2.0.1 10 | feedfinder2>=0.0.4 11 | jieba3k>=0.35.1 12 | python-dateutil>=2.5.3 13 | tinysegmenter==0.3 # TODO(codelucas): Investigate making this >=0.3 -------------------------------------------------------------------------------- /scraper/util.py: -------------------------------------------------------------------------------- 1 | # This module aggregates different functions used for scrapping news RSS feeds websites 2 | # Date created: 2018/07/19 3 | import re 4 | import os 5 | import heapq 6 | import smtplib 7 | import logging as log 8 | from datetime import datetime 9 | from logging.handlers import TimedRotatingFileHandler, BufferingHandler 10 | 11 | 12 | def str_or_empty_str(tag): 13 | return "" if tag is None or tag.string is None else tag.string 14 | 15 | 16 | def text_or_empty_str(tag): 17 | return "" if tag is None or tag.text is None else tag.text 18 | 19 | 20 | def clean_text(text): 21 | # Regex to remove non printable chars 22 | return re.sub(r"[\x00-\x1F]+", " ", text).rstrip().strip().lower() 23 | 24 | 25 | def enable_debug_http(): 26 | try: 27 | import http.client as http_client 28 | except ImportError: 29 | # Python 2 30 | import httplib as http_client 31 | http_client.HTTPConnection.debuglevel = 1 32 | 33 | # DEBUG 34 | log.basicConfig() 35 | log.getLogger().setLevel(log.DEBUG) 36 | requests_log = log.getLogger("requests.packages.urllib3") 37 | requests_log.setLevel(log.DEBUG) 38 | requests_log.propagate = True 39 | 40 | 41 | def conv_str2date(strDate): 42 | 43 | strDate = ( 44 | strDate.replace("GMT", "") 45 | .replace("-0400", "") 46 | .replace("EDT", "") 47 | .replace("EST", "") 48 | .replace("+0000", "") 49 | .replace("-0300", "") 50 | .replace("-0700", "") 51 | .replace("-0600", "") 52 | .replace("-0500", "") 53 | .replace("-0001 ", "") 54 | .replace(".000", "") 55 | .strip() 56 | ) 57 | try: 58 | try: 59 | convDate = datetime.strptime(strDate, "%a, %d %b %Y %H:%M:%S") 60 | except ValueError: 61 | try: 62 | convDate = datetime.strptime(strDate, "%Y-%m-%d %H:%M:%S") 63 | except ValueError: 64 | convDate = datetime.strptime(strDate, "%d %b %Y %H:%M:%S") 65 | 66 | # log.info("Converted: %s", convDate) 67 | except Exception as ex: 68 | log.exception("Exception: %s", ex) 69 | convDate = datetime.utcnow() 70 | 71 | return convDate 72 | 73 | 74 | # Partially Extracted from: https://gist.github.com/anonymous/1379446 75 | class BufferingSMTPHandler(BufferingHandler): 76 | def __init__(self, mailhost, fromaddr, toaddrs, subject, capacity=1024 * 10, credentials=None): 77 | 78 | BufferingHandler.__init__(self, capacity) 79 | self.mailhost = mailhost 80 | self.mailport = None 81 | self.fromaddr = fromaddr 82 | self.toaddrs = toaddrs 83 | self.subject = subject 84 | self.credentials = credentials 85 | 86 | def flush(self): 87 | if len(self.buffer) > 0: 88 | try: 89 | smtp = smtplib.SMTP_SSL(self.mailhost, 465) 90 | smtp.ehlo() 91 | smtp.login(self.credentials[0], self.credentials[1]) 92 | body = "" 93 | for record in self.buffer: 94 | s = self.format(record) 95 | body += s + "\n" 96 | 97 | msg = "From: %s\nSubject: %s\n%s" % (self.fromaddr, self.subject, body) 98 | 99 | smtp.sendmail(self.fromaddr, self.toaddrs, msg.encode("utf-8")) 100 | smtp.quit() 101 | except: 102 | self.handleError(None) # no particular record 103 | self.buffer = [] 104 | 105 | def close(self): 106 | self.flush() 107 | 108 | 109 | def get_filename(filename): 110 | # Get logs directory 111 | log_directory = os.path.split(filename)[0] 112 | 113 | # Get file extension (also it's a suffix's value (i.e. ".20181231")) without dot 114 | date = os.path.splitext(filename)[0] 115 | # date = os.path.splitext(tmp)[1][1:] 116 | 117 | # Create new file name 118 | filename = os.path.join(log_directory, date) 119 | 120 | # I don't want to add index if only one log file will exists for date 121 | if not os.path.exists("{}.log".format(filename)): 122 | return "{}.log".format(filename) 123 | 124 | # Create new file name with index 125 | index = 0 126 | f = "{}.{}.log".format(filename, index) 127 | while os.path.exists(f): 128 | index += 1 129 | f = "{}.{}.log".format(filename, index) 130 | return f 131 | 132 | 133 | class CustomTimedRotatingFileHandler(TimedRotatingFileHandler): 134 | def __init__( 135 | self, 136 | filename, 137 | when="S", 138 | interval=1, 139 | backupCount=20, 140 | encoding=None, 141 | delay=False, 142 | utc=False, 143 | atTime=None, 144 | ): 145 | TimedRotatingFileHandler.__init__( 146 | self, filename, when, interval, backupCount, encoding, delay, utc, atTime 147 | ) 148 | self.namer = get_filename 149 | 150 | def doRollover(self): 151 | 152 | TimedRotatingFileHandler.doRollover(self) 153 | 154 | if os.stat(self.baseFilename).st_size <= 0: 155 | os.remove(self.baseFilename) 156 | 157 | 158 | class PrioritySet(object): 159 | def __init__(self): 160 | self.heap = [] 161 | 162 | def add(self, d): 163 | heapq.heappush(self.heap, (d.priority, d)) 164 | 165 | def get(self): 166 | pri, d = heapq.heappop(self.heap) 167 | return d 168 | 169 | def __len__(self): 170 | return len(self.heap) 171 | -------------------------------------------------------------------------------- /statistics/config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'MONGO_ARGS': { 3 | 'host': ['mongo0', 'mongo1', 'mongo2'], 4 | 'port': 27017, 5 | 'username': 'username', 6 | 'password': 'password', 7 | 'authSource': 'admin', 8 | 'readPreference': 'nearest', 9 | } 10 | } -------------------------------------------------------------------------------- /statistics/daily_pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Daily aggregate statistics 2 | 3 | 4 | ## Daily article counts per outlet 5 | To keep track of whether our news article scrapers are performing, an additional app is added to [our research dashboard](https://gendergaptracker.research.sfu.ca/). We plot daily counts of articles for all news outlets in English and French over a given time period. To do this, we run a daily aggregator script that counts the number of sources and articles for each outlet each day, and write this to the `mediaDaily` collection on the DB. Following this, the charts on the dashboard query the data from the last 180 days, so that we can see if there is an abrupt decline in daily article counts per outlet over a sustained period -- this could be an indication that a particular scraper is out of date and that we need to more closely inspect its source code. 6 | 7 | #### Run the daily article/source aggregator script 8 | This script aggregates the number of articles and sources per gender, per outlet, and writes them to the `mediaDaily` collection in the database. By default, this runs over all articles published within the last 180 days (6 months). Alternately, a custom date range over which the daily counts need to be performed can be specified as follows. 9 | 10 | ```sh 11 | cd daily_pipeline 12 | python3 media_daily.py --begin_date 2021-10-01 --end_date 2021-10-31 13 | ``` 14 | -------------------------------------------------------------------------------- /statistics/daily_pipeline/config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'MONGO_ARGS': { 3 | 'host': ['mongo0', 'mongo1', 'mongo2'], 4 | 'port': 27017, 5 | 'username': 'username', 6 | 'password': 'password', 7 | 'authSource': 'admin', 8 | 'readPreference': 'nearest', 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /statistics/daily_pipeline/daily_article_counts.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script outputs daily counts of articles per outlet to a separate collection on the 3 | `mediaTracker database`. Following this, the daily counts can be plotted on a chart to 4 | track whether the scrapers are performing as intended. 5 | """ 6 | import argparse 7 | from datetime import timedelta, datetime 8 | from pymongo import MongoClient 9 | from config import config 10 | 11 | 12 | def get_connection(): 13 | _db_client = MongoClient(**MONGO_ARGS) 14 | return _db_client 15 | 16 | 17 | def format_date(date_str): 18 | dateFormat = '%Y-%m-%d' 19 | return datetime.strptime(date_str, dateFormat) 20 | 21 | 22 | def get_past_date_as_str(days_ago=1): 23 | today = datetime.today().date() - timedelta(days=days_ago) 24 | return today.strftime("%Y-%m-%d") 25 | 26 | 27 | def daily_article_counts(start_date, end_date): 28 | """ 29 | Returns the daily counts for articles published by each outlet between two specified dates 30 | """ 31 | query = [ 32 | { 33 | "$match": { 34 | "body": {"$ne": ""}, 35 | "outlet": {"$in": args["outlets"]}, 36 | "publishedAt": { 37 | "$gte": start_date, 38 | "$lt": end_date, 39 | }, 40 | } 41 | }, 42 | { 43 | "$project": { 44 | "publishedAt": { 45 | "$dateToString": {"format": "%Y-%m-%d", "date": "$publishedAt"} 46 | }, 47 | "outlet": 1.0, 48 | } 49 | }, 50 | { 51 | "$group": { 52 | "_id": {"publishedAt": "$publishedAt", "outlet": "$outlet"}, 53 | "totalArticles": {"$sum": 1.0}, 54 | } 55 | }, 56 | # Final projection: Extract the date (from string) and the outlet name, along with article counts 57 | { 58 | "$project": { 59 | "_id": 0.0, 60 | "publishedAt": { 61 | "$dateFromString": { 62 | "dateString": "$_id.publishedAt", 63 | "format": "%Y-%m-%d", 64 | } 65 | }, 66 | "outlet": "$_id.outlet", 67 | "totalArticles": 1.0, 68 | } 69 | }, 70 | ] 71 | return query 72 | 73 | 74 | def update_db(collection, payload): 75 | """ 76 | Insert aggregated stats of daily per-outlet article counts to the specified 77 | collection in the DB 78 | """ 79 | try: 80 | # Find and upsert stats based on the date string value and outlet name 81 | # To avoid duplicates, we match on BOTH the name of the outlet and the date string 82 | for item in payload: 83 | collection.update_one( 84 | { 85 | "$and": [ 86 | {"outlet": item["outlet"]}, 87 | {"publishedAt": item["publishedAt"]}, 88 | ] 89 | }, 90 | {"$set": {"totalArticles": item["totalArticles"]}}, 91 | upsert=True, 92 | ) 93 | except Exception as e: 94 | print(f"Error: {e}") 95 | 96 | 97 | def main(): 98 | """Run query and write the daily per-outlet article counts to the database.""" 99 | daily_counts = read_collection.aggregate(daily_article_counts(start_date, end_date)) 100 | # Write daily article counts per outlet to DB for the given date range 101 | update_db(write_collection, daily_counts) 102 | 103 | 104 | if __name__ == "__main__": 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument("--db", type=str, default="mediaTracker", help="Database name") 107 | parser.add_argument("--readcol", type=str, default="media", help="Read collection name") 108 | parser.add_argument("--writecol", type=str, default="articleCountsDaily", help="Write collection name") 109 | parser.add_argument("--begin_date", type=str, default=get_past_date_as_str(days_ago=90), help="Start date in the format YYYY-MM-DD") 110 | parser.add_argument("--end_date", type=str, default=get_past_date_as_str(days_ago=1), help="End date in the format YYYY-MM-DD") 111 | parser.add_argument("--outlets", type=str, help="Comma-separated list of news outlets to consider in query scope") 112 | args = vars(parser.parse_args()) 113 | 114 | start_date = format_date(args["begin_date"]) 115 | end_date = format_date(args["end_date"]) + timedelta(days=1) 116 | 117 | # Import config settings 118 | MONGO_ARGS = config["MONGO_ARGS"] 119 | 120 | if not args["outlets"]: 121 | # Consider all English and French outlets by default 122 | args["outlets"] = [ 123 | "National Post", 124 | "The Globe And Mail", 125 | "The Star", 126 | "Huffington Post", 127 | "Global News", 128 | "CTV News", 129 | "CBC News", 130 | "Journal De Montreal", 131 | "La Presse", 132 | "Le Devoir", 133 | "Le Droit", 134 | "Radio Canada", 135 | "TVA News", 136 | ] 137 | else: 138 | # Format outlets as a list of strings 139 | args["outlets"] = args["outlets"].split(",") 140 | 141 | # Connect to database 142 | _client = get_connection() 143 | read_collection = _client[args["db"]][args["readcol"]] 144 | write_collection = _client[args["db"]][args["writecol"]] 145 | 146 | main() -------------------------------------------------------------------------------- /statistics/daily_pipeline/media_daily.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script is designed to be a replacement for the tools.py script that calculates 3 | daily article/source counts per outlet. The aim is to speed up the computation (the 4 | earlier version used vanilla Python) using native mongo objects and queries. 5 | 6 | By default, this script is run for all articles published within the last 3 months. Even 7 | though this is redundant, we feel this is necessary, because in some cases, the scrapers 8 | can populate the DB with new articles from a past date. This is why it makes sense to 9 | check up to 3 months back on a daily basis. 10 | """ 11 | import argparse 12 | from datetime import timedelta, datetime 13 | from pymongo import MongoClient 14 | from config import config 15 | 16 | 17 | def get_connection(): 18 | _db_client = MongoClient(**MONGO_ARGS) 19 | return _db_client 20 | 21 | 22 | def format_date(date_str): 23 | dateFormat = '%Y-%m-%d' 24 | return datetime.strptime(date_str, dateFormat) 25 | 26 | 27 | def get_past_date_as_str(days_ago=1): 28 | today = datetime.today().date() - timedelta(days=days_ago) 29 | return today.strftime("%Y-%m-%d") 30 | 31 | 32 | def daily_article_counts(start_date, end_date): 33 | """ 34 | Returns the daily counts for articles and sources by gender, as published by each 35 | outlet between two specified dates 36 | """ 37 | query = [ 38 | { 39 | "$match": { 40 | "body": {"$ne": ""}, 41 | "outlet": {"$in": args["outlets"]}, 42 | "publishedAt": { 43 | "$gte": start_date, 44 | "$lt": end_date, 45 | }, 46 | } 47 | }, 48 | { 49 | "$project": { 50 | "publishedAt": { 51 | "$dateToString": {"format": "%Y-%m-%d", "date": "$publishedAt"} 52 | }, 53 | "outlet": 1.0, 54 | "sourcesFemaleCount": 1.0, 55 | "sourcesMaleCount": 1.0, 56 | "sourcesUnknownCount": 1.0, 57 | } 58 | }, 59 | { 60 | "$group": { 61 | "_id": { 62 | "publishedAt": "$publishedAt", 63 | "outlet": "$outlet", 64 | }, 65 | "totalArticles": {"$sum": 1.0}, 66 | "totalFemales": {"$sum": "$sourcesFemaleCount"}, 67 | "totalMales": {"$sum": "$sourcesMaleCount"}, 68 | "totalUnknowns": {"$sum": "$sourcesUnknownCount"}, 69 | } 70 | }, 71 | # Final projection: Extract the date (from string) and the outlet name, along with article counts 72 | { 73 | "$project": { 74 | "_id": 0.0, 75 | "publishedAt": { 76 | "$dateFromString": { 77 | "dateString": "$_id.publishedAt", 78 | "format": "%Y-%m-%d", 79 | } 80 | }, 81 | "outlet": "$_id.outlet", 82 | "totalArticles": 1.0, 83 | "totalFemales": 1.0, 84 | "totalMales": 1.0, 85 | "totalUnknowns": 1.0, 86 | } 87 | }, 88 | ] 89 | return query 90 | 91 | 92 | def update_db(collection, payload): 93 | """ 94 | Insert aggregated stats of daily per-outlet article and source counts to the 95 | specified collection in the DB 96 | """ 97 | try: 98 | # Find and upsert stats based on the date string value and outlet name 99 | # To avoid duplicates, we match on BOTH the name of the outlet and the date string 100 | for item in payload: 101 | collection.update_one( 102 | { 103 | "$and": [ 104 | {"outlet": item["outlet"]}, 105 | {"publishedAt": item["publishedAt"]}, 106 | ] 107 | }, 108 | { 109 | "$set": { 110 | "totalArticles": item["totalArticles"], 111 | "totalFemales": item["totalFemales"], 112 | "totalMales": item["totalMales"], 113 | "totalUnknowns": item["totalUnknowns"], 114 | } 115 | }, 116 | upsert=True, 117 | ) 118 | except Exception as e: 119 | print(f"Error: {e}") 120 | 121 | 122 | def main(): 123 | """Run query and write the daily per-outlet article counts to the database.""" 124 | daily_counts = read_collection.aggregate(daily_article_counts(start_date, end_date)) 125 | # Write daily article counts per outlet to DB for the given date range 126 | update_db(write_collection, daily_counts) 127 | 128 | 129 | if __name__ == "__main__": 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument("--db", type=str, default="mediaTracker", help="Database name") 132 | parser.add_argument("--readcol", type=str, default="media", help="Read collection name") 133 | parser.add_argument("--writecol", type=str, default="mediaDaily", help="Write collection name") 134 | parser.add_argument("--begin_date", type=str, default=get_past_date_as_str(days_ago=90), help="Start date in the string format YYYY-MM-DD") 135 | parser.add_argument("--end_date", type=str, default=get_past_date_as_str(days_ago=3), help="End date in the string format YYYY-MM-DD") 136 | parser.add_argument("--outlets", type=str, help="Comma-separated list of news outlets to consider in query scope") 137 | args = vars(parser.parse_args()) 138 | 139 | start_date = format_date(args["begin_date"]) 140 | end_date = format_date(args["end_date"]) + timedelta(days=1) 141 | 142 | # Import config settings 143 | MONGO_ARGS = config["MONGO_ARGS"] 144 | 145 | if not args["outlets"]: 146 | # English outlets 147 | args["outlets"] = [ 148 | "National Post", 149 | "The Globe And Mail", 150 | "The Star", 151 | "Huffington Post", 152 | "Global News", 153 | "CTV News", 154 | "CBC News", 155 | ] 156 | else: 157 | # Format outlets as a list of strings 158 | args["outlets"] = args["outlets"].split(",") 159 | 160 | # Connect to database 161 | _client = get_connection() 162 | read_collection = _client[args["db"]][args["readcol"]] 163 | write_collection = _client[args["db"]][args["writecol"]] 164 | 165 | main() -------------------------------------------------------------------------------- /statistics/monthly_pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Monthly aggregate statistics 2 | 3 | For [our research dashboard](https://gendergaptracker.research.sfu.ca/), we aggregate our results on a monthly basis. This is primarily for us to study trends in our topic models each month, as well as to analyze the top quoted men and women over time. 4 | 5 | Calculate the top 50 quoted men and women for a particular month by specifying the month and year as follows: 6 | 7 | ```sh 8 | cd monthly_aggregate 9 | # Calculate top 50 male & female sources for all outlets for November and December 2020 10 | python3 monthly_top_sources.py --year 2020 --month 11 11 | python3 monthly_top_sources.py --year 2020 --month 12 12 | ``` 13 | 14 | Similarly, we can calculate the top 50 quoted men and women each month to study the top quoted people's quote counts as a time series. We limit the calculation to just the top 50 for querying-efficiency reasons (otherwise the time series lookup can become inefficient). Each month's calculation is run one at a time, sequentially, as follows. 15 | 16 | ```sh 17 | cd monthly_aggregate 18 | # Calculate the quote counts for each of the top 50 male & female sources for all outlets for April, May and June 2020 19 | python3 monthly_top_sources_timeseries.py --year 2020 --month 4 20 | python3 monthly_top_sources_timeseries.py --year 2020 --month 5 21 | python3 monthly_top_sources_timeseries.py --year 2020 --month 6 22 | ``` 23 | -------------------------------------------------------------------------------- /statistics/monthly_pipeline/config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'MONGO_ARGS': { 3 | 'host': ['mongo0', 'mongo1', 'mongo2'], 4 | 'port': 27017, 5 | 'username': 'username', 6 | 'password': 'password', 7 | 'authSource': 'admin', 8 | 'readPreference': 'nearest', 9 | } 10 | } -------------------------------------------------------------------------------- /statistics/requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.27.1 2 | pymongo>=3.10.0,<4.0.0 3 | pandas>=1.1.5 4 | 5 | -------------------------------------------------------------------------------- /statistics/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import time 4 | import os 5 | import sys 6 | import pandas as pd 7 | from datetime import datetime 8 | from logging.handlers import TimedRotatingFileHandler 9 | from pymongo import MongoClient 10 | # config 11 | from config import config 12 | # User-created queries 13 | import queries 14 | 15 | 16 | def get_connection(): 17 | connection = MongoClient(**MONGO_ARGS) 18 | return connection 19 | 20 | 21 | def format_date(date_str): 22 | dateFormat = '%Y-%m-%d' 23 | return datetime.strptime(date_str, dateFormat) 24 | 25 | 26 | def create_app_logger(filename): 27 | """Logger format and timed handling""" 28 | logger = logging.getLogger(filename) 29 | logger.setLevel(logging.DEBUG) 30 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 31 | rotateHandler = TimedRotatingFileHandler(os.path.join("logs", "g-statistics.log"), 32 | when="midnight") 33 | rotateHandler.setFormatter(formatter) 34 | stream = logging.StreamHandler(sys.stdout) 35 | stream.setFormatter(formatter) 36 | 37 | logger.addHandler(rotateHandler) 38 | logger.addHandler(stream) 39 | return logger 40 | 41 | 42 | def run_aggregation_queries(): 43 | """Collect aggregation query methods from queries.py and run them.""" 44 | query_list = [] 45 | for method_name in args.keys(): 46 | requested = args[method_name] 47 | if requested and isinstance(requested, bool): 48 | # Only those args supplied as boolean flags will run as queries 49 | # getattr(foo, 'bar') equals foo.bar 50 | query_list.append(getattr(queries, method_name)) 51 | 52 | # Run multiple aggregation queries between specified start/end dates 53 | for query in query_list: 54 | logger.info(f"Query: '{query.__name__}', date range: ({start_date}, {end_date})") 55 | start_time = time.time() 56 | result = collection.aggregate(query(args)) 57 | 58 | # Export CSV 59 | filename = f"{query.__name__}_{start_date}_to_{end_date}.csv" 60 | df = pd.DataFrame.from_dict(result) 61 | df.to_csv(filename, index=False) 62 | 63 | logger.info(f"{query.__name__} query completed in {time.time() - start_time:.3f} seconds.") 64 | 65 | 66 | if __name__ == "__main__": 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument('--db', type=str, default='mediaTracker', help="Database name") 69 | parser.add_argument('--col', type=str, default='media', help="Read collection name") 70 | parser.add_argument("--begin_date", type=str, default="2020-04-29", help="Start date in the format YYYY-MM-DD") 71 | parser.add_argument("--end_date", type=str, default="2020-04-30", help="End date in the format YYYY-MM-DD") 72 | parser.add_argument("--outlets", type=str, help="Comma-separated list of news outlets to consider in query scope") 73 | parser.add_argument("--limit", type=int, default=100, help="Number of results to limit to") 74 | parser.add_argument("--sort", type=str, default='desc', help="Sort results in ascending or descending order") 75 | # Query name args (specified as booleans) 76 | parser.add_argument("--db_stats", action='store_true', help="Run query to calculate overall gender stats (sources, people, authors)") 77 | parser.add_argument("--outlet_stats", action='store_true', help="Run query to calculate gender stats (sources, people, authors) per outlet") 78 | parser.add_argument("--top_sources_female", action='store_true', help="Run query to calculate top N female sources") 79 | parser.add_argument("--top_sources_male", action='store_true', help="Run query to calculate top N male sources") 80 | parser.add_argument("--top_sources_unknown", action='store_true', help="Run query to calculate top N unknown sources") 81 | parser.add_argument("--top_sources_all", action='store_true', help="Run query to calculate top N overall sources (male or female)") 82 | parser.add_argument("--female_author_sources", action='store_true', help="Run query to cross-tabulate female author sources vs. source gender counts") 83 | parser.add_argument("--male_author_sources", action='store_true', help="Run query to cross-tabulate male author sources vs. source gender counts") 84 | parser.add_argument("--mixed_author_sources", action='store_true', help="Run query to cross-tabulate both gender (male & female) author sources vs. source gender counts") 85 | parser.add_argument("--unknown_author_sources", action='store_true', help="Run query to cross-tabulate unknown author sources vs. source gender counts") 86 | parser.add_argument("--daily_article_counts", action='store_true', help="Run query to get a tally of daily article counts") 87 | args = vars(parser.parse_args()) 88 | 89 | # Import config settings 90 | MONGO_ARGS = config['MONGO_ARGS'] 91 | 92 | if not args['outlets']: 93 | # Consider all seven English-language outlets by default 94 | args['outlets'] = [ 95 | 'National Post', 'The Globe And Mail', 'The Star', 96 | 'Huffington Post', 'Global News', 'CTV News', 'CBC News' 97 | ] 98 | else: 99 | # Format outlets as a list of strings 100 | args['outlets'] = args['outlets'].split(",") 101 | 102 | # Convert sort value to float for pymongo (1.0 is ascending, -1.0 is descending) 103 | args['sort'] = 1.0 if args['sort'] == 'asc' else -1.0 104 | 105 | # Store dates as strings for file naming 106 | start_date = args['begin_date'] 107 | end_date = args['end_date'] 108 | # Format dates as datetime objects for pymongo 109 | args['begin_date'] = format_date(args['begin_date']) 110 | args['end_date'] = format_date(args['end_date']) 111 | 112 | # Create logs 113 | os.makedirs("logs", exist_ok=True) 114 | logger = create_app_logger('statisticsLog') 115 | 116 | # Connect to database 117 | connection = get_connection() 118 | collection = connection[args['db']][args['col']] 119 | 120 | run_aggregation_queries() 121 | 122 | --------------------------------------------------------------------------------