├── .gitignore ├── LICENSE.TXT ├── README.md ├── job_pipeline ├── __init__.py ├── __main__.py ├── lib │ ├── __init__.py │ ├── cc.py │ ├── extractlib.py │ ├── io.py │ ├── nlp.py │ ├── normalise.py │ ├── rdftool.py │ └── salary.py ├── postprocess.py └── sources │ ├── __init__.py │ ├── abstract_datasource.py │ ├── careers_vic.py │ ├── cgcrecruitment.py │ ├── commoncrawl_datasource.py │ ├── csiro.py │ ├── davidsonwp.py │ ├── engineeringjobs.py │ ├── ethicaljobs.py │ ├── gumtree.py │ ├── iworkfornsw.py │ ├── jsonld.py │ ├── kaggle_datascienceau_201910.py │ ├── kaggle_datasource.py │ ├── kaggle_promptcloud_gumtree.py │ ├── kaggle_promptcloud_latest.py │ ├── kaggle_promptcloud_listings.py │ ├── launchrecruitment.py │ ├── microdata.py │ ├── probono.py │ └── seek.py ├── notebooks ├── Analysing Salary Extracted From CommonCrawl Job Data.ipynb ├── Converting HTML to Text.ipynb ├── Extracting Australian Job Ads from Web Data Commons with SPARQL.ipynb ├── Extracting Role Title Words.ipynb ├── Extracting Role Titles and Analysing with Salary.ipynb ├── JobPosting RDF - 2019 Web Data Commons Schema Analysis.ipynb └── JobPosting SPARQL - 2019 Web Data Commons Analysis.ipynb ├── requirements-dev.txt ├── requirements.txt ├── scripts ├── format.sh ├── lint.sh ├── run └── test.sh ├── tests ├── __init__.py └── test_salary.py └── typestubs ├── bs4 ├── __init__.pyi └── element.pyi ├── datasketch ├── __init__.pyi ├── hashfunc.pyi ├── lean_minhash.pyi ├── lsh ├── lsh.pyi └── minhash.pyi ├── demjson.pyi ├── extruct └── __init__.pyi ├── kaggle ├── __init__.pyi └── api │ ├── __init__.pyi │ └── kaggle_api_extended.pyi ├── mistletoe └── __init__.pyi ├── rdflib ├── __init__.pyi ├── graph.pyi ├── namespace.pyi └── term.pyi ├── tqdm.pyi ├── warcio ├── __init__.pyi ├── archiveiterator.pyi ├── bufferedreaders.pyi ├── limitreader.pyi ├── recordloader.pyi └── warcwriter.pyi └── xxhash └── __init__.pyi /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE.TXT: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/LICENSE.TXT -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/README.md -------------------------------------------------------------------------------- /job_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /job_pipeline/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/__main__.py -------------------------------------------------------------------------------- /job_pipeline/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /job_pipeline/lib/cc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/lib/cc.py -------------------------------------------------------------------------------- /job_pipeline/lib/extractlib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/lib/extractlib.py -------------------------------------------------------------------------------- /job_pipeline/lib/io.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/lib/io.py -------------------------------------------------------------------------------- /job_pipeline/lib/nlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/lib/nlp.py -------------------------------------------------------------------------------- /job_pipeline/lib/normalise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/lib/normalise.py -------------------------------------------------------------------------------- /job_pipeline/lib/rdftool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/lib/rdftool.py -------------------------------------------------------------------------------- /job_pipeline/lib/salary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/lib/salary.py -------------------------------------------------------------------------------- /job_pipeline/postprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/postprocess.py -------------------------------------------------------------------------------- /job_pipeline/sources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /job_pipeline/sources/abstract_datasource.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/abstract_datasource.py -------------------------------------------------------------------------------- /job_pipeline/sources/careers_vic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/careers_vic.py -------------------------------------------------------------------------------- /job_pipeline/sources/cgcrecruitment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/cgcrecruitment.py -------------------------------------------------------------------------------- /job_pipeline/sources/commoncrawl_datasource.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/commoncrawl_datasource.py -------------------------------------------------------------------------------- /job_pipeline/sources/csiro.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/csiro.py -------------------------------------------------------------------------------- /job_pipeline/sources/davidsonwp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/davidsonwp.py -------------------------------------------------------------------------------- /job_pipeline/sources/engineeringjobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/engineeringjobs.py -------------------------------------------------------------------------------- /job_pipeline/sources/ethicaljobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/ethicaljobs.py -------------------------------------------------------------------------------- /job_pipeline/sources/gumtree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/gumtree.py -------------------------------------------------------------------------------- /job_pipeline/sources/iworkfornsw.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/iworkfornsw.py -------------------------------------------------------------------------------- /job_pipeline/sources/jsonld.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/jsonld.py -------------------------------------------------------------------------------- /job_pipeline/sources/kaggle_datascienceau_201910.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/kaggle_datascienceau_201910.py -------------------------------------------------------------------------------- /job_pipeline/sources/kaggle_datasource.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/kaggle_datasource.py -------------------------------------------------------------------------------- /job_pipeline/sources/kaggle_promptcloud_gumtree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/kaggle_promptcloud_gumtree.py -------------------------------------------------------------------------------- /job_pipeline/sources/kaggle_promptcloud_latest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/kaggle_promptcloud_latest.py -------------------------------------------------------------------------------- /job_pipeline/sources/kaggle_promptcloud_listings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/kaggle_promptcloud_listings.py -------------------------------------------------------------------------------- /job_pipeline/sources/launchrecruitment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/launchrecruitment.py -------------------------------------------------------------------------------- /job_pipeline/sources/microdata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/microdata.py -------------------------------------------------------------------------------- /job_pipeline/sources/probono.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/probono.py -------------------------------------------------------------------------------- /job_pipeline/sources/seek.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/job_pipeline/sources/seek.py -------------------------------------------------------------------------------- /notebooks/Analysing Salary Extracted From CommonCrawl Job Data.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/notebooks/Analysing Salary Extracted From CommonCrawl Job Data.ipynb -------------------------------------------------------------------------------- /notebooks/Converting HTML to Text.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/notebooks/Converting HTML to Text.ipynb -------------------------------------------------------------------------------- /notebooks/Extracting Australian Job Ads from Web Data Commons with SPARQL.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/notebooks/Extracting Australian Job Ads from Web Data Commons with SPARQL.ipynb -------------------------------------------------------------------------------- /notebooks/Extracting Role Title Words.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/notebooks/Extracting Role Title Words.ipynb -------------------------------------------------------------------------------- /notebooks/Extracting Role Titles and Analysing with Salary.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/notebooks/Extracting Role Titles and Analysing with Salary.ipynb -------------------------------------------------------------------------------- /notebooks/JobPosting RDF - 2019 Web Data Commons Schema Analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/notebooks/JobPosting RDF - 2019 Web Data Commons Schema Analysis.ipynb -------------------------------------------------------------------------------- /notebooks/JobPosting SPARQL - 2019 Web Data Commons Analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/notebooks/JobPosting SPARQL - 2019 Web Data Commons Analysis.ipynb -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/requirements-dev.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/format.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/scripts/format.sh -------------------------------------------------------------------------------- /scripts/lint.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/scripts/lint.sh -------------------------------------------------------------------------------- /scripts/run: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/scripts/run -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | pytest -s job_pipeline/tests/ 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_salary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/tests/test_salary.py -------------------------------------------------------------------------------- /typestubs/bs4/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/bs4/__init__.pyi -------------------------------------------------------------------------------- /typestubs/bs4/element.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/bs4/element.pyi -------------------------------------------------------------------------------- /typestubs/datasketch/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/datasketch/__init__.pyi -------------------------------------------------------------------------------- /typestubs/datasketch/hashfunc.pyi: -------------------------------------------------------------------------------- 1 | def sha1_hash32(data: bytes) -> int: ... 2 | -------------------------------------------------------------------------------- /typestubs/datasketch/lean_minhash.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/datasketch/lean_minhash.pyi -------------------------------------------------------------------------------- /typestubs/datasketch/lsh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /typestubs/datasketch/lsh.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/datasketch/lsh.pyi -------------------------------------------------------------------------------- /typestubs/datasketch/minhash.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/datasketch/minhash.pyi -------------------------------------------------------------------------------- /typestubs/demjson.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | def decode(text: str) -> Any: ... 4 | -------------------------------------------------------------------------------- /typestubs/extruct/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/extruct/__init__.pyi -------------------------------------------------------------------------------- /typestubs/kaggle/__init__.pyi: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /typestubs/kaggle/api/__init__.pyi: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /typestubs/kaggle/api/kaggle_api_extended.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/kaggle/api/kaggle_api_extended.pyi -------------------------------------------------------------------------------- /typestubs/mistletoe/__init__.pyi: -------------------------------------------------------------------------------- 1 | def markdown(iterable: str, renderer) -> str: ... 2 | -------------------------------------------------------------------------------- /typestubs/rdflib/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/rdflib/__init__.pyi -------------------------------------------------------------------------------- /typestubs/rdflib/graph.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/rdflib/graph.pyi -------------------------------------------------------------------------------- /typestubs/rdflib/namespace.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/rdflib/namespace.pyi -------------------------------------------------------------------------------- /typestubs/rdflib/term.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/rdflib/term.pyi -------------------------------------------------------------------------------- /typestubs/tqdm.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/tqdm.pyi -------------------------------------------------------------------------------- /typestubs/warcio/__init__.pyi: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /typestubs/warcio/archiveiterator.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/warcio/archiveiterator.pyi -------------------------------------------------------------------------------- /typestubs/warcio/bufferedreaders.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/warcio/bufferedreaders.pyi -------------------------------------------------------------------------------- /typestubs/warcio/limitreader.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/warcio/limitreader.pyi -------------------------------------------------------------------------------- /typestubs/warcio/recordloader.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/warcio/recordloader.pyi -------------------------------------------------------------------------------- /typestubs/warcio/warcwriter.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/warcio/warcwriter.pyi -------------------------------------------------------------------------------- /typestubs/xxhash/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdwardJRoss/job-advert-analysis/HEAD/typestubs/xxhash/__init__.pyi --------------------------------------------------------------------------------