├── .gitignore ├── scripts ├── analysis │ ├── __init__.py │ ├── constant_n_files.py │ ├── constant_n_incorrect_prop.py │ ├── constant_fail_percentage.py │ ├── constant_prop_potential_dialect.py │ ├── constant_n_dialect.py │ ├── constant_improve_sniffer.py │ ├── potential_dialects.py │ ├── constant_failure_messy.py │ ├── constant_improve_sniffer_messy.py │ ├── table_accuracy.py │ ├── constant_failure.py │ ├── constant_accuracy_overall.py │ ├── constant_known_type.py │ ├── figure_box_plot.py │ ├── table_std_messy.py │ ├── figure_bar_plot.py │ ├── table_parse_result.py │ ├── core.py │ ├── figure_fail.py │ ├── latex.py │ ├── show_failures.py │ ├── figure_violins.py │ └── make_summary.py ├── detection │ ├── __init__.py │ ├── lib │ │ ├── __init__.py │ │ └── types │ │ │ ├── __init__.py │ │ │ ├── README.md │ │ │ └── rudi_types.py │ ├── our_score_type_only.py │ ├── our_score_pattern_only.py │ ├── our_score_full.py │ ├── our_score_full_no_tie.py │ ├── sniffer.py │ ├── core.py │ ├── suitability.py │ ├── _ties.py │ ├── hypo.R │ ├── our_score_base.py │ └── human.py ├── preprocessing │ ├── __init__.py │ ├── merge.py │ ├── filter_non_normal.py │ └── extract_normals.py ├── run_human.py ├── analysis_summarise.py ├── analysis_explore_failures.py ├── analysis_potential_dialects.py ├── merge_human_normal.py ├── run_extract_normal.py ├── run_normal_detection.py ├── common │ ├── utils.py │ ├── escape.py │ ├── encoding.py │ ├── load.py │ ├── dialect.py │ ├── detector_result.py │ └── parser.py ├── README.md ├── run_detector.py ├── analysis_results.py ├── run_hypoparsr.sh ├── analysis_constants.py └── data_download.py ├── data └── .gitignore ├── results ├── test │ ├── analysis │ │ ├── constants │ │ │ ├── NumDialectTotal.tex │ │ │ ├── NumDialect_github.tex │ │ │ ├── NumDialect_ukdata.tex │ │ │ ├── NumFiles_github.tex │ │ │ ├── NumFiles_ukdata.tex │ │ │ ├── PropKnownType.tex │ │ │ ├── AccuracyOverallOurs.tex │ │ │ ├── FactorPotentialDialects.tex │ │ │ ├── PropFailHypoTimeout.tex │ │ │ ├── FailureRateOursMessyAll.tex │ │ │ ├── FailureRateSnifferMessyAll.tex │ │ │ ├── ImprovementOverSniffer.tex │ │ │ ├── PropFailHypoNoResults.tex │ │ │ ├── PropFailOurFull_github.tex │ │ │ ├── PropFailOurFull_ukdata.tex │ │ │ ├── PropFailSnifferNoResults.tex │ │ │ ├── PropFailSnifferTimeout.tex │ │ │ ├── ImprovementOverSnifferMessy.tex │ │ │ └── ImprovementOverSnifferMessyCeil.tex │ │ ├── figures │ │ │ └── violin_combined.pdf │ │ └── tables │ │ │ ├── parse_result_github.tex │ │ │ ├── parse_result_ukdata.tex │ │ │ ├── standard_and_messy_github.tex │ │ │ ├── standard_and_messy_ukdata.tex │ │ │ ├── accuracy_all_github.tex │ │ │ ├── accuracy_all_ukdata.tex │ │ │ ├── accuracy_human_github.tex │ │ │ ├── accuracy_human_ukdata.tex │ │ │ ├── accuracy_normal_github.tex │ │ │ └── accuracy_normal_ukdata.tex │ └── README.md └── dev │ └── README.md ├── Rpackages.txt ├── requirements.txt ├── .gitmodules ├── .travis.yml ├── utils └── install_R_packages.sh ├── LICENSE ├── design └── result.md ├── Dockerfile └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__/ 2 | -------------------------------------------------------------------------------- /scripts/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/detection/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/detection/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /scripts/detection/lib/types/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /results/test/analysis/constants/NumDialectTotal.tex: -------------------------------------------------------------------------------- 1 | 34% -------------------------------------------------------------------------------- /results/test/analysis/constants/NumDialect_github.tex: -------------------------------------------------------------------------------- 1 | 33% -------------------------------------------------------------------------------- /results/test/analysis/constants/NumDialect_ukdata.tex: -------------------------------------------------------------------------------- 1 | 8% -------------------------------------------------------------------------------- /results/test/analysis/constants/NumFiles_github.tex: -------------------------------------------------------------------------------- 1 | 4386% -------------------------------------------------------------------------------- /results/test/analysis/constants/NumFiles_ukdata.tex: -------------------------------------------------------------------------------- 1 | 4969% -------------------------------------------------------------------------------- /results/test/analysis/constants/PropKnownType.tex: -------------------------------------------------------------------------------- 1 | 91.6\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/AccuracyOverallOurs.tex: -------------------------------------------------------------------------------- 1 | 97\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/FactorPotentialDialects.tex: -------------------------------------------------------------------------------- 1 | 0.2% -------------------------------------------------------------------------------- /results/test/analysis/constants/PropFailHypoTimeout.tex: -------------------------------------------------------------------------------- 1 | 38.1\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/FailureRateOursMessyAll.tex: -------------------------------------------------------------------------------- 1 | 14\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/FailureRateSnifferMessyAll.tex: -------------------------------------------------------------------------------- 1 | 36\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/ImprovementOverSniffer.tex: -------------------------------------------------------------------------------- 1 | 8.6\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/PropFailHypoNoResults.tex: -------------------------------------------------------------------------------- 1 | 61.4\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/PropFailOurFull_github.tex: -------------------------------------------------------------------------------- 1 | 0.30\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/PropFailOurFull_ukdata.tex: -------------------------------------------------------------------------------- 1 | 0.00\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/PropFailSnifferNoResults.tex: -------------------------------------------------------------------------------- 1 | 75.8\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/PropFailSnifferTimeout.tex: -------------------------------------------------------------------------------- 1 | 24.2\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/ImprovementOverSnifferMessy.tex: -------------------------------------------------------------------------------- 1 | 21.4\%% -------------------------------------------------------------------------------- /results/test/analysis/constants/ImprovementOverSnifferMessyCeil.tex: -------------------------------------------------------------------------------- 1 | 22\%% -------------------------------------------------------------------------------- /Rpackages.txt: -------------------------------------------------------------------------------- 1 | devtools 2 | rjson 3 | data.tree 4 | RecordLinkage 5 | readr 6 | tibble 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chardet 2 | libtmux 3 | matplotlib 4 | numpy 5 | pandas 6 | regex 7 | requests 8 | scipy 9 | sklearn 10 | tabulate 11 | tqdm 12 | dominate 13 | -------------------------------------------------------------------------------- /results/test/analysis/figures/violin_combined.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/CSV_Wrangling/HEAD/results/test/analysis/figures/violin_combined.pdf -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "scripts/detection/lib/hypoparsr"] 2 | path = scripts/detection/lib/hypoparsr 3 | url = https://github.com/GjjvdBurg/hypoparsr 4 | branch = turing 5 | -------------------------------------------------------------------------------- /results/dev/README.md: -------------------------------------------------------------------------------- 1 | # Results 2 | 3 | These are the dialect annotations for the files that were used during 4 | development of the dialect detection algorithm. See 5 | ``out_reference_.json`` for the ground-truth annotations for all 6 | files. 7 | -------------------------------------------------------------------------------- /scripts/run_human.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper for human annotation. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | from detection import human 14 | 15 | if __name__ == '__main__': 16 | human.main() 17 | -------------------------------------------------------------------------------- /scripts/analysis_summarise.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper around ``make_summary`` script. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | from analysis import make_summary 14 | 15 | if __name__ == '__main__': 16 | make_summary.main() 17 | -------------------------------------------------------------------------------- /scripts/analysis_explore_failures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper around ``show_failures``. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | from analysis import show_failures 14 | 15 | if __name__ == '__main__': 16 | show_failures.main() 17 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: generic 2 | 3 | services: 4 | - docker 5 | 6 | before_install: 7 | - docker build -t alan-turing-institute/csvwrangling . 8 | 9 | script: 10 | - mkdir -p /home/travis/build/alan-turing-institute/results 11 | - docker run -v /home/travis/build/alan-turing-institute/results:/CSV_Wrangling/test alan-turing-institute/csvwrangling /bin/bash -c "make output && git diff" 12 | -------------------------------------------------------------------------------- /results/test/analysis/tables/parse_result_github.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | No Result & 10.12 & 4.90 & 22.96 & 1.69 & 1.30 & 4.24 & \textbf{0.30}\\ 5 | Incorrect & 9.28 & 9.64 & 38.85 & 7.32 & 15.09 & \textbf{5.15} & 5.95\\ 6 | Correct & 80.60 & 85.45 & 38.19 & 90.99 & 83.61 & 90.61 & \textbf{93.75}\\ 7 | \hline 8 | \end{tabular} -------------------------------------------------------------------------------- /scripts/analysis_potential_dialects.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper for the potential dialects analysis. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2019 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | from analysis import potential_dialects 14 | 15 | if __name__ == '__main__': 16 | potential_dialects.main() 17 | -------------------------------------------------------------------------------- /results/test/analysis/tables/parse_result_ukdata.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | No Result & 1.85 & 1.21 & 16.72 & \textbf{0.00} & 0.04 & 0.56 & \textbf{0.00}\\ 5 | Incorrect & 7.71 & 7.95 & 57.96 & 0.60 & 12.78 & \textbf{0.32} & \textbf{0.32}\\ 6 | Correct & 90.44 & 90.84 & 25.32 & 99.40 & 87.18 & 99.11 & \textbf{99.68}\\ 7 | \hline 8 | \end{tabular} -------------------------------------------------------------------------------- /results/test/analysis/tables/standard_and_messy_github.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Standard (3502) & 85.75 & 90.89 & 44.12 & 93.15 & 86.26 & 93.46 & \textbf{95.80}\\ 5 | Messy (884) & 60.18 & 63.91 & 14.71 & 82.47 & 73.08 & 79.30 & \textbf{85.63}\\ 6 | Total (4386) & 80.60 & 85.45 & 38.19 & 90.99 & 83.61 & 90.61 & \textbf{93.75}\\ 7 | \hline 8 | \end{tabular} -------------------------------------------------------------------------------- /results/test/analysis/tables/standard_and_messy_ukdata.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Standard (4938) & 90.46 & 90.91 & 25.05 & 99.43 & 87.30 & 99.15 & \textbf{99.72}\\ 5 | Messy (31) & 87.10 & 80.65 & 67.74 & \textbf{93.55} & 67.74 & \textbf{93.55} & \textbf{93.55}\\ 6 | Total (4969) & 90.44 & 90.84 & 25.32 & 99.40 & 87.18 & 99.11 & \textbf{99.68}\\ 7 | \hline 8 | \end{tabular} -------------------------------------------------------------------------------- /results/test/README.md: -------------------------------------------------------------------------------- 1 | # Results 2 | 3 | The results will be placed here. The structure is as follows: 4 | 5 | 1. The **preprocessing** directory stores output from automatic ground truth 6 | detection (a.k.a. *normal forms*). 7 | 8 | 2. The **detection** directory stores the output of the detectors, as well as 9 | the ground truth (``out_reference``). 10 | 11 | 3. The **analysis** directory stores the analysis output in figures, tables, 12 | and constants. 13 | -------------------------------------------------------------------------------- /utils/install_R_packages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Install R packages from a file 4 | # 5 | # Author: G.J.J. van den Burg 6 | # Date: 2019-05-16 7 | # 8 | if [ $# -ne 1 ] 9 | then 10 | echo "Usage: $0 packages.txt" 11 | exit 1 12 | fi 13 | 14 | if [ ! -s "$1" ] 15 | then 16 | echo "Provided package file $1 has no packages. Skipping" 17 | exit 0 18 | fi 19 | 20 | while read -r pkg 21 | do 22 | Rscript -e "install.packages('${pkg}', repos=c('https://cloud.r-project.org'))" 23 | done < "$1" 24 | -------------------------------------------------------------------------------- /results/test/analysis/tables/accuracy_all_github.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Delimiter & 87.48 & 86.82 & 65.41 & 92.61 & 88.33 & 91.38 & \textbf{94.92}\\ 5 | Quotechar & 82.90 & 92.36 & 44.60 & 95.23 & 90.10 & 93.80 & \textbf{97.36}\\ 6 | Escapechar & 87.96 & 94.37 & 74.85 & 97.95 & 96.26 & 95.44 & \textbf{99.25}\\ 7 | Overall & 80.60 & 85.45 & 38.19 & 90.99 & 83.61 & 90.61 & \textbf{93.75}\\ 8 | \hline 9 | \end{tabular} -------------------------------------------------------------------------------- /results/test/analysis/tables/accuracy_all_ukdata.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Delimiter & 97.97 & 91.89 & 80.20 & 99.70 & 93.80 & 99.26 & \textbf{99.82}\\ 5 | Quotechar & 90.56 & 92.21 & 26.34 & 99.46 & 89.56 & 99.13 & \textbf{99.70}\\ 6 | Escapechar & 98.05 & 98.79 & 82.61 & \textbf{100.00} & 97.67 & 99.42 & 99.98\\ 7 | Overall & 90.44 & 90.84 & 25.32 & 99.40 & 87.18 & 99.11 & \textbf{99.68}\\ 8 | \hline 9 | \end{tabular} -------------------------------------------------------------------------------- /results/test/analysis/tables/accuracy_human_github.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Delimiter & 83.35 & 81.78 & 59.78 & 89.64 & 84.47 & 87.95 & \textbf{93.04}\\ 5 | Quotechar & 76.24 & 89.19 & 39.51 & 93.27 & 85.71 & 90.95 & \textbf{96.07}\\ 6 | Escapechar & 84.14 & 92.33 & 74.07 & 97.08 & 94.84 & 93.27 & \textbf{98.77}\\ 7 | Overall & 72.54 & 79.61 & 28.99 & 87.28 & 76.88 & 86.76 & \textbf{91.21}\\ 8 | \hline 9 | \end{tabular} -------------------------------------------------------------------------------- /results/test/analysis/tables/accuracy_human_ukdata.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Delimiter & 97.31 & 92.17 & 81.91 & 99.59 & 93.16 & 98.95 & \textbf{99.74}\\ 5 | Quotechar & 87.46 & 92.93 & 24.61 & 99.30 & 87.49 & 98.77 & \textbf{99.56}\\ 6 | Escapechar & 97.43 & 99.65 & 85.21 & \textbf{100.00} & 96.76 & 99.18 & 99.97\\ 7 | Overall & 87.28 & 91.14 & 23.12 & 99.21 & 84.71 & 98.74 & \textbf{99.53}\\ 8 | \hline 9 | \end{tabular} -------------------------------------------------------------------------------- /results/test/analysis/tables/accuracy_normal_github.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Delimiter & 93.93 & 94.69 & 74.20 & 97.26 & 94.34 & 96.73 & \textbf{97.84}\\ 5 | Quotechar & 93.29 & 97.31 & 52.54 & 98.31 & 96.96 & 98.25 & \textbf{99.36}\\ 6 | Escapechar & 93.93 & 97.55 & 76.07 & 99.30 & 98.48 & 98.83 & \textbf{100.00}\\ 7 | Overall & 93.17 & 94.57 & 52.54 & 96.79 & 94.10 & 96.61 & \textbf{97.72}\\ 8 | \hline 9 | \end{tabular} -------------------------------------------------------------------------------- /results/test/analysis/tables/accuracy_normal_ukdata.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lrrr|rrrr} 2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\ 3 | \hline 4 | Delimiter & 99.42 & 91.28 & 76.42 & 99.94 & 95.22 & 99.94 & \textbf{100.00}\\ 5 | Quotechar & 97.42 & 90.63 & 30.17 & 99.81 & 94.12 & 99.94 & \textbf{100.00}\\ 6 | Escapechar & 99.42 & 96.90 & 76.87 & \textbf{100.00} & 99.68 & 99.94 & \textbf{100.00}\\ 7 | Overall & 97.42 & 90.18 & 30.17 & 99.81 & 92.64 & 99.94 & \textbf{100.00}\\ 8 | \hline 9 | \end{tabular} -------------------------------------------------------------------------------- /scripts/merge_human_normal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper around merge. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import sys 14 | 15 | from preprocessing import merge 16 | 17 | if __name__ == "__main__": 18 | if len(sys.argv) == 1: 19 | print("Usage: %s output_file input_file ..." % sys.argv[0]) 20 | raise SystemExit 21 | merge.main(sys.argv[1], sys.argv[2:]) 22 | -------------------------------------------------------------------------------- /scripts/run_extract_normal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper for normal form extraction 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import sys 14 | 15 | from preprocessing import extract_normals 16 | 17 | if __name__ == "__main__": 18 | if not len(sys.argv) == 3: 19 | print("Usage: %s normals.json output_file" % sys.argv[0]) 20 | raise SystemExit 21 | extract_normals.main(sys.argv[1], sys.argv[2]) 22 | -------------------------------------------------------------------------------- /scripts/run_normal_detection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper for normal form detection. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import sys 14 | 15 | from preprocessing import filter_non_normal 16 | 17 | if __name__ == '__main__': 18 | if not len(sys.argv) == 4: 19 | print("Usage: %s input_dir normal_file non_normal_file" % sys.argv[0]) 20 | raise SystemExit 21 | filter_non_normal.main(sys.argv[1], sys.argv[2], sys.argv[3]) 22 | -------------------------------------------------------------------------------- /scripts/common/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Shared utility functions. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import math 14 | 15 | 16 | def pairwise(iterable): 17 | "s - > (s0, s1), (s1, s2), (s2, s3), ..." 18 | a = iter(iterable) 19 | b = iter(iterable) 20 | next(b, None) 21 | return zip(a, b) 22 | 23 | 24 | def softmax(iterable): 25 | maxx = max(iterable) 26 | offset = [x - maxx for x in iterable] 27 | denom = sum(map(math.exp, offset)) 28 | return [math.exp(o) / denom for o in offset] 29 | -------------------------------------------------------------------------------- /scripts/common/escape.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Common functions for dealing with escape characters 5 | 6 | Author: Gertjan van den Burg 7 | Copyright (c) 2018 - The Alan Turing Institute 8 | License: See the LICENSE file. 9 | Date: 2018-11-06 10 | """ 11 | 12 | import codecs 13 | import unicodedata 14 | 15 | 16 | def is_potential_escapechar(char, encoding): 17 | as_unicode = codecs.decode(bytes(char, encoding), encoding=encoding) 18 | ctr = unicodedata.category(as_unicode) 19 | block = ["!", "?", '"', "'", ".", ",", ";", ":", "%", "*", "&", "#"] 20 | if ctr == "Po": 21 | if as_unicode in block: 22 | return False 23 | return True 24 | return False 25 | -------------------------------------------------------------------------------- /scripts/common/encoding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Common functions for encoding detection 5 | 6 | Author: Gertjan van den Burg 7 | Copyright (c) 2018 - The Alan Turing Institute 8 | License: See the LICENSE file. 9 | Date: 2018-11-06 10 | """ 11 | 12 | import chardet 13 | 14 | def get_encoding(filename): 15 | detector = chardet.UniversalDetector() 16 | final_chunk = False 17 | blk_size = 65536 18 | with open(filename, "rb") as fid: 19 | while (not final_chunk) and (not detector.done): 20 | chunk = fid.read(blk_size) 21 | if len(chunk) < blk_size: 22 | final_chunk = True 23 | detector.feed(chunk) 24 | detector.close() 25 | encoding = detector.result.get("encoding", None) 26 | return encoding 27 | -------------------------------------------------------------------------------- /scripts/common/load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Common functions for loading files 5 | 6 | Author: Gertjan van den Burg 7 | Copyright (c) 2018 - The Alan Turing Institute 8 | License: See the LICENSE file. 9 | Date: 2018-11-06 10 | """ 11 | 12 | from .encoding import get_encoding 13 | 14 | 15 | def load_file(filename, encoding="unknown"): 16 | if encoding == "unknown": 17 | encoding = get_encoding(filename) 18 | with open(filename, "r", newline="", encoding=encoding) as fid: 19 | try: 20 | return fid.read() 21 | except UnicodeDecodeError: 22 | print( 23 | "UnicodeDecodeError occurred for file: %s. " 24 | "This means the encoding was determined incorrectly " 25 | "or the file is corrupt." % filename 26 | ) 27 | return None 28 | -------------------------------------------------------------------------------- /scripts/detection/lib/types/README.md: -------------------------------------------------------------------------------- 1 | # Rudimentary Type Detection 2 | 3 | This directory contains the rudimentary type detection engine used for CSV 4 | dialect detection. It is a regular-expression based method that allows 5 | detection of: 6 | 7 | - Empty cells 8 | - URLs and email 9 | - Numbers, including scientific notation, comma/period as radix point, 10 | comma/period as thousands separator. 11 | - Percentages 12 | - Currencies 13 | - Time in HH:MM:SS, HH:MM, and H:MM notation 14 | - Dates in forty different formats, including Chinese. Based on [this 15 | Wikipedia article](https://en.wikipedia.org/wiki/Date_format_by_country). 16 | - Combined date and time (i.e. ISO 8601 and variations) 17 | - N/A and n/a 18 | 19 | This covers about 80% - 90% of cells in our collection of CSV files. 20 | 21 | Copyright (c) 2018 The Alan Turing Institute 22 | 23 | ## Author 24 | 25 | Gerrit J.J. van den Burg, gvandenburg@turing.ac.uk 26 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Script Directory 2 | 3 | The scripts are organized as follows: 4 | 5 | 1. **analysis** contains the code necessary to generate the figures, tables, 6 | and constants. 7 | 8 | 2. **common** contains shared code for analysis, detection, and preprocessing. 9 | Among other things it contains definitions for the detector result and 10 | dialect objects, the parser we use, and utilities for encoding detection 11 | and file loading. 12 | 13 | 3. **detection** contains the code for each of the detectors. Every detector 14 | has a separate file. Those implemented in Python have a common commandline 15 | interface defined in ``core.py``. Code for HypoParsr and type detection are 16 | in the **lib** subdir. 17 | 18 | 4. **preprocessing** contains the code for automatic dialect detection using 19 | so-called ''normal forms'' 20 | 21 | 22 | The files in this folder are top-level wrapper scripts that are actually 23 | needed to run everything. 24 | -------------------------------------------------------------------------------- /scripts/analysis/constant_n_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Get the number of files in a given summary file. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import json 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument( 20 | "-s", 21 | dest="summary", 22 | help="Summary file with the results", 23 | required=True, 24 | ) 25 | 26 | parser.add_argument( 27 | "-o", dest="output", help="Output tex file to write to", required=True 28 | ) 29 | return parser.parse_args() 30 | 31 | 32 | def main(): 33 | args = parse_args() 34 | with open(args.summary, "r") as fid: 35 | data = json.load(fid) 36 | 37 | n_files = data["n_files_all"] 38 | with open(args.output, "w") as fid: 39 | fid.write("%i%%" % n_files) 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Alan Turing Institute 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /design/result.md: -------------------------------------------------------------------------------- 1 | # Design document for storing results 2 | 3 | A result is stored in [JSON Lines format](http://jsonlines.org/) with the 4 | following fields: 5 | 6 | - filename: filename of the CSV file, typically ``/path/to/data/[md5hash].csv`` 7 | 8 | - status: parsing status, either ``null``, ``ok``, ``fail``, or ``skip``. 9 | 10 | - reason: failure or skip reason, either ``null`` or: 11 | 12 | + ``unknown``, 13 | + ``multiple_answers``, 14 | + ``no_results``, 15 | + ``timeout``, 16 | + ``unreadable`` 17 | + ``non_existent`` 18 | 19 | - detector: name of the detector 20 | 21 | - hostname: hostname of the pc that ran the detection 22 | 23 | - runtime: time it took to run the detection 24 | 25 | - dialect. See below. 26 | 27 | A dialect is a separate key/value map using the fields: 28 | 29 | - delimiter: single character string, empty string for single-column files, 30 | ``null`` for undefined. 31 | 32 | - quotechar: single character string, empty string for unquoted files, 33 | ``null`` for undefined. 34 | 35 | - escapechar: single character string, empty string for no escape char, 36 | ``null`` for undefined 37 | -------------------------------------------------------------------------------- /scripts/run_detector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper for detector executables. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import sys 14 | 15 | from detection import ( 16 | our_score_full, 17 | our_score_full_no_tie, 18 | our_score_pattern_only, 19 | our_score_type_only, 20 | sniffer, 21 | suitability, 22 | ) 23 | 24 | 25 | def main(): 26 | detector = sys.argv.pop(1) 27 | if detector == "our_score_full": 28 | our_score_full.main() 29 | elif detector == "our_score_full_no_tie": 30 | our_score_full_no_tie.main() 31 | elif detector == "our_score_type_only": 32 | our_score_type_only.main() 33 | elif detector == "our_score_pattern_only": 34 | our_score_pattern_only.main() 35 | elif detector == "sniffer": 36 | sniffer.main() 37 | elif detector == "suitability": 38 | suitability.main() 39 | else: 40 | raise ValueError("Unknown detector: %s" % detector) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /scripts/analysis_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper around result generation. 6 | 7 | See the individual scripts for more usage info. 8 | 9 | Author: Gertjan van den Burg 10 | Copyright (c) 2018 - The Alan Turing Institute 11 | License: See the LICENSE file. 12 | 13 | """ 14 | 15 | import sys 16 | 17 | from analysis import ( 18 | figure_fail, 19 | figure_bar_plot, 20 | figure_box_plot, 21 | figure_violins, 22 | table_accuracy, 23 | table_std_messy, 24 | table_parse_result 25 | ) 26 | 27 | 28 | def main(): 29 | result_type = sys.argv.pop(1) 30 | if result_type == "fail_figure": 31 | figure_fail.main() 32 | elif result_type == "accuracy_bar": 33 | figure_bar_plot.main() 34 | elif result_type == "boxplot": 35 | figure_box_plot.main() 36 | elif result_type == "violins": 37 | figure_violins.main() 38 | elif result_type == "tables": 39 | table_accuracy.main() 40 | elif result_type == "std_messy": 41 | table_std_messy.main() 42 | elif result_type == "parse_result": 43 | table_parse_result.main() 44 | else: 45 | raise ValueError("Unknown result type: %s" % result_type) 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /scripts/analysis/constant_n_incorrect_prop.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Proportion of files incorrect. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import json 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument( 19 | "-d", dest="detector", help="Detector name", required=True 20 | ) 21 | parser.add_argument( 22 | "-s", 23 | dest="summary", 24 | help="Summary file with the results", 25 | required=True, 26 | ) 27 | 28 | parser.add_argument( 29 | "-o", dest="output", help="Output tex file to write to", required=True 30 | ) 31 | return parser.parse_args() 32 | 33 | 34 | def main(): 35 | args = parse_args() 36 | with open(args.summary, "r") as fid: 37 | data = json.load(fid) 38 | 39 | fails = data["failures"] 40 | if not args.detector in fails: 41 | raise KeyError( 42 | "Detector name %s doesn't exist in failure dict" % args.detector 43 | ) 44 | perc = fails[args.detector] * 100.0 45 | 46 | with open(args.output, "w") as fid: 47 | fid.write("%.2f\\%%%%" % perc) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /scripts/analysis/constant_fail_percentage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Overall failure rate of a method for a single corpus. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import json 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument( 19 | "-d", dest="detector", help="Detector name", required=True 20 | ) 21 | parser.add_argument( 22 | "-s", 23 | dest="summary", 24 | help="Summary file with the results", 25 | required=True, 26 | ) 27 | 28 | parser.add_argument( 29 | "-o", dest="output", help="Output tex file to write to", required=True 30 | ) 31 | return parser.parse_args() 32 | 33 | 34 | def main(): 35 | args = parse_args() 36 | with open(args.summary, "r") as fid: 37 | data = json.load(fid) 38 | 39 | fails = data["failures"] 40 | if not args.detector in fails: 41 | raise KeyError( 42 | "Detector name %s doesn't exist in failure dict" % args.detector 43 | ) 44 | perc = fails[args.detector] * 100.0 45 | 46 | with open(args.output, "w") as fid: 47 | fid.write("%.2f\\%%%%" % perc) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /scripts/preprocessing/merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This takes a series of detector output files and merges them into a single file 6 | with the detector name "reference". 7 | 8 | Author: Gertjan van den Burg 9 | Copyright (c) 2018 - The Alan Turing Institute 10 | License: See the LICENSE file. 11 | 12 | """ 13 | 14 | from common.detector_result import DetectorResult 15 | 16 | 17 | def main(output_file, input_files): 18 | combined = {} 19 | for filename in input_files: 20 | with open(filename, "r") as fid: 21 | for line in fid: 22 | dr = DetectorResult.from_json(line.strip()) 23 | if dr.filename in combined: 24 | if dr.dialect == combined[dr.filename].dialect: 25 | # allow it if the dialect is the same 26 | continue 27 | else: 28 | raise KeyError( 29 | "Duplicate result for file: %s" % dr.filename 30 | ) 31 | combined[dr.filename] = dr 32 | 33 | with open(output_file, "w") as fid: 34 | for filename in sorted(combined.keys()): 35 | dr = combined[filename] 36 | dr.original_detector = dr.detector 37 | dr.detector = "reference" 38 | fid.write(dr.to_json() + "\n") 39 | -------------------------------------------------------------------------------- /scripts/analysis/constant_prop_potential_dialect.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env python 3 | # -*- coding: utf-8 -*- 4 | 5 | """ 6 | Compute the factor that relates the size of the alphabet to the size of the set 7 | of potential dialects. 8 | 9 | To be exact, we want F in the equation: |Dialects| = F * |UniqueChars| 10 | 11 | This is averaged over both datasets in the test set. 12 | 13 | Author: Gertjan van den Burg 14 | Date: 2019-04-10 15 | 16 | """ 17 | 18 | import argparse 19 | import json 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument( 25 | "-i", 26 | dest="input", 27 | help="Overview files with the results from the ``potential_dialects.py`` script.", 28 | required=True, 29 | nargs="+", 30 | ) 31 | parser.add_argument( 32 | "-o", dest="output", help="Output tex file to write to", required=True 33 | ) 34 | return parser.parse_args() 35 | 36 | 37 | def main(): 38 | args = parse_args() 39 | fracs = [] 40 | for filename in args.input: 41 | with open(filename, "r") as fid: 42 | for line in fid: 43 | data = json.loads(line.strip()) 44 | fracs.append(data["n_dialect"] / data["n_alpha"]) 45 | 46 | result = sum(fracs) / len(fracs) 47 | with open(args.output, "w") as fid: 48 | fid.write("%.1f%%" % result) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /scripts/preprocessing/filter_non_normal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Split CSV files between those where ground truth can be determined 6 | automatically (normal forms) and those that need human annotation (non-normal). 7 | 8 | Author: Gertjan van den Burg 9 | Copyright (c) 2018 - The Alan Turing Institute 10 | License: See the LICENSE file. 11 | 12 | """ 13 | 14 | import os 15 | import json 16 | 17 | from tabulate import tabulate 18 | 19 | from .normal_forms import detect_form 20 | 21 | 22 | def main(input_dir, normal_file, non_normal_file): 23 | files = [os.path.join(input_dir, x) for x in os.listdir(input_dir)] 24 | files.sort() 25 | 26 | normal_fid = open(normal_file, "w") 27 | nonnormal_fid = open(non_normal_file, "w") 28 | 29 | counts = {} 30 | 31 | for f in files: 32 | print("[normal_form] Analyzing file: %s" % f) 33 | form_id, params = detect_form(f, record_result=False, verbose=False) 34 | 35 | if not form_id in counts: 36 | counts[form_id] = 0 37 | counts[form_id] += 1 38 | 39 | if form_id is None: 40 | nonnormal_fid.write(f + "\n") 41 | else: 42 | data = {"filename": f, "form_id": form_id, "params": params} 43 | normal_fid.write(json.dumps(data) + "\n") 44 | 45 | normal_fid.close() 46 | nonnormal_fid.close() 47 | 48 | table = [ 49 | {"form": "None" if k is None else k, "count": v} 50 | for k, v in counts.items() 51 | ] 52 | 53 | print(tabulate(table, headers="keys")) 54 | -------------------------------------------------------------------------------- /scripts/detection/our_score_type_only.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Get the best parameter set by using only the type score of our data consistency 6 | measure. 7 | 8 | Author: Gertjan van den Burg 9 | Copyright (c) 2018 - The Alan Turing Institute 10 | License: See the LICENSE file. 11 | """ 12 | 13 | from .core import run 14 | from .our_score_base import determine_dqr, get_cells, is_clean 15 | from .our_score_full import EPS_TYP 16 | 17 | 18 | DETECTOR = "our_score_type_only" 19 | 20 | 21 | def get_scores(data, dialects, verbose=False): 22 | scores = {} 23 | for dialect in sorted(dialects): 24 | cells = get_cells(data, dialect) 25 | n_clean = sum((is_clean(cell) for cell in cells)) 26 | n_cells = len(cells) 27 | 28 | if n_cells == 0: 29 | type_score = EPS_TYP 30 | else: 31 | type_score = max(EPS_TYP, n_clean / n_cells) 32 | score = type_score 33 | 34 | scores[dialect] = score 35 | 36 | if verbose: 37 | print( 38 | "%15r:\ttype = %.6f\tfinal = %s" 39 | % ( 40 | dialect, 41 | type_score, 42 | "0" if scores[dialect] == 0 else "%.6f" % scores[dialect], 43 | ) 44 | ) 45 | 46 | return scores 47 | 48 | 49 | def wrap_determine_dqr(filename, verbose=False): 50 | return determine_dqr(filename, get_scores, verbose=verbose) 51 | 52 | 53 | def main(): 54 | run(determine_dqr=wrap_determine_dqr, detector=DETECTOR) 55 | -------------------------------------------------------------------------------- /scripts/analysis/constant_n_dialect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Count the number of dialects 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | 15 | from common.detector_result import Status 16 | 17 | from .core import load_detector_results 18 | 19 | 20 | def count_dialect(result_dicts): 21 | dialects = set() 22 | for reference in result_dicts: 23 | for fname in reference: 24 | res = reference[fname] 25 | if not res.status == Status.OK: 26 | continue 27 | dialects.add(res.dialect) 28 | return len(dialects) 29 | 30 | 31 | def parse_args(): 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument( 34 | "-o", dest="output", help="Output tex file to write to", required=True 35 | ) 36 | parser.add_argument( 37 | "-r", 38 | dest="reference", 39 | help="Reference file for a specific corpus", 40 | required=True, 41 | nargs="+", 42 | ) 43 | 44 | return parser.parse_args() 45 | 46 | 47 | def main(): 48 | args = parse_args() 49 | result_dicts = [] 50 | for reference in args.reference: 51 | _, reference_results = load_detector_results(reference) 52 | result_dicts.append(reference_results) 53 | n_dialect = count_dialect(result_dicts) 54 | 55 | with open(args.output, "w") as fid: 56 | fid.write("%i%%" % n_dialect) 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /scripts/run_hypoparsr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Bash wrapper around HypoParsr so we can kill the thing when it takes too 4 | # long. 5 | # 6 | # This is necessary because R's withTimeout can't kill C code so it's kinda 7 | # useless. 8 | # 9 | # Author: G.J.J. van den Burg 10 | # Date: 2018-09-28T09:21:05+01:00 11 | # Copyright (c) 2018 - The Alan Turing Institute 12 | # License: See the LICENSE file. 13 | # 14 | # 15 | 16 | TIMEOUT=600 # ten minutes 17 | 18 | ALL_FILE="$1" 19 | OUTPUT_FILE="$2" 20 | 21 | THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 22 | HYPO_R="${THIS_DIR}/detection/hypo.R" 23 | 24 | if [ ! -f ${HYPO_R} ] 25 | then 26 | echo "Couldn't find hypo.R at ${HYPO_R}. Not starting." 27 | exit 1 28 | fi 29 | 30 | if [ ! -f ${OUTPUT_FILE} ] 31 | then 32 | touch ${OUTPUT_FILE} 33 | fi 34 | 35 | # catch return code 124 36 | 37 | for filename in `cat ${ALL_FILE}` 38 | do 39 | # check if it is already processed 40 | if grep -q ${filename} ${OUTPUT_FILE} 41 | then 42 | continue 43 | fi 44 | 45 | echo "[hypoparsr] Analyzing file: ${filename}" 46 | 47 | # process it with timeout 48 | res=$(timeout ${TIMEOUT} Rscript ${HYPO_R} ${filename} 2>/dev/null) 49 | 50 | # timeout retcode is 124 if timeout occurred. 51 | if [ "$?" -eq "124" ] 52 | then 53 | # timeout occurred 54 | res="{\"status\": \"FAIL\", \"status_msg\": \"TIMEOUT\", \"filename\": \"${filename}\", \"detector\": \"hypoparsr\", \"runtime\": ${TIMEOUT}, \"hostname\": \"$(hostname)\"}" 55 | fi 56 | 57 | # Strip the simpleError from the output if necessary 58 | res=$(echo "${res}" | grep -v simpleError | grep -v read_delim) 59 | 60 | echo "${res}" >> ${OUTPUT_FILE} 61 | done 62 | -------------------------------------------------------------------------------- /scripts/detection/our_score_pattern_only.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Get the best parameter set by using only the pattern score of our data 7 | consistency measure. 8 | 9 | Author: Gertjan van den Burg 10 | Copyright (c) 2018 - The Alan Turing Institute 11 | License: See the LICENSE file. 12 | """ 13 | 14 | from collections import Counter 15 | 16 | from .core import run 17 | from .our_score_base import determine_dqr, make_abstraction 18 | from .our_score_full import EPS_PAT 19 | 20 | 21 | DETECTOR = "our_score_pattern_only" 22 | 23 | 24 | def get_scores(data, dialects, verbose=False): 25 | scores = {} 26 | for dialect in sorted(dialects): 27 | A = make_abstraction(data, dialect) 28 | row_patterns = Counter(A.split("R")) 29 | pattern_score = 0 30 | for pat_p, n_p in row_patterns.items(): 31 | Lk = len(pat_p.split("D")) 32 | pattern_score += n_p * (max(EPS_PAT, Lk - 1) / Lk) 33 | pattern_score /= len(row_patterns) 34 | 35 | score = pattern_score 36 | scores[dialect] = score 37 | 38 | if verbose: 39 | print( 40 | "%15r:\tpattern = %.6f\tfinal = %s" 41 | % ( 42 | dialect, 43 | pattern_score, 44 | "0" if scores[dialect] == 0 else "%.6f" % scores[dialect], 45 | ) 46 | ) 47 | 48 | return scores 49 | 50 | 51 | def wrap_determine_dqr(filename, verbose=False): 52 | return determine_dqr(filename, get_scores, verbose=verbose) 53 | 54 | 55 | def main(): 56 | run(determine_dqr=wrap_determine_dqr, detector=DETECTOR) 57 | -------------------------------------------------------------------------------- /scripts/analysis/constant_improve_sniffer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Accuracy improvement of a method over sniffer. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | 15 | from .constant_accuracy_overall import load_and_merge, compute_accuracy_overall 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument( 21 | "-r", 22 | dest="reference", 23 | help="Reference file(s) with ground truth", 24 | required=True, 25 | nargs="+", 26 | ) 27 | parser.add_argument( 28 | "-d", 29 | dest="detector", 30 | help="Detector result(s)", 31 | required=True, 32 | nargs="+", 33 | ) 34 | parser.add_argument( 35 | "-s", dest="sniffer", help="Sniffer result(s)", required=True, nargs="+" 36 | ) 37 | parser.add_argument( 38 | "-o", dest="output", help="Output tex file to write to", required=True 39 | ) 40 | return parser.parse_args() 41 | 42 | 43 | def main(): 44 | args = parse_args() 45 | reference_results = load_and_merge(args.reference) 46 | detector_results = load_and_merge(args.detector) 47 | sniffer_results = load_and_merge(args.sniffer) 48 | acc_det = compute_accuracy_overall(reference_results, detector_results) 49 | acc_snf = compute_accuracy_overall(reference_results, sniffer_results) 50 | diff = acc_det - acc_snf 51 | with open(args.output, "w") as fid: 52 | fid.write("%.1f\\%%%%" % diff) 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /scripts/analysis/potential_dialects.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Quick script to check the distribution of the number of dialects that we 6 | consider. 7 | 8 | Author: Gertjan van den Burg 9 | Date: 2019-04-09 10 | 11 | """ 12 | 13 | import argparse 14 | import json 15 | 16 | from tqdm import tqdm 17 | 18 | from common.encoding import get_encoding 19 | from common.load import load_file 20 | from detection.our_score_base import get_potential_dialects 21 | 22 | 23 | def get_stats(filename): 24 | encoding = get_encoding(filename) 25 | data = load_file(filename, encoding=encoding) 26 | if data is None: 27 | return None 28 | n_alpha = len(set(data)) 29 | n_dialect = len(get_potential_dialects(data, encoding)) 30 | return dict(filename=filename, n_alpha=n_alpha, n_dialect=n_dialect) 31 | 32 | 33 | def parse_args(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument( 36 | "-i", "--input", help="File with filenames to consider", required=True 37 | ) 38 | parser.add_argument( 39 | "-o", 40 | "--output", 41 | help="Output file to write the numbers to", 42 | required=True, 43 | ) 44 | return parser.parse_args() 45 | 46 | 47 | def main(): 48 | args = parse_args() 49 | 50 | with open(args.output, "w") as oid: 51 | with open(args.input, "r") as fid: 52 | total = sum((1 for _ in fid)) 53 | fid.seek(0) 54 | for line in tqdm(fid, total=total): 55 | filename = line.strip() 56 | s = get_stats(filename) 57 | if s is None: 58 | continue 59 | line = json.dumps(s) 60 | oid.write(line + "\n") 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /scripts/analysis/constant_failure_messy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Failure on messy files averaged over both corpora. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2019 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | Date: 2019-04-15 11 | 12 | """ 13 | 14 | import argparse 15 | import json 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument( 21 | "-d", dest="detector", help="Detector name", required=True 22 | ) 23 | parser.add_argument( 24 | "-s", 25 | dest="summary", 26 | help="Summary file(s) with the results", 27 | required=True, 28 | nargs="+", 29 | ) 30 | 31 | parser.add_argument( 32 | "-o", dest="output", help="Output tex file to write to", required=True 33 | ) 34 | return parser.parse_args() 35 | 36 | 37 | def main(): 38 | args = parse_args() 39 | 40 | n_messy_total = 0 41 | n_messy_correct_total = 0 42 | for summary_file in args.summary: 43 | with open(summary_file, "r") as fid: 44 | data = json.load(fid) 45 | if not args.detector in data["messy_accuracy_all"]: 46 | raise KeyError( 47 | "Detector name %s doesn't exist in messy_accuracy_all dict" 48 | % args.detector 49 | ) 50 | 51 | n_messy = data["n_files_messy"] 52 | acc_messy = data["messy_accuracy_all"][args.detector] 53 | n_messy_correct = acc_messy * n_messy 54 | 55 | n_messy_total += n_messy 56 | n_messy_correct_total += n_messy_correct 57 | 58 | perc = (n_messy_total - n_messy_correct_total) / n_messy_total * 100.0 59 | 60 | with open(args.output, "w") as fid: 61 | fid.write("%.0f\\%%%%" % perc) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /scripts/analysis/constant_improve_sniffer_messy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Accuracy improvement of a method over sniffer for messy files. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import json 15 | import math 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument( 21 | "--round-up", help="Whether or not to round up", action="store_true" 22 | ) 23 | parser.add_argument( 24 | "-s", dest="summary", help="Summary file(s)", required=True, nargs="+" 25 | ) 26 | parser.add_argument( 27 | "-o", dest="output", help="Output tex file to write to", required=True 28 | ) 29 | return parser.parse_args() 30 | 31 | 32 | def main(): 33 | args = parse_args() 34 | 35 | n_messy_tot = 0 36 | n_messy_correct_ours = 0 37 | n_messy_correct_snif = 0 38 | 39 | for summary_file in args.summary: 40 | with open(summary_file, "r") as fid: 41 | data = json.load(fid) 42 | 43 | n_messy = data["n_files_messy"] 44 | acc_messy_ours = data["messy_accuracy_all"]["our_score_full"] 45 | acc_messy_snif = data["messy_accuracy_all"]["sniffer"] 46 | 47 | n_messy_tot += n_messy 48 | n_messy_correct_ours += acc_messy_ours * n_messy 49 | n_messy_correct_snif += acc_messy_snif * n_messy 50 | 51 | acc_ours = n_messy_correct_ours / n_messy_tot 52 | acc_snif = n_messy_correct_snif / n_messy_tot 53 | 54 | improv = (acc_ours - acc_snif) * 100 55 | 56 | with open(args.output, "w") as fid: 57 | if args.round_up: 58 | fid.write("%.0f\\%%%%" % math.ceil(improv)) 59 | else: 60 | fid.write("%.1f\\%%%%" % improv) 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /scripts/analysis/table_accuracy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Convert summary data to a latex table 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | 14 | from .core import ( 15 | ORDERED_DETECTORS, 16 | TABLE_SPEC, 17 | clean_detector_name, 18 | check_detectors, 19 | ) 20 | from .latex import build_latex_table 21 | 22 | 23 | def create_table(results, output_file): 24 | table = [] 25 | for prop in results: 26 | row = [prop.capitalize()] 27 | check_detectors(results[prop].keys()) 28 | for key in ORDERED_DETECTORS: 29 | row.append(results[prop][key] * 100.0) 30 | table.append(row) 31 | 32 | headers = ["Property"] + list(map(clean_detector_name, ORDERED_DETECTORS)) 33 | 34 | with open(output_file, "w") as fid: 35 | fid.write( 36 | build_latex_table( 37 | table, headers, floatfmt=".2f", table_spec=TABLE_SPEC 38 | ) 39 | ) 40 | 41 | 42 | def parse_args(): 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument( 45 | "type", 46 | choices=["all", "human", "normal"], 47 | help="Subset of data to generate plot for", 48 | default="all", 49 | ) 50 | parser.add_argument( 51 | "-o", dest="output", help="Output tex file to write to", required=True 52 | ) 53 | parser.add_argument( 54 | "-s", 55 | dest="summary", 56 | help="Summary file with the results", 57 | required=True, 58 | ) 59 | 60 | return parser.parse_args() 61 | 62 | 63 | def main(): 64 | args = parse_args() 65 | with open(args.summary, "r") as fid: 66 | data = json.load(fid) 67 | 68 | key = "detection_accuracy_" + args.type 69 | if not key in data: 70 | raise ValueError("Can't find key %s in file %s" % (key, args.summary)) 71 | 72 | create_table(data[key], args.output) 73 | -------------------------------------------------------------------------------- /scripts/analysis_constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Wrapper for constants generation. 6 | 7 | The constants are generated with separate Python scripts available in the 8 | ``analysis`` directory. This file provides a wrapper. See the scripts of each 9 | of the different constants for more info. 10 | 11 | Author: Gertjan van den Burg 12 | Copyright (c) 2018 - The Alan Turing Institute 13 | License: See the LICENSE file. 14 | 15 | """ 16 | 17 | import sys 18 | 19 | from analysis import ( 20 | constant_n_dialect, 21 | constant_n_files, 22 | constant_n_incorrect_prop, 23 | constant_accuracy_overall, 24 | constant_improve_sniffer, 25 | constant_improve_sniffer_messy, 26 | constant_fail_percentage, 27 | constant_failure, 28 | constant_failure_messy, 29 | constant_prop_potential_dialect, 30 | constant_known_type, 31 | ) 32 | 33 | 34 | def main(): 35 | const_name = sys.argv.pop(1) 36 | if const_name == "n_dialect": 37 | constant_n_dialect.main() 38 | elif const_name == "n_files": 39 | constant_n_files.main() 40 | elif const_name == "accuracy_overall": 41 | constant_accuracy_overall.main() 42 | elif const_name == "improve_sniffer": 43 | constant_improve_sniffer.main() 44 | elif const_name == "improve_sniffer_messy": 45 | constant_improve_sniffer_messy.main() 46 | elif const_name == "failure": 47 | constant_failure.main() 48 | elif const_name == "fail_percentage": 49 | constant_fail_percentage.main() 50 | elif const_name == "num_incorrect_prop": 51 | constant_n_incorrect_prop.main() 52 | elif const_name == "prop_potential_dialect": 53 | constant_prop_potential_dialect.main() 54 | elif const_name == "fail_percentage_messy": 55 | constant_failure_messy.main() 56 | elif const_name == "known_type": 57 | constant_known_type.main() 58 | else: 59 | raise ValueError("Unknown constant: %s" % const_name) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /scripts/preprocessing/extract_normals.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This script extracts the detected normal forms into an output file that can be 6 | used for the comparison. 7 | 8 | Author: Gertjan van den Burg 9 | Copyright (c) 2018 - The Alan Turing Institute 10 | License: See the LICENSE file. 11 | 12 | """ 13 | 14 | import json 15 | 16 | from tqdm import tqdm 17 | 18 | from common.dialect import Dialect 19 | from common.detector_result import DetectorResult, Status, StatusMsg 20 | 21 | 22 | def load_normals(filename): 23 | data = [] 24 | with open(filename, "r") as fid: 25 | for line in fid: 26 | data.append(json.loads(line.strip())) 27 | return data 28 | 29 | 30 | def main(normal_file, output_file): 31 | normals = load_normals(normal_file) 32 | 33 | results = {} 34 | for entry in tqdm(normals): 35 | filename = entry["filename"] 36 | form_id = entry["form_id"] 37 | params = entry["params"] 38 | 39 | if form_id == "FAIL": 40 | # unreadable file 41 | dr = DetectorResult( 42 | detector="normal", 43 | filename=filename, 44 | status=Status.FAIL, 45 | status_msg=StatusMsg.UNREADABLE, 46 | ) 47 | else: 48 | dialect = Dialect( 49 | delimiter=params["delim"], 50 | quotechar=params["quotechar"], 51 | escapechar=params["escapechar"], 52 | ) 53 | 54 | dr = DetectorResult( 55 | detector="normal", 56 | dialect=dialect, 57 | filename=filename, 58 | status=Status.OK, 59 | ) 60 | 61 | if filename in results: 62 | raise KeyError("Filename %s already exists, duplicate!" % filename) 63 | 64 | results[filename] = dr 65 | 66 | with open(output_file, "w") as fid: 67 | for filename in sorted(results.keys()): 68 | fid.write(results[filename].to_json() + "\n") 69 | 70 | print("All done.") 71 | -------------------------------------------------------------------------------- /scripts/analysis/constant_failure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Percentage of failure cases that were because of no_results or timeout. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | 15 | from common.detector_result import StatusMsg, Status 16 | 17 | from .constant_accuracy_overall import load_and_merge 18 | 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "-d", 24 | dest="detector", 25 | help="Detector result(s)", 26 | required=True, 27 | nargs="+", 28 | ) 29 | parser.add_argument( 30 | "-r", 31 | dest="reason", 32 | help="Reason for failure", 33 | choices=["no_results", "timeout"], 34 | required=True, 35 | ) 36 | parser.add_argument( 37 | "-o", dest="output", help="Output tex file to write to", required=True 38 | ) 39 | return parser.parse_args() 40 | 41 | 42 | def main(): 43 | args = parse_args() 44 | detector_results = load_and_merge(args.detector) 45 | n_failure = sum( 46 | (1 for x in detector_results.values() if x.status == Status.FAIL) 47 | ) 48 | if args.reason == "no_results": 49 | n_with_reason = sum( 50 | ( 51 | 1 52 | for x in detector_results.values() 53 | if x.status == Status.FAIL 54 | and x.status_msg == StatusMsg.NO_RESULTS 55 | ) 56 | ) 57 | elif args.reason == "timeout": 58 | n_with_reason = sum( 59 | ( 60 | 1 61 | for x in detector_results.values() 62 | if x.status == Status.FAIL and x.status_msg == StatusMsg.TIMEOUT 63 | ) 64 | ) 65 | else: 66 | raise ValueError 67 | 68 | prop = n_with_reason / n_failure * 100 69 | with open(args.output, "w") as fid: 70 | fid.write("%.1f\\%%%%" % prop) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /scripts/analysis/constant_accuracy_overall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Overall accuracy of a method averaged over multiple corpora. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import sys 15 | 16 | from common.detector_result import Status 17 | 18 | from .core import load_detector_results 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "-r", 24 | dest="reference", 25 | help="Reference file(s) with ground truth", 26 | required=True, 27 | nargs="+", 28 | ) 29 | parser.add_argument( 30 | "-d", 31 | dest="detector", 32 | help="Detector result(s)", 33 | required=True, 34 | nargs="+", 35 | ) 36 | parser.add_argument( 37 | "-o", dest="output", help="Output tex file to write to", required=True 38 | ) 39 | return parser.parse_args() 40 | 41 | 42 | def load_and_merge(filenames): 43 | results = {} 44 | for res_file in filenames: 45 | _, res = load_detector_results(res_file) 46 | for fname in res: 47 | if fname in results: 48 | print( 49 | "Error: duplicate result for file %s" % fname, 50 | file=sys.stderr, 51 | ) 52 | raise SystemExit 53 | results[fname] = res[fname] 54 | return results 55 | 56 | 57 | def compute_accuracy_overall(ref_results, det_results): 58 | total = 0 59 | correct = 0 60 | for fname in ref_results: 61 | ref = ref_results[fname] 62 | if not ref.status == Status.OK: 63 | continue 64 | total += 1 65 | det = det_results[fname] 66 | if not det.status == Status.OK: 67 | continue 68 | correct += ref.dialect == det.dialect 69 | return correct / total * 100 70 | 71 | 72 | def main(): 73 | args = parse_args() 74 | reference_results = load_and_merge(args.reference) 75 | detector_results = load_and_merge(args.detector) 76 | acc = compute_accuracy_overall(reference_results, detector_results) 77 | with open(args.output, "w") as fid: 78 | fid.write("%.0f\\%%%%" % acc) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /scripts/analysis/constant_known_type.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Compute the percentage of cells with a known type. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2019 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | Date: 2019-04-15 11 | 12 | """ 13 | 14 | import argparse 15 | import multiprocessing 16 | 17 | from tqdm import tqdm 18 | 19 | from common.encoding import get_encoding 20 | from common.load import load_file 21 | from detection.our_score_base import is_clean, get_cells 22 | from common.detector_result import Status 23 | 24 | from .core import load_detector_results 25 | 26 | 27 | def parse_args(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument( 30 | "-n", 31 | "--n-jobs", 32 | help="Number of parallel jobs", 33 | default=6, 34 | dest="n_jobs", 35 | ) 36 | parser.add_argument( 37 | "-r", 38 | "--reference", 39 | help="Reference file(s) with ground truth", 40 | nargs="+", 41 | required=True, 42 | ) 43 | parser.add_argument("-o", "--output", help="Output file") 44 | return parser.parse_args() 45 | 46 | 47 | def _worker(res_ref): 48 | filename = res_ref.filename 49 | encoding = get_encoding(filename) 50 | data = load_file(filename, encoding=encoding) 51 | if data is None: 52 | return None 53 | 54 | cells = get_cells(data, res_ref.dialect) 55 | n_clean = sum((is_clean(cell) for cell in cells)) 56 | n_cells = len(cells) 57 | return (n_clean, n_cells) 58 | 59 | 60 | def main(): 61 | args = parse_args() 62 | 63 | reference_results = {} 64 | for reference in args.reference: 65 | _, ref_results = load_detector_results(reference) 66 | reference_results.update(ref_results) 67 | 68 | n_cells = 0 69 | n_clean = 0 70 | 71 | only_ok = { 72 | k: v for k, v in reference_results.items() if v.status == Status.OK 73 | } 74 | 75 | with multiprocessing.Pool(args.n_jobs) as pool: 76 | with tqdm(total=len(only_ok)) as pbar: 77 | for n_clean_x, n_cells_x in pool.imap_unordered( 78 | _worker, only_ok.values() 79 | ): 80 | n_clean += n_clean_x 81 | n_cells += n_cells_x 82 | pbar.update() 83 | 84 | perc = n_clean / n_cells * 100 85 | with open(args.output, "w") as fid: 86 | fid.write("%.1f\\%%%%" % perc) 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /scripts/common/dialect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Definitions for a Dialect object. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | 14 | import sys 15 | 16 | from functools import total_ordering 17 | 18 | ATTRIBUTES = ["delimiter", "quotechar", "escapechar"] 19 | 20 | 21 | @total_ordering 22 | class Dialect(object): 23 | def __init__(self, delimiter, quotechar, escapechar): 24 | self.delimiter = delimiter 25 | self.quotechar = quotechar 26 | self.escapechar = escapechar 27 | 28 | def validate(self): 29 | if self.delimiter is None or len(self.delimiter) > 1: 30 | raise ValueError( 31 | "Delimiter should be zero or one characters, got: %r" 32 | % self.delimiter 33 | ) 34 | if self.quotechar is None or len(self.quotechar) > 1: 35 | raise ValueError( 36 | "Quotechar should be zero or one characters, got: %r" 37 | % self.quotechar 38 | ) 39 | if self.escapechar is None or len(self.escapechar) > 1: 40 | raise ValueError( 41 | "Escapechar should be zero or one characters, got: %r" 42 | % self.escapechar 43 | ) 44 | if self.quotechar in ["Q", "A"]: 45 | print( 46 | "Warning: quotechar is 'Q' or 'A', probably a mistake.", 47 | file=sys.stderr, 48 | ) 49 | 50 | @classmethod 51 | def from_dict(cls, d): 52 | d = cls(d["delimiter"], d["quotechar"], d["escapechar"]) 53 | return d 54 | 55 | def to_dict(self): 56 | self.validate() 57 | d = dict(delimiter=self.delimiter, quotechar=self.quotechar, 58 | escapechar=self.escapechar) 59 | return d 60 | 61 | def __repr__(self): 62 | return "(%r, %r, %r)" % ( 63 | self.delimiter, 64 | self.quotechar, 65 | self.escapechar, 66 | ) 67 | 68 | def __key(self): 69 | return (self.delimiter, self.quotechar, self.escapechar) 70 | 71 | def __hash__(self): 72 | return hash(self.__key()) 73 | 74 | def __eq__(self, other): 75 | if not isinstance(other, Dialect): 76 | return False 77 | return self.__key() == other.__key() 78 | 79 | def __lt__(self, other): 80 | if not isinstance(other, Dialect): 81 | return -1 82 | return self.__key() < other.__key() 83 | -------------------------------------------------------------------------------- /scripts/detection/our_score_full.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Get the best parameter set by using the data consistency measure. 7 | 8 | Author: Gertjan van den Burg 9 | Copyright (c) 2018 - The Alan Turing Institute 10 | License: See the LICENSE file. 11 | """ 12 | 13 | from collections import Counter 14 | 15 | from .core import run 16 | from .our_score_base import determine_dqr, get_cells, is_clean, make_abstraction 17 | 18 | DETECTOR = "our_score_full" 19 | 20 | # The value of EPS_PAT is tricky, because if we choose it too high it may give 21 | # too many false single-column files. This value seems to work quite well. 22 | EPS_PAT = 1e-3 23 | EPS_TYP = 1e-10 24 | 25 | 26 | def get_scores(data, dialects, verbose=False): 27 | scores = {} 28 | max_score = -float("inf") 29 | for dialect in sorted(dialects): 30 | A = make_abstraction(data, dialect) 31 | row_patterns = Counter(A.split("R")) 32 | pattern_score = 0 33 | for pat_p, n_p in row_patterns.items(): 34 | Lk = len(pat_p.split("D")) 35 | pattern_score += n_p * (max(EPS_PAT, Lk - 1) / Lk) 36 | pattern_score /= len(row_patterns) 37 | 38 | if pattern_score == 0: 39 | # if pattern score is zero, the outcome will be zero, so we 40 | # don't have to check types. 41 | type_score = float("nan") 42 | score = 0 43 | elif pattern_score < max_score: 44 | # since the type score is in [0, 1], if the pattern score 45 | # is smaller than the current best score, it can't possibly 46 | # be improved by types, so we don't have to bother. 47 | type_score = float("nan") 48 | score = 0 49 | else: 50 | cells = get_cells(data, dialect) 51 | n_clean = sum((is_clean(cell) for cell in cells)) 52 | n_cells = len(cells) 53 | 54 | if n_cells == 0: 55 | type_score = EPS_TYP 56 | else: 57 | type_score = max(EPS_TYP, n_clean / n_cells) 58 | score = type_score * pattern_score 59 | 60 | scores[dialect] = score 61 | max_score = max(max_score, score) 62 | 63 | if verbose: 64 | print( 65 | "%15r:\ttype = %.6f\tpattern = %.6f\tfinal = %s" 66 | % ( 67 | dialect, 68 | type_score, 69 | pattern_score, 70 | "0" if scores[dialect] == 0 else "%.6f" % scores[dialect], 71 | ) 72 | ) 73 | 74 | return scores 75 | 76 | 77 | def wrap_determine_dqr(filename, verbose=False): 78 | return determine_dqr(filename, get_scores, verbose=verbose) 79 | 80 | 81 | def main(): 82 | run(determine_dqr=wrap_determine_dqr, detector=DETECTOR) 83 | -------------------------------------------------------------------------------- /scripts/analysis/figure_box_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import argparse 11 | import json 12 | import numpy as np 13 | import os 14 | 15 | from .core import ORDERED_DETECTORS, check_detectors, clean_detector_name 16 | from .latex import build_latex_doc 17 | 18 | 19 | def create_box_and_whisker_plot(runtimes, output_file): 20 | check_detectors(runtimes.keys()) 21 | abbrev = [clean_detector_name(d) for d in ORDERED_DETECTORS] 22 | 23 | xtick = ",".join([str(i + 1) for i in range(len(abbrev))]) 24 | xticklabels = ",".join(abbrev) 25 | 26 | tex = ( 27 | "\\documentclass[preview=true]{standalone}\n" 28 | "\\pdfinfoomitdate=1\n" 29 | "\\pdftrailerid{}\n" 30 | "\\pdfsuppressptexinfo=1\n" 31 | "\\usepackage{tikz}\n" 32 | "\\usepackage{pgfplots}\n" 33 | "\\pgfplotsset{compat=1.16}\n" 34 | "\\usepgfplotslibrary{statistics}\n" 35 | "\\begin{document}\n" 36 | "\\begin{tikzpicture}\n" 37 | "\\begin{semilogyaxis}[\n" 38 | "boxplot/draw direction=y,\n" 39 | "xtick={%s},\n" 40 | "xticklabels={%s},\n" 41 | "ylabel={Runtime (s)},\n" 42 | "width=500pt,\n" 43 | "height=200pt\n" 44 | "]\n" % (xtick, xticklabels) 45 | ) 46 | 47 | for detector in ORDERED_DETECTORS: 48 | rt = runtimes[detector] 49 | q1, median, q3 = np.percentile(rt, [25, 50, 75]) 50 | upper_whisker = max(rt) 51 | lower_whisker = min(rt) 52 | boxplot_tex = ( 53 | "\\addplot+[\n" 54 | "\tdraw=black,\n" 55 | "\tsolid,\n" 56 | "\tboxplot prepared={\n" 57 | "\t\tmedian=%f,\n" 58 | "\t\tlower quartile=%f,\n" 59 | "\t\tupper quartile=%f,\n" 60 | "\t\tupper whisker=%f,\n" 61 | "\t\tlower whisker=%f\n" 62 | "},\n" 63 | "] coordinates {};\n" 64 | % (median, q1, q3, upper_whisker, lower_whisker) 65 | ) 66 | tex += boxplot_tex 67 | 68 | tex += "\\end{semilogyaxis}\n" "\\end{tikzpicture}\n" "\\end{document}" 69 | 70 | tex_file = os.path.splitext(output_file)[0] + ".tex" 71 | with open(tex_file, "w") as fid: 72 | fid.write(tex) 73 | 74 | build_latex_doc(tex, output_name=output_file) 75 | 76 | 77 | def parse_args(): 78 | parser = argparse.ArgumentParser() 79 | parser.add_argument( 80 | "-o", dest="output", help="Output pdf file to write to", required=True 81 | ) 82 | parser.add_argument( 83 | "-s", 84 | dest="summary", 85 | help="Summary file with the input data", 86 | required=True, 87 | ) 88 | return parser.parse_args() 89 | 90 | 91 | def main(): 92 | args = parse_args() 93 | with open(args.summary, "r") as fid: 94 | summary = json.load(fid) 95 | 96 | create_box_and_whisker_plot(summary["runtimes"], args.output) 97 | -------------------------------------------------------------------------------- /scripts/detection/our_score_full_no_tie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Get the best parameter set by using our data consistency measure. 7 | 8 | This variation does not break ties. 9 | 10 | Author: Gertjan van den Burg 11 | Copyright (c) 2018 - The Alan Turing Institute 12 | License: See the LICENSE file. 13 | """ 14 | 15 | from collections import Counter 16 | 17 | from .core import run 18 | from .our_score_base import ( 19 | determine_dqr, 20 | get_cells, 21 | is_clean, 22 | make_abstraction, 23 | ) 24 | 25 | 26 | DETECTOR = "our_score_full_no_tie" 27 | 28 | # The value of EPS_PAT is tricky, because if we choose it too high it may give 29 | # too many false single-column files. This value seems to work quite well. 30 | EPS_PAT = 1e-3 31 | EPS_TYP = 1e-10 32 | 33 | 34 | def get_scores(data, dialects, verbose=False): 35 | scores = {} 36 | max_score = -float("inf") 37 | for dialect in sorted(dialects): 38 | A = make_abstraction(data, dialect) 39 | row_patterns = Counter(A.split("R")) 40 | pattern_score = 0 41 | for pat_p, n_p in row_patterns.items(): 42 | Lk = len(pat_p.split("D")) 43 | pattern_score += n_p * (max(EPS_PAT, Lk - 1) / Lk) 44 | pattern_score /= len(row_patterns) 45 | 46 | if pattern_score == 0: 47 | # if pattern score is zero, the outcome will be zero, so we 48 | # don't have to check types. 49 | type_score = float("nan") 50 | score = 0 51 | elif pattern_score < max_score: 52 | # since the type score is in [0, 1], if the pattern score 53 | # is smaller than the current best score, it can't possibly 54 | # be improved by types, so we don't have to bother. 55 | type_score = float("nan") 56 | score = 0 57 | else: 58 | cells = get_cells(data, dialect) 59 | n_clean = sum((is_clean(cell) for cell in cells)) 60 | n_cells = len(cells) 61 | 62 | if n_cells == 0: 63 | type_score = EPS_TYP 64 | else: 65 | type_score = max(EPS_TYP, n_clean / n_cells) 66 | score = type_score * pattern_score 67 | 68 | scores[dialect] = score 69 | max_score = max(max_score, score) 70 | 71 | if verbose: 72 | print( 73 | "%15r:\ttype = %.6f\tpattern = %.6f\tfinal = %s" 74 | % ( 75 | dialect, 76 | type_score, 77 | pattern_score, 78 | "0" if scores[dialect] == 0 else "%.6f" % scores[dialect], 79 | ) 80 | ) 81 | 82 | return scores 83 | 84 | 85 | def wrap_determine_dqr(filename, verbose=False): 86 | return determine_dqr( 87 | filename, get_scores, verbose=verbose, do_break_ties=False 88 | ) 89 | 90 | 91 | def main(): 92 | run(determine_dqr=wrap_determine_dqr, detector=DETECTOR) 93 | -------------------------------------------------------------------------------- /scripts/analysis/table_std_messy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Generate a table with accuracies showing standard/non-standard split. 6 | 7 | | Sniff | Suit | Hypo | Our (full) | 8 | -------------------------------------------- 9 | Std (N) | | | | | 10 | NStd (N)| | | | | 11 | Totl (N)| | | | | 12 | 13 | 14 | Author: Gertjan van den Burg 15 | Date: 2018-11-18 16 | 17 | """ 18 | 19 | import argparse 20 | import json 21 | 22 | from .core import ( 23 | ORDERED_DETECTORS, 24 | TABLE_SPEC, 25 | clean_detector_name, 26 | check_detectors, 27 | ) 28 | from .latex import build_latex_table 29 | 30 | 31 | def create_table(results, output_file): 32 | n_standard = results["n_files_standard"] 33 | n_messy = results["n_files_messy"] 34 | n_total = results["n_files_all"] 35 | assert n_total == n_standard + n_messy 36 | 37 | row_std = ["Standard (%i)" % n_standard] 38 | row_mes = ["Messy (%i)" % n_messy] 39 | row_tot = ["Total (%i)" % n_total] 40 | 41 | check_detectors(results["standard_accuracy_all"].keys()) 42 | check_detectors(results["messy_accuracy_all"].keys()) 43 | check_detectors(results["detection_accuracy_all"]["overall"].keys()) 44 | 45 | for key in ORDERED_DETECTORS: 46 | row_std.append(results["standard_accuracy_all"][key] * 100.0) 47 | row_mes.append(results["messy_accuracy_all"][key] * 100.0) 48 | row_tot.append( 49 | results["detection_accuracy_all"]["overall"][key] * 100.0 50 | ) 51 | 52 | headers = [""] + list(map(clean_detector_name, ORDERED_DETECTORS)) 53 | 54 | table = [row_std, row_mes, row_tot] 55 | with open(output_file, "w") as fid: 56 | fid.write( 57 | build_latex_table( 58 | table, headers, floatfmt=".2f", table_spec=TABLE_SPEC 59 | ) 60 | ) 61 | 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser("Create standard/non-standard table") 65 | parser.add_argument( 66 | "-o", dest="output", help="Output tex file to write to", required=True 67 | ) 68 | parser.add_argument( 69 | "-s", 70 | dest="summary", 71 | help="Summary file with the results", 72 | required=True, 73 | ) 74 | 75 | return parser.parse_args() 76 | 77 | 78 | def main(): 79 | args = parse_args() 80 | with open(args.summary, "r") as fid: 81 | data = json.load(fid) 82 | 83 | needed_keys = [ 84 | "n_files_standard", 85 | "n_files_messy", 86 | "standard_accuracy_all", 87 | "messy_accuracy_all", 88 | ] 89 | for key in needed_keys: 90 | if not key in data: 91 | raise ValueError( 92 | "Required key '%s' not present in summary file." % key 93 | ) 94 | 95 | create_table(data, args.output) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # High version so we get updated version of texlive 2 | FROM ubuntu:20.04 3 | 4 | # Install base packages 5 | RUN apt-get update && \ 6 | DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata && \ 7 | apt-get remove -y python && \ 8 | apt-get install -y --no-install-recommends \ 9 | git \ 10 | build-essential \ 11 | r-base \ 12 | latexmk \ 13 | texlive-latex-extra \ 14 | texlive-pictures 15 | 16 | # Install R package dependencies that are available on Ubuntu 17 | RUN apt-get install -y --no-install-recommends \ 18 | r-cran-ape r-cran-assertthat r-cran-backports \ 19 | r-cran-base64enc r-cran-bit r-cran-bit64 \ 20 | r-cran-bitops r-cran-blob r-cran-brew \ 21 | r-cran-class r-cran-cli r-cran-codetools \ 22 | r-cran-coin r-cran-colorspace r-cran-crayon \ 23 | r-cran-curl r-cran-data.table r-cran-dbi \ 24 | r-cran-desc r-cran-devtools r-cran-digest \ 25 | r-cran-doparallel r-cran-downloader r-cran-dplyr \ 26 | r-cran-e1071 r-cran-evaluate r-cran-evd \ 27 | r-cran-fastmatch r-cran-foreach r-cran-formula \ 28 | r-cran-ggplot2 r-cran-git2r r-cran-glue \ 29 | r-cran-gridbase r-cran-gridextra r-cran-gtable \ 30 | r-cran-highr r-cran-hms r-cran-htmltools \ 31 | r-cran-htmlwidgets r-cran-httpuv r-cran-httr \ 32 | r-cran-igraph r-cran-ipred r-cran-iterators \ 33 | r-cran-jsonlite r-cran-kernsmooth r-cran-knitr \ 34 | r-cran-labeling r-cran-lattice r-cran-lava \ 35 | r-cran-lazyeval r-cran-magrittr r-cran-markdown \ 36 | r-cran-mass r-cran-matrix r-cran-matrixstats \ 37 | r-cran-memoise r-cran-mgcv r-cran-mime \ 38 | r-cran-mockery r-cran-modeltools r-cran-multcomp \ 39 | r-cran-munsell r-cran-mvtnorm r-cran-nlme \ 40 | r-cran-nnet r-cran-numderiv r-cran-openssl \ 41 | r-cran-pillar r-cran-pkgconfig r-cran-plogr \ 42 | r-cran-plyr r-cran-praise r-cran-prettyunits \ 43 | r-cran-prodlim r-cran-purrr r-cran-r6 \ 44 | r-cran-rcolorbrewer r-cran-rcpp r-cran-rcurl \ 45 | r-cran-readr r-cran-rematch r-cran-reshape2 \ 46 | r-cran-rjson r-cran-rlang r-cran-rpart \ 47 | r-cran-rprojroot r-cran-rsqlite r-cran-rstudioapi \ 48 | r-cran-runit r-cran-sandwich r-cran-scales \ 49 | r-cran-shiny r-cran-sourcetools r-cran-stringi \ 50 | r-cran-stringr r-cran-strucchange r-cran-survival \ 51 | r-cran-testthat r-cran-th.data r-cran-tibble \ 52 | r-cran-tidyr r-cran-tidyselect r-cran-utf8 \ 53 | r-cran-uuid r-cran-viridis r-cran-viridislite \ 54 | r-cran-whisker r-cran-withr r-cran-xml \ 55 | r-cran-xml2 r-cran-xtable r-cran-yaml \ 56 | r-cran-zoo 57 | 58 | # Deal with the Python2/3 situation 59 | RUN apt-get install -y --no-install-recommends \ 60 | python3 \ 61 | python3-dev \ 62 | python3-pip && \ 63 | pip3 install --no-cache-dir --upgrade pip setuptools && \ 64 | echo "alias python='python3'" >> /root/.bash_aliases && \ 65 | echo "alias pip='pip3'" >> /root/.bash_aliases && \ 66 | cd /usr/local/bin && ln -s /usr/bin/python3 python 67 | 68 | # Clone the repo 69 | RUN git clone https://github.com/alan-turing-institute/CSV_Wrangling 70 | 71 | # Install dependencies 72 | RUN pip install -r CSV_Wrangling/requirements.txt 73 | RUN ./CSV_Wrangling/utils/install_R_packages.sh CSV_Wrangling/Rpackages.txt 74 | 75 | WORKDIR CSV_Wrangling 76 | -------------------------------------------------------------------------------- /scripts/analysis/figure_bar_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Convert summary data to a bar plot. 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import os 13 | import argparse 14 | 15 | from .core import ( 16 | ORDERED_DETECTORS, 17 | ORDERED_PROP, 18 | check_detectors, 19 | clean_detector_name, 20 | ) 21 | from .latex import build_latex_doc 22 | 23 | 24 | def create_prop_graph(results, output_file): 25 | for prop in results: 26 | check_detectors(results[prop].keys()) 27 | 28 | abbrev = [clean_detector_name(d) for d in ORDERED_DETECTORS] 29 | tex = ( 30 | "\\documentclass[preview=true]{standalone}\n" 31 | "\\pdfinfoomitdate=1\n" 32 | "\\pdftrailerid{}\n" 33 | "\\pdfsuppressptexinfo=1\n" 34 | "\\usepackage{tikz}\n" 35 | "\\usepackage{pgfplots}\n" 36 | "\\pgfplotsset{compat=1.16}\n" 37 | "\\begin{document}\n" 38 | "\\begin{tikzpicture}\n" 39 | "\\begin{axis}[\n" 40 | "\tybar,\n" 41 | "\twidth={400},\n" 42 | "\theight={200},\n" 43 | "\tymin=0,\n" 44 | "\tlegend style={at={(0.5,-0.15)}, anchor=north, legend columns=-1},\n" 45 | "\tylabel={Accuracy (\\%%)},\n" 46 | "\tsymbolic x coords={%s},\n" 47 | "\txtick=data,\n" 48 | "\tnodes near coords,\n" 49 | "\tnodes near coords align={vertical},\n" 50 | "\tevery node near coord/.append style={font=\\tiny},\n" 51 | "\t]\n" % ",".join(abbrev) 52 | ) 53 | for prop in ORDERED_PROP: 54 | line = "\\addplot coordinates {" 55 | for detector in ORDERED_DETECTORS: 56 | line += "(%s,%.16f) " % ( 57 | clean_detector_name(detector), 58 | results[prop][detector], 59 | ) 60 | line += "};\n" 61 | 62 | tex += line 63 | 64 | tex += "\\legend{%s}\n" % ",".join(ORDERED_PROP) 65 | tex += "\\end{axis}\n" "\\end{tikzpicture}\n" "\\end{document}" 66 | 67 | tex_file = os.path.splitext(output_file)[0] + ".tex" 68 | with open(tex_file, "w") as fid: 69 | fid.write(tex) 70 | 71 | build_latex_doc(tex, output_name=output_file) 72 | 73 | 74 | def parse_args(): 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument( 77 | "type", 78 | choices=["all", "human", "normal"], 79 | help="Subset of data to generate plot for", 80 | default="all", 81 | ) 82 | parser.add_argument( 83 | "-o", dest="output", help="Output pdf file to write to", required=True 84 | ) 85 | parser.add_argument( 86 | "-s", 87 | dest="summary", 88 | help="Summary file with the results", 89 | required=True, 90 | ) 91 | 92 | return parser.parse_args() 93 | 94 | 95 | def main(): 96 | args = parse_args() 97 | 98 | with open(args.summary, "r") as fid: 99 | data = json.load(fid) 100 | 101 | key = "detection_accuracy_" + args.type 102 | if not key in data: 103 | raise ValueError("Can't find key %s in file %s" % (key, args.summary)) 104 | 105 | create_prop_graph(data[key], args.output) 106 | -------------------------------------------------------------------------------- /scripts/analysis/table_parse_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Generate a table with percentages along a "no result"/"incorrect 6 | result"/"correct result" split. 7 | 8 | | Sniff | Suit | Hypo | Our (full) | 9 | ---------------------------------------------- 10 | No Result | | | | | 11 | Incorrect | | | | | 12 | Correct | | | | | 13 | 14 | 15 | Author: Gertjan van den Burg 16 | Date: 2019-04-02 17 | 18 | """ 19 | 20 | import argparse 21 | import json 22 | 23 | from .core import ( 24 | ORDERED_DETECTORS, 25 | TABLE_SPEC, 26 | clean_detector_name, 27 | check_detectors, 28 | ) 29 | from .latex import build_latex_table 30 | 31 | 32 | def create_table(results, output_file): 33 | row_no_result = ["No Result"] 34 | row_incorrect = ["Incorrect"] 35 | row_correct = ["Correct"] 36 | 37 | check_detectors(results["no_result_all"].keys()) 38 | check_detectors(results["incorrect_result_all"].keys()) 39 | check_detectors(results["correct_result_all"].keys()) 40 | 41 | for key in ORDERED_DETECTORS: 42 | row_no_result.append(results["no_result_all"][key] * 100.0) 43 | row_incorrect.append(results["incorrect_result_all"][key] * 100.0) 44 | row_correct.append(results["correct_result_all"][key] * 100.0) 45 | 46 | # check that the values add up to 100% (minus precision errors) 47 | diff = abs( 48 | sum((r[-1] for r in [row_no_result, row_incorrect, row_correct])) 49 | - 100.0 50 | ) 51 | if not diff < 1e-13: 52 | raise AssertionError("Difference is larger than eps: %r" % diff) 53 | 54 | headers = [""] + list(map(clean_detector_name, ORDERED_DETECTORS)) 55 | 56 | table = [row_no_result, row_incorrect, row_correct] 57 | with open(output_file, "w") as fid: 58 | fid.write( 59 | build_latex_table( 60 | table, 61 | headers, 62 | floatfmt=".2f", 63 | bests=[min, min, max], 64 | table_spec=TABLE_SPEC, 65 | ) 66 | ) 67 | 68 | 69 | def parse_args(): 70 | parser = argparse.ArgumentParser("Create parsing result table") 71 | parser.add_argument( 72 | "-o", dest="output", help="Output tex file to write to", required=True 73 | ) 74 | parser.add_argument( 75 | "-s", 76 | dest="summary", 77 | help="Summary file with the results", 78 | required=True, 79 | ) 80 | 81 | return parser.parse_args() 82 | 83 | 84 | def main(): 85 | args = parse_args() 86 | with open(args.summary, "r") as fid: 87 | data = json.load(fid) 88 | 89 | needed_keys = [ 90 | "no_result_all", 91 | "incorrect_result_all", 92 | "correct_result_all", 93 | ] 94 | for key in needed_keys: 95 | if not key in data: 96 | raise ValueError( 97 | "Required key '%s' not present in summary file." % key 98 | ) 99 | 100 | create_table(data, args.output) 101 | 102 | 103 | if __name__ == "__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /scripts/analysis/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Common definitions for the analysis scripts 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import json 14 | import os 15 | 16 | from common.detector_result import DetectorResult 17 | 18 | DETECTOR_NAMES = { 19 | "hypoparsr": "HypoParsr", 20 | "sniffer": "Sniffer", 21 | "suitability": "Suitability", 22 | "our_score_pattern_only": "Pattern", 23 | "our_score_type_only": "Type", 24 | "our_score_full_no_tie": "No Tie", 25 | "our_score_full": "Full", 26 | } 27 | 28 | ORDERED_DETECTORS = [ 29 | "hypoparsr", 30 | "sniffer", 31 | "suitability", 32 | "our_score_pattern_only", 33 | "our_score_type_only", 34 | "our_score_full_no_tie", 35 | "our_score_full", 36 | ] 37 | TABLE_SPEC = "lrrr|rrrr" 38 | 39 | ORDERED_PROP = ["delimiter", "quotechar", "escapechar", "overall"] 40 | 41 | CORPUS_NAMES = {"github": "GitHub", "ukdata": "UKdata"} 42 | 43 | 44 | def check_detectors(names): 45 | if not set(ORDERED_DETECTORS) == set(names): 46 | print( 47 | "Detector set doesn't match!\nExpected: %r\nReceived: %r\n" 48 | % (sorted(set(ORDERED_DETECTORS)), sorted(set(names))) 49 | ) 50 | raise SystemExit(1) 51 | 52 | 53 | def clean_detector_name(detector): 54 | abbr = DETECTOR_NAMES.get(detector, detector) 55 | return abbr.replace("_", "\\_") 56 | 57 | 58 | def load_detector_results(result_file): 59 | """ 60 | Load the results from a given detector result file. Verify each record in 61 | the process. 62 | """ 63 | detector_names = set() 64 | results = {} 65 | with open(result_file, "r") as fid: 66 | for idx, line in enumerate(fid.readlines()): 67 | try: 68 | record = DetectorResult.from_json(line.strip()) 69 | except json.JSONDecodeError: 70 | print( 71 | "\nError parsing the following record in file (line %i): " 72 | "%s\n---\n%s" % (idx + 1, result_file, line.strip()) 73 | ) 74 | raise SystemExit(1) 75 | 76 | detector_names.add(record.detector) 77 | 78 | fname = record.filename 79 | if not os.path.isabs(fname): 80 | fname = os.path.abspath(fname) 81 | record.filename = fname 82 | if fname in results: 83 | raise ValueError( 84 | "Duplicate result for file %s in detector file %s" 85 | % (record.filename, result_file) 86 | ) 87 | 88 | record.validate() 89 | results[fname] = record 90 | 91 | if len(detector_names) > 1: 92 | raise ValueError( 93 | "More than one detector name in file: %s" % result_file 94 | ) 95 | detector = detector_names.pop() 96 | return detector, results 97 | 98 | 99 | def is_standard_dialect(dialect): 100 | if ( 101 | dialect.delimiter == "," 102 | and dialect.quotechar in ["", '"'] 103 | and dialect.escapechar == "" 104 | ): 105 | return True 106 | return False 107 | -------------------------------------------------------------------------------- /scripts/analysis/figure_fail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Convert summary data to a bar plot. 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import json 12 | import argparse 13 | import os 14 | 15 | from .core import ( 16 | CORPUS_NAMES, 17 | DETECTOR_NAMES, 18 | ORDERED_DETECTORS, 19 | check_detectors, 20 | ) 21 | from .latex import build_latex_doc 22 | 23 | BAR_PATTERNS = [ 24 | "north east lines", 25 | "none", 26 | "north west lines", 27 | "horizontal lines", 28 | "vertical lines", 29 | "grid", 30 | "crosshatch", 31 | ] 32 | 33 | 34 | def clean_name(detector): 35 | abbr = DETECTOR_NAMES.get(detector, detector) 36 | return abbr.replace("_", "\\_") 37 | 38 | 39 | def create_fail_graph(results, output_file): 40 | fail_data = {corpus: results[corpus]["failures"] for corpus in results} 41 | for corpus in fail_data: 42 | check_detectors(fail_data[corpus].keys()) 43 | 44 | abbrev = [clean_name(d) for d in ORDERED_DETECTORS] 45 | tex = ( 46 | "\\documentclass[preview=true]{standalone}\n" 47 | "\\pdfinfoomitdate=1\n" 48 | "\\pdftrailerid{}\n" 49 | "\\pdfsuppressptexinfo=1\n" 50 | "\\usepackage{tikz}\n" 51 | "\\usepackage{pgfplots}\n" 52 | "\\usetikzlibrary{patterns}\n" 53 | "\\pgfplotsset{compat=1.16}\n" 54 | "\\begin{document}\n" 55 | "\\begin{tikzpicture}\n" 56 | "\\begin{axis}[\n" 57 | "\tybar,\n" 58 | "\twidth={600},\n" 59 | "\theight={200},\n" 60 | "\tymin=0,\n" 61 | "\tlegend pos={north east},\n" 62 | "\tylabel={Failure (\\%%)},\n" 63 | "\tsymbolic x coords={%s},\n" 64 | "\txtick=data,\n" 65 | "\tnodes near coords,\n" 66 | "\tevery node near coord/.append style={font=\\tiny, /pgf/number format/fixed},\n" 67 | "\tnodes near coords align={vertical},\n" 68 | "\t]\n" % ",".join(abbrev) 69 | ) 70 | 71 | corpora = sorted(fail_data.keys()) 72 | 73 | for pattern, corpus in zip(BAR_PATTERNS, corpora): 74 | line = "\\addplot[postaction={pattern=%s}] coordinates {" % pattern 75 | for detector in ORDERED_DETECTORS: 76 | line += "(%s,%.16f) " % ( 77 | clean_name(detector), 78 | fail_data[corpus][detector] * 100.0, 79 | ) 80 | line += "};\n" 81 | tex += line 82 | 83 | tex += "\\legend{%s}\n" % ", ".join([CORPUS_NAMES.get(c) for c in corpora]) 84 | 85 | tex += "\\end{axis}\n" "\\end{tikzpicture}\n" "\\end{document}" 86 | 87 | tex_file = os.path.splitext(output_file)[0] + ".tex" 88 | with open(tex_file, "w") as fid: 89 | fid.write(tex) 90 | 91 | build_latex_doc(tex, output_name=output_file) 92 | 93 | 94 | def parse_args(): 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument( 97 | "-o", dest="output", help="Output pdf file to write to", required=True 98 | ) 99 | parser.add_argument( 100 | "-s", 101 | dest="summaries", 102 | help="Summary file with the results", 103 | required=True, 104 | nargs="+", 105 | ) 106 | 107 | return parser.parse_args() 108 | 109 | 110 | def main(): 111 | args = parse_args() 112 | 113 | all_data = {} 114 | for summary_file in args.summaries: 115 | with open(summary_file, "r") as fid: 116 | data = json.load(fid) 117 | all_data[data["corpus"]] = data 118 | 119 | create_fail_graph(all_data, args.output) 120 | 121 | -------------------------------------------------------------------------------- /scripts/data_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Downloader for the experimental data. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | Date: 2018-11-26 11 | 12 | """ 13 | 14 | import argparse 15 | import hashlib 16 | import json 17 | import os 18 | import random 19 | import requests 20 | import shutil 21 | import sys 22 | import tempfile 23 | import time 24 | 25 | 26 | def md5sum(filename): 27 | blocksize = 65536 28 | hasher = hashlib.md5() 29 | with open(filename, "rb") as fid: 30 | buf = fid.read(blocksize) 31 | while len(buf) > 0: 32 | hasher.update(buf) 33 | buf = fid.read(blocksize) 34 | return hasher.hexdigest() 35 | 36 | 37 | def download_url(urls, md5old, output_dir): 38 | response = None 39 | for url in urls: 40 | # TODO: Catch error when URL no longer exists 41 | try: 42 | response = requests.get(url) 43 | break 44 | except requests.exceptions.ConnectionError: 45 | print( 46 | "Connection error occurred trying to get url: %s" % url, 47 | file=sys.stderr, 48 | ) 49 | continue 50 | except requests.exceptions.ChunkedEncodingError: 51 | print("Connection error occurred trying to get url: %s" % url, 52 | file=sys.stderr) 53 | continue 54 | if response is None or response.status_code != 200: 55 | return None 56 | 57 | tmpfd, tmpfname = tempfile.mkstemp() 58 | tmpfid = os.fdopen(tmpfd, "wb") 59 | tmpfid.write(response.content) 60 | tmpfid.close() 61 | 62 | md5new = md5sum(tmpfname) 63 | if not md5new == md5old: 64 | print( 65 | "Checksum mismatch for URL '%s'. Skipping this file." % url, 66 | file=sys.stderr, 67 | ) 68 | os.unlink(tmpfname) 69 | return None 70 | target = os.path.join(output_dir, md5new + ".csv") 71 | shutil.move(tmpfname, target) 72 | return target 73 | 74 | 75 | def parse_args(): 76 | parser = argparse.ArgumentParser("Data Downloader") 77 | parser.add_argument( 78 | "-i", 79 | "--input", 80 | help="JSONlines file with urls and hashes", 81 | required=True, 82 | ) 83 | parser.add_argument( 84 | "-o", "--output", help="output directory", required=True 85 | ) 86 | return parser.parse_args() 87 | 88 | 89 | def main(): 90 | args = parse_args() 91 | 92 | # load the input file 93 | url_and_hash = [] 94 | with open(args.input, "r") as fid: 95 | for line in fid: 96 | obj = json.loads(line.strip()) 97 | url_and_hash.append(obj) 98 | 99 | # Remove files that already exist 100 | have_obj = [] 101 | have_files = os.listdir(args.output) 102 | for f in have_files: 103 | h = os.path.splitext(f)[0] 104 | obj = next((x for x in url_and_hash if x["md5"] == h), None) 105 | if obj is None: 106 | # ignore files not in our list 107 | continue 108 | have_obj.append(obj) 109 | for obj in have_obj: 110 | url_and_hash.remove(obj) 111 | 112 | # start the download 113 | for obj in url_and_hash: 114 | target = download_url(obj["urls"], obj["md5"], args.output) 115 | if target is None: 116 | continue 117 | print("Downloaded file '%s'" % target) 118 | time.sleep(random.random()) 119 | 120 | 121 | if __name__ == "__main__": 122 | main() 123 | -------------------------------------------------------------------------------- /scripts/detection/sniffer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This detector uses the Python CSV Sniffer to detect the dialect. 6 | 7 | A timeout is needed on the Sniffer because the regular expression for detecting 8 | double quotes can run into catastrophic backtracking if a CSV file has many 9 | empty lines at the end that only contain delimiters (i.e. ",,,,,,,,,," lines). 10 | 11 | Author: Gertjan van den Burg 12 | Copyright (c) 2018 - The Alan Turing Institute 13 | License: See the LICENSE file. 14 | 15 | """ 16 | 17 | import csv 18 | 19 | from multiprocessing import Process, Manager 20 | 21 | from .core import run 22 | 23 | from common.encoding import get_encoding 24 | from common.load import load_file 25 | from common.detector_result import DetectorResult, Dialect, Status, StatusMsg 26 | 27 | DETECTOR = "sniffer" 28 | TIMEOUT = 120 29 | 30 | 31 | def worker(args, return_dict, **kwargs): 32 | res = determine_dqr(*args, **kwargs) 33 | return_dict["output"] = res 34 | 35 | 36 | def run_with_timeout(args, kwargs, limit): 37 | # See: https://stackoverflow.com/a/26664130/1154005 38 | # and: https://stackoverflow.com/a/10415215/1154005 39 | 40 | manager = Manager() 41 | return_dict = manager.dict() 42 | 43 | p = Process(target=worker, args=(args, return_dict), kwargs=kwargs) 44 | p.start() 45 | p.join(limit) 46 | if p.is_alive(): 47 | p.terminate() 48 | return None 49 | if "output" in return_dict: 50 | return return_dict["output"] 51 | return None 52 | 53 | 54 | def sniff(sample, delimiters=None): 55 | """ 56 | This function mimics the Sniffer.sniff() method from the Python CSV 57 | function, with one exception: it doesn't change the detected quotechar to 58 | default to '"'. We do this because we want to know the detected quote 59 | character. 60 | 61 | """ 62 | sniffer = csv.Sniffer() 63 | 64 | quotechar, doublequote, delimiter, skipinitialspace = sniffer._guess_quote_and_delimiter( 65 | sample, delimiters 66 | ) 67 | 68 | if not delimiter: 69 | delimiter, skipinitialspace = sniffer._guess_delimiter( 70 | sample, delimiters 71 | ) 72 | if not delimiter: 73 | raise csv.Error("Could not determine delimiter") 74 | 75 | class dialect(csv.Dialect): 76 | _name = "sniffed" 77 | lineterminator = "\r\n" # unused 78 | quoting = csv.QUOTE_MINIMAL 79 | 80 | dialect.doublequote = doublequote 81 | dialect.delimiter = delimiter 82 | dialect.quotechar = quotechar # See above 83 | dialect.skipinitialspace = skipinitialspace 84 | dialect.escapechar = '' if dialect.escapechar is None else dialect.escapechar 85 | 86 | return dialect 87 | 88 | 89 | def determine_dqr(filename, verbose=False): 90 | """ Run the python CSV Sniffer """ 91 | encoding = get_encoding(filename) 92 | data = load_file(filename, encoding=encoding) 93 | if data is None: 94 | return DetectorResult( 95 | status=Status.SKIP, status_msg=StatusMsg.UNREADABLE 96 | ) 97 | 98 | try: 99 | dialect = sniff(data) 100 | except csv.Error: 101 | return DetectorResult( 102 | status=Status.FAIL, status_msg=StatusMsg.NO_RESULTS 103 | ) 104 | 105 | config = { 106 | "delimiter": dialect.delimiter, 107 | "quotechar": dialect.quotechar, 108 | "escapechar": dialect.escapechar, 109 | } 110 | res = DetectorResult(dialect=Dialect.from_dict(config), status=Status.OK) 111 | 112 | return res 113 | 114 | 115 | def wrap_determine_dqr(filename, verbose=False): 116 | res = run_with_timeout((filename,), {"verbose": verbose}, TIMEOUT) 117 | if res is None: 118 | return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.TIMEOUT) 119 | return res 120 | 121 | 122 | def main(): 123 | run(determine_dqr=wrap_determine_dqr, detector=DETECTOR) 124 | -------------------------------------------------------------------------------- /scripts/common/detector_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Definitions for a DetectorResult object. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | 14 | import enum 15 | import json 16 | import socket 17 | import sys 18 | 19 | from .dialect import Dialect 20 | 21 | 22 | class Status(enum.Enum): 23 | UNKNOWN = 0 24 | OK = 1 25 | FAIL = 2 26 | SKIP = 3 27 | 28 | 29 | class StatusMsg(enum.Enum): 30 | UNKNOWN = 0 31 | MULTIPLE_ANSWERS = 1 32 | NO_RESULTS = 2 33 | TIMEOUT = 3 34 | UNREADABLE = 4 35 | NON_EXISTENT = 5 36 | NO_DIALECTS = 6 37 | HUMAN_SKIP = 7 38 | AMBIGUOUS_QUOTECHAR = 8 39 | 40 | 41 | class DetectorResult(object): 42 | def __init__( 43 | self, 44 | detector=None, 45 | dialect=None, 46 | filename=None, 47 | hostname=None, 48 | runtime=None, 49 | status=None, 50 | status_msg=None, 51 | original_detector=None, 52 | note=None 53 | ): 54 | self.detector = detector 55 | self.dialect = dialect 56 | self.filename = filename 57 | self.hostname = hostname or socket.gethostname() 58 | self.runtime = runtime 59 | self.status = status 60 | self.status_msg = status_msg 61 | self.original_detector = original_detector or detector 62 | self.note = note 63 | 64 | def validate(self): 65 | assert isinstance(self.status, Status) 66 | if not self.status_msg is None: 67 | assert isinstance(self.status_msg, StatusMsg) 68 | assert not self.detector is None 69 | assert not self.hostname is None 70 | assert not self.filename is None 71 | if self.status == Status.OK: 72 | assert not self.dialect is None 73 | assert isinstance(self.dialect, Dialect) 74 | try: 75 | self.dialect.validate() 76 | except ValueError: 77 | print("Dialect validation error for: %r" % self) 78 | raise 79 | else: 80 | assert self.dialect is None 81 | 82 | def to_json(self): 83 | self.validate() 84 | output = { 85 | "detector": self.detector, 86 | "filename": self.filename, 87 | "hostname": self.hostname, 88 | "runtime": self.runtime, 89 | "status": self.status.name, 90 | } 91 | if not self.dialect is None: 92 | output["dialect"] = self.dialect.to_dict() 93 | if not self.status_msg is None: 94 | output["status_msg"] = self.status_msg.name 95 | if not self.note is None: 96 | output['note'] = self.note 97 | if not self.detector == self.original_detector: 98 | output["original_detector"] = self.original_detector 99 | as_json = json.dumps(output) 100 | return as_json 101 | 102 | @classmethod 103 | def from_json(cls, line): 104 | """ load from a json line """ 105 | d = json.loads(line) 106 | try: 107 | d["dialect"] = ( 108 | Dialect.from_dict(d["dialect"]) if "dialect" in d else None 109 | ) 110 | except: 111 | print("Error occurred parsing dialect from line: %s" % line, 112 | file=sys.stderr) 113 | raise 114 | d["status"] = Status[d["status"]] 115 | d["status_msg"] = ( 116 | StatusMsg[d["status_msg"]] if "status_msg" in d else None 117 | ) 118 | dr = cls(**d) 119 | dr.validate() 120 | return dr 121 | 122 | def __repr__(self): 123 | s = ( 124 | "DetectorResult(detector=%r, dialect=%r, runtime=%r, status=%r, status_msg=%r)" 125 | % ( 126 | self.detector, 127 | self.dialect, 128 | self.runtime, 129 | self.status.value, 130 | self.status_msg, 131 | ) 132 | ) 133 | return s 134 | -------------------------------------------------------------------------------- /scripts/analysis/latex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Code for compiling latex from Python. 6 | 7 | Based on: https://github.com/GjjvdBurg/labella.py 8 | 9 | Author: Gertjan van den Burg 10 | Copyright (c) 2018 - The Alan Turing Institute 11 | License: See the LICENSE file. 12 | 13 | """ 14 | 15 | import os 16 | import shutil 17 | import subprocess 18 | import tabulate 19 | import tempfile 20 | 21 | 22 | def compile_latex(fname, tmpdirname, silent=True): 23 | compiler = "latexmk" 24 | compiler_args = [ 25 | "--pdf", 26 | "--outdir=" + tmpdirname, 27 | "--interaction=nonstopmode", 28 | fname, 29 | ] 30 | command = [compiler] + compiler_args 31 | try: 32 | output = subprocess.check_output(command, stderr=subprocess.STDOUT) 33 | except (OSError, IOError) as e: 34 | raise (e) 35 | except subprocess.CalledProcessError as e: 36 | print(e.output.decode()) 37 | raise (e) 38 | else: 39 | if not silent: 40 | print(output.decode()) 41 | 42 | 43 | def build_latex_doc(tex, output_name=None, silent=True): 44 | with tempfile.TemporaryDirectory() as tmpdirname: 45 | basename = "labella_text" 46 | fname = os.path.join(tmpdirname, basename + ".tex") 47 | with open(fname, "w") as fid: 48 | fid.write(tex) 49 | 50 | compile_latex(fname, tmpdirname, silent=silent) 51 | 52 | pdfname = os.path.join(tmpdirname, basename + ".pdf") 53 | if output_name: 54 | shutil.copy2(pdfname, output_name) 55 | 56 | 57 | def build_latex_table( 58 | table, headers, floatfmt="g", missingval="", bests="default", 59 | table_spec=None 60 | ): 61 | """Construct the LaTeX code for a table 62 | 63 | This function creates the LaTeX code for a data table while taking number 64 | formatting, headers, missing values, and "best value formatting" into 65 | account. 66 | 67 | The numbers in the table are formatted following the provided float format 68 | and the missing value indicator using the ``_format`` function from the 69 | ``tabulate`` package. To indicate a missing value the data row should mark 70 | this value as ``None``. 71 | 72 | The ``bests`` parameter is used to decide how to highlight the best values 73 | in each row. It can be either ``'default'``, ``None``, a list of length 1 74 | where the element is either ``min`` or ``max``, or a list of length ``K`` 75 | with similar elements where ``K`` is the length of the data table. If it is 76 | ``'default'`` then ``max`` will be considered best for each row. If a list 77 | of length 1 is supplied then the provided function will be used for each 78 | row. If ``None``, no highlighting will be done. 79 | 80 | The ``table_spec`` parameter allows the user to specify the table 81 | specification. This value is not checked. If it is None, the first column 82 | will get 'l' spec and the remaining columns will get the 'r' spec. 83 | 84 | """ 85 | if bests == "default": 86 | bests = [max] 87 | elif bests is None: 88 | bests = [] 89 | 90 | if len(bests) > 1: 91 | assert len(bests) == len(table) 92 | assert all((x in [min, max] for x in bests)) 93 | 94 | if len(bests) == 0: 95 | best_funcs = [None for x in range(len(table))] 96 | elif len(bests) == 1: 97 | best_funcs = [bests[0] for x in range(len(table))] 98 | else: 99 | best_funcs = bests[:] 100 | 101 | list_of_lists, headers = table, headers 102 | cols = list(zip(*list_of_lists)) 103 | coltypes = list(map(tabulate._column_type, cols)) 104 | cols = [ 105 | [tabulate._format(v, ct, floatfmt, missingval) for v in c] 106 | for c, ct in zip(cols, coltypes) 107 | ] 108 | n_cols = len(cols) 109 | 110 | data_rows = table 111 | text_rows = list(zip(*cols)) 112 | 113 | text = [] 114 | if table_spec is None: 115 | text.append("\\begin{tabular}{l%s}" % ("r" * n_cols)) 116 | else: 117 | text.append("\\begin{tabular}{%s}" % table_spec) 118 | text.append(" & ".join(headers) + "\\\\") 119 | text.append("\\hline") 120 | for data_row, text_row, best_func in zip(data_rows, text_rows, best_funcs): 121 | text_row = list(text_row) 122 | if not best_func is None: 123 | best_val = best_func([x for x in data_row if isinstance(x, float)]) 124 | best_idx = [i for i, v in enumerate(data_row) if v == best_val] 125 | for idx in best_idx: 126 | text_row[idx] = "\\textbf{" + text_row[idx] + "}" 127 | text.append(" & ".join(text_row) + "\\\\") 128 | text.append("\\hline") 129 | text.append("\\end{tabular}") 130 | 131 | return "\n".join(text) 132 | -------------------------------------------------------------------------------- /scripts/detection/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Common functions for the Python code of the experiment. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | """ 11 | 12 | import os 13 | import json 14 | import time 15 | import argparse 16 | import codecs 17 | import unicodedata 18 | 19 | from tqdm import tqdm 20 | 21 | from common.detector_result import DetectorResult, Status, StatusMsg 22 | 23 | 24 | def can_be_delim_unicode(char, encoding=None): 25 | as_unicode = codecs.decode(bytes(char, encoding), encoding=encoding) 26 | ctr = unicodedata.category(as_unicode) 27 | if ctr in ["Lu", "Ll", "Lt", "Lm", "Lo"]: 28 | return False 29 | elif ctr in ["Nd", "Nl", "No"]: # number 30 | return False 31 | elif ctr in ["Po", "Pd", "Pc"]: # punctuation 32 | return True 33 | elif ctr in ["Ps", "Pe"]: # open and close brackets (maybe include?) 34 | return False 35 | elif ctr == "Zs": # space 36 | return True 37 | elif ctr == "Sm": # math symbols 38 | return True 39 | elif ctr == "Cc": # other control (i.e. tab etc.) 40 | if as_unicode == "\t": 41 | return True 42 | return False 43 | elif ctr == "Co": # private use (maybe used for NA?) 44 | # NOTE: This is tricky, we may slow our algorithm down a lot by 45 | # including all these code points as potential delimiters, but we 46 | # may also find the delimiter here. 47 | # Let's see if we _ever_ find a file that uses a private use 48 | # codepoint as a delimiter. 49 | return False 50 | return True 51 | 52 | 53 | 54 | def get_potential_quotechars(data): 55 | quotechars = set([""]) 56 | if "'" in data: 57 | quotechars.add("'") 58 | if '"' in data: 59 | quotechars.add('"') 60 | if "~" in data: 61 | quotechars.add("~") 62 | return quotechars 63 | 64 | 65 | def dump_result(output_file, res): 66 | with open(output_file, "a") as fid: 67 | fid.write(res.to_json() + "\n") 68 | 69 | 70 | def load_previous(output_file): 71 | previous = set() 72 | if not os.path.exists(output_file): 73 | return previous 74 | with open(output_file, "r") as fid: 75 | for line in fid.readlines(): 76 | record = json.loads(line.strip()) 77 | previous.add(record["filename"]) 78 | return previous 79 | 80 | 81 | def main( 82 | path_file, 83 | output_file, 84 | determine_dqr=None, 85 | detector=None, 86 | verbose=False, 87 | progress=False, 88 | ): 89 | with open(path_file, "r") as fid: 90 | files = [l.strip() for l in fid.readlines()] 91 | files.sort() 92 | 93 | previous = load_previous(output_file) 94 | 95 | for filename in tqdm(files, disable=not progress, desc=detector): 96 | if filename in previous: 97 | continue 98 | 99 | if not os.path.exists(filename): 100 | res = DetectorResult( 101 | detector=detector, 102 | dialect=None, 103 | filename=filename, 104 | runtime=None, 105 | status=Status.FAIL, 106 | status_msg=StatusMsg.NON_EXISTENT, 107 | ) 108 | dump_result(output_file, res) 109 | continue 110 | 111 | if not progress: 112 | print("[%s] Analyzing file: %s" % (detector, filename)) 113 | 114 | start_time = time.time() 115 | try: 116 | res = determine_dqr(filename, verbose=verbose) 117 | except KeyboardInterrupt: 118 | raise 119 | except: 120 | print("Uncaught exception occured parsing file: %s" % filename) 121 | raise 122 | 123 | res.runtime = time.time() - start_time 124 | res.filename = filename 125 | res.detector = detector 126 | dump_result(output_file, res) 127 | 128 | 129 | def parse_args(): 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument("-v", "--verbose", dest="verbose", action="store_true") 132 | parser.add_argument( 133 | "-p", "--progress", dest="progress", action="store_true" 134 | ) 135 | parser.add_argument( 136 | "input_file", 137 | help="Input file can be a file of paths to CSV file, or the path of a single CSV file. If the former, output_file must be set", 138 | ) 139 | parser.add_argument( 140 | "output_file", 141 | help="Output file (JSON) to write the results to", 142 | default=None, 143 | nargs="?", 144 | ) 145 | return parser.parse_args() 146 | 147 | 148 | def run(determine_dqr, detector): 149 | args = parse_args() 150 | if args.output_file is None: 151 | print(determine_dqr(args.input_file, verbose=args.verbose)) 152 | else: 153 | main( 154 | args.input_file, 155 | args.output_file, 156 | determine_dqr=determine_dqr, 157 | detector=detector, 158 | verbose=args.verbose, 159 | progress=args.progress, 160 | ) 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CSV Wrangling 2 | 3 | [![Build Status](https://travis-ci.org/alan-turing-institute/CSV_Wrangling.svg?branch=master)](https://travis-ci.org/alan-turing-institute/CSV_Wrangling) 4 | [![DOI](https://zenodo.org/badge/158363564.svg)](https://zenodo.org/badge/latestdoi/158363564) 5 | 6 | This is the repository for reproducing the experiments in the paper: 7 | 8 | [**Wrangling Messy CSV files by Detecting Row and Type 9 | Patterns**](https://rdcu.be/bLVur) 10 | [(PDF)](https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf) 11 | 12 | by [G.J.J. van den Burg](https://gertjanvandenburg.com), [A. 13 | Nazabal](https://scholar.google.co.uk/citations?user=IanHvT4AAAAJ&hl=en&oi=ao) 14 | and [C. Sutton](https://homepages.inf.ed.ac.uk/csutton/). 15 | 16 | For an implementation of the method developed in the paper, see the 17 | [CleverCSV](https://github.com/alan-turing-institute/CleverCSV) repository. 18 | 19 | If you use this paper or this code in your own work, please ***cite the 20 | paper*** using for instance the following BibTeX citation: 21 | 22 | ```bibtex 23 | @article{van2019wrangling, 24 | title = {Wrangling Messy {CSV} Files by Detecting Row and Type Patterns}, 25 | author = {{van den Burg}, G. J. J. and Naz{\'a}bal, A. and Sutton, C.}, 26 | journal = {Data Mining and Knowledge Discovery}, 27 | year = {2019}, 28 | volume = {33}, 29 | number = {6}, 30 | pages = {1799--1820}, 31 | issn = {1573-756X}, 32 | doi = {10.1007/s10618-019-00646-y}, 33 | } 34 | ``` 35 | 36 | ## Introduction 37 | 38 | Our experiments are made reproducible through the use of [GNU 39 | Make](https://www.gnu.org/software/make/). You can either set up your local 40 | environment with the necessary dependencies as described under 41 | [Requirements](#requirements), or use the Dockerfile included in the 42 | repository. 43 | 44 | There are two ways to reproduce our results. The first only reproduces the 45 | figures, tables, and constants in the paper from the raw detection results, 46 | while the second runs the detection methods as well. 47 | 48 | 1. You can reproduce the figures, tables, and constants from the raw 49 | experimental results included in this repository. This will not re-run all 50 | the experiments but will regenerate the output used in the paper. The 51 | command for this is: 52 | 53 | ```bash 54 | $ make output 55 | ``` 56 | 57 | 2. You can fully reproduce our experiments by downloading the data and 58 | rerunning the detection methods on all the files. This might take a while 59 | depending on the speed of your machine and the number of cores available. 60 | Total wall-clock computation time for a single core is estimated at 11 61 | days. The following commands will do all of this. 62 | 63 | ```bash 64 | $ make clean # remove existing output files, except human annotated 65 | $ make data # download the data 66 | $ make results # run all the detectors and generate the result files 67 | ``` 68 | 69 | If you'd like to use multiple cores, you can replace the last command with: 70 | 71 | ```bash 72 | $ make -j X results 73 | ``` 74 | 75 | where ``X`` is the desired number of cores. 76 | 77 | 78 | ## Data 79 | 80 | There are two datasets that are used in the experiments. Because we don't own 81 | the rights to all these files, we can't package these files and make them 82 | available in a single download. We can however provide URLs to the files and 83 | add a download script, which is what we do here. The data can be downloaded 84 | with: 85 | 86 | ```bash 87 | $ make data 88 | ``` 89 | 90 | If you wish to change the download location of the data, please edit the 91 | ``DATA_DIR`` variable in the Makefile. 92 | 93 | **Note:** We are aware of the fact that some of the files may change or become 94 | unavailable in the future. This is an unfortunate side-effect of using 95 | publically available data in this way. The data downloader skips files that 96 | are unavailable or that have changed. Note that this may affect the exact 97 | reproducibility of the results. 98 | 99 | The above downloads the "test" set that was used for the evaluation in the 100 | paper. For the "working set" that was used to develop our algorithm, run 101 | ``make dev-data``. 102 | 103 | If the above datasets are insufficient, the complete original data sets are 104 | available on request for research purposes. Contact ``gertjanvandenburg at 105 | gmail dot com``. 106 | 107 | ## Requirements 108 | 109 | Below are the requirements for reproducing the experiments if you're not using 110 | Docker. Note that at the moment only Linux-based systems are supported. MacOS 111 | will probably work, but hasn't been tested. 112 | 113 | - Python 3.x with the packages in the ``requirements.txt`` file. These can be 114 | installed with: ``pip install --user -r requirements.txt``. 115 | 116 | - R with the external packages installed through: 117 | ``install.packages(c('devtools', 'rjson', 'data.tree', 'RecordLinkage', 118 | 'readr', 'tibble'))``. 119 | 120 | - A working [LaTeX](https://www.latex-project.org/) installation is needed for 121 | creating the figures (at least ``texlive-latex-extra`` and 122 | ``texlive-pictures``), as well as a working 123 | [LaTeXMK](https://mg.readthedocs.io/latexmk.html) installation. 124 | 125 | 126 | ## Instructions 127 | 128 | To clone this repository and all its submodules do: 129 | 130 | ```bash 131 | $ git clone --recurse-submodules https://github.com/alan-turing-institute/CSV_Wrangling 132 | ``` 133 | 134 | Then install the requirements as listed above and run the ``make`` command of 135 | your choice. 136 | 137 | ## License 138 | 139 | With the exception of the submodule in ``scripts/detection/lib/hypoparsr`` 140 | this code is licensed under the [MIT 141 | license](https://en.wikipedia.org/wiki/MIT_License). See the LICENSE file for 142 | more details. 143 | -------------------------------------------------------------------------------- /scripts/analysis/show_failures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Print the failure cases to the terminal. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import itertools 15 | import numpy as np 16 | 17 | import matplotlib.pyplot as plt 18 | from sklearn.metrics import confusion_matrix 19 | 20 | from tabulate import tabulate 21 | 22 | from common.dialect import ATTRIBUTES 23 | from common.detector_result import Status 24 | 25 | from .core import load_detector_results, is_standard_dialect 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser( 30 | description="Show failure cases for given detector results" 31 | ) 32 | parser.add_argument( 33 | "-r", 34 | dest="reference_file", 35 | help="reference output file with ground truth", 36 | required=True, 37 | ) 38 | parser.add_argument( 39 | "-d", dest="detector_file", help="detector output file", required=True 40 | ) 41 | parser.add_argument( 42 | "-p", 43 | dest="attr_name", 44 | choices=ATTRIBUTES + ["overall"], 45 | help="Attribute to show failure for. If omitted, shows the files for which the detector failed.", 46 | required=False, 47 | default=None, 48 | ) 49 | parser.add_argument( 50 | "-c", 51 | dest="confusion", 52 | help="Plot and print the confusion matrix", 53 | action="store_true", 54 | ) 55 | parser.add_argument( 56 | "-m", 57 | dest="only_messy", 58 | help="Show only failures for messy files", 59 | action="store_true", 60 | ) 61 | 62 | return parser.parse_args() 63 | 64 | 65 | def show_complete_failure( 66 | ref_results, detector, det_results, only_messy=False 67 | ): 68 | print("Detector: %s. Failure cases." % detector) 69 | count = 0 70 | total = 0 71 | for fname in ref_results: 72 | res_ref = ref_results[fname] 73 | if not res_ref.status == Status.OK: 74 | continue 75 | if only_messy and is_standard_dialect(res_ref.dialect): 76 | continue 77 | total += 1 78 | if det_results[fname].status == Status.SKIP: 79 | continue 80 | if det_results[fname].status == Status.FAIL: 81 | print(fname) 82 | count += 1 83 | print( 84 | "Total: %i out of %i (%.2f%%)" % (count, total, (count / total * 100)) 85 | ) 86 | 87 | 88 | def plot_confusion(cm, clean_classes): 89 | plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues) 90 | plt.colorbar() 91 | tick_marks = np.arange(len(clean_classes)) 92 | 93 | plt.xticks(tick_marks, clean_classes, rotation=45) 94 | plt.yticks(tick_marks, clean_classes) 95 | 96 | fmt = "d" 97 | thresh = cm.max() / 2 98 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 99 | plt.text( 100 | j, 101 | i, 102 | format(cm[i, j], fmt), 103 | horizontalalignment="center", 104 | color="white" if cm[i, j] > thresh else "black", 105 | ) 106 | plt.ylabel("True") 107 | plt.xlabel("Predicted") 108 | plt.tight_layout() 109 | 110 | 111 | def show_property_failure( 112 | ref_results, detector, det_results, attr_name, show_confusion=False, 113 | only_messy=False 114 | ): 115 | print("Detector: %s. Property: %s." % (detector, attr_name)) 116 | count = 0 117 | total = 0 118 | y_true = [] 119 | y_pred = [] 120 | for fname in ref_results: 121 | res_ref = ref_results[fname] 122 | if not res_ref.status == Status.OK: 123 | continue 124 | if only_messy and is_standard_dialect(res_ref.dialect): 125 | continue 126 | total += 1 127 | if not det_results[fname].status == Status.OK: 128 | continue 129 | if attr_name == "overall": 130 | prop_ref = ref_results[fname].dialect 131 | prop_det = det_results[fname].dialect 132 | y_true.append(repr(prop_ref)) 133 | y_pred.append(repr(prop_det)) 134 | else: 135 | prop_ref = getattr(ref_results[fname].dialect, attr_name) 136 | prop_det = getattr(det_results[fname].dialect, attr_name) 137 | y_true.append(prop_ref) 138 | y_pred.append(prop_det) 139 | if not prop_ref == prop_det: 140 | print("%s ref=%r %s=%r" % (fname, prop_ref, detector, prop_det)) 141 | count += 1 142 | print( 143 | "Total: %i out of %i (%.2f%%)" % (count, total, (count / total * 100)) 144 | ) 145 | if show_confusion: 146 | classes = [] 147 | for c in y_true + y_pred: 148 | if not c in classes: 149 | classes.append(c) 150 | cm = confusion_matrix(y_true, y_pred, labels=classes) 151 | trans = { 152 | "\t": "Tab", 153 | "": "Empty", 154 | " ": "Space", 155 | "。": "CDot", 156 | ":": "CCol", 157 | } 158 | clean = [trans.get(x, x) for x in classes] 159 | print(tabulate(cm, headers=clean, showindex=clean)) 160 | plot_confusion(cm, clean) 161 | plt.show() 162 | 163 | 164 | def main(): 165 | args = parse_args() 166 | detector, det_results = load_detector_results(args.detector_file) 167 | _, ref_results = load_detector_results(args.reference_file) 168 | if args.attr_name is None: 169 | show_complete_failure( 170 | ref_results, detector, det_results, only_messy=args.only_messy 171 | ) 172 | else: 173 | show_property_failure( 174 | ref_results, 175 | detector, 176 | det_results, 177 | args.attr_name, 178 | show_confusion=args.confusion, 179 | only_messy=args.only_messy, 180 | ) 181 | -------------------------------------------------------------------------------- /scripts/detection/suitability.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Uses the suitability metric from the Proactive Wrangler paper to decide on the 6 | dialect. 7 | 8 | Author: Gertjan van den Burg 9 | Copyright (c) 2018 - The Alan Turing Institute 10 | License: See the LICENSE file. 11 | 12 | """ 13 | 14 | 15 | from common.dialect import Dialect 16 | from common.encoding import get_encoding 17 | from common.escape import is_potential_escapechar 18 | from common.load import load_file 19 | from common.parser import parse_file 20 | from common.detector_result import DetectorResult, Status, StatusMsg 21 | from common.utils import pairwise 22 | 23 | from .core import run, get_potential_quotechars 24 | from .lib.types.rudi_types import eval_types 25 | from ._ties import break_ties 26 | 27 | DETECTOR = "suitability" 28 | WRANGLER_DELIMS = [",", ":", "|", "\t"] 29 | 30 | 31 | def extract_cells(data, dialect): 32 | cells = [] 33 | rows = parse_file(data, dialect) 34 | for row in rows: 35 | cells.extend(row) 36 | return cells 37 | 38 | 39 | def get_columns(cells): 40 | cols = {} 41 | for row in cells: 42 | for i, c in enumerate(row): 43 | if not i in cols: 44 | cols[i] = [] 45 | cols[i].append(c) 46 | return cols 47 | 48 | 49 | def count_empties(cells, dialect): 50 | count = 0 51 | for row in cells: 52 | for cell in row: 53 | if cell == "": 54 | count += 1 55 | if not dialect.quotechar is None: 56 | if cell == (dialect.quotechar + dialect.quotechar): 57 | count += 1 58 | return count 59 | 60 | 61 | def count_delimiters(cells): 62 | """Count the cells that contain a delimiter 63 | 64 | It is not entirely trivial whether or not the "continue" statement should 65 | be there, as we could also count each occurrence of a delimiter in a cell 66 | separately. However, since the normalization in the second term of (1) in 67 | Guo et al. (2011) is normalized by |R|*|C| it seems naturally to include 68 | it. 69 | """ 70 | count = 0 71 | for row in cells: 72 | for cell in row: 73 | for d in WRANGLER_DELIMS: 74 | if d in cell: 75 | count += 1 76 | continue # see note 77 | return count 78 | 79 | 80 | def column_homogeneity(column): 81 | """ 82 | As the Proactive Wrangler (PW) paper doesn't give sufficient details on all 83 | the types they implement, we use our own type inference engine (from 84 | rudi_types) to guess the type. Note that "unicode_alphanum" is a generic 85 | string type as is None. Empty cells are treated separately and are not 86 | considered a type in the PW paper. 87 | 88 | """ 89 | type_counts = {} 90 | for cell in column: 91 | detected_type = eval_types(cell) 92 | if detected_type is None: 93 | detected_type = "string" 94 | if detected_type == "unicode_alphanum": 95 | detected_type = "string" 96 | if detected_type == "empty": 97 | continue 98 | if not detected_type in type_counts: 99 | type_counts[detected_type] = 0 100 | type_counts[detected_type] += 1 101 | 102 | R = len(column) 103 | 104 | homogeneity = 0 105 | for t in type_counts: 106 | homogeneity += pow(type_counts[t] / R, 2.0) 107 | 108 | return homogeneity 109 | 110 | 111 | def compute_suitability(data, dialect): 112 | cells = extract_cells(data, dialect) 113 | columns = get_columns(cells) 114 | 115 | R = len(cells) 116 | C = len(columns) 117 | 118 | E = count_empties(cells, dialect) 119 | D = count_delimiters(cells) 120 | 121 | homo = sum((column_homogeneity(columns[cidx]) for cidx in columns)) 122 | if R * C == 0: 123 | suitability = 0 124 | else: 125 | suitability = (1 - homo / C) + (E + D) / (R * C) 126 | 127 | return suitability 128 | 129 | 130 | def get_dialects(data, encoding): 131 | delims = WRANGLER_DELIMS 132 | quotechars = get_potential_quotechars(data) 133 | escapechars = {} 134 | 135 | for delim in delims: 136 | delim_escapes = set() 137 | for u, v in pairwise(data): 138 | if v == delim and is_potential_escapechar(u, encoding): 139 | delim_escapes.add(u) 140 | for quotechar in quotechars: 141 | escapes = set(delim_escapes) 142 | for u, v in pairwise(data): 143 | if v == quotechar and is_potential_escapechar(u, encoding): 144 | escapes.add(u) 145 | escapes.add("") 146 | escapechars[(delim, quotechar)] = escapes 147 | 148 | dialects = [] 149 | for delim in delims: 150 | for quotechar in quotechars: 151 | for escapechar in escapechars[(delim, quotechar)]: 152 | d = Dialect(delim, quotechar, escapechar) 153 | dialects.append(d) 154 | return dialects 155 | 156 | 157 | def determine_dqr(filename, verbose=False): 158 | encoding = get_encoding(filename) 159 | data = load_file(filename, encoding=encoding) 160 | if data is None: 161 | return DetectorResult( 162 | status=Status.SKIP, status_msg=StatusMsg.UNREADABLE 163 | ) 164 | 165 | dialects = get_dialects(data, encoding) 166 | scores = [] 167 | 168 | for dialect in sorted(dialects): 169 | S = compute_suitability(data, dialect) 170 | if verbose: 171 | print("%15r\tsuitability = %.6f" % (dialect, S)) 172 | scores.append((S, dialect)) 173 | 174 | min_suit = min((x[0] for x in scores)) 175 | min_dialects = [x[1] for x in scores if x[0] == min_suit] 176 | 177 | if len(min_dialects) > 1: 178 | res = break_ties(data, min_dialects) 179 | else: 180 | res = min_dialects[0] 181 | 182 | if res is None: 183 | return DetectorResult( 184 | status=Status.FAIL, status_msg=StatusMsg.MULTIPLE_ANSWERS 185 | ) 186 | 187 | res = DetectorResult(dialect=res, status=Status.OK) 188 | 189 | return res 190 | 191 | 192 | def main(): 193 | run(determine_dqr=determine_dqr, detector=DETECTOR) 194 | -------------------------------------------------------------------------------- /scripts/detection/_ties.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Code for breaking ties in the heuristic solutions. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | Date: 2018-10-30 11 | 12 | """ 13 | 14 | from common.parser import parse_file 15 | from common.utils import pairwise 16 | 17 | 18 | def break_ties_two(data, A, B): 19 | """ 20 | Break ties between dialects A and B. 21 | 22 | """ 23 | if A.delimiter == B.delimiter and A.escapechar == B.escapechar: 24 | if A.quotechar == "" or B.quotechar == "": 25 | d_no = A if A.quotechar == "" else B 26 | d_yes = B if d_no == A else A 27 | 28 | X = parse_file(data, dialect=d_no) 29 | Y = parse_file(data, dialect=d_yes) 30 | 31 | if X == Y: 32 | # quotechar has no effect 33 | return d_no 34 | else: 35 | # quotechar has an effect 36 | return d_yes 37 | elif A.quotechar == B.quotechar and A.escapechar == B.escapechar: 38 | if sorted([A.delimiter, B.delimiter]) == sorted([",", " "]): 39 | # Artifact due to type detection (comma as radix point) 40 | if A.delimiter == ",": 41 | return A 42 | else: 43 | return B 44 | elif A.delimiter == "-" or B.delimiter == "-": 45 | # Artifact due to type detection (dash as minus sign) 46 | if A.delimiter == "-": 47 | return B 48 | else: 49 | return A 50 | elif A.delimiter == B.delimiter and A.quotechar == B.quotechar: 51 | Dnone, Descape = (A, B) if A.escapechar == "" else (B, A) 52 | 53 | X = parse_file(data, Dnone) 54 | Y = parse_file(data, Descape) 55 | 56 | # double check shape. Usually if the shape differs the pattern score 57 | # should have caught it, but if by a freakish occurance it hasn't then 58 | # we can't break this tie (for now) 59 | if len(X) != len(Y): 60 | return None 61 | for x, y in zip(X, Y): 62 | if len(x) != len(y): 63 | return None 64 | 65 | cells_escaped = [] 66 | cells_unescaped = [] 67 | for x, y in zip(X, Y): 68 | for u, v in zip(x, y): 69 | if u != v: 70 | cells_unescaped.append(u) 71 | cells_escaped.append(v) 72 | 73 | # We will break the ties in the following ways: 74 | # 75 | # If the escapechar precedes the quotechar an even number of times 76 | # within each offending cell, then we think it is a functional escape 77 | # and the escaped version is the correct dialect. Note that if an odd 78 | # number of escaped quotechars would occur, then the shape of the file 79 | # will be different if it is ignored. Only if it occurs an even number 80 | # of times within the cell can we get the same shape. 81 | for u in cells_unescaped: 82 | count = 0 83 | for a, b in pairwise(u): 84 | if a != Descape.escapechar: 85 | continue 86 | if a == Descape.escapechar and b == Descape.quotechar: 87 | count += 1 88 | if count > 0 and count % 2 == 0: 89 | return Descape 90 | else: 91 | return Dnone 92 | return None 93 | 94 | 95 | def break_ties_three(data, A, B, C): 96 | # NOTE: We have only observed one tie for each case during development, so 97 | # this may need to be improved in the future. 98 | equal_delim = A.delimiter == B.delimiter == C.delimiter 99 | equal_escape = A.escapechar == B.escapechar == C.escapechar 100 | 101 | if equal_delim and equal_escape: 102 | # difference is *only* in quotechar 103 | dialects = [A, B, C] 104 | 105 | # TODO: shouldn't hardcode single/double quotes here. 106 | # try with type-only on: 107 | # github/test_set/files/6367b9c5338b9a035a221cfffd928e92.csv 108 | d_none = next((d for d in dialects if d.quotechar == ""), None) 109 | d_single = next((d for d in dialects if d.quotechar == "'"), None) 110 | d_double = next((d for d in dialects if d.quotechar == '"'), None) 111 | 112 | # Added to fix above todo note, doesn't affect test results. 113 | if any((d is None for d in [d_none, d_single, d_double])): 114 | return None 115 | 116 | r_none = parse_file(data, d_none) 117 | r_single = parse_file(data, d_single) 118 | r_double = parse_file(data, d_double) 119 | 120 | if len(r_none) != len(r_single) or len(r_none) != len(r_double): 121 | return None 122 | 123 | if r_none == r_single: 124 | return break_ties_two(data, d_none, d_double) 125 | elif r_none == r_double: 126 | return break_ties_two(data, d_none, d_single) 127 | elif equal_delim: 128 | # difference is in quotechar *and* escapechar 129 | 130 | # NOTE: The reasoning here is as follows. If we are in this situation, 131 | # then there is both a potential escapechar and there are quotechars, 132 | # but the pattern score is the same and the type score can't make a 133 | # difference because no cells become clean if we interpret the 134 | # quote/escape correctly. This implies that the quote and escape do 135 | # have a function. Thus, we find the dialects that have a quote and 136 | # defer to break_ties_two. 137 | 138 | dialects = [A, B, C] 139 | with_quote = [d for d in dialects if d.quotechar != ""] 140 | 141 | if len(with_quote) != 2: 142 | return None 143 | 144 | return break_ties_two(data, with_quote[0], with_quote[1]) 145 | 146 | return None 147 | 148 | 149 | def break_ties_four(data, dialects): 150 | # NOTE: We have only observed one case during development where this 151 | # function was needed. It may need to be revisited in the future if other 152 | # examples are found. 153 | 154 | equal_delim = len(set([d.delimiter for d in dialects])) == 1 155 | if not equal_delim: 156 | return None 157 | 158 | # First, identify dialects that result in the same parsing result. 159 | equal_dialects = [] 160 | for a, b in pairwise(dialects): 161 | X = parse_file(data, a) 162 | Y = parse_file(data, b) 163 | if X == Y: 164 | equal_dialects.append((a, b)) 165 | 166 | # Try to break the ties in these pairs 167 | new_dialects = set() 168 | visited = set() 169 | for A, B in equal_dialects: 170 | ans = break_ties_two(data, A, B) 171 | if not ans is None: 172 | new_dialects.add(ans) 173 | visited.add(A) 174 | visited.add(B) 175 | for d in dialects: 176 | if not d in visited: 177 | new_dialects.add(d) 178 | 179 | dialects = list(new_dialects) 180 | 181 | # Defer to other functions if the number of dialects was reduced 182 | if len(dialects) == 2: 183 | return break_ties_two(data, *dialects) 184 | elif len(dialects) == 3: 185 | return break_ties_three(data, *dialects) 186 | 187 | return None 188 | 189 | 190 | def break_ties(data, dialects): 191 | if len(dialects) == 2: 192 | return break_ties_two(data, dialects[0], dialects[1]) 193 | elif len(dialects) == 3: 194 | return break_ties_three(data, dialects[0], dialects[1], dialects[2]) 195 | elif len(dialects) == 4: 196 | return break_ties_four(data, dialects) 197 | return None 198 | -------------------------------------------------------------------------------- /scripts/common/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Our CSV parser. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | Date: 2018-10-22 11 | 12 | """ 13 | 14 | 15 | def parse_file( 16 | S, dialect=None, delimiter=None, quotechar=None, escapechar=None 17 | ): 18 | """ 19 | Parse a CSV file given as a string by ``S`` into a list of lists. 20 | 21 | This function automatically takes double quotes into account, uses 22 | universal newlines, and can deal with quotes that start *inside* a cell. 23 | Quotes are only stripped from cells if they occur at the start and the end 24 | of the cell. 25 | 26 | Tests 27 | ----- 28 | 29 | Testing splitting on delimiter with or without quotes 30 | 31 | >>> parse_file('A,B,C,D,E', delimiter=',', quotechar='"') 32 | [['A', 'B', 'C', 'D', 'E']] 33 | >>> parse_file('A,B,C,D,E', delimiter=',', quotechar='') 34 | [['A', 'B', 'C', 'D', 'E']] 35 | >>> parse_file('A,B,C,D,E') 36 | [['A,B,C,D,E']] 37 | >>> parse_file('A,"B",C,D,E', delimiter=',', quotechar='"') 38 | [['A', 'B', 'C', 'D', 'E']] 39 | >>> parse_file('A,"B,C",D,E', delimiter=',', quotechar='"') 40 | [['A', 'B,C', 'D', 'E']] 41 | >>> parse_file('A,"B,C",D,E', delimiter=',', quotechar='') 42 | [['A', '"B', 'C"', 'D', 'E']] 43 | >>> parse_file('"A","B","C",,,,', delimiter=',', quotechar='') 44 | [['"A"', '"B"', '"C"', '', '', '', '']] 45 | 46 | Testing splitting on rows only: 47 | 48 | >>> parse_file('A"B"C\\rA"B""C""D"', quotechar='') 49 | [['A"B"C'], ['A"B""C""D"']] 50 | >>> parse_file('A"B"C\\nA"B""C""D"', quotechar='') 51 | [['A"B"C'], ['A"B""C""D"']] 52 | >>> parse_file('A"B"C\\r\\nA"B""C""D"', quotechar='') 53 | [['A"B"C'], ['A"B""C""D"']] 54 | >>> parse_file('A"B\\r\\nB"C\\r\\nD"E"F\\r\\nG', quotechar='"') 55 | [['A"B\\r\\nB"C'], ['D"E"F'], ['G']] 56 | >>> parse_file('A"B\\nB"C\\nD"E"F\\nG', quotechar='"') 57 | [['A"B\\nB"C'], ['D"E"F'], ['G']] 58 | >>> parse_file('A"B\\nB\\rB"C\\nD"E"F\\nG', quotechar='"') 59 | [['A"B\\nB\\rB"C'], ['D"E"F'], ['G']] 60 | 61 | Tests from Python's builtin CSV module: 62 | 63 | >>> parse_file('') 64 | [] 65 | >>> parse_file('a,b\\r', delimiter=',') 66 | [['a', 'b']] 67 | >>> parse_file('a,b\\n', delimiter=',') 68 | [['a', 'b']] 69 | >>> parse_file('a,b\\r\\n', delimiter=',') 70 | [['a', 'b']] 71 | >>> parse_file('a,"', delimiter=',', quotechar='"') 72 | [['a', '']] 73 | >>> parse_file('"a', delimiter=',', quotechar='"') 74 | [['a']] 75 | >>> parse_file('a,|b,c', delimiter=',', quotechar='"', escapechar='|') # differs from Python (1) 76 | [['a', '|b', 'c']] 77 | >>> parse_file('a,b|,c', delimiter=',', quotechar='"', escapechar='|') 78 | [['a', 'b,c']] 79 | >>> parse_file('a,"b,|c"', delimiter=',', quotechar='"', escapechar='|') # differs from Python (1) 80 | [['a', 'b,|c']] 81 | >>> parse_file('a,"b,c|""', delimiter=',', quotechar='"', escapechar='|') 82 | [['a', 'b,c"']] 83 | >>> parse_file('a,"b,c"|', delimiter=',', quotechar='"', escapechar='|') # differs from Python (2) 84 | [['a', 'b,c']] 85 | >>> parse_file('1,",3,",5', delimiter=',', quotechar='"') 86 | [['1', ',3,', '5']] 87 | >>> parse_file('1,",3,",5', delimiter=',', quotechar='') 88 | [['1', '"', '3', '"', '5']] 89 | >>> parse_file(',3,"5",7.3, 9', delimiter=',', quotechar='"') 90 | [['', '3', '5', '7.3', ' 9']] 91 | >>> parse_file('"a\\nb", 7', delimiter=',', quotechar='"') 92 | [['a\\nb', ' 7']] 93 | 94 | Double quotes: 95 | 96 | >>> parse_file('a,"a""b""c"', delimiter=',', quotechar='"') 97 | [['a', 'a"b"c']] 98 | 99 | Mix double and escapechar: 100 | 101 | >>> parse_file('a,"bc""d"",|"f|""', delimiter=',', quotechar='"', escapechar='|') 102 | [['a', 'bc"d","f"']] 103 | 104 | Other tests: 105 | 106 | >>> parse_file('a,b "c" d,e', delimiter=',', quotechar='') 107 | [['a', 'b "c" d', 'e']] 108 | >>> parse_file('a,b "c" d,e', delimiter=',', quotechar='"') 109 | [['a', 'b "c" d', 'e']] 110 | >>> parse_file('a,\\rb,c', delimiter=',') 111 | [['a', ''], ['b', 'c']] 112 | >>> parse_file('a,b\\r\\n\\r\\nc,d\\r\\n', delimiter=',') 113 | [['a', 'b'], ['c', 'd']] 114 | >>> parse_file('\\r\\na,b\\rc,d\\n\\re,f\\r\\n', delimiter=',') 115 | [['a', 'b'], ['c', 'd'], ['e', 'f']] 116 | 117 | Further escape char tests: 118 | 119 | >>> parse_file('a,b,c||d', delimiter=',', quotechar='', escapechar='|') 120 | [['a', 'b', 'c|d']] 121 | >>> parse_file('a,b,c||d,e|,d', delimiter=',', quotechar='', escapechar='|') 122 | [['a', 'b', 'c|d', 'e,d']] 123 | 124 | Quote mismatch until EOF: 125 | 126 | >>> parse_file('a,b,c"d,e\\n', delimiter=',', quotechar='"') 127 | [['a', 'b', 'c"d,e\\n']] 128 | >>> parse_file('a,b,c"d,e\\n', delimiter=',', quotechar='') 129 | [['a', 'b', 'c"d', 'e']] 130 | >>> parse_file('a,b,"c,d', delimiter=',', quotechar='"') 131 | [['a', 'b', 'c,d']] 132 | >>> parse_file('a,b,"c,d\\n', delimiter=',', quotechar='"') 133 | [['a', 'b', 'c,d\\n']] 134 | 135 | Single column: 136 | 137 | >>> parse_file('a\\rb\\rc\\n') 138 | [['a'], ['b'], ['c']] 139 | 140 | These tests illustrate a difference with the Python parser, which in this 141 | case would return ``[['a', 'abc', 'd']]``. 142 | 143 | >>> parse_file('a,"ab"c,d', delimiter=',', quotechar='') 144 | [['a', '"ab"c', 'd']] 145 | >>> parse_file('a,"ab"c,d', delimiter=',', quotechar='"') 146 | [['a', '"ab"c', 'd']] 147 | 148 | 149 | Notes 150 | ----- 151 | 152 | (1) We only interpret the escape character if it precedes the provided 153 | delimiter, quotechar, or itself. Otherwise, the escape character does not 154 | serve any purpose, and should not be dropped automatically. 155 | 156 | (2) For some reason the Python test suite places this escape character 157 | *inside* the preceding quoted block. This seems counterintuitive and 158 | incorrect and thus this behavior has not been duplicated. 159 | 160 | """ 161 | if not dialect is None: 162 | delimiter = dialect.delimiter if delimiter is None else delimiter 163 | quotechar = dialect.quotechar if quotechar is None else quotechar 164 | escapechar = dialect.escapechar if escapechar is None else escapechar 165 | 166 | quote_cond = lambda c, q: q and c.startswith(q) and c.endswith(q) 167 | 168 | in_quotes = False 169 | in_escape = False 170 | rows = [] 171 | i = 0 172 | row = [] 173 | field = "" 174 | end_row = False 175 | end_field = False 176 | s = None 177 | while i < len(S): 178 | s = S[i] 179 | if s == quotechar: 180 | if in_escape: 181 | in_escape = False 182 | elif not in_quotes: 183 | in_quotes = True 184 | else: 185 | if i + 1 < len(S) and S[i + 1] == quotechar: 186 | i += 1 187 | else: 188 | in_quotes = False 189 | field += s 190 | elif s in ["\r", "\n"]: 191 | if in_quotes: 192 | field += s 193 | elif field == "" and row == []: 194 | pass 195 | else: 196 | end_row = True 197 | end_field = True 198 | elif s == delimiter: 199 | if in_escape: 200 | in_escape = False 201 | field += s 202 | elif in_quotes: 203 | field += s 204 | else: 205 | end_field = True 206 | elif s == escapechar: 207 | if in_escape: 208 | field += s 209 | in_escape = False 210 | else: 211 | in_escape = True 212 | else: 213 | if in_escape: 214 | field += escapechar 215 | in_escape = False 216 | field += s 217 | 218 | if end_field: 219 | if quote_cond(field, quotechar): 220 | field = field[1:-1] 221 | row.append(field) 222 | field = "" 223 | end_field = False 224 | 225 | if end_row: 226 | rows.append(row) 227 | row = [] 228 | end_row = False 229 | 230 | i += 1 231 | 232 | if quote_cond(field, quotechar): 233 | field = field[1:-1] 234 | elif in_quotes: 235 | if field.startswith(quotechar): 236 | field = field[1:] 237 | s = "" 238 | if not s in ["\r", "\n", None]: 239 | row.append(field) 240 | rows.append(row) 241 | 242 | return rows 243 | -------------------------------------------------------------------------------- /scripts/detection/lib/types/rudi_types.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | 5 | Rudimentary types, used as a first pass to detect cell types given a potential 6 | delimiter. 7 | 8 | Potentially add (small reward?): 9 | 10 | - Latitude and longitude 11 | - Alternative date(time) formats: 12 | x 2009-01-02T00:00 13 | x 18/10/2014 14 | x 04/07/11 15 | - 26-Feb 16 | - 10/12/2015 HH:MM 17 | - 10-Jul-12 18 | - Dec-13 19 | - File sizes and bandwidth speed 20 | - Unix Paths 21 | x Currency (\p{Sc} + float) 22 | 23 | 24 | Notes: 25 | 26 | - Testing dates with Maya or Pendulum might work, but I got some false 27 | positives such as "T2P" being interpreted as a time. 28 | 29 | - Maybe check out Moment.js? Many datetime formats for many locales. This 30 | might be overkill for a "rudimentary type guess" though. 31 | 32 | x We can make this faster by compiling the regexes. 33 | 34 | Should we consider a type hierarchy? Some urls (www.xxx.yyy) are also strings 35 | 36 | Author: Gertjan van den Burg 37 | 38 | """ 39 | 40 | import regex 41 | import sys 42 | 43 | 44 | STRIP_WHITESPACE = True 45 | TO_CHECK = [] 46 | CHECK_ALL = False 47 | 48 | # Used this site: https://unicode-search.net/unicode-namesearch.pl 49 | SPECIALS_ALLOWED = [ 50 | # Periods 51 | "\u002e", 52 | "\u06d4", 53 | "\u3002", 54 | "\ufe52", 55 | "\uff0e", 56 | "\uff61", 57 | # Parentheses 58 | "\u0028", 59 | "\u0029", 60 | "\u27ee", 61 | "\u27ef", 62 | "\uff08", 63 | "\uff09", 64 | # Question marks 65 | "\u003F", 66 | "\u00BF", 67 | "\u037E", 68 | "\u055E", 69 | "\u061F", 70 | "\u1367", 71 | "\u1945", 72 | "\u2047", 73 | "\u2048", 74 | "\u2049", 75 | "\u2CFA", 76 | "\u2CFB", 77 | "\u2E2E", 78 | "\uA60F", 79 | "\uA6F7", 80 | "\uFE16", 81 | "\uFE56", 82 | "\uFF1F", 83 | chr(69955), # chakma question mark 84 | chr(125279), # adlam initial question mark 85 | # Exclamation marks 86 | "\u0021", 87 | "\u00A1", 88 | "\u01C3", 89 | "\u055C", 90 | "\u07F9", 91 | "\u109F", 92 | "\u1944", 93 | "\u203C", 94 | "\u2048", 95 | "\u2049", 96 | "\uAA77", 97 | "\uFE15", 98 | "\uFE57", 99 | "\uFF01", 100 | chr(125278), # adlam initial exclamation mark 101 | ] 102 | 103 | PATTERNS = { 104 | "number_1": regex.compile( 105 | "(?=[+-\.\d])[+-]?(?:0|[1-9]\d*)?(((?P\.)?(?(dot)(?P\d*(\d+[eE][+-]?\d+)?)|(?P([eE][+-]?\d+)?)))|((?P,)?(?(comma)(?P\d+(\d+[eE][+-]?\d+)?)|(?P([eE][+-]?\d+)?))))" 106 | ), 107 | "number_2": regex.compile("[+-]?(?:[1-9]|[1-9]\d{0,2})(?:\,\d{3})+\.\d*"), 108 | "number_3": regex.compile("[+-]?(?:[1-9]|[1-9]\d{0,2})(?:\.\d{3})+\,\d*"), 109 | "url": regex.compile( 110 | "(?:(?:[A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)(?:(?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?" 111 | ), 112 | "email": regex.compile( 113 | r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)" 114 | ), 115 | "unicode_alphanum": regex.compile( 116 | "(\p{N}+\p{L}+[\p{N}\p{L}\ " 117 | + regex.escape("".join(SPECIALS_ALLOWED)) 118 | + "]*|\p{L}+[\p{N}\p{L}\ " 119 | + regex.escape("".join(SPECIALS_ALLOWED)) 120 | + "]+)" 121 | ), 122 | "time_hhmmss": regex.compile( 123 | "(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])" 124 | ), 125 | "time_hhmm": regex.compile("(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])"), 126 | "time_HHMM": regex.compile("(0[0-9]|1[0-9]|2[0-3])([0-5][0-9])"), 127 | "time_HH": regex.compile("(0[0-9]|1[0-9]|2[0-3])([0-5][0-9])"), 128 | "time_hmm": regex.compile("([0-9]|1[0-9]|2[0-3]):([0-5][0-9])"), 129 | "currency": regex.compile("\p{Sc}\s?(.*)"), 130 | "unix_path": regex.compile( 131 | "[\/~]{1,2}(?:[a-zA-Z0-9\.]+(?:[\/]{1,2}))+(?:[a-zA-Z0-9\.]+)" 132 | ), 133 | } 134 | 135 | 136 | def load_date_patterns(): 137 | year2 = "(?:\d{2})" 138 | year4 = "(?:[12]\d{3})" 139 | 140 | month_leading = "(?:0[1-9]|1[0-2])" 141 | month_sparse = "(?:[1-9]|1[0-2])" 142 | 143 | day_leading = "(?:0[1-9]|[12]\d|3[01])" 144 | day_sparse = "(?:[1-9]|[12]\d|3[01])" 145 | 146 | sep = "[-/\.\ ]" 147 | 148 | counter = 0 149 | for year in [year2, year4]: 150 | for month in [month_leading, month_sparse]: 151 | for day in [day_leading, day_sparse]: 152 | fmt = {"year": year, "month": month, "day": day, "sep": sep} 153 | 154 | pat_1 = "{year}{sep}{month}{sep}{day}".format(**fmt) 155 | pat_2 = "{day}{sep}{month}{sep}{year}".format(**fmt) 156 | pat_3 = "{month}{sep}{day}{sep}{year}".format(**fmt) 157 | 158 | pat_cn = "{year}年{month}月{day}日".format(**fmt) 159 | pat_ko = "{year}년{month}월{day}일".format(**fmt) 160 | 161 | for pattern in [pat_1, pat_2, pat_3, pat_cn, pat_ko]: 162 | PATTERNS["date_%i" % counter] = regex.compile(pattern) 163 | counter += 1 164 | 165 | # These should be allowed as dates, but are also numbers. 166 | for year in [year2, year4]: 167 | fmt = { 168 | "year": year, 169 | "month": month_leading, 170 | "day": day_leading, 171 | "sep": "", 172 | } 173 | pat_1 = "{year}{sep}{month}{sep}{day}".format(**fmt) 174 | pat_2 = "{day}{sep}{month}{sep}{year}".format(**fmt) 175 | pat_3 = "{month}{sep}{day}{sep}{year}".format(**fmt) 176 | 177 | for pattern in [pat_1, pat_2, pat_3, pat_cn]: 178 | PATTERNS["date_%i" % counter] = regex.compile(pattern) 179 | counter += 1 180 | 181 | 182 | # TODO Ugly to do this here, but this is research code... 183 | 184 | load_date_patterns() 185 | 186 | 187 | def test_with_regex(cell, patname): 188 | # Test if cell *fully* matches reg (e.g. entire cell is number, maybe allow 189 | # stripping of leading/trailing spaces) 190 | if STRIP_WHITESPACE: 191 | cell = cell.strip() 192 | pat = PATTERNS.get(patname, None) 193 | match = pat.fullmatch(cell) 194 | return match is not None 195 | 196 | 197 | def test_number(cell): 198 | # NOTE: This is more general than trying to coerce to float(), because it 199 | # allows use of the comma as radix point. 200 | if cell == "": 201 | return False 202 | if test_with_regex(cell, "number_1"): 203 | return True 204 | if test_with_regex(cell, "number_2"): 205 | return True 206 | if test_with_regex(cell, "number_3"): 207 | return True 208 | return False 209 | 210 | 211 | def test_url_or_email(cell): 212 | return test_with_regex(cell, "url") or test_with_regex(cell, "email") 213 | 214 | 215 | def test_unicode_alphanum(cell): 216 | # TODO: I'm not sure if it's desirable to allow alphanumeric cells, because 217 | # it's not clear if they include "junk" cells due to incorrect delimiter 218 | # (think: space). Maybe it's better to have only character cells? 219 | # NOTE: This function assumes that number and url are already excluded. 220 | 221 | return test_with_regex(cell, "unicode_alphanum") 222 | 223 | 224 | def test_date(cell): 225 | if test_number(cell): 226 | return False 227 | 228 | for patname in PATTERNS: 229 | if patname.startswith("date_"): 230 | if test_with_regex(cell, patname): 231 | return True 232 | return False 233 | 234 | 235 | def test_time(cell): 236 | # HH:MM:SS, HH:MM, or H:MM 237 | return ( 238 | test_with_regex(cell, "time_hmm") 239 | or test_with_regex(cell, "time_hhmm") 240 | or test_with_regex(cell, "time_hhmmss") 241 | ) 242 | 243 | 244 | def test_empty(cell): 245 | if STRIP_WHITESPACE: 246 | cell = cell.strip() 247 | return cell == "" 248 | 249 | 250 | def test_percentage(cell): 251 | cell = cell.strip() 252 | return cell.endswith("%") and test_number(cell.rstrip("%")) 253 | 254 | 255 | def test_currency(cell): 256 | if STRIP_WHITESPACE: 257 | cell = cell.strip() 258 | pat = PATTERNS.get("currency", None) 259 | m = pat.fullmatch(cell) 260 | if m is None: 261 | return False 262 | grp = m.group(1) 263 | if not test_number(grp): 264 | return False 265 | return True 266 | 267 | 268 | def test_datetime(cell): 269 | # Takes care of cells with '[date] [time]' and '[date]T[time]' (iso) 270 | if " " in cell: 271 | parts = cell.split(" ") 272 | if len(parts) > 2: 273 | return False 274 | return test_date(parts[0]) and test_time(parts[1]) 275 | elif "T" in cell: 276 | parts = cell.split("T") 277 | if len(parts) > 2: 278 | return False 279 | isdate = test_date(parts[0]) 280 | if not isdate: 281 | return False 282 | # [date]T[time] 283 | if test_time(parts[1]): 284 | return True 285 | # [date]T[time][+-][time] 286 | if "+" in parts[1]: 287 | subparts = parts[1].split("+") 288 | istime1 = test_time(subparts[0]) 289 | istime2 = test_time(subparts[1]) 290 | if not istime1: 291 | return False 292 | if istime2: 293 | return True 294 | if test_with_regex(subparts[1], "time_HHMM"): 295 | return True 296 | if test_with_regex(subparts[1], "time_HH"): 297 | return True 298 | elif "-" in parts[1]: 299 | subparts = parts[1].split("-") 300 | istime1 = test_time(subparts[0]) 301 | istime2 = test_time(subparts[1]) 302 | if not istime1: 303 | return False 304 | if istime2: 305 | return True 306 | if test_with_regex(subparts[1], "time_HHMM"): 307 | return True 308 | if test_with_regex(subparts[1], "time_HH"): 309 | return True 310 | return False 311 | 312 | 313 | def test_nan(cell): 314 | if STRIP_WHITESPACE: 315 | cell = cell.strip() 316 | # other forms (na and nan) are caught by unicode_alphanum 317 | if cell.lower() == "n/a": 318 | return True 319 | return False 320 | 321 | 322 | def eval_types(cell, break_away=True): 323 | type_tests = [ 324 | ("empty", test_empty), 325 | ("url_or_email", test_url_or_email), 326 | ("number", test_number), 327 | ("time", test_time), 328 | ("percentage", test_percentage), 329 | ("currency", test_currency), 330 | ("unicode_alphanum", test_unicode_alphanum), 331 | ("nan", test_nan), 332 | ("date", test_date), 333 | ("datetime", test_datetime), 334 | ] 335 | 336 | detected = [] 337 | for name, func in type_tests: 338 | if func(cell): 339 | detected.append(name) 340 | if break_away: 341 | break 342 | 343 | if len(detected) > 1: 344 | print( 345 | "Type tests aren't mutually exclusive!\nCell: %r\nTypes: %r" 346 | % (cell, detected), 347 | file=sys.stderr, 348 | ) 349 | raise ValueError 350 | if len(detected) == 0: 351 | return None 352 | return detected[0] 353 | -------------------------------------------------------------------------------- /scripts/detection/hypo.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Author: G.J.J. van den Burg 4 | # Copyright (c) 2018 - The Alan Turing Institute 5 | # License: See the LICENSE file. 6 | # 7 | 8 | library(devtools) 9 | library(rjson) 10 | 11 | # load our local version of hypoparsr 12 | match <- grep("--file=", commandArgs(trailingOnly=F)) 13 | this.path <- normalizePath(sub("--file=", "", commandArgs(trailingOnly=F)[match])) 14 | this.dir <- dirname(this.path) 15 | hypoparsr.dir <- paste(this.dir, "/lib/hypoparsr", sep="") 16 | load_all(hypoparsr.dir, export_all=F) 17 | 18 | printf <- function(...) invisible(cat(sprintf(...))); 19 | fprintf <- function(file, ...) invisible(cat(sprintf(...), file=file)) 20 | 21 | #' Replacement for R's ridiculous strsplit that drops empties. 22 | my.strsplit <- function(string, delim) { 23 | out <- strsplit(string, delim) 24 | if (substr(string, nchar(string), nchar(string)) == delim) 25 | out <- c(out, "") 26 | return(out) 27 | } 28 | 29 | real.quotechar <- function(filename, best, delim, rowsep, quote.method) 30 | { 31 | # Since HypoParsr doesn't reliably return the quote character, we here try 32 | # to reverse engineer what they do to figure out what the quote character 33 | # is that they actually use. 34 | 35 | encoding <- strsplit(names(best$confidence[6]), '\n')[[1]][3] 36 | text <- readr::read_file(filename, locale=readr::locale(encoding=encoding)) 37 | text <- iconv(text) 38 | 39 | if (rowsep == "E") 40 | regex.rowsep <- "\r\n" 41 | else if (rowsep == "N") 42 | regex.rowsep <- "(?>> masked_by_quotechar('A"B&C"A', '"', '', '&') 39 | True 40 | >>> masked_by_quotechar('A"B&C"A&A', '"', '', '&') 41 | False 42 | >>> masked_by_quotechar('A|"B&C"A', '"', '|', '&') 43 | False 44 | >>> masked_by_quotechar('A"B"C', '"', '', '') 45 | False 46 | """ 47 | if test_char == "": 48 | return False 49 | escape_next = False 50 | in_quotes = False 51 | i = 0 52 | while i < len(S): 53 | s = S[i] 54 | if s == quotechar: 55 | if escape_next: 56 | i += 1 57 | continue 58 | if not in_quotes: 59 | in_quotes = True 60 | else: 61 | if i + 1 < len(S) and S[i + 1] == quotechar: 62 | i += 1 63 | else: 64 | in_quotes = False 65 | elif s == test_char and not in_quotes: 66 | return False 67 | elif s == escapechar: 68 | escape_next = True 69 | i += 1 70 | return True 71 | 72 | 73 | def get_potential_delimiters(data, encoding): 74 | delims = set() 75 | c = Counter(data) 76 | for delim, _ in c.most_common(): 77 | if ( 78 | can_be_delim_unicode(delim, encoding=encoding) 79 | and not delim in BLOCKED_DELIMS 80 | ): 81 | delims.add(delim) 82 | delims.add("") 83 | return delims 84 | 85 | 86 | def get_cells(data, dialect): 87 | rows = parse_file(data, dialect=dialect) 88 | all_cells = [] 89 | for row in rows: 90 | all_cells.extend(row) 91 | return all_cells 92 | 93 | 94 | def make_base_abstraction(S, dialect): 95 | stack = "" 96 | escape_next = False 97 | for s in S: 98 | if s in ["\r", "\n"]: 99 | if not stack.endswith("R"): 100 | stack += "R" 101 | elif s == dialect.delimiter: 102 | if escape_next: 103 | stack += "C" 104 | escape_next = False 105 | else: 106 | stack += "D" 107 | elif s == dialect.quotechar: 108 | if escape_next: 109 | stack += "C" 110 | escape_next = False 111 | else: 112 | stack += "Q" 113 | elif s == dialect.escapechar: 114 | if escape_next: 115 | if not stack.endswith("C"): 116 | stack += "C" 117 | escape_next = False 118 | else: 119 | escape_next = True 120 | else: 121 | if escape_next: 122 | escape_next = False 123 | if not stack.endswith("C"): 124 | stack += "C" 125 | 126 | return stack 127 | 128 | 129 | def merge_with_quotechar(S, dialect): 130 | in_quotes = False 131 | i = 0 132 | quote_pairs = [] 133 | while i < len(S): 134 | s = S[i] 135 | if not s == "Q": 136 | i += 1 137 | continue 138 | 139 | if not in_quotes: 140 | in_quotes = True 141 | begin_quotes = i 142 | else: 143 | if i + 1 < len(S) and S[i + 1] == "Q": 144 | i += 1 145 | else: 146 | end_quotes = i 147 | quote_pairs.append((begin_quotes, end_quotes)) 148 | in_quotes = False 149 | i += 1 150 | 151 | # replace quoted blocks by C 152 | Sl = list(S) 153 | for begin, end in quote_pairs: 154 | for i in range(begin, end + 1): 155 | Sl[i] = "C" 156 | S = "".join(Sl) 157 | 158 | return S 159 | 160 | 161 | def strip_trailing(abstract): 162 | while abstract.endswith("R"): 163 | abstract = abstract[:-1] 164 | return abstract 165 | 166 | 167 | def fill_empties(abstract): 168 | while "DD" in abstract: 169 | abstract = abstract.replace("DD", "DCD") 170 | 171 | while "DR" in abstract: 172 | abstract = abstract.replace("DR", "DCR") 173 | 174 | while "RD" in abstract: 175 | abstract = abstract.replace("RD", "RCD") 176 | 177 | while "CC" in abstract: 178 | abstract = abstract.replace("CC", "C") 179 | 180 | if abstract.startswith("D"): 181 | abstract = "C" + abstract 182 | 183 | if abstract.endswith("D"): 184 | abstract += "C" 185 | 186 | return abstract 187 | 188 | 189 | def filter_urls(data): 190 | pat = "(?:(?:[A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)(?:(?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?" 191 | url_idxs = [] 192 | for match in re.finditer(pat, data): 193 | url_idxs.append(match.span()) 194 | Sl = list(data) 195 | for begin, end in url_idxs: 196 | for i in range(begin, end): 197 | Sl[i] = "U" 198 | return "".join(Sl) 199 | 200 | 201 | def make_abstraction(data, dialect): 202 | """ 203 | Make the abstract representation of a CSV file. 204 | 205 | Tests 206 | ----- 207 | 208 | >>> make_abstraction('A,B,C', Dialect(delimiter=',', quotechar='', escapechar='')) 209 | 'CDCDC' 210 | >>> make_abstraction('A,\\rA,A,A\\r', Dialect(delimiter=',', quotechar='', escapechar='')) 211 | 'CDCRCDCDC' 212 | >>> make_abstraction('a,a,\\n,a,a\\ra,a,a\\r\\n', Dialect(delimiter=',', quotechar='', escapechar='')) 213 | 'CDCDCRCDCDCRCDCDC' 214 | >>> make_abstraction('a,"bc""d""e""f""a",\\r\\n', Dialect(delimiter=',', quotechar='"', escapechar='')) 215 | 'CDCDC' 216 | >>> make_abstraction('a,"bc""d"",|"f|""', Dialect(delimiter=',', quotechar='"', escapechar='|')) 217 | 'CDC' 218 | >>> make_abstraction(',,,', Dialect(delimiter=',', quotechar='', escapechar='')) 219 | 'CDCDCDC' 220 | >>> make_abstraction(',"",,', Dialect(delimiter=',', quotechar='"', escapechar='')) 221 | 'CDCDCDC' 222 | >>> make_abstraction(',"",,\\r\\n', Dialect(delimiter=',', quotechar='"', escapechar='')) 223 | 'CDCDCDC' 224 | 225 | Escape char: 226 | 227 | >>> make_abstraction('A,B|,C', Dialect(delimiter=',', quotechar='', escapechar='|')) 228 | 'CDC' 229 | >>> make_abstraction('A,"B,C|"D"', Dialect(delimiter=',', quotechar='"', escapechar='|')) 230 | 'CDC' 231 | >>> make_abstraction('a,|b,c', Dialect(delimiter=',', quotechar='', escapechar='|')) 232 | 'CDCDC' 233 | >>> make_abstraction('a,b|,c', Dialect(delimiter=',', quotechar='', escapechar='|')) 234 | 'CDC' 235 | >>> make_abstraction('a,"b,c|""', Dialect(delimiter=',', quotechar='"', escapechar='|')) 236 | 'CDC' 237 | >>> make_abstraction('a,b||c', Dialect(delimiter=',', quotechar='', escapechar='|')) 238 | 'CDC' 239 | >>> make_abstraction('a,"b|"c||d|"e"', Dialect(delimiter=',', quotechar='"', escapechar='|')) 240 | 'CDC' 241 | >>> make_abstraction('a,"b|"c||d","e"', Dialect(delimiter=',', quotechar='"', escapechar='|')) 242 | 'CDCDC' 243 | 244 | """ 245 | 246 | A = make_base_abstraction(data, dialect) 247 | A = merge_with_quotechar(A, dialect) 248 | A = fill_empties(A) 249 | A = strip_trailing(A) 250 | 251 | return A 252 | 253 | 254 | def is_clean(cell): 255 | return not (eval_types(cell) is None) 256 | 257 | 258 | def get_potential_dialects(data, encoding): 259 | """ 260 | We consider as escape characters those characters for which 261 | is_potential_escapechar() is True and that occur at least once before a 262 | quote character or delimiter in the dialect. 263 | 264 | One may wonder if self-escaping is an issue here (i.e. "\\\\", two times 265 | backslash). It is not. In a file where a single backslash is desired and 266 | escaping with a backslash is used, then it only makes sense to do this in a 267 | file where the backslash is already used as an escape character (in which 268 | case we include it). If it is never used as escape for the delimiter or 269 | quotechar, then it is not necessary to self-escape. 270 | """ 271 | delims = get_potential_delimiters(data, encoding) 272 | quotechars = get_potential_quotechars(data) 273 | escapechars = {} 274 | 275 | for delim, quotechar in itertools.product(delims, quotechars): 276 | escapechars[(delim, quotechar)] = set([""]) 277 | 278 | for u, v in pairwise(data): 279 | if not is_potential_escapechar(u, encoding): 280 | continue 281 | for delim, quotechar in itertools.product(delims, quotechars): 282 | if v == delim or v == quotechar: 283 | escapechars[(delim, quotechar)].add(u) 284 | 285 | dialects = [] 286 | for delim in delims: 287 | for quotechar in quotechars: 288 | for escapechar in escapechars[(delim, quotechar)]: 289 | if masked_by_quotechar(data, quotechar, escapechar, delim): 290 | continue 291 | d = Dialect(delim, quotechar, escapechar) 292 | dialects.append(d) 293 | return dialects 294 | 295 | 296 | def determine_dqr(filename, score_func, verbose=False, do_break_ties=True): 297 | encoding = get_encoding(filename) 298 | data = load_file(filename, encoding=encoding) 299 | if data is None: 300 | return DetectorResult( 301 | status=Status.SKIP, status_msg=StatusMsg.UNREADABLE 302 | ) 303 | 304 | # fix-up to replace urls by a character, this removes many potential 305 | # delimiters that only occur in urls and cause noise. 306 | dialects = get_potential_dialects(filter_urls(data), encoding) 307 | if not dialects: 308 | return DetectorResult( 309 | status=Status.FAIL, status_msg=StatusMsg.NO_DIALECTS 310 | ) 311 | 312 | if verbose: 313 | print( 314 | "Length of data: %i\n" 315 | "Considering %i dialects\n" % (len(data), len(dialects)) 316 | ) 317 | 318 | scores = score_func(data, dialects, verbose=verbose) 319 | 320 | score_sort = sorted( 321 | [(scores[dialect], dialect) for dialect in scores], 322 | key=lambda x: x[0], 323 | reverse=True, 324 | ) 325 | 326 | max_prob = score_sort[0][0] 327 | dialects_with_score = [x[1] for x in score_sort if x[0] == max_prob] 328 | 329 | if len(dialects_with_score) > 1: 330 | if do_break_ties: 331 | res = break_ties(data, dialects_with_score) 332 | else: 333 | res = None 334 | else: 335 | res = dialects_with_score[0] 336 | 337 | if res is None: 338 | if verbose: 339 | print("More than 1 parameter set!") 340 | for d in dialects_with_score: 341 | print(d) 342 | return DetectorResult( 343 | status=Status.FAIL, status_msg=StatusMsg.MULTIPLE_ANSWERS 344 | ) 345 | 346 | res = DetectorResult(dialect=res, status=Status.OK) 347 | 348 | return res 349 | -------------------------------------------------------------------------------- /scripts/analysis/figure_violins.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Creating violin plots in PGFplots (two-sided version) 6 | 7 | Based on: 8 | https://matplotlib.org/_modules/matplotlib/axes/_axes.html#Axes.violinplot 9 | https://github.com/statsmodels/statsmodels/blob/master/statsmodels/graphics/boxplots.py 10 | 11 | Author: Gertjan van den Burg 12 | 13 | """ 14 | 15 | import argparse 16 | import json 17 | import numpy as np 18 | import os 19 | import math 20 | 21 | from scipy.stats import gaussian_kde 22 | 23 | from .core import ( 24 | CORPUS_NAMES, 25 | ORDERED_DETECTORS, 26 | check_detectors, 27 | clean_detector_name, 28 | ) 29 | from .latex import build_latex_doc 30 | 31 | # Color 32 | # COLOR_LEFT = "B40204" 33 | # COLOR_RIGHT = "00AABA" 34 | # COLOR_MINMAX = "FF0000" 35 | 36 | # Grayscale 37 | COLOR_LEFT = "101010" 38 | COLOR_RIGHT = "878787" 39 | COLOR_MINMAX = "000000" 40 | 41 | USE_LOG = True 42 | 43 | 44 | def transform(x): 45 | if USE_LOG: 46 | return math.log(x, 10) 47 | return x 48 | 49 | 50 | def untransform(x): 51 | if USE_LOG: 52 | return pow(10, x) 53 | return x 54 | 55 | 56 | def _interpolate(coords, values, x): 57 | """ Return the estimated value of x by interpolating the nearest neighbors 58 | in coords. It is assumed coords is sorted and is of the same length as 59 | values. 60 | """ 61 | if x in coords: 62 | return values[coords == x] 63 | below_idx, above_idx = None, None 64 | for idx, c in enumerate(coords): 65 | if c < x: 66 | below_idx = idx 67 | if c > x: 68 | above_idx = idx 69 | break 70 | avg_val = (values[below_idx] + values[above_idx]) / 2 71 | return avg_val 72 | 73 | 74 | def _single_violin_data(pos, pos_data, width, side, plot_opts): 75 | # Based almost entirely on_single_violin from statsmodels 76 | bw_factor = plot_opts.get("bw_factor", None) 77 | 78 | def _violin_range(pos_data, plot_opts): 79 | """Return array with correct range, with which violins can be plotted.""" 80 | cutoff = plot_opts.get("cutoff", False) 81 | cutoff_type = plot_opts.get("cutoff_type", "std") 82 | cutoff_val = plot_opts.get("cutoff_val", 1.5) 83 | 84 | s = 0.0 85 | if not cutoff: 86 | if cutoff_type == "std": 87 | s = cutoff_val * np.std(pos_data) 88 | else: 89 | s = cutoff_val 90 | 91 | x_lower = kde.dataset.min() - s 92 | x_upper = kde.dataset.max() + s 93 | return np.linspace(x_lower, x_upper, 501) 94 | 95 | pos_data = np.asarray(pos_data) 96 | kde = gaussian_kde(pos_data, bw_method=bw_factor) 97 | 98 | xvals = _violin_range(pos_data, plot_opts) 99 | violin = kde.evaluate(xvals) 100 | 101 | # NOTE: we removed normalization by violin.max() 102 | violin = width * violin 103 | 104 | if side == "both": 105 | envelope_l, envelope_r = (-violin + pos, violin + pos) 106 | elif side == "right": 107 | envelope_l, envelope_r = (np.zeros_like(violin) + pos, violin + pos) 108 | elif side == "left": 109 | envelope_l, envelope_r = (-violin + pos, np.zeros_like(violin) + pos) 110 | else: 111 | msg = "`side` parameter should be one of {'left', 'right', 'both'}." 112 | raise ValueError(msg) 113 | 114 | return xvals, envelope_l, envelope_r 115 | 116 | 117 | def get_median_coords(coords, left, right, median): 118 | data = {} 119 | data["xleft"] = _interpolate(coords, left, median) 120 | data["xright"] = _interpolate(coords, right, median) 121 | data["yleft"] = median 122 | data["yright"] = median 123 | return data 124 | 125 | 126 | def get_extrema_coords(pos, pos_data, width, side): 127 | # min 128 | xleft = pos 129 | xleft -= width if side in ["left", "both"] else 0 130 | xright = pos 131 | xright += width if side in ["right", "both"] else 0 132 | yleft = yright = np.min(pos_data) 133 | min_coords = { 134 | "xleft": xleft, 135 | "xright": xright, 136 | "yleft": yleft, 137 | "yright": yright, 138 | } 139 | # max 140 | yleft = yright = np.max(pos_data) 141 | max_coords = { 142 | "xleft": xleft, 143 | "xright": xright, 144 | "yleft": yleft, 145 | "yright": yright, 146 | } 147 | return min_coords, max_coords 148 | 149 | 150 | def generate_violin_data( 151 | summary_data, side="both", showmedian=True, showextrema=True, plot_opts={} 152 | ): 153 | 154 | check_detectors(summary_data["runtimes"].keys()) 155 | 156 | dataset = list( 157 | map( 158 | np.asarray, 159 | [ 160 | list(map(transform, summary_data["runtimes"][key])) 161 | for key in ORDERED_DETECTORS 162 | ], 163 | ) 164 | ) 165 | 166 | positions = np.arange(len(dataset)) + 1 167 | pos_span = np.max(positions) - np.min(positions) 168 | width = np.min( 169 | [0.15 * np.max([pos_span, 1.]), plot_opts.get("violin_width", 0.8) / 2.] 170 | ) 171 | 172 | violin_data = [] 173 | for pos_data, pos, name in zip(dataset, positions, ORDERED_DETECTORS): 174 | xvals, envelope_l, envelope_r = _single_violin_data( 175 | pos, pos_data, width, side, plot_opts 176 | ) 177 | 178 | # return back to actual data 179 | xvals = np.array([untransform(x) for x in xvals]) 180 | pos_data = np.array([untransform(x) for x in pos_data]) 181 | 182 | data = { 183 | "name": name, 184 | "side": side, 185 | "xvals": xvals, 186 | "envelope_l": envelope_l, 187 | "envelope_r": envelope_r, 188 | } 189 | 190 | if showmedian: 191 | data["median"] = get_median_coords( 192 | xvals, envelope_l, envelope_r, np.median(pos_data) 193 | ) 194 | if showextrema: 195 | data["min"], data["max"] = get_extrema_coords( 196 | pos, pos_data, width / 3, side 197 | ) 198 | 199 | violin_data.append(data) 200 | 201 | return violin_data 202 | 203 | 204 | def generate_tex_for_line(xleft=0, yleft=0, xright=0, yright=0, linestyle=""): 205 | tex = "" 206 | tex += "\\addplot[%s] coordinates {%%\n" % linestyle 207 | tex += "(%.16f, %.16f)\n" % (xleft, yleft) 208 | tex += "(%.16f, %.16f)\n" % (xright, yright) 209 | tex += "};\n" 210 | return tex 211 | 212 | 213 | def generate_tex_for_violin( 214 | violin, edgecolor=None, edgethick=None, fillcolor=None, alpha=0.5 215 | ): 216 | name = violin["name"] + violin["side"] 217 | 218 | edgecolor = "none" if edgecolor is None else edgecolor 219 | edgethick = "" if edgethick is None else ", " + edgethick 220 | fillcolor = "fill=none" if fillcolor is None else fillcolor 221 | left_name, right_name = name + "Left", name + "Right" 222 | 223 | tex = "\\addplot [draw=%s %s, name path=%s] coordinates {%%\n" % ( 224 | edgecolor, 225 | edgethick, 226 | left_name, 227 | ) 228 | for xx, yy in zip(violin["envelope_l"], violin["xvals"]): 229 | tex += "(%.16f, %.16f)\n" % (xx, yy) 230 | tex += "};\n" 231 | tex += "\\addplot [draw=%s %s, name path=%s] coordinates {%%\n" % ( 232 | edgecolor, 233 | edgethick, 234 | right_name, 235 | ) 236 | for xx, yy in zip(violin["envelope_r"], violin["xvals"]): 237 | tex += "(%.16f, %.16f)\n" % (xx, yy) 238 | tex += "};\n" 239 | tex += "\\addplot [%s, opacity=%f] fill between [of=%s and %s];\n" % ( 240 | fillcolor, 241 | alpha, 242 | left_name, 243 | right_name, 244 | ) 245 | 246 | if "median" in violin: 247 | # linestyle = "dashed, dash pattern=on 2pt off 2pt" 248 | violin["median"]["linestyle"] = "densely dotted, thick, black" 249 | 250 | tex += generate_tex_for_line(**violin["median"]) 251 | if "min" in violin: 252 | violin["min"]["linestyle"] = "solid, ColorMinMax" 253 | tex += generate_tex_for_line(**violin["min"]) 254 | if "max" in violin: 255 | violin["max"]["linestyle"] = "solid, ColorMinMax" 256 | tex += generate_tex_for_line(**violin["max"]) 257 | 258 | return tex 259 | 260 | 261 | def generate_latex(violindata, legend_data, opacity=0.5): 262 | abbrev = [clean_detector_name(d) for d in ORDERED_DETECTORS] 263 | xtick = ",".join([str(i + 1) for i in range(len(abbrev))]) 264 | xticklabels = ",".join(abbrev) 265 | 266 | yrange = [pow(10, x) for x in [-6, -4, -2, 0, 2, 4]] 267 | ytick = ",".join([str(i) for i in yrange]) 268 | 269 | legend_entries = ", ".join( 270 | [CORPUS_NAMES.get(c) for c in legend_data["corpora"]] 271 | ) 272 | 273 | tex = ( 274 | "\\documentclass[preview=true]{standalone}\n" 275 | "\\pdfinfoomitdate=1\n" 276 | "\\pdftrailerid{}\n" 277 | "\\pdfsuppressptexinfo=1\n" 278 | "\\usepackage{tikz}\n" 279 | "\\usepackage{pgfplots}\n" 280 | "\\pgfplotsset{compat=1.16}\n" 281 | "\\usepgfplotslibrary{fillbetween}\n" 282 | "\\definecolor{ColorLeft}{HTML}{%s}\n" 283 | "\\definecolor{ColorRight}{HTML}{%s}\n" 284 | "\\definecolor{ColorMinMax}{HTML}{%s}\n" 285 | "\\begin{document}\n" 286 | "\\begin{tikzpicture}\n" 287 | "\\begin{semilogyaxis}[\n" 288 | "xtick={%s},\n" 289 | "xticklabels={%s},\n" 290 | "ytick={%s},\n" 291 | "ylabel={Runtime (s)},\n" 292 | "width=600pt,\n" 293 | "height=200pt,\n" 294 | "ymajorgrids,\n" 295 | "grid style={opacity=0.1},\n" 296 | "legend entries={%s},\n" 297 | "legend pos={south west},\n" 298 | "]\n" 299 | % ( 300 | COLOR_LEFT, 301 | COLOR_RIGHT, 302 | COLOR_MINMAX, 303 | xtick, 304 | xticklabels, 305 | ytick, 306 | legend_entries, 307 | ) 308 | ) 309 | 310 | tex += ( 311 | "\\addlegendimage{only marks, mark=square*, ColorLeft, opacity=%g}\n" 312 | % (opacity) 313 | ) 314 | tex += ( 315 | "\\addlegendimage{only marks, mark=square*, ColorRight, opacity=%g}\n" 316 | % (opacity) 317 | ) 318 | 319 | for corpus in violindata: 320 | for violin in violindata[corpus]: 321 | fillcolor = ( 322 | "ColorLeft" if violin["side"] == "left" else "ColorRight" 323 | ) 324 | tex += generate_tex_for_violin( 325 | violin, edgecolor="black", fillcolor=fillcolor, alpha=opacity 326 | ) 327 | 328 | tex += "\\end{semilogyaxis}\n" "\\end{tikzpicture}\n" "\\end{document}" 329 | return tex 330 | 331 | 332 | def create_twosided_violin(corpus_data, output_file): 333 | corpora = sorted(corpus_data.keys()) 334 | sides = ["left", "right"] 335 | assert len(corpora) == 2 336 | legend_data = {"corpora": corpora, "colors": [COLOR_LEFT, COLOR_RIGHT]} 337 | 338 | violindata = {} 339 | for corpus, side in zip(corpora, sides): 340 | violindata[corpus] = generate_violin_data( 341 | corpus_data[corpus], side=side, showmedian=True 342 | ) 343 | 344 | tex = generate_latex(violindata, legend_data) 345 | tex_file = os.path.splitext(output_file)[0] + ".tex" 346 | with open(tex_file, "w") as fid: 347 | fid.write(tex) 348 | build_latex_doc(tex, output_name=output_file) 349 | 350 | 351 | def parse_args(): 352 | parser = argparse.ArgumentParser() 353 | parser.add_argument( 354 | "-o", dest="output", help="Output pdf file to write to", required=True 355 | ) 356 | parser.add_argument( 357 | "-s", 358 | dest="summaries", 359 | help="Summary file(s) with the results", 360 | required=True, 361 | nargs="+", 362 | ) 363 | return parser.parse_args() 364 | 365 | 366 | def main(): 367 | args = parse_args() 368 | all_data = {} 369 | for summary_file in args.summaries: 370 | with open(summary_file, "r") as fid: 371 | data = json.load(fid) 372 | all_data[data["corpus"]] = data 373 | 374 | create_twosided_violin(all_data, args.output) 375 | -------------------------------------------------------------------------------- /scripts/detection/human.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This script should be opened within tmux and no other tmux sessions should be 6 | running. 7 | 8 | Author: Gertjan van den Burg 9 | Copyright (c) 2018 - The Alan Turing Institute 10 | License: See the LICENSE file. 11 | 12 | """ 13 | 14 | import json 15 | import libtmux 16 | import os 17 | import sys 18 | import time 19 | 20 | from common.encoding import get_encoding 21 | from common.escape import is_potential_escapechar 22 | from common.load import load_file 23 | from common.detector_result import DetectorResult, Dialect, Status, StatusMsg 24 | from common.utils import pairwise 25 | 26 | 27 | def has_quotechar(data): 28 | chars = set(data) 29 | if '"' in chars or "'" in chars or "~" in chars or "`" in chars: 30 | return True 31 | return False 32 | 33 | 34 | def get_quotechar_options(data): 35 | options = set() 36 | if '"' in data: 37 | options.add("q") 38 | if "'" in data: 39 | options.add("a") 40 | if "`" in data: 41 | options.add("b") 42 | if "~" in data: 43 | options.add("t") 44 | options.add("n") 45 | return options 46 | 47 | 48 | def get_escapechar_options(data, encoding, delim, quotechar): 49 | escapes = set() 50 | for u, v in pairwise(data): 51 | if not is_potential_escapechar(u, encoding): 52 | continue 53 | if v in [delim, quotechar] and not u in [delim, quotechar]: 54 | escapes.add(u) 55 | return escapes 56 | 57 | 58 | def ask_dqe( 59 | filename, 60 | data, 61 | encoding, 62 | ask_delim, 63 | ask_quotechar, 64 | ask_escapechar, 65 | old_res, 66 | less_pane, 67 | ): 68 | if not old_res is None: 69 | res = { 70 | "delimiter": old_res.get("delimiter", None), 71 | "quotechar": old_res.get("quotechar", None), 72 | "escapechar": old_res.get("escapechar", None), 73 | } 74 | else: 75 | res = {"delimiter": None, "quotechar": None, "escapechar": None} 76 | 77 | opened_vim = False 78 | opened_less = False 79 | 80 | note = None 81 | 82 | if ask_delim: 83 | less_pane.send_keys("less -f %s" % filename) 84 | opened_less = True 85 | prompt = "What is the delimiter? " 86 | while True: 87 | ans = input(prompt) 88 | if ans == "quit": 89 | less_pane.send_keys("q") 90 | opened_less = False 91 | less_pane.send_keys("exit") 92 | raise SystemExit 93 | if ans in ["vi", "vim"]: 94 | less_pane.send_keys("q") 95 | opened_less = False 96 | less_pane.send_keys("vim %s" % filename) 97 | opened_vim = True 98 | continue 99 | if ans in ["hltab", "hlt"]: 100 | less_pane.send_keys("/\\t") 101 | continue 102 | if ans in ["hlspace", "hls"]: 103 | less_pane.send_keys("/\\ ") 104 | continue 105 | if ans == "skip": 106 | if opened_less: 107 | less_pane.send_keys("q") 108 | elif opened_vim: 109 | less_pane.send_keys(":q") 110 | less_pane.clear() 111 | return None, note 112 | if ans == "note": 113 | note = input("Enter note: ").strip() 114 | continue 115 | if ans == "none": 116 | res["delimiter"] = None 117 | elif ans == "\\t": 118 | res["delimiter"] = "\t" 119 | elif len(ans.strip()) > 1: 120 | print("Only length 0 or 1 delimiters are allowed") 121 | continue 122 | else: 123 | res["delimiter"] = ans.rstrip("\n") 124 | break 125 | 126 | print("Delimiter: %r" % res["delimiter"]) 127 | 128 | if opened_vim: 129 | less_pane.send_keys(":q") 130 | opened_vim = False 131 | time.sleep(1) 132 | less_pane.send_keys("less -f %s" % filename) 133 | opened_less = True 134 | 135 | if ask_quotechar: 136 | if not opened_less: 137 | less_pane.send_keys("less -f %s" % filename) 138 | opened_less = True 139 | 140 | options = get_quotechar_options(data) 141 | if "q" in options: 142 | less_pane.send_keys('/"') 143 | less_pane.send_keys("gg", enter=False, suppress_history=False) 144 | less_pane.send_keys("n", enter=False, suppress_history=False) 145 | elif "a" in options: 146 | less_pane.send_keys("/'") 147 | less_pane.send_keys("gg", enter=False, suppress_history=False) 148 | less_pane.send_keys("n", enter=False, suppress_history=False) 149 | opt_str = "/".join(sorted(options)) 150 | prompt = "What is the quotation mark? [%s] " % opt_str 151 | while True: 152 | if list(options) == ["n"]: 153 | res["quotechar"] = None 154 | break 155 | ans = input(prompt) 156 | ans = ans.rstrip("\n") 157 | 158 | if ans == "quit": 159 | less_pane.send_keys("q") 160 | opened_less = False 161 | less_pane.send_keys("exit") 162 | raise SystemExit 163 | if ans in ["vi", "vim"]: 164 | less_pane.send_keys("q") 165 | opened_less = False 166 | less_pane.send_keys("vim %s" % filename) 167 | opened_vim = True 168 | continue 169 | if ans == "skip": 170 | if opened_less: 171 | less_pane.send_keys("q") 172 | elif opened_vim: 173 | less_pane.send_keys(":q") 174 | less_pane.clear() 175 | return None, note 176 | if ans == "note": 177 | note = input("Enter note: ").strip() 178 | continue 179 | if not ans.strip().lower() in options: 180 | print("Please try again.") 181 | continue 182 | if ans == "n": 183 | res["quotechar"] = None 184 | else: 185 | if not ans.upper() in ["Q", "A", "B", "T"]: 186 | raise ValueError("Unknown option: %s" % ans) 187 | res["quotechar"] = {"Q": '"', "A": "'", "B": "`", "T": 188 | "~"}[ans.upper()] 189 | break 190 | 191 | print("Quotechar: %r" % res["quotechar"]) 192 | 193 | if opened_vim: 194 | less_pane.send_keys(":q") 195 | opened_vim = False 196 | time.sleep(1) 197 | less_pane.send_keys("less -f %s" % filename) 198 | opened_less = True 199 | 200 | options = get_escapechar_options( 201 | data, encoding, res["delimiter"], res["quotechar"] 202 | ) 203 | if ask_escapechar: 204 | if not options: 205 | print("No escapechar options.") 206 | res["escapechar"] = "" 207 | else: 208 | if not opened_less: 209 | less_pane.send_keys("less -f %s" % filename) 210 | opened_less = True 211 | if "n" in options: 212 | raise ValueError("'n' shouldn't be an option in escapechars!") 213 | if len(options) == 1: 214 | if '\\' in options: 215 | less_pane.send_keys("/\\\\") 216 | less_pane.send_keys("gg", enter=False, suppress_history=False) 217 | less_pane.send_keys("n", enter=False, suppress_history=False) 218 | options.add("n") 219 | opt_str = "/".join(sorted(options)) 220 | prompt = "What is the escape character? [%s] " % opt_str 221 | while True: 222 | ans = input(prompt) 223 | ans = ans.strip("\n") 224 | if ans == "quit": 225 | less_pane.send_keys("q") 226 | opened_less = False 227 | less_pane.send_keys("exit") 228 | raise SystemExit 229 | if ans == "skip": 230 | if opened_less: 231 | less_pane.send_keys("q") 232 | less_pane.clear() 233 | return None, note 234 | if ans == "note": 235 | note = input("Enter note: ").strip() 236 | continue 237 | if not ans.strip() in options: 238 | print("Please try again") 239 | continue 240 | if ans == "n": 241 | res["escapechar"] = "" 242 | else: 243 | res["escapechar"] = ans 244 | break 245 | 246 | print("Escapechar: %r" % res["escapechar"]) 247 | 248 | if opened_less: 249 | less_pane.send_keys("q") 250 | less_pane.clear() 251 | return res, note 252 | 253 | 254 | def annotate_file(filename, less_pane, previous): 255 | print("") 256 | encoding = get_encoding(filename) 257 | data = load_file(filename, encoding=encoding) 258 | 259 | if previous: 260 | ask_delim = not "delimiter" in previous 261 | ask_quotechar = not "quotechar" in previous and has_quotechar(data) 262 | ask_escapechar = not "escapechar" in previous 263 | else: 264 | ask_delim = True 265 | ask_quotechar = has_quotechar(data) 266 | ask_escapechar = True 267 | 268 | print("Annotating file: %s" % filename) 269 | res, note = ask_dqe( 270 | filename, 271 | data, 272 | encoding, 273 | ask_delim, 274 | ask_quotechar, 275 | ask_escapechar, 276 | previous, 277 | less_pane, 278 | ) 279 | 280 | out = DetectorResult( 281 | detector="human", filename=filename, runtime=None, status=Status.OK 282 | ) 283 | if note: 284 | out.note = note 285 | 286 | if res is None: 287 | less_pane.send_keys("q") 288 | less_pane.clear() 289 | out.status = Status.SKIP 290 | out.status_msg = StatusMsg.HUMAN_SKIP 291 | return out 292 | 293 | if res["delimiter"] is None: 294 | res["delimiter"] = "" 295 | if res["quotechar"] is None: 296 | res["quotechar"] = "" 297 | 298 | out.dialect = Dialect.from_dict(res) 299 | 300 | return out 301 | 302 | 303 | def dump_result(output_file, res): 304 | with open(output_file, "a") as fid: 305 | fid.write(res.to_json() + "\n") 306 | 307 | 308 | def load_previous(output_file): 309 | previous = {} 310 | if not os.path.exists(output_file): 311 | return previous 312 | with open(output_file, "r") as fid: 313 | for line in fid.readlines(): 314 | record = json.loads(line.strip()) 315 | previous[record["filename"]] = record 316 | return previous 317 | 318 | 319 | def init_tmux(): 320 | tmux_server = libtmux.Server() 321 | tmux_sess = tmux_server.list_sessions()[-1] 322 | tmux_win = tmux_sess.attached_window 323 | less_pane = tmux_win.split_window(attach=False) 324 | 325 | return less_pane 326 | 327 | 328 | def batch_process(path_file, output_file): 329 | with open(path_file, "r") as fid: 330 | files = [l.strip() for l in fid.readlines()] 331 | files.sort() 332 | 333 | previous = load_previous(output_file) 334 | 335 | done = [x for x in files if x in previous and "dialect" in previous[x]] 336 | skipped = [ 337 | x for x in files if x in previous and previous[x]["status"] == "SKIP" 338 | ] 339 | todo = [x for x in files if not (x in done or x in skipped)] 340 | 341 | if not todo: 342 | print("All done.") 343 | return 344 | 345 | print("Number of files remaining: %i" % len(todo)) 346 | 347 | less_pane = init_tmux() 348 | 349 | count = 0 350 | start_time = time.time() 351 | for filename in todo: 352 | old_res = previous.get(filename, None) 353 | 354 | if not os.path.exists(filename): 355 | print("File not found: %s" % filename) 356 | res = DetectorResult( 357 | status=Status.SKIP, status_msg=StatusMsg.NON_EXISTENT 358 | ) 359 | continue 360 | 361 | res = annotate_file(filename, less_pane, old_res) 362 | res.filename = filename 363 | dump_result(output_file, res) 364 | count += 1 365 | 366 | if count % 10 == 0: 367 | print( 368 | "\nProgress: %i done out of %i. " 369 | "This session: %i (%.2f seconds per file)" 370 | % ( 371 | count, 372 | len(todo), 373 | count, 374 | ((time.time() - start_time) / count), 375 | ) 376 | ) 377 | 378 | print("All done.") 379 | 380 | 381 | def main(): 382 | if len(sys.argv) == 2: 383 | print(annotate_file(sys.argv[1], init_tmux())) 384 | elif len(sys.argv) == 3: 385 | batch_process(sys.argv[1], sys.argv[2]) 386 | else: 387 | print("Usage: %s path_file output_file" % (sys.argv[0])) 388 | raise SystemExit 389 | 390 | 391 | if __name__ == "__main__": 392 | main() 393 | -------------------------------------------------------------------------------- /scripts/analysis/make_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Make summaries from the detector result files. 6 | 7 | Author: Gertjan van den Burg 8 | Copyright (c) 2018 - The Alan Turing Institute 9 | License: See the LICENSE file. 10 | 11 | """ 12 | 13 | import argparse 14 | import json 15 | 16 | from common.dialect import ATTRIBUTES 17 | from common.detector_result import Status 18 | 19 | from .core import load_detector_results, is_standard_dialect 20 | 21 | 22 | def prop_equal(res1, res2, attr_name): 23 | return getattr(res1.dialect, attr_name) == getattr(res2.dialect, attr_name) 24 | 25 | 26 | def compute_attribute_accuracy( 27 | reference, detector, attr_name, detector_name, original_detector=None 28 | ): 29 | n_equal, n_total = 0, 0 30 | od = original_detector 31 | 32 | for fname in reference: 33 | res_ref = reference[fname] 34 | if not fname in detector: 35 | print( 36 | "Warning: no result for %s in for detector %s" 37 | % (fname, detector_name) 38 | ) 39 | continue 40 | res_det = detector[fname] 41 | if od is not None and res_ref.original_detector != od: 42 | continue 43 | if not res_ref.status == Status.OK: 44 | continue 45 | n_total += 1 46 | if res_det.status == Status.OK: 47 | n_equal += prop_equal(res_ref, res_det, attr_name) 48 | 49 | return n_equal / n_total 50 | 51 | 52 | def compute_overall_accuracy( 53 | reference, detector, detector_name, original_detector=None 54 | ): 55 | n_equal, n_total = 0, 0 56 | od = original_detector 57 | for fname in reference: 58 | res_ref = reference[fname] 59 | if not fname in detector: 60 | print( 61 | "Warning: no result for %s in for detector %s" 62 | % (fname, detector_name) 63 | ) 64 | continue 65 | res_det = detector[fname] 66 | if od is not None and res_ref.original_detector != od: 67 | continue 68 | if not res_ref.status == Status.OK: 69 | continue 70 | n_total += 1 71 | if res_det.status == Status.OK: 72 | n_equal += res_ref.dialect == res_det.dialect 73 | return n_equal / n_total 74 | 75 | 76 | def compute_standard_accuracy(reference, detector, standard=True): 77 | total_standard, total_messy = 0, 0 78 | correct_standard, correct_messy = 0, 0 79 | for fname in reference: 80 | res_ref = reference[fname] 81 | if not res_ref.status == Status.OK: 82 | continue 83 | if not fname in detector: 84 | print("Warning: no result for file: %s" % fname) 85 | continue 86 | res_det = detector[fname] 87 | 88 | is_std = is_standard_dialect(res_ref.dialect) 89 | if is_std: 90 | total_standard += 1 91 | else: 92 | total_messy += 1 93 | 94 | if not res_det.status == Status.OK: 95 | continue 96 | 97 | is_correct = res_det.dialect == res_ref.dialect 98 | if is_std: 99 | correct_standard += 1 if is_correct else 0 100 | else: 101 | correct_messy += 1 if is_correct else 0 102 | if standard: 103 | return correct_standard / total_standard 104 | return correct_messy / total_messy 105 | 106 | 107 | def compute_fail_percentage(reference, detector, detector_name): 108 | n_fail, n_total = 0, 0 109 | for fname in reference: 110 | if reference[fname].status == Status.OK: 111 | n_total += 1 112 | else: 113 | continue 114 | if not fname in detector: 115 | print( 116 | "Warning: no result for %s in for detector %s" 117 | % (fname, detector_name) 118 | ) 119 | continue 120 | if detector[fname].status == Status.FAIL: 121 | n_fail += 1 122 | return n_fail / n_total 123 | 124 | 125 | def compute_nic_split_accuracy(reference, detector, mode=None): 126 | files_total = 0 127 | files_with_mode = 0 128 | for fname in reference: 129 | res_ref = reference[fname] 130 | if not res_ref.status == Status.OK: 131 | continue 132 | if not fname in detector: 133 | print("Warning: no result for file: %s" % fname) 134 | continue 135 | res_det = detector[fname] 136 | 137 | files_total += 1 138 | 139 | if mode == "no_results": 140 | if not res_det.status == Status.OK: 141 | files_with_mode += 1 142 | elif mode == "incorrect_results": 143 | if ( 144 | res_det.status == Status.OK 145 | and res_det.dialect != res_ref.dialect 146 | ): 147 | files_with_mode += 1 148 | elif mode == "correct_results": 149 | if ( 150 | res_det.status == Status.OK 151 | and res_det.dialect == res_ref.dialect 152 | ): 153 | files_with_mode += 1 154 | else: 155 | raise ValueError("Unknown mode: %r" % mode) 156 | return files_with_mode / files_total 157 | 158 | def collect_computation_times(reference, detector, detector_name): 159 | runtimes = [] 160 | for fname in sorted(reference.keys()): 161 | if not reference[fname].status == Status.OK: 162 | continue 163 | if not fname in detector: 164 | print( 165 | "Warning: no result for %s in for detector %s" 166 | % (fname, detector_name) 167 | ) 168 | continue 169 | # Note that we don't check whether the detector returned with status 170 | # OK, because we want to include failures and timeouts in the runtime 171 | # plots as well. 172 | rt = detector[fname].runtime 173 | if rt is None: 174 | raise ValueError( 175 | "Runtime is None for result: %r" % detector[fname] 176 | ) 177 | runtimes.append(detector[fname].runtime) 178 | 179 | return runtimes 180 | 181 | 182 | def count_reference_ok(reference, original_detector=None): 183 | n_ok = 0 184 | od = original_detector 185 | for fname in reference: 186 | if od is not None and reference[fname].original_detector != od: 187 | continue 188 | if reference[fname].status == Status.OK: 189 | n_ok += 1 190 | return n_ok 191 | 192 | 193 | def count_standard(reference_results, standard=True): 194 | count = 0 195 | for fname in reference_results: 196 | ref = reference_results[fname] 197 | if not ref.status == Status.OK: 198 | continue 199 | 200 | is_std = is_standard_dialect(ref.dialect) 201 | if standard: 202 | if is_std: 203 | count += 1 204 | else: 205 | if not is_std: 206 | count += 1 207 | return count 208 | 209 | 210 | def summarize_accuracy( 211 | reference_results, detector_results_all, original_detector=None 212 | ): 213 | accuracy = {} 214 | for attr_name in ATTRIBUTES: 215 | accuracy[attr_name] = {} 216 | for detector in detector_results_all: 217 | detector_results = detector_results_all[detector] 218 | accuracy[attr_name][detector] = compute_attribute_accuracy( 219 | reference_results, 220 | detector_results, 221 | attr_name, 222 | detector, 223 | original_detector=original_detector, 224 | ) 225 | 226 | assert "overall" not in accuracy.keys() 227 | accuracy["overall"] = {} 228 | for detector in detector_results_all: 229 | detector_results = detector_results_all[detector] 230 | accuracy["overall"][detector] = compute_overall_accuracy( 231 | reference_results, 232 | detector_results, 233 | detector, 234 | original_detector=original_detector, 235 | ) 236 | return accuracy 237 | 238 | 239 | def summarize_standard_accuracy( 240 | reference_results, detector_results_all, standard=True 241 | ): 242 | 243 | accuracy = {} 244 | for detector in detector_results_all: 245 | detector_results = detector_results_all[detector] 246 | accuracy[detector] = compute_standard_accuracy( 247 | reference_results, detector_results, standard=standard 248 | ) 249 | return accuracy 250 | 251 | 252 | def summarize_nic_split_accuracy( 253 | reference_results, detector_results_all, mode=None 254 | ): 255 | allowed_modes = ["no_results", "incorrect_results", "correct_results"] 256 | if mode is None or not mode in allowed_modes: 257 | raise ValueError("mode must be one of: %r" % allowed_modes) 258 | 259 | accuracies = {} 260 | for detector in detector_results_all: 261 | detector_results = detector_results_all[detector] 262 | accuracies[detector] = compute_nic_split_accuracy( 263 | reference_results, detector_results, mode=mode 264 | ) 265 | return accuracies 266 | 267 | 268 | def create_summary(reference_results, detector_results_all): 269 | summary = {} 270 | summary["n_files_all"] = count_reference_ok( 271 | reference_results, original_detector=None 272 | ) 273 | summary["n_files_human"] = count_reference_ok( 274 | reference_results, original_detector="human" 275 | ) 276 | summary["n_files_normal"] = count_reference_ok( 277 | reference_results, original_detector="normal" 278 | ) 279 | summary["n_files_standard"] = count_standard( 280 | reference_results, standard=True 281 | ) 282 | summary["n_files_messy"] = count_standard( 283 | reference_results, standard=False 284 | ) 285 | 286 | # Compute accuracy 287 | summary["detection_accuracy_all"] = summarize_accuracy( 288 | reference_results, detector_results_all, original_detector=None 289 | ) 290 | summary["detection_accuracy_human"] = summarize_accuracy( 291 | reference_results, detector_results_all, original_detector="human" 292 | ) 293 | summary["detection_accuracy_normal"] = summarize_accuracy( 294 | reference_results, detector_results_all, original_detector="normal" 295 | ) 296 | 297 | # Compute standard/non-standard split 298 | summary["standard_accuracy_all"] = summarize_standard_accuracy( 299 | reference_results, detector_results_all, standard=True 300 | ) 301 | summary["messy_accuracy_all"] = summarize_standard_accuracy( 302 | reference_results, detector_results_all, standard=False 303 | ) 304 | 305 | # Compute No result/Incorrect results/Correct result split 306 | summary["no_result_all"] = summarize_nic_split_accuracy( 307 | reference_results, detector_results_all, mode="no_results" 308 | ) 309 | summary["incorrect_result_all"] = summarize_nic_split_accuracy( 310 | reference_results, detector_results_all, mode="incorrect_results" 311 | ) 312 | summary["correct_result_all"] = summarize_nic_split_accuracy( 313 | reference_results, detector_results_all, mode="correct_results" 314 | ) 315 | 316 | # Compute failure rates 317 | failures = {} 318 | for detector in detector_results_all: 319 | detector_results = detector_results_all[detector] 320 | failures[detector] = compute_fail_percentage( 321 | reference_results, detector_results, detector 322 | ) 323 | summary["failures"] = failures 324 | 325 | # Collect runtimes 326 | runtimes = {} 327 | for detector in detector_results_all: 328 | detector_results = detector_results_all[detector] 329 | runtimes[detector] = collect_computation_times( 330 | reference_results, detector_results, detector 331 | ) 332 | summary["runtimes"] = runtimes 333 | 334 | return summary 335 | 336 | 337 | def parse_args(): 338 | parser = argparse.ArgumentParser(description="Compare detector results") 339 | parser.add_argument( 340 | "-c", 341 | dest="corpus", 342 | help="Name of the corpus we're looking at", 343 | required=True, 344 | ) 345 | parser.add_argument( 346 | "-s", 347 | dest="summary_file", 348 | help="output file for the summary statistics", 349 | required=True, 350 | ) 351 | parser.add_argument( 352 | "-r", 353 | dest="reference_file", 354 | help="reference output file with ground truth", 355 | required=True, 356 | ) 357 | parser.add_argument( 358 | "-o", 359 | dest="output_file", 360 | nargs="+", 361 | help="output_file(s) from different detectors", 362 | required=True, 363 | ) 364 | return parser.parse_args() 365 | 366 | 367 | def main(): 368 | args = parse_args() 369 | 370 | _, ref_results = load_detector_results(args.reference_file) 371 | detector_results = {} 372 | for fname in args.output_file: 373 | name, results = load_detector_results(fname) 374 | detector_results[name] = results 375 | 376 | summary_data = create_summary(ref_results, detector_results) 377 | summary_data["corpus"] = args.corpus 378 | with open(args.summary_file, "w") as fid: 379 | fid.write(json.dumps(summary_data, indent=2)) 380 | --------------------------------------------------------------------------------