├── .gitignore
├── scripts
    ├── analysis
    │   ├── __init__.py
    │   ├── constant_n_files.py
    │   ├── constant_n_incorrect_prop.py
    │   ├── constant_fail_percentage.py
    │   ├── constant_prop_potential_dialect.py
    │   ├── constant_n_dialect.py
    │   ├── constant_improve_sniffer.py
    │   ├── potential_dialects.py
    │   ├── constant_failure_messy.py
    │   ├── constant_improve_sniffer_messy.py
    │   ├── table_accuracy.py
    │   ├── constant_failure.py
    │   ├── constant_accuracy_overall.py
    │   ├── constant_known_type.py
    │   ├── figure_box_plot.py
    │   ├── table_std_messy.py
    │   ├── figure_bar_plot.py
    │   ├── table_parse_result.py
    │   ├── core.py
    │   ├── figure_fail.py
    │   ├── latex.py
    │   ├── show_failures.py
    │   ├── figure_violins.py
    │   └── make_summary.py
    ├── detection
    │   ├── __init__.py
    │   ├── lib
    │   │   ├── __init__.py
    │   │   └── types
    │   │   │   ├── __init__.py
    │   │   │   ├── README.md
    │   │   │   └── rudi_types.py
    │   ├── our_score_type_only.py
    │   ├── our_score_pattern_only.py
    │   ├── our_score_full.py
    │   ├── our_score_full_no_tie.py
    │   ├── sniffer.py
    │   ├── core.py
    │   ├── suitability.py
    │   ├── _ties.py
    │   ├── hypo.R
    │   ├── our_score_base.py
    │   └── human.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── merge.py
    │   ├── filter_non_normal.py
    │   └── extract_normals.py
    ├── run_human.py
    ├── analysis_summarise.py
    ├── analysis_explore_failures.py
    ├── analysis_potential_dialects.py
    ├── merge_human_normal.py
    ├── run_extract_normal.py
    ├── run_normal_detection.py
    ├── common
    │   ├── utils.py
    │   ├── escape.py
    │   ├── encoding.py
    │   ├── load.py
    │   ├── dialect.py
    │   ├── detector_result.py
    │   └── parser.py
    ├── README.md
    ├── run_detector.py
    ├── analysis_results.py
    ├── run_hypoparsr.sh
    ├── analysis_constants.py
    └── data_download.py
├── data
    └── .gitignore
├── results
    ├── test
    │   ├── analysis
    │   │   ├── constants
    │   │   │   ├── NumDialectTotal.tex
    │   │   │   ├── NumDialect_github.tex
    │   │   │   ├── NumDialect_ukdata.tex
    │   │   │   ├── NumFiles_github.tex
    │   │   │   ├── NumFiles_ukdata.tex
    │   │   │   ├── PropKnownType.tex
    │   │   │   ├── AccuracyOverallOurs.tex
    │   │   │   ├── FactorPotentialDialects.tex
    │   │   │   ├── PropFailHypoTimeout.tex
    │   │   │   ├── FailureRateOursMessyAll.tex
    │   │   │   ├── FailureRateSnifferMessyAll.tex
    │   │   │   ├── ImprovementOverSniffer.tex
    │   │   │   ├── PropFailHypoNoResults.tex
    │   │   │   ├── PropFailOurFull_github.tex
    │   │   │   ├── PropFailOurFull_ukdata.tex
    │   │   │   ├── PropFailSnifferNoResults.tex
    │   │   │   ├── PropFailSnifferTimeout.tex
    │   │   │   ├── ImprovementOverSnifferMessy.tex
    │   │   │   └── ImprovementOverSnifferMessyCeil.tex
    │   │   ├── figures
    │   │   │   └── violin_combined.pdf
    │   │   └── tables
    │   │   │   ├── parse_result_github.tex
    │   │   │   ├── parse_result_ukdata.tex
    │   │   │   ├── standard_and_messy_github.tex
    │   │   │   ├── standard_and_messy_ukdata.tex
    │   │   │   ├── accuracy_all_github.tex
    │   │   │   ├── accuracy_all_ukdata.tex
    │   │   │   ├── accuracy_human_github.tex
    │   │   │   ├── accuracy_human_ukdata.tex
    │   │   │   ├── accuracy_normal_github.tex
    │   │   │   └── accuracy_normal_ukdata.tex
    │   └── README.md
    └── dev
    │   └── README.md
├── Rpackages.txt
├── requirements.txt
├── .gitmodules
├── .travis.yml
├── utils
    └── install_R_packages.sh
├── LICENSE
├── design
    └── result.md
├── Dockerfile
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__/
2 | 


--------------------------------------------------------------------------------
/scripts/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/detection/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/detection/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/scripts/detection/lib/types/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/results/test/analysis/constants/NumDialectTotal.tex:
--------------------------------------------------------------------------------
1 | 34%


--------------------------------------------------------------------------------
/results/test/analysis/constants/NumDialect_github.tex:
--------------------------------------------------------------------------------
1 | 33%


--------------------------------------------------------------------------------
/results/test/analysis/constants/NumDialect_ukdata.tex:
--------------------------------------------------------------------------------
1 | 8%


--------------------------------------------------------------------------------
/results/test/analysis/constants/NumFiles_github.tex:
--------------------------------------------------------------------------------
1 | 4386%


--------------------------------------------------------------------------------
/results/test/analysis/constants/NumFiles_ukdata.tex:
--------------------------------------------------------------------------------
1 | 4969%


--------------------------------------------------------------------------------
/results/test/analysis/constants/PropKnownType.tex:
--------------------------------------------------------------------------------
1 | 91.6\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/AccuracyOverallOurs.tex:
--------------------------------------------------------------------------------
1 | 97\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/FactorPotentialDialects.tex:
--------------------------------------------------------------------------------
1 | 0.2%


--------------------------------------------------------------------------------
/results/test/analysis/constants/PropFailHypoTimeout.tex:
--------------------------------------------------------------------------------
1 | 38.1\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/FailureRateOursMessyAll.tex:
--------------------------------------------------------------------------------
1 | 14\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/FailureRateSnifferMessyAll.tex:
--------------------------------------------------------------------------------
1 | 36\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/ImprovementOverSniffer.tex:
--------------------------------------------------------------------------------
1 | 8.6\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/PropFailHypoNoResults.tex:
--------------------------------------------------------------------------------
1 | 61.4\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/PropFailOurFull_github.tex:
--------------------------------------------------------------------------------
1 | 0.30\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/PropFailOurFull_ukdata.tex:
--------------------------------------------------------------------------------
1 | 0.00\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/PropFailSnifferNoResults.tex:
--------------------------------------------------------------------------------
1 | 75.8\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/PropFailSnifferTimeout.tex:
--------------------------------------------------------------------------------
1 | 24.2\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/ImprovementOverSnifferMessy.tex:
--------------------------------------------------------------------------------
1 | 21.4\%%


--------------------------------------------------------------------------------
/results/test/analysis/constants/ImprovementOverSnifferMessyCeil.tex:
--------------------------------------------------------------------------------
1 | 22\%%


--------------------------------------------------------------------------------
/Rpackages.txt:
--------------------------------------------------------------------------------
1 | devtools
2 | rjson
3 | data.tree
4 | RecordLinkage
5 | readr
6 | tibble
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | chardet
 2 | libtmux
 3 | matplotlib
 4 | numpy
 5 | pandas
 6 | regex
 7 | requests
 8 | scipy
 9 | sklearn
10 | tabulate
11 | tqdm
12 | dominate
13 | 


--------------------------------------------------------------------------------
/results/test/analysis/figures/violin_combined.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/CSV_Wrangling/HEAD/results/test/analysis/figures/violin_combined.pdf


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "scripts/detection/lib/hypoparsr"]
2 | 	path = scripts/detection/lib/hypoparsr
3 | 	url = https://github.com/GjjvdBurg/hypoparsr
4 | 	branch = turing
5 | 


--------------------------------------------------------------------------------
/results/dev/README.md:
--------------------------------------------------------------------------------
1 | # Results
2 | 
3 | These are the dialect annotations for the files that were used during 
4 | development of the dialect detection algorithm. See 
5 | ``out_reference_<source>.json`` for the ground-truth annotations for all 
6 | files.
7 | 


--------------------------------------------------------------------------------
/scripts/run_human.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper for human annotation.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | from detection import human
14 | 
15 | if __name__ == '__main__':
16 |     human.main()
17 | 


--------------------------------------------------------------------------------
/scripts/analysis_summarise.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper around ``make_summary`` script.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | from analysis import make_summary
14 | 
15 | if __name__ == '__main__':
16 |     make_summary.main()
17 | 


--------------------------------------------------------------------------------
/scripts/analysis_explore_failures.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper around ``show_failures``.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | from analysis import show_failures
14 | 
15 | if __name__ == '__main__':
16 |     show_failures.main()
17 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: generic
 2 | 
 3 | services:
 4 |   - docker
 5 | 
 6 | before_install:
 7 |   - docker build -t alan-turing-institute/csvwrangling .
 8 | 
 9 | script:
10 |   - mkdir -p /home/travis/build/alan-turing-institute/results
11 |   - docker run -v /home/travis/build/alan-turing-institute/results:/CSV_Wrangling/test alan-turing-institute/csvwrangling /bin/bash -c "make output && git diff"
12 | 


--------------------------------------------------------------------------------
/results/test/analysis/tables/parse_result_github.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 |  & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | No Result & 10.12 & 4.90 & 22.96 & 1.69 & 1.30 & 4.24 & \textbf{0.30}\\
5 | Incorrect & 9.28 & 9.64 & 38.85 & 7.32 & 15.09 & \textbf{5.15} & 5.95\\
6 | Correct & 80.60 & 85.45 & 38.19 & 90.99 & 83.61 & 90.61 & \textbf{93.75}\\
7 | \hline
8 | \end{tabular}


--------------------------------------------------------------------------------
/scripts/analysis_potential_dialects.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper for the potential dialects analysis.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2019 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | from analysis import potential_dialects
14 | 
15 | if __name__ == '__main__':
16 |     potential_dialects.main()
17 | 


--------------------------------------------------------------------------------
/results/test/analysis/tables/parse_result_ukdata.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 |  & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | No Result & 1.85 & 1.21 & 16.72 & \textbf{0.00} & 0.04 & 0.56 & \textbf{0.00}\\
5 | Incorrect & 7.71 & 7.95 & 57.96 & 0.60 & 12.78 & \textbf{0.32} & \textbf{0.32}\\
6 | Correct & 90.44 & 90.84 & 25.32 & 99.40 & 87.18 & 99.11 & \textbf{99.68}\\
7 | \hline
8 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/analysis/tables/standard_and_messy_github.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 |  & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Standard (3502) & 85.75 & 90.89 & 44.12 & 93.15 & 86.26 & 93.46 & \textbf{95.80}\\
5 | Messy (884) & 60.18 & 63.91 & 14.71 & 82.47 & 73.08 & 79.30 & \textbf{85.63}\\
6 | Total (4386) & 80.60 & 85.45 & 38.19 & 90.99 & 83.61 & 90.61 & \textbf{93.75}\\
7 | \hline
8 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/analysis/tables/standard_and_messy_ukdata.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 |  & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Standard (4938) & 90.46 & 90.91 & 25.05 & 99.43 & 87.30 & 99.15 & \textbf{99.72}\\
5 | Messy (31) & 87.10 & 80.65 & 67.74 & \textbf{93.55} & 67.74 & \textbf{93.55} & \textbf{93.55}\\
6 | Total (4969) & 90.44 & 90.84 & 25.32 & 99.40 & 87.18 & 99.11 & \textbf{99.68}\\
7 | \hline
8 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/README.md:
--------------------------------------------------------------------------------
 1 | # Results
 2 | 
 3 | The results will be placed here. The structure is as follows:
 4 | 
 5 | 1. The **preprocessing** directory stores output from automatic ground truth 
 6 |    detection (a.k.a. *normal forms*). 
 7 | 
 8 | 2. The **detection** directory stores the output of the detectors, as well as 
 9 |    the ground truth (``out_reference``).
10 | 
11 | 3. The **analysis** directory stores the analysis output in figures, tables, 
12 |    and constants.
13 | 


--------------------------------------------------------------------------------
/utils/install_R_packages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Install R packages from a file
 4 | #
 5 | # Author: G.J.J. van den Burg
 6 | # Date: 2019-05-16
 7 | #
 8 | if [ $# -ne 1 ]
 9 | then
10 | 	echo "Usage: $0 packages.txt"
11 | 	exit 1
12 | fi
13 | 
14 | if [ ! -s "$1" ]
15 | then
16 | 	echo "Provided package file $1 has no packages. Skipping"
17 | 	exit 0
18 | fi
19 | 
20 | while read -r pkg
21 | do
22 | 	Rscript -e "install.packages('${pkg}', repos=c('https://cloud.r-project.org'))"
23 | done < "$1"
24 | 


--------------------------------------------------------------------------------
/results/test/analysis/tables/accuracy_all_github.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Delimiter & 87.48 & 86.82 & 65.41 & 92.61 & 88.33 & 91.38 & \textbf{94.92}\\
5 | Quotechar & 82.90 & 92.36 & 44.60 & 95.23 & 90.10 & 93.80 & \textbf{97.36}\\
6 | Escapechar & 87.96 & 94.37 & 74.85 & 97.95 & 96.26 & 95.44 & \textbf{99.25}\\
7 | Overall & 80.60 & 85.45 & 38.19 & 90.99 & 83.61 & 90.61 & \textbf{93.75}\\
8 | \hline
9 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/analysis/tables/accuracy_all_ukdata.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Delimiter & 97.97 & 91.89 & 80.20 & 99.70 & 93.80 & 99.26 & \textbf{99.82}\\
5 | Quotechar & 90.56 & 92.21 & 26.34 & 99.46 & 89.56 & 99.13 & \textbf{99.70}\\
6 | Escapechar & 98.05 & 98.79 & 82.61 & \textbf{100.00} & 97.67 & 99.42 & 99.98\\
7 | Overall & 90.44 & 90.84 & 25.32 & 99.40 & 87.18 & 99.11 & \textbf{99.68}\\
8 | \hline
9 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/analysis/tables/accuracy_human_github.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Delimiter & 83.35 & 81.78 & 59.78 & 89.64 & 84.47 & 87.95 & \textbf{93.04}\\
5 | Quotechar & 76.24 & 89.19 & 39.51 & 93.27 & 85.71 & 90.95 & \textbf{96.07}\\
6 | Escapechar & 84.14 & 92.33 & 74.07 & 97.08 & 94.84 & 93.27 & \textbf{98.77}\\
7 | Overall & 72.54 & 79.61 & 28.99 & 87.28 & 76.88 & 86.76 & \textbf{91.21}\\
8 | \hline
9 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/analysis/tables/accuracy_human_ukdata.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Delimiter & 97.31 & 92.17 & 81.91 & 99.59 & 93.16 & 98.95 & \textbf{99.74}\\
5 | Quotechar & 87.46 & 92.93 & 24.61 & 99.30 & 87.49 & 98.77 & \textbf{99.56}\\
6 | Escapechar & 97.43 & 99.65 & 85.21 & \textbf{100.00} & 96.76 & 99.18 & 99.97\\
7 | Overall & 87.28 & 91.14 & 23.12 & 99.21 & 84.71 & 98.74 & \textbf{99.53}\\
8 | \hline
9 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/analysis/tables/accuracy_normal_github.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Delimiter & 93.93 & 94.69 & 74.20 & 97.26 & 94.34 & 96.73 & \textbf{97.84}\\
5 | Quotechar & 93.29 & 97.31 & 52.54 & 98.31 & 96.96 & 98.25 & \textbf{99.36}\\
6 | Escapechar & 93.93 & 97.55 & 76.07 & 99.30 & 98.48 & 98.83 & \textbf{100.00}\\
7 | Overall & 93.17 & 94.57 & 52.54 & 96.79 & 94.10 & 96.61 & \textbf{97.72}\\
8 | \hline
9 | \end{tabular}


--------------------------------------------------------------------------------
/results/test/analysis/tables/accuracy_normal_ukdata.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr|rrrr}
2 | Property & HypoParsr & Sniffer & Suitability & Pattern & Type & No Tie & Full\\
3 | \hline
4 | Delimiter & 99.42 & 91.28 & 76.42 & 99.94 & 95.22 & 99.94 & \textbf{100.00}\\
5 | Quotechar & 97.42 & 90.63 & 30.17 & 99.81 & 94.12 & 99.94 & \textbf{100.00}\\
6 | Escapechar & 99.42 & 96.90 & 76.87 & \textbf{100.00} & 99.68 & 99.94 & \textbf{100.00}\\
7 | Overall & 97.42 & 90.18 & 30.17 & 99.81 & 92.64 & 99.94 & \textbf{100.00}\\
8 | \hline
9 | \end{tabular}


--------------------------------------------------------------------------------
/scripts/merge_human_normal.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper around merge.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import sys
14 | 
15 | from preprocessing import merge
16 | 
17 | if __name__ == "__main__":
18 |     if len(sys.argv) == 1:
19 |         print("Usage: %s output_file input_file ..." % sys.argv[0])
20 |         raise SystemExit
21 |     merge.main(sys.argv[1], sys.argv[2:])
22 | 


--------------------------------------------------------------------------------
/scripts/run_extract_normal.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper for normal form extraction
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import sys
14 | 
15 | from preprocessing import extract_normals
16 | 
17 | if __name__ == "__main__":
18 |     if not len(sys.argv) == 3:
19 |         print("Usage: %s normals.json output_file" % sys.argv[0])
20 |         raise SystemExit
21 |     extract_normals.main(sys.argv[1], sys.argv[2])
22 | 


--------------------------------------------------------------------------------
/scripts/run_normal_detection.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper for normal form detection.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import sys
14 | 
15 | from preprocessing import filter_non_normal
16 | 
17 | if __name__ == '__main__':
18 |     if not len(sys.argv) == 4:
19 |         print("Usage: %s input_dir normal_file non_normal_file" % sys.argv[0])
20 |         raise SystemExit
21 |     filter_non_normal.main(sys.argv[1], sys.argv[2], sys.argv[3])
22 | 


--------------------------------------------------------------------------------
/scripts/common/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Shared utility functions.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import math
14 | 
15 | 
16 | def pairwise(iterable):
17 |     "s - > (s0, s1), (s1, s2), (s2, s3), ..."
18 |     a = iter(iterable)
19 |     b = iter(iterable)
20 |     next(b, None)
21 |     return zip(a, b)
22 | 
23 | 
24 | def softmax(iterable):
25 |     maxx = max(iterable)
26 |     offset = [x - maxx for x in iterable]
27 |     denom = sum(map(math.exp, offset))
28 |     return [math.exp(o) / denom for o in offset]
29 | 


--------------------------------------------------------------------------------
/scripts/common/escape.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Common functions for dealing with escape characters
 5 | 
 6 | Author: Gertjan van den Burg
 7 | Copyright (c) 2018 - The Alan Turing Institute
 8 | License: See the LICENSE file.
 9 | Date: 2018-11-06
10 | """
11 | 
12 | import codecs
13 | import unicodedata
14 | 
15 | 
16 | def is_potential_escapechar(char, encoding):
17 |     as_unicode = codecs.decode(bytes(char, encoding), encoding=encoding)
18 |     ctr = unicodedata.category(as_unicode)
19 |     block = ["!", "?", '"', "'", ".", ",", ";", ":", "%", "*", "&", "#"]
20 |     if ctr == "Po":
21 |         if as_unicode in block:
22 |             return False
23 |         return True
24 |     return False
25 | 


--------------------------------------------------------------------------------
/scripts/common/encoding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Common functions for encoding detection
 5 | 
 6 | Author: Gertjan van den Burg
 7 | Copyright (c) 2018 - The Alan Turing Institute
 8 | License: See the LICENSE file.
 9 | Date: 2018-11-06
10 | """
11 | 
12 | import chardet
13 | 
14 | def get_encoding(filename):
15 |     detector = chardet.UniversalDetector()
16 |     final_chunk = False
17 |     blk_size = 65536
18 |     with open(filename, "rb") as fid:
19 |         while (not final_chunk) and (not detector.done):
20 |             chunk = fid.read(blk_size)
21 |             if len(chunk) < blk_size:
22 |                 final_chunk = True
23 |             detector.feed(chunk)
24 |     detector.close()
25 |     encoding = detector.result.get("encoding", None)
26 |     return encoding
27 | 


--------------------------------------------------------------------------------
/scripts/common/load.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Common functions for loading files
 5 | 
 6 | Author: Gertjan van den Burg
 7 | Copyright (c) 2018 - The Alan Turing Institute
 8 | License: See the LICENSE file.
 9 | Date: 2018-11-06
10 | """
11 | 
12 | from .encoding import get_encoding
13 | 
14 | 
15 | def load_file(filename, encoding="unknown"):
16 |     if encoding == "unknown":
17 |         encoding = get_encoding(filename)
18 |     with open(filename, "r", newline="", encoding=encoding) as fid:
19 |         try:
20 |             return fid.read()
21 |         except UnicodeDecodeError:
22 |             print(
23 |                 "UnicodeDecodeError occurred for file: %s. "
24 |                 "This means the encoding was determined incorrectly "
25 |                 "or the file is corrupt." % filename
26 |             )
27 |             return None
28 | 


--------------------------------------------------------------------------------
/scripts/detection/lib/types/README.md:
--------------------------------------------------------------------------------
 1 | # Rudimentary Type Detection
 2 | 
 3 | This directory contains the rudimentary type detection engine used for CSV 
 4 | dialect detection. It is a regular-expression based method that allows 
 5 | detection of:
 6 | 
 7 | - Empty cells
 8 | - URLs and email
 9 | - Numbers, including scientific notation, comma/period as radix point, 
10 |   comma/period as thousands separator.
11 | - Percentages
12 | - Currencies
13 | - Time in HH:MM:SS, HH:MM, and H:MM notation
14 | - Dates in forty different formats, including Chinese. Based on [this 
15 |   Wikipedia article](https://en.wikipedia.org/wiki/Date_format_by_country).
16 | - Combined date and time (i.e. ISO 8601 and variations)
17 | - N/A and n/a
18 | 
19 | This covers about 80% - 90% of cells in our collection of CSV files.
20 | 
21 | Copyright (c) 2018 The Alan Turing Institute
22 | 
23 | ## Author
24 | 
25 | Gerrit J.J. van den Burg, gvandenburg@turing.ac.uk
26 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Script Directory
 2 | 
 3 | The scripts are organized as follows:
 4 | 
 5 | 1. **analysis** contains the code necessary to generate the figures, tables, 
 6 |    and constants.
 7 | 
 8 | 2. **common** contains shared code for analysis, detection, and preprocessing. 
 9 |    Among other things it contains definitions for the detector result and 
10 |    dialect objects, the parser we use, and utilities for encoding detection 
11 |    and file loading.
12 | 
13 | 3. **detection** contains the code for each of the detectors. Every detector 
14 |    has a separate file. Those implemented in Python have a common commandline 
15 |    interface defined in ``core.py``. Code for HypoParsr and type detection are 
16 |    in the **lib** subdir.
17 | 
18 | 4. **preprocessing** contains the code for automatic dialect detection using 
19 |    so-called ''normal forms''
20 | 
21 | 
22 | The files in this folder are top-level wrapper scripts that are actually 
23 | needed to run everything.
24 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_n_files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Get the number of files in a given summary file.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | import json
15 | 
16 | 
17 | def parse_args():
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument(
20 |         "-s",
21 |         dest="summary",
22 |         help="Summary file with the results",
23 |         required=True,
24 |     )
25 | 
26 |     parser.add_argument(
27 |         "-o", dest="output", help="Output tex file to write to", required=True
28 |     )
29 |     return parser.parse_args()
30 | 
31 | 
32 | def main():
33 |     args = parse_args()
34 |     with open(args.summary, "r") as fid:
35 |         data = json.load(fid)
36 | 
37 |     n_files = data["n_files_all"]
38 |     with open(args.output, "w") as fid:
39 |         fid.write("%i%%" % n_files)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Alan Turing Institute
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/design/result.md:
--------------------------------------------------------------------------------
 1 | # Design document for storing results
 2 | 
 3 | A result is stored in [JSON Lines format](http://jsonlines.org/) with the 
 4 | following fields:
 5 | 
 6 | - filename: filename of the CSV file, typically ``/path/to/data/[md5hash].csv``
 7 | 
 8 | - status: parsing status, either ``null``, ``ok``, ``fail``, or ``skip``.
 9 | 
10 | - reason: failure or skip reason, either ``null`` or:
11 | 
12 |    + ``unknown``,
13 |    + ``multiple_answers``,
14 |    + ``no_results``,
15 |    + ``timeout``,
16 |    + ``unreadable``
17 |    + ``non_existent``
18 | 
19 | - detector: name of the detector
20 | 
21 | - hostname: hostname of the pc that ran the detection
22 | 
23 | - runtime: time it took to run the detection
24 | 
25 | - dialect. See below.
26 | 
27 | A dialect is a separate key/value map using the fields:
28 | 
29 | - delimiter: single character string, empty string for single-column files, 
30 |   ``null`` for undefined.
31 | 
32 | - quotechar: single character string, empty string for unquoted files, 
33 |   ``null`` for undefined.
34 | 
35 | - escapechar: single character string, empty string for no escape char, 
36 |   ``null`` for undefined
37 | 


--------------------------------------------------------------------------------
/scripts/run_detector.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper for detector executables.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import sys
14 | 
15 | from detection import (
16 |     our_score_full,
17 |     our_score_full_no_tie,
18 |     our_score_pattern_only,
19 |     our_score_type_only,
20 |     sniffer,
21 |     suitability,
22 | )
23 | 
24 | 
25 | def main():
26 |     detector = sys.argv.pop(1)
27 |     if detector == "our_score_full":
28 |         our_score_full.main()
29 |     elif detector == "our_score_full_no_tie":
30 |         our_score_full_no_tie.main()
31 |     elif detector == "our_score_type_only":
32 |         our_score_type_only.main()
33 |     elif detector == "our_score_pattern_only":
34 |         our_score_pattern_only.main()
35 |     elif detector == "sniffer":
36 |         sniffer.main()
37 |     elif detector == "suitability":
38 |         suitability.main()
39 |     else:
40 |         raise ValueError("Unknown detector: %s" % detector)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/scripts/analysis_results.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper around result generation.
 6 | 
 7 | See the individual scripts for more usage info.
 8 | 
 9 | Author: Gertjan van den Burg
10 | Copyright (c) 2018 - The Alan Turing Institute
11 | License: See the LICENSE file.
12 | 
13 | """
14 | 
15 | import sys
16 | 
17 | from analysis import (
18 |     figure_fail,
19 |     figure_bar_plot,
20 |     figure_box_plot,
21 |     figure_violins,
22 |     table_accuracy,
23 |     table_std_messy,
24 |     table_parse_result
25 | )
26 | 
27 | 
28 | def main():
29 |     result_type = sys.argv.pop(1)
30 |     if result_type == "fail_figure":
31 |         figure_fail.main()
32 |     elif result_type == "accuracy_bar":
33 |         figure_bar_plot.main()
34 |     elif result_type == "boxplot":
35 |         figure_box_plot.main()
36 |     elif result_type == "violins":
37 |         figure_violins.main()
38 |     elif result_type == "tables":
39 |         table_accuracy.main()
40 |     elif result_type == "std_messy":
41 |         table_std_messy.main()
42 |     elif result_type == "parse_result":
43 |         table_parse_result.main()
44 |     else:
45 |         raise ValueError("Unknown result type: %s" % result_type)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_n_incorrect_prop.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Proportion of files incorrect.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | import json
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument(
19 |         "-d", dest="detector", help="Detector name", required=True
20 |     )
21 |     parser.add_argument(
22 |         "-s",
23 |         dest="summary",
24 |         help="Summary file with the results",
25 |         required=True,
26 |     )
27 | 
28 |     parser.add_argument(
29 |         "-o", dest="output", help="Output tex file to write to", required=True
30 |     )
31 |     return parser.parse_args()
32 | 
33 | 
34 | def main():
35 |     args = parse_args()
36 |     with open(args.summary, "r") as fid:
37 |         data = json.load(fid)
38 | 
39 |     fails = data["failures"]
40 |     if not args.detector in fails:
41 |         raise KeyError(
42 |             "Detector name %s doesn't exist in failure dict" % args.detector
43 |         )
44 |     perc = fails[args.detector] * 100.0
45 | 
46 |     with open(args.output, "w") as fid:
47 |         fid.write("%.2f\\%%%%" % perc)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_fail_percentage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Overall failure rate of a method for a single corpus.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | import json
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument(
19 |         "-d", dest="detector", help="Detector name", required=True
20 |     )
21 |     parser.add_argument(
22 |         "-s",
23 |         dest="summary",
24 |         help="Summary file with the results",
25 |         required=True,
26 |     )
27 | 
28 |     parser.add_argument(
29 |         "-o", dest="output", help="Output tex file to write to", required=True
30 |     )
31 |     return parser.parse_args()
32 | 
33 | 
34 | def main():
35 |     args = parse_args()
36 |     with open(args.summary, "r") as fid:
37 |         data = json.load(fid)
38 | 
39 |     fails = data["failures"]
40 |     if not args.detector in fails:
41 |         raise KeyError(
42 |             "Detector name %s doesn't exist in failure dict" % args.detector
43 |         )
44 |     perc = fails[args.detector] * 100.0
45 | 
46 |     with open(args.output, "w") as fid:
47 |         fid.write("%.2f\\%%%%" % perc)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/scripts/preprocessing/merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This takes a series of detector output files and merges them into a single file 
 6 | with the detector name "reference".
 7 | 
 8 | Author: Gertjan van den Burg
 9 | Copyright (c) 2018 - The Alan Turing Institute
10 | License: See the LICENSE file.
11 | 
12 | """
13 | 
14 | from common.detector_result import DetectorResult
15 | 
16 | 
17 | def main(output_file, input_files):
18 |     combined = {}
19 |     for filename in input_files:
20 |         with open(filename, "r") as fid:
21 |             for line in fid:
22 |                 dr = DetectorResult.from_json(line.strip())
23 |                 if dr.filename in combined:
24 |                     if dr.dialect == combined[dr.filename].dialect:
25 |                         # allow it if the dialect is the same
26 |                         continue
27 |                     else:
28 |                         raise KeyError(
29 |                             "Duplicate result for file: %s" % dr.filename
30 |                         )
31 |                 combined[dr.filename] = dr
32 | 
33 |     with open(output_file, "w") as fid:
34 |         for filename in sorted(combined.keys()):
35 |             dr = combined[filename]
36 |             dr.original_detector = dr.detector
37 |             dr.detector = "reference"
38 |             fid.write(dr.to_json() + "\n")
39 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_prop_potential_dialect.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env python
 3 | # -*- coding: utf-8 -*-
 4 | 
 5 | """
 6 | Compute the factor that relates the size of the alphabet to the size of the set 
 7 | of potential dialects.
 8 | 
 9 | To be exact, we want F in the equation: |Dialects| = F * |UniqueChars|
10 | 
11 | This is averaged over both datasets in the test set.
12 | 
13 | Author: Gertjan van den Burg
14 | Date: 2019-04-10
15 | 
16 | """
17 | 
18 | import argparse
19 | import json
20 | 
21 | 
22 | def parse_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument(
25 |         "-i",
26 |         dest="input",
27 |         help="Overview files with the results from the ``potential_dialects.py`` script.",
28 |         required=True,
29 |         nargs="+",
30 |     )
31 |     parser.add_argument(
32 |         "-o", dest="output", help="Output tex file to write to", required=True
33 |     )
34 |     return parser.parse_args()
35 | 
36 | 
37 | def main():
38 |     args = parse_args()
39 |     fracs = []
40 |     for filename in args.input:
41 |         with open(filename, "r") as fid:
42 |             for line in fid:
43 |                 data = json.loads(line.strip())
44 |                 fracs.append(data["n_dialect"] / data["n_alpha"])
45 | 
46 |     result = sum(fracs) / len(fracs)
47 |     with open(args.output, "w") as fid:
48 |         fid.write("%.1f%%" % result)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/scripts/preprocessing/filter_non_normal.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Split CSV files between those where ground truth can be determined 
 6 | automatically (normal forms) and those that need human annotation (non-normal).
 7 | 
 8 | Author: Gertjan van den Burg
 9 | Copyright (c) 2018 - The Alan Turing Institute
10 | License: See the LICENSE file.
11 | 
12 | """
13 | 
14 | import os
15 | import json
16 | 
17 | from tabulate import tabulate
18 | 
19 | from .normal_forms import detect_form
20 | 
21 | 
22 | def main(input_dir, normal_file, non_normal_file):
23 |     files = [os.path.join(input_dir, x) for x in os.listdir(input_dir)]
24 |     files.sort()
25 | 
26 |     normal_fid = open(normal_file, "w")
27 |     nonnormal_fid = open(non_normal_file, "w")
28 | 
29 |     counts = {}
30 | 
31 |     for f in files:
32 |         print("[normal_form] Analyzing file: %s" % f)
33 |         form_id, params = detect_form(f, record_result=False, verbose=False)
34 | 
35 |         if not form_id in counts:
36 |             counts[form_id] = 0
37 |         counts[form_id] += 1
38 | 
39 |         if form_id is None:
40 |             nonnormal_fid.write(f + "\n")
41 |         else:
42 |             data = {"filename": f, "form_id": form_id, "params": params}
43 |             normal_fid.write(json.dumps(data) + "\n")
44 | 
45 |     normal_fid.close()
46 |     nonnormal_fid.close()
47 | 
48 |     table = [
49 |         {"form": "None" if k is None else k, "count": v}
50 |         for k, v in counts.items()
51 |     ]
52 | 
53 |     print(tabulate(table, headers="keys"))
54 | 


--------------------------------------------------------------------------------
/scripts/detection/our_score_type_only.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Get the best parameter set by using only the type score of our data consistency 
 6 | measure.
 7 | 
 8 | Author: Gertjan van den Burg
 9 | Copyright (c) 2018 - The Alan Turing Institute
10 | License: See the LICENSE file.
11 | """
12 | 
13 | from .core import run
14 | from .our_score_base import determine_dqr, get_cells, is_clean
15 | from .our_score_full import EPS_TYP
16 | 
17 | 
18 | DETECTOR = "our_score_type_only"
19 | 
20 | 
21 | def get_scores(data, dialects, verbose=False):
22 |     scores = {}
23 |     for dialect in sorted(dialects):
24 |         cells = get_cells(data, dialect)
25 |         n_clean = sum((is_clean(cell) for cell in cells))
26 |         n_cells = len(cells)
27 | 
28 |         if n_cells == 0:
29 |             type_score = EPS_TYP
30 |         else:
31 |             type_score = max(EPS_TYP, n_clean / n_cells)
32 |         score = type_score
33 | 
34 |         scores[dialect] = score
35 | 
36 |         if verbose:
37 |             print(
38 |                 "%15r:\ttype = %.6f\tfinal = %s"
39 |                 % (
40 |                     dialect,
41 |                     type_score,
42 |                     "0" if scores[dialect] == 0 else "%.6f" % scores[dialect],
43 |                 )
44 |             )
45 | 
46 |     return scores
47 | 
48 | 
49 | def wrap_determine_dqr(filename, verbose=False):
50 |     return determine_dqr(filename, get_scores, verbose=verbose)
51 | 
52 | 
53 | def main():
54 |     run(determine_dqr=wrap_determine_dqr, detector=DETECTOR)
55 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_n_dialect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Count the number of dialects
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | 
15 | from common.detector_result import Status
16 | 
17 | from .core import load_detector_results
18 | 
19 | 
20 | def count_dialect(result_dicts):
21 |     dialects = set()
22 |     for reference in result_dicts:
23 |         for fname in reference:
24 |             res = reference[fname]
25 |             if not res.status == Status.OK:
26 |                 continue
27 |             dialects.add(res.dialect)
28 |     return len(dialects)
29 | 
30 | 
31 | def parse_args():
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument(
34 |         "-o", dest="output", help="Output tex file to write to", required=True
35 |     )
36 |     parser.add_argument(
37 |         "-r",
38 |         dest="reference",
39 |         help="Reference file for a specific corpus",
40 |         required=True,
41 |         nargs="+",
42 |     )
43 | 
44 |     return parser.parse_args()
45 | 
46 | 
47 | def main():
48 |     args = parse_args()
49 |     result_dicts = []
50 |     for reference in args.reference:
51 |         _, reference_results = load_detector_results(reference)
52 |         result_dicts.append(reference_results)
53 |     n_dialect = count_dialect(result_dicts)
54 | 
55 |     with open(args.output, "w") as fid:
56 |         fid.write("%i%%" % n_dialect)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/scripts/run_hypoparsr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Bash wrapper around HypoParsr so we can kill the thing when it takes too 
 4 | # long.
 5 | #
 6 | # This is necessary because R's withTimeout can't kill C code so it's kinda 
 7 | # useless.
 8 | #
 9 | # Author: G.J.J. van den Burg
10 | # Date: 2018-09-28T09:21:05+01:00
11 | # Copyright (c) 2018 - The Alan Turing Institute
12 | # License: See the LICENSE file.
13 | #
14 | #
15 | 
16 | TIMEOUT=600 # ten minutes
17 | 
18 | ALL_FILE="$1"
19 | OUTPUT_FILE="$2"
20 | 
21 | THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
22 | HYPO_R="${THIS_DIR}/detection/hypo.R"
23 | 
24 | if [ ! -f ${HYPO_R} ]
25 | then
26 | 	echo "Couldn't find hypo.R at ${HYPO_R}. Not starting."
27 | 	exit 1
28 | fi
29 | 
30 | if [ ! -f ${OUTPUT_FILE} ]
31 | then
32 | 	touch ${OUTPUT_FILE}
33 | fi
34 | 
35 | # catch return code 124
36 | 
37 | for filename in `cat ${ALL_FILE}`
38 | do
39 | 	# check if it is already processed
40 | 	if grep -q ${filename} ${OUTPUT_FILE}
41 | 	then
42 | 		continue
43 | 	fi
44 | 
45 | 	echo "[hypoparsr] Analyzing file: ${filename}"
46 | 
47 | 	# process it with timeout
48 | 	res=$(timeout ${TIMEOUT} Rscript ${HYPO_R} ${filename} 2>/dev/null)
49 | 
50 | 	# timeout retcode is 124 if timeout occurred.
51 | 	if [ "$?" -eq "124" ]
52 | 	then
53 | 		# timeout occurred
54 | 		res="{\"status\": \"FAIL\", \"status_msg\": \"TIMEOUT\", \"filename\": \"${filename}\", \"detector\": \"hypoparsr\", \"runtime\": ${TIMEOUT}, \"hostname\": \"$(hostname)\"}"
55 | 	fi
56 | 
57 | 	# Strip the simpleError from the output if necessary
58 | 	res=$(echo "${res}" | grep -v simpleError | grep -v read_delim)
59 | 
60 | 	echo "${res}" >> ${OUTPUT_FILE}
61 | done
62 | 


--------------------------------------------------------------------------------
/scripts/detection/our_score_pattern_only.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Get the best parameter set by using only the pattern score of our data 
 7 | consistency measure.
 8 | 
 9 | Author: Gertjan van den Burg
10 | Copyright (c) 2018 - The Alan Turing Institute
11 | License: See the LICENSE file.
12 | """
13 | 
14 | from collections import Counter
15 | 
16 | from .core import run
17 | from .our_score_base import determine_dqr, make_abstraction
18 | from .our_score_full import EPS_PAT
19 | 
20 | 
21 | DETECTOR = "our_score_pattern_only"
22 | 
23 | 
24 | def get_scores(data, dialects, verbose=False):
25 |     scores = {}
26 |     for dialect in sorted(dialects):
27 |         A = make_abstraction(data, dialect)
28 |         row_patterns = Counter(A.split("R"))
29 |         pattern_score = 0
30 |         for pat_p, n_p in row_patterns.items():
31 |             Lk = len(pat_p.split("D"))
32 |             pattern_score += n_p * (max(EPS_PAT, Lk - 1) / Lk)
33 |         pattern_score /= len(row_patterns)
34 | 
35 |         score = pattern_score
36 |         scores[dialect] = score
37 | 
38 |         if verbose:
39 |             print(
40 |                 "%15r:\tpattern = %.6f\tfinal = %s"
41 |                 % (
42 |                     dialect,
43 |                     pattern_score,
44 |                     "0" if scores[dialect] == 0 else "%.6f" % scores[dialect],
45 |                 )
46 |             )
47 | 
48 |     return scores
49 | 
50 | 
51 | def wrap_determine_dqr(filename, verbose=False):
52 |     return determine_dqr(filename, get_scores, verbose=verbose)
53 | 
54 | 
55 | def main():
56 |     run(determine_dqr=wrap_determine_dqr, detector=DETECTOR)
57 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_improve_sniffer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Accuracy improvement of a method over sniffer.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | 
15 | from .constant_accuracy_overall import load_and_merge, compute_accuracy_overall
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument(
21 |         "-r",
22 |         dest="reference",
23 |         help="Reference file(s) with ground truth",
24 |         required=True,
25 |         nargs="+",
26 |     )
27 |     parser.add_argument(
28 |         "-d",
29 |         dest="detector",
30 |         help="Detector result(s)",
31 |         required=True,
32 |         nargs="+",
33 |     )
34 |     parser.add_argument(
35 |         "-s", dest="sniffer", help="Sniffer result(s)", required=True, nargs="+"
36 |     )
37 |     parser.add_argument(
38 |         "-o", dest="output", help="Output tex file to write to", required=True
39 |     )
40 |     return parser.parse_args()
41 | 
42 | 
43 | def main():
44 |     args = parse_args()
45 |     reference_results = load_and_merge(args.reference)
46 |     detector_results = load_and_merge(args.detector)
47 |     sniffer_results = load_and_merge(args.sniffer)
48 |     acc_det = compute_accuracy_overall(reference_results, detector_results)
49 |     acc_snf = compute_accuracy_overall(reference_results, sniffer_results)
50 |     diff = acc_det - acc_snf
51 |     with open(args.output, "w") as fid:
52 |         fid.write("%.1f\\%%%%" % diff)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/scripts/analysis/potential_dialects.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Quick script to check the distribution of the number of dialects that we 
 6 | consider.
 7 | 
 8 | Author: Gertjan van den Burg
 9 | Date: 2019-04-09
10 | 
11 | """
12 | 
13 | import argparse
14 | import json
15 | 
16 | from tqdm import tqdm
17 | 
18 | from common.encoding import get_encoding
19 | from common.load import load_file
20 | from detection.our_score_base import get_potential_dialects
21 | 
22 | 
23 | def get_stats(filename):
24 |     encoding = get_encoding(filename)
25 |     data = load_file(filename, encoding=encoding)
26 |     if data is None:
27 |         return None
28 |     n_alpha = len(set(data))
29 |     n_dialect = len(get_potential_dialects(data, encoding))
30 |     return dict(filename=filename, n_alpha=n_alpha, n_dialect=n_dialect)
31 | 
32 | 
33 | def parse_args():
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument(
36 |         "-i", "--input", help="File with filenames to consider", required=True
37 |     )
38 |     parser.add_argument(
39 |         "-o",
40 |         "--output",
41 |         help="Output file to write the numbers to",
42 |         required=True,
43 |     )
44 |     return parser.parse_args()
45 | 
46 | 
47 | def main():
48 |     args = parse_args()
49 | 
50 |     with open(args.output, "w") as oid:
51 |         with open(args.input, "r") as fid:
52 |             total = sum((1 for _ in fid))
53 |             fid.seek(0)
54 |             for line in tqdm(fid, total=total):
55 |                 filename = line.strip()
56 |                 s = get_stats(filename)
57 |                 if s is None:
58 |                     continue
59 |                 line = json.dumps(s)
60 |                 oid.write(line + "\n")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_failure_messy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Failure on messy files averaged over both corpora.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2019 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | Date: 2019-04-15
11 | 
12 | """
13 | 
14 | import argparse
15 | import json
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument(
21 |         "-d", dest="detector", help="Detector name", required=True
22 |     )
23 |     parser.add_argument(
24 |         "-s",
25 |         dest="summary",
26 |         help="Summary file(s) with the results",
27 |         required=True,
28 |         nargs="+",
29 |     )
30 | 
31 |     parser.add_argument(
32 |         "-o", dest="output", help="Output tex file to write to", required=True
33 |     )
34 |     return parser.parse_args()
35 | 
36 | 
37 | def main():
38 |     args = parse_args()
39 | 
40 |     n_messy_total = 0
41 |     n_messy_correct_total = 0
42 |     for summary_file in args.summary:
43 |         with open(summary_file, "r") as fid:
44 |             data = json.load(fid)
45 |         if not args.detector in data["messy_accuracy_all"]:
46 |             raise KeyError(
47 |                 "Detector name %s doesn't exist in messy_accuracy_all dict"
48 |                 % args.detector
49 |             )
50 | 
51 |         n_messy = data["n_files_messy"]
52 |         acc_messy = data["messy_accuracy_all"][args.detector]
53 |         n_messy_correct = acc_messy * n_messy
54 | 
55 |         n_messy_total += n_messy
56 |         n_messy_correct_total += n_messy_correct
57 | 
58 |     perc = (n_messy_total - n_messy_correct_total) / n_messy_total * 100.0
59 | 
60 |     with open(args.output, "w") as fid:
61 |         fid.write("%.0f\\%%%%" % perc)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_improve_sniffer_messy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Accuracy improvement of a method over sniffer for messy files.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | import json
15 | import math
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument(
21 |         "--round-up", help="Whether or not to round up", action="store_true"
22 |     )
23 |     parser.add_argument(
24 |         "-s", dest="summary", help="Summary file(s)", required=True, nargs="+"
25 |     )
26 |     parser.add_argument(
27 |         "-o", dest="output", help="Output tex file to write to", required=True
28 |     )
29 |     return parser.parse_args()
30 | 
31 | 
32 | def main():
33 |     args = parse_args()
34 | 
35 |     n_messy_tot = 0
36 |     n_messy_correct_ours = 0
37 |     n_messy_correct_snif = 0
38 | 
39 |     for summary_file in args.summary:
40 |         with open(summary_file, "r") as fid:
41 |             data = json.load(fid)
42 | 
43 |         n_messy = data["n_files_messy"]
44 |         acc_messy_ours = data["messy_accuracy_all"]["our_score_full"]
45 |         acc_messy_snif = data["messy_accuracy_all"]["sniffer"]
46 | 
47 |         n_messy_tot += n_messy
48 |         n_messy_correct_ours += acc_messy_ours * n_messy
49 |         n_messy_correct_snif += acc_messy_snif * n_messy
50 | 
51 |     acc_ours = n_messy_correct_ours / n_messy_tot
52 |     acc_snif = n_messy_correct_snif / n_messy_tot
53 | 
54 |     improv = (acc_ours - acc_snif) * 100
55 | 
56 |     with open(args.output, "w") as fid:
57 |         if args.round_up:
58 |             fid.write("%.0f\\%%%%" % math.ceil(improv))
59 |         else:
60 |             fid.write("%.1f\\%%%%" % improv)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/scripts/analysis/table_accuracy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Convert summary data to a latex table
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import json
12 | import argparse
13 | 
14 | from .core import (
15 |     ORDERED_DETECTORS,
16 |     TABLE_SPEC,
17 |     clean_detector_name,
18 |     check_detectors,
19 | )
20 | from .latex import build_latex_table
21 | 
22 | 
23 | def create_table(results, output_file):
24 |     table = []
25 |     for prop in results:
26 |         row = [prop.capitalize()]
27 |         check_detectors(results[prop].keys())
28 |         for key in ORDERED_DETECTORS:
29 |             row.append(results[prop][key] * 100.0)
30 |         table.append(row)
31 | 
32 |     headers = ["Property"] + list(map(clean_detector_name, ORDERED_DETECTORS))
33 | 
34 |     with open(output_file, "w") as fid:
35 |         fid.write(
36 |             build_latex_table(
37 |                 table, headers, floatfmt=".2f", table_spec=TABLE_SPEC
38 |             )
39 |         )
40 | 
41 | 
42 | def parse_args():
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument(
45 |         "type",
46 |         choices=["all", "human", "normal"],
47 |         help="Subset of data to generate plot for",
48 |         default="all",
49 |     )
50 |     parser.add_argument(
51 |         "-o", dest="output", help="Output tex file to write to", required=True
52 |     )
53 |     parser.add_argument(
54 |         "-s",
55 |         dest="summary",
56 |         help="Summary file with the results",
57 |         required=True,
58 |     )
59 | 
60 |     return parser.parse_args()
61 | 
62 | 
63 | def main():
64 |     args = parse_args()
65 |     with open(args.summary, "r") as fid:
66 |         data = json.load(fid)
67 | 
68 |     key = "detection_accuracy_" + args.type
69 |     if not key in data:
70 |         raise ValueError("Can't find key %s in file %s" % (key, args.summary))
71 | 
72 |     create_table(data[key], args.output)
73 | 


--------------------------------------------------------------------------------
/scripts/analysis_constants.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Wrapper for constants generation.
 6 | 
 7 | The constants are generated with separate Python scripts available in the 
 8 | ``analysis`` directory. This file provides a wrapper. See the scripts of each 
 9 | of the different constants for more info.
10 | 
11 | Author: Gertjan van den Burg
12 | Copyright (c) 2018 - The Alan Turing Institute
13 | License: See the LICENSE file.
14 | 
15 | """
16 | 
17 | import sys
18 | 
19 | from analysis import (
20 |     constant_n_dialect,
21 |     constant_n_files,
22 |     constant_n_incorrect_prop,
23 |     constant_accuracy_overall,
24 |     constant_improve_sniffer,
25 |     constant_improve_sniffer_messy,
26 |     constant_fail_percentage,
27 |     constant_failure,
28 |     constant_failure_messy,
29 |     constant_prop_potential_dialect,
30 |     constant_known_type,
31 | )
32 | 
33 | 
34 | def main():
35 |     const_name = sys.argv.pop(1)
36 |     if const_name == "n_dialect":
37 |         constant_n_dialect.main()
38 |     elif const_name == "n_files":
39 |         constant_n_files.main()
40 |     elif const_name == "accuracy_overall":
41 |         constant_accuracy_overall.main()
42 |     elif const_name == "improve_sniffer":
43 |         constant_improve_sniffer.main()
44 |     elif const_name == "improve_sniffer_messy":
45 |         constant_improve_sniffer_messy.main()
46 |     elif const_name == "failure":
47 |         constant_failure.main()
48 |     elif const_name == "fail_percentage":
49 |         constant_fail_percentage.main()
50 |     elif const_name == "num_incorrect_prop":
51 |         constant_n_incorrect_prop.main()
52 |     elif const_name == "prop_potential_dialect":
53 |         constant_prop_potential_dialect.main()
54 |     elif const_name == "fail_percentage_messy":
55 |         constant_failure_messy.main()
56 |     elif const_name == "known_type":
57 |         constant_known_type.main()
58 |     else:
59 |         raise ValueError("Unknown constant: %s" % const_name)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/scripts/preprocessing/extract_normals.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This script extracts the detected normal forms into an output file that can be 
 6 | used for the comparison.
 7 | 
 8 | Author: Gertjan van den Burg
 9 | Copyright (c) 2018 - The Alan Turing Institute
10 | License: See the LICENSE file.
11 | 
12 | """
13 | 
14 | import json
15 | 
16 | from tqdm import tqdm
17 | 
18 | from common.dialect import Dialect
19 | from common.detector_result import DetectorResult, Status, StatusMsg
20 | 
21 | 
22 | def load_normals(filename):
23 |     data = []
24 |     with open(filename, "r") as fid:
25 |         for line in fid:
26 |             data.append(json.loads(line.strip()))
27 |     return data
28 | 
29 | 
30 | def main(normal_file, output_file):
31 |     normals = load_normals(normal_file)
32 | 
33 |     results = {}
34 |     for entry in tqdm(normals):
35 |         filename = entry["filename"]
36 |         form_id = entry["form_id"]
37 |         params = entry["params"]
38 | 
39 |         if form_id == "FAIL":
40 |             # unreadable file
41 |             dr = DetectorResult(
42 |                 detector="normal",
43 |                 filename=filename,
44 |                 status=Status.FAIL,
45 |                 status_msg=StatusMsg.UNREADABLE,
46 |             )
47 |         else:
48 |             dialect = Dialect(
49 |                 delimiter=params["delim"],
50 |                 quotechar=params["quotechar"],
51 |                 escapechar=params["escapechar"],
52 |             )
53 | 
54 |             dr = DetectorResult(
55 |                 detector="normal",
56 |                 dialect=dialect,
57 |                 filename=filename,
58 |                 status=Status.OK,
59 |             )
60 | 
61 |         if filename in results:
62 |             raise KeyError("Filename %s already exists, duplicate!" % filename)
63 | 
64 |         results[filename] = dr
65 | 
66 |     with open(output_file, "w") as fid:
67 |         for filename in sorted(results.keys()):
68 |             fid.write(results[filename].to_json() + "\n")
69 | 
70 |     print("All done.")
71 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_failure.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Percentage of failure cases that were because of no_results or timeout.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | 
15 | from common.detector_result import StatusMsg, Status
16 | 
17 | from .constant_accuracy_overall import load_and_merge
18 | 
19 | 
20 | def parse_args():
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument(
23 |         "-d",
24 |         dest="detector",
25 |         help="Detector result(s)",
26 |         required=True,
27 |         nargs="+",
28 |     )
29 |     parser.add_argument(
30 |         "-r",
31 |         dest="reason",
32 |         help="Reason for failure",
33 |         choices=["no_results", "timeout"],
34 |         required=True,
35 |     )
36 |     parser.add_argument(
37 |         "-o", dest="output", help="Output tex file to write to", required=True
38 |     )
39 |     return parser.parse_args()
40 | 
41 | 
42 | def main():
43 |     args = parse_args()
44 |     detector_results = load_and_merge(args.detector)
45 |     n_failure = sum(
46 |         (1 for x in detector_results.values() if x.status == Status.FAIL)
47 |     )
48 |     if args.reason == "no_results":
49 |         n_with_reason = sum(
50 |             (
51 |                 1
52 |                 for x in detector_results.values()
53 |                 if x.status == Status.FAIL
54 |                 and x.status_msg == StatusMsg.NO_RESULTS
55 |             )
56 |         )
57 |     elif args.reason == "timeout":
58 |         n_with_reason = sum(
59 |             (
60 |                 1
61 |                 for x in detector_results.values()
62 |                 if x.status == Status.FAIL and x.status_msg == StatusMsg.TIMEOUT
63 |             )
64 |         )
65 |     else:
66 |         raise ValueError
67 | 
68 |     prop = n_with_reason / n_failure * 100
69 |     with open(args.output, "w") as fid:
70 |         fid.write("%.1f\\%%%%" % prop)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_accuracy_overall.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Overall accuracy of a method averaged over multiple corpora.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | import argparse
14 | import sys
15 | 
16 | from common.detector_result import Status
17 | 
18 | from .core import load_detector_results
19 | 
20 | def parse_args():
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument(
23 |         "-r",
24 |         dest="reference",
25 |         help="Reference file(s) with ground truth",
26 |         required=True,
27 |         nargs="+",
28 |     )
29 |     parser.add_argument(
30 |         "-d",
31 |         dest="detector",
32 |         help="Detector result(s)",
33 |         required=True,
34 |         nargs="+",
35 |     )
36 |     parser.add_argument(
37 |         "-o", dest="output", help="Output tex file to write to", required=True
38 |     )
39 |     return parser.parse_args()
40 | 
41 | 
42 | def load_and_merge(filenames):
43 |     results = {}
44 |     for res_file in filenames:
45 |         _, res = load_detector_results(res_file)
46 |         for fname in res:
47 |             if fname in results:
48 |                 print(
49 |                     "Error: duplicate result for file %s" % fname,
50 |                     file=sys.stderr,
51 |                 )
52 |                 raise SystemExit
53 |             results[fname] = res[fname]
54 |     return results
55 | 
56 | 
57 | def compute_accuracy_overall(ref_results, det_results):
58 |     total = 0
59 |     correct = 0
60 |     for fname in ref_results:
61 |         ref = ref_results[fname]
62 |         if not ref.status == Status.OK:
63 |             continue
64 |         total += 1
65 |         det = det_results[fname]
66 |         if not det.status == Status.OK:
67 |             continue
68 |         correct += ref.dialect == det.dialect
69 |     return correct / total * 100
70 | 
71 | 
72 | def main():
73 |     args = parse_args()
74 |     reference_results = load_and_merge(args.reference)
75 |     detector_results = load_and_merge(args.detector)
76 |     acc = compute_accuracy_overall(reference_results, detector_results)
77 |     with open(args.output, "w") as fid:
78 |         fid.write("%.0f\\%%%%" % acc)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/scripts/analysis/constant_known_type.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Compute the percentage of cells with a known type.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2019 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | Date: 2019-04-15
11 | 
12 | """
13 | 
14 | import argparse
15 | import multiprocessing
16 | 
17 | from tqdm import tqdm
18 | 
19 | from common.encoding import get_encoding
20 | from common.load import load_file
21 | from detection.our_score_base import is_clean, get_cells
22 | from common.detector_result import Status
23 | 
24 | from .core import load_detector_results
25 | 
26 | 
27 | def parse_args():
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument(
30 |         "-n",
31 |         "--n-jobs",
32 |         help="Number of parallel jobs",
33 |         default=6,
34 |         dest="n_jobs",
35 |     )
36 |     parser.add_argument(
37 |         "-r",
38 |         "--reference",
39 |         help="Reference file(s) with ground truth",
40 |         nargs="+",
41 |         required=True,
42 |     )
43 |     parser.add_argument("-o", "--output", help="Output file")
44 |     return parser.parse_args()
45 | 
46 | 
47 | def _worker(res_ref):
48 |     filename = res_ref.filename
49 |     encoding = get_encoding(filename)
50 |     data = load_file(filename, encoding=encoding)
51 |     if data is None:
52 |         return None
53 | 
54 |     cells = get_cells(data, res_ref.dialect)
55 |     n_clean = sum((is_clean(cell) for cell in cells))
56 |     n_cells = len(cells)
57 |     return (n_clean, n_cells)
58 | 
59 | 
60 | def main():
61 |     args = parse_args()
62 | 
63 |     reference_results = {}
64 |     for reference in args.reference:
65 |         _, ref_results = load_detector_results(reference)
66 |         reference_results.update(ref_results)
67 | 
68 |     n_cells = 0
69 |     n_clean = 0
70 | 
71 |     only_ok = {
72 |         k: v for k, v in reference_results.items() if v.status == Status.OK
73 |     }
74 | 
75 |     with multiprocessing.Pool(args.n_jobs) as pool:
76 |         with tqdm(total=len(only_ok)) as pbar:
77 |             for n_clean_x, n_cells_x in pool.imap_unordered(
78 |                 _worker, only_ok.values()
79 |             ):
80 |                 n_clean += n_clean_x
81 |                 n_cells += n_cells_x
82 |                 pbar.update()
83 | 
84 |     perc = n_clean / n_cells * 100
85 |     with open(args.output, "w") as fid:
86 |         fid.write("%.1f\\%%%%" % perc)
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/scripts/common/dialect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Definitions for a Dialect object.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | Copyright (c) 2018 - The Alan Turing Institute
 9 | License: See the LICENSE file.
10 | 
11 | """
12 | 
13 | 
14 | import sys
15 | 
16 | from functools import total_ordering
17 | 
18 | ATTRIBUTES = ["delimiter", "quotechar", "escapechar"]
19 | 
20 | 
21 | @total_ordering
22 | class Dialect(object):
23 |     def __init__(self, delimiter, quotechar, escapechar):
24 |         self.delimiter = delimiter
25 |         self.quotechar = quotechar
26 |         self.escapechar = escapechar
27 | 
28 |     def validate(self):
29 |         if self.delimiter is None or len(self.delimiter) > 1:
30 |             raise ValueError(
31 |                 "Delimiter should be zero or one characters, got: %r"
32 |                 % self.delimiter
33 |             )
34 |         if self.quotechar is None or len(self.quotechar) > 1:
35 |             raise ValueError(
36 |                 "Quotechar should be zero or one characters, got: %r"
37 |                 % self.quotechar
38 |             )
39 |         if self.escapechar is None or len(self.escapechar) > 1:
40 |             raise ValueError(
41 |                 "Escapechar should be zero or one characters, got: %r"
42 |                 % self.escapechar
43 |             )
44 |         if self.quotechar in ["Q", "A"]:
45 |             print(
46 |                 "Warning: quotechar is 'Q' or 'A', probably a mistake.",
47 |                 file=sys.stderr,
48 |             )
49 | 
50 |     @classmethod
51 |     def from_dict(cls, d):
52 |         d = cls(d["delimiter"], d["quotechar"], d["escapechar"])
53 |         return d
54 | 
55 |     def to_dict(self):
56 |         self.validate()
57 |         d = dict(delimiter=self.delimiter, quotechar=self.quotechar, 
58 |                 escapechar=self.escapechar)
59 |         return d
60 | 
61 |     def __repr__(self):
62 |         return "(%r, %r, %r)" % (
63 |             self.delimiter,
64 |             self.quotechar,
65 |             self.escapechar,
66 |         )
67 | 
68 |     def __key(self):
69 |         return (self.delimiter, self.quotechar, self.escapechar)
70 | 
71 |     def __hash__(self):
72 |         return hash(self.__key())
73 | 
74 |     def __eq__(self, other):
75 |         if not isinstance(other, Dialect):
76 |             return False
77 |         return self.__key() == other.__key()
78 | 
79 |     def __lt__(self, other):
80 |         if not isinstance(other, Dialect):
81 |             return -1
82 |         return self.__key() < other.__key()
83 | 


--------------------------------------------------------------------------------
/scripts/detection/our_score_full.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Get the best parameter set by using the data consistency measure.
 7 | 
 8 | Author: Gertjan van den Burg
 9 | Copyright (c) 2018 - The Alan Turing Institute
10 | License: See the LICENSE file.
11 | """
12 | 
13 | from collections import Counter
14 | 
15 | from .core import run
16 | from .our_score_base import determine_dqr, get_cells, is_clean, make_abstraction
17 | 
18 | DETECTOR = "our_score_full"
19 | 
20 | # The value of EPS_PAT is tricky, because if we choose it too high it may give
21 | # too many false single-column files. This value seems to work quite well.
22 | EPS_PAT = 1e-3
23 | EPS_TYP = 1e-10
24 | 
25 | 
26 | def get_scores(data, dialects, verbose=False):
27 |     scores = {}
28 |     max_score = -float("inf")
29 |     for dialect in sorted(dialects):
30 |         A = make_abstraction(data, dialect)
31 |         row_patterns = Counter(A.split("R"))
32 |         pattern_score = 0
33 |         for pat_p, n_p in row_patterns.items():
34 |             Lk = len(pat_p.split("D"))
35 |             pattern_score += n_p * (max(EPS_PAT, Lk - 1) / Lk)
36 |         pattern_score /= len(row_patterns)
37 | 
38 |         if pattern_score == 0:
39 |             # if pattern score is zero, the outcome will be zero, so we
40 |             # don't have to check types.
41 |             type_score = float("nan")
42 |             score = 0
43 |         elif pattern_score < max_score:
44 |             # since the type score is in [0, 1], if the pattern score
45 |             # is smaller than the current best score, it can't possibly
46 |             # be improved by types, so we don't have to bother.
47 |             type_score = float("nan")
48 |             score = 0
49 |         else:
50 |             cells = get_cells(data, dialect)
51 |             n_clean = sum((is_clean(cell) for cell in cells))
52 |             n_cells = len(cells)
53 | 
54 |             if n_cells == 0:
55 |                 type_score = EPS_TYP
56 |             else:
57 |                 type_score = max(EPS_TYP, n_clean / n_cells)
58 |             score = type_score * pattern_score
59 | 
60 |         scores[dialect] = score
61 |         max_score = max(max_score, score)
62 | 
63 |         if verbose:
64 |             print(
65 |                 "%15r:\ttype = %.6f\tpattern = %.6f\tfinal = %s"
66 |                 % (
67 |                     dialect,
68 |                     type_score,
69 |                     pattern_score,
70 |                     "0" if scores[dialect] == 0 else "%.6f" % scores[dialect],
71 |                 )
72 |             )
73 | 
74 |     return scores
75 | 
76 | 
77 | def wrap_determine_dqr(filename, verbose=False):
78 |     return determine_dqr(filename, get_scores, verbose=verbose)
79 | 
80 | 
81 | def main():
82 |     run(determine_dqr=wrap_determine_dqr, detector=DETECTOR)
83 | 


--------------------------------------------------------------------------------
/scripts/analysis/figure_box_plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Author: Gertjan van den Burg
 7 | 
 8 | """
 9 | 
10 | import argparse
11 | import json
12 | import numpy as np
13 | import os
14 | 
15 | from .core import ORDERED_DETECTORS, check_detectors, clean_detector_name
16 | from .latex import build_latex_doc
17 | 
18 | 
19 | def create_box_and_whisker_plot(runtimes, output_file):
20 |     check_detectors(runtimes.keys())
21 |     abbrev = [clean_detector_name(d) for d in ORDERED_DETECTORS]
22 | 
23 |     xtick = ",".join([str(i + 1) for i in range(len(abbrev))])
24 |     xticklabels = ",".join(abbrev)
25 | 
26 |     tex = (
27 |         "\\documentclass[preview=true]{standalone}\n"
28 |         "\\pdfinfoomitdate=1\n"
29 |         "\\pdftrailerid{}\n"
30 |         "\\pdfsuppressptexinfo=1\n"
31 |         "\\usepackage{tikz}\n"
32 |         "\\usepackage{pgfplots}\n"
33 |         "\\pgfplotsset{compat=1.16}\n"
34 |         "\\usepgfplotslibrary{statistics}\n"
35 |         "\\begin{document}\n"
36 |         "\\begin{tikzpicture}\n"
37 |         "\\begin{semilogyaxis}[\n"
38 |         "boxplot/draw direction=y,\n"
39 |         "xtick={%s},\n"
40 |         "xticklabels={%s},\n"
41 |         "ylabel={Runtime (s)},\n"
42 |         "width=500pt,\n"
43 |         "height=200pt\n"
44 |         "]\n" % (xtick, xticklabels)
45 |     )
46 | 
47 |     for detector in ORDERED_DETECTORS:
48 |         rt = runtimes[detector]
49 |         q1, median, q3 = np.percentile(rt, [25, 50, 75])
50 |         upper_whisker = max(rt)
51 |         lower_whisker = min(rt)
52 |         boxplot_tex = (
53 |             "\\addplot+[\n"
54 |             "\tdraw=black,\n"
55 |             "\tsolid,\n"
56 |             "\tboxplot prepared={\n"
57 |             "\t\tmedian=%f,\n"
58 |             "\t\tlower quartile=%f,\n"
59 |             "\t\tupper quartile=%f,\n"
60 |             "\t\tupper whisker=%f,\n"
61 |             "\t\tlower whisker=%f\n"
62 |             "},\n"
63 |             "] coordinates {};\n"
64 |             % (median, q1, q3, upper_whisker, lower_whisker)
65 |         )
66 |         tex += boxplot_tex
67 | 
68 |     tex += "\\end{semilogyaxis}\n" "\\end{tikzpicture}\n" "\\end{document}"
69 | 
70 |     tex_file = os.path.splitext(output_file)[0] + ".tex"
71 |     with open(tex_file, "w") as fid:
72 |         fid.write(tex)
73 | 
74 |     build_latex_doc(tex, output_name=output_file)
75 | 
76 | 
77 | def parse_args():
78 |     parser = argparse.ArgumentParser()
79 |     parser.add_argument(
80 |         "-o", dest="output", help="Output pdf file to write to", required=True
81 |     )
82 |     parser.add_argument(
83 |         "-s",
84 |         dest="summary",
85 |         help="Summary file with the input data",
86 |         required=True,
87 |     )
88 |     return parser.parse_args()
89 | 
90 | 
91 | def main():
92 |     args = parse_args()
93 |     with open(args.summary, "r") as fid:
94 |         summary = json.load(fid)
95 | 
96 |     create_box_and_whisker_plot(summary["runtimes"], args.output)
97 | 


--------------------------------------------------------------------------------
/scripts/detection/our_score_full_no_tie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Get the best parameter set by using our data consistency measure.
 7 | 
 8 | This variation does not break ties.
 9 | 
10 | Author: Gertjan van den Burg
11 | Copyright (c) 2018 - The Alan Turing Institute
12 | License: See the LICENSE file.
13 | """
14 | 
15 | from collections import Counter
16 | 
17 | from .core import run
18 | from .our_score_base import (
19 |     determine_dqr,
20 |     get_cells,
21 |     is_clean,
22 |     make_abstraction,
23 | )
24 | 
25 | 
26 | DETECTOR = "our_score_full_no_tie"
27 | 
28 | # The value of EPS_PAT is tricky, because if we choose it too high it may give
29 | # too many false single-column files. This value seems to work quite well.
30 | EPS_PAT = 1e-3
31 | EPS_TYP = 1e-10
32 | 
33 | 
34 | def get_scores(data, dialects, verbose=False):
35 |     scores = {}
36 |     max_score = -float("inf")
37 |     for dialect in sorted(dialects):
38 |         A = make_abstraction(data, dialect)
39 |         row_patterns = Counter(A.split("R"))
40 |         pattern_score = 0
41 |         for pat_p, n_p in row_patterns.items():
42 |             Lk = len(pat_p.split("D"))
43 |             pattern_score += n_p * (max(EPS_PAT, Lk - 1) / Lk)
44 |         pattern_score /= len(row_patterns)
45 | 
46 |         if pattern_score == 0:
47 |             # if pattern score is zero, the outcome will be zero, so we
48 |             # don't have to check types.
49 |             type_score = float("nan")
50 |             score = 0
51 |         elif pattern_score < max_score:
52 |             # since the type score is in [0, 1], if the pattern score
53 |             # is smaller than the current best score, it can't possibly
54 |             # be improved by types, so we don't have to bother.
55 |             type_score = float("nan")
56 |             score = 0
57 |         else:
58 |             cells = get_cells(data, dialect)
59 |             n_clean = sum((is_clean(cell) for cell in cells))
60 |             n_cells = len(cells)
61 | 
62 |             if n_cells == 0:
63 |                 type_score = EPS_TYP
64 |             else:
65 |                 type_score = max(EPS_TYP, n_clean / n_cells)
66 |             score = type_score * pattern_score
67 | 
68 |         scores[dialect] = score
69 |         max_score = max(max_score, score)
70 | 
71 |         if verbose:
72 |             print(
73 |                 "%15r:\ttype = %.6f\tpattern = %.6f\tfinal = %s"
74 |                 % (
75 |                     dialect,
76 |                     type_score,
77 |                     pattern_score,
78 |                     "0" if scores[dialect] == 0 else "%.6f" % scores[dialect],
79 |                 )
80 |             )
81 | 
82 |     return scores
83 | 
84 | 
85 | def wrap_determine_dqr(filename, verbose=False):
86 |     return determine_dqr(
87 |         filename, get_scores, verbose=verbose, do_break_ties=False
88 |     )
89 | 
90 | 
91 | def main():
92 |     run(determine_dqr=wrap_determine_dqr, detector=DETECTOR)
93 | 


--------------------------------------------------------------------------------
/scripts/analysis/table_std_messy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Generate a table with accuracies showing standard/non-standard split.
  6 | 
  7 |         | Sniff | Suit | Hypo | Our (full) |
  8 | --------------------------------------------
  9 | Std (N) |       |      |      |            |
 10 | NStd (N)|       |      |      |            |
 11 | Totl (N)|       |      |      |            |
 12 | 
 13 | 
 14 | Author: Gertjan van den Burg
 15 | Date: 2018-11-18
 16 | 
 17 | """
 18 | 
 19 | import argparse
 20 | import json
 21 | 
 22 | from .core import (
 23 |     ORDERED_DETECTORS,
 24 |     TABLE_SPEC,
 25 |     clean_detector_name,
 26 |     check_detectors,
 27 | )
 28 | from .latex import build_latex_table
 29 | 
 30 | 
 31 | def create_table(results, output_file):
 32 |     n_standard = results["n_files_standard"]
 33 |     n_messy = results["n_files_messy"]
 34 |     n_total = results["n_files_all"]
 35 |     assert n_total == n_standard + n_messy
 36 | 
 37 |     row_std = ["Standard (%i)" % n_standard]
 38 |     row_mes = ["Messy (%i)" % n_messy]
 39 |     row_tot = ["Total (%i)" % n_total]
 40 | 
 41 |     check_detectors(results["standard_accuracy_all"].keys())
 42 |     check_detectors(results["messy_accuracy_all"].keys())
 43 |     check_detectors(results["detection_accuracy_all"]["overall"].keys())
 44 | 
 45 |     for key in ORDERED_DETECTORS:
 46 |         row_std.append(results["standard_accuracy_all"][key] * 100.0)
 47 |         row_mes.append(results["messy_accuracy_all"][key] * 100.0)
 48 |         row_tot.append(
 49 |             results["detection_accuracy_all"]["overall"][key] * 100.0
 50 |         )
 51 | 
 52 |     headers = [""] + list(map(clean_detector_name, ORDERED_DETECTORS))
 53 | 
 54 |     table = [row_std, row_mes, row_tot]
 55 |     with open(output_file, "w") as fid:
 56 |         fid.write(
 57 |             build_latex_table(
 58 |                 table, headers, floatfmt=".2f", table_spec=TABLE_SPEC
 59 |             )
 60 |         )
 61 | 
 62 | 
 63 | def parse_args():
 64 |     parser = argparse.ArgumentParser("Create standard/non-standard table")
 65 |     parser.add_argument(
 66 |         "-o", dest="output", help="Output tex file to write to", required=True
 67 |     )
 68 |     parser.add_argument(
 69 |         "-s",
 70 |         dest="summary",
 71 |         help="Summary file with the results",
 72 |         required=True,
 73 |     )
 74 | 
 75 |     return parser.parse_args()
 76 | 
 77 | 
 78 | def main():
 79 |     args = parse_args()
 80 |     with open(args.summary, "r") as fid:
 81 |         data = json.load(fid)
 82 | 
 83 |     needed_keys = [
 84 |         "n_files_standard",
 85 |         "n_files_messy",
 86 |         "standard_accuracy_all",
 87 |         "messy_accuracy_all",
 88 |     ]
 89 |     for key in needed_keys:
 90 |         if not key in data:
 91 |             raise ValueError(
 92 |                 "Required key '%s' not present in summary file." % key
 93 |             )
 94 | 
 95 |     create_table(data, args.output)
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # High version so we get updated version of texlive
 2 | FROM ubuntu:20.04
 3 | 
 4 | # Install base packages
 5 | RUN apt-get update && \
 6 | 	DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata && \
 7 | 	apt-get remove -y python && \
 8 | 	apt-get install -y --no-install-recommends \
 9 | 		git \
10 | 		build-essential \
11 | 		r-base \
12 | 		latexmk \
13 | 		texlive-latex-extra \
14 | 		texlive-pictures
15 | 
16 | # Install R package dependencies that are available on Ubuntu
17 | RUN apt-get install -y --no-install-recommends \
18 | 		r-cran-ape r-cran-assertthat r-cran-backports \
19 | 		r-cran-base64enc r-cran-bit r-cran-bit64 \
20 | 		r-cran-bitops r-cran-blob r-cran-brew \
21 | 		r-cran-class r-cran-cli r-cran-codetools \
22 | 		r-cran-coin r-cran-colorspace r-cran-crayon \
23 | 		r-cran-curl r-cran-data.table r-cran-dbi \
24 | 		r-cran-desc r-cran-devtools r-cran-digest \
25 | 		r-cran-doparallel r-cran-downloader r-cran-dplyr \
26 | 		r-cran-e1071 r-cran-evaluate r-cran-evd \
27 | 		r-cran-fastmatch r-cran-foreach r-cran-formula \
28 | 		r-cran-ggplot2 r-cran-git2r r-cran-glue \
29 | 		r-cran-gridbase r-cran-gridextra r-cran-gtable \
30 | 		r-cran-highr r-cran-hms r-cran-htmltools \
31 | 		r-cran-htmlwidgets r-cran-httpuv r-cran-httr \
32 | 		r-cran-igraph r-cran-ipred r-cran-iterators \
33 | 		r-cran-jsonlite r-cran-kernsmooth r-cran-knitr \
34 | 		r-cran-labeling r-cran-lattice r-cran-lava \
35 | 		r-cran-lazyeval r-cran-magrittr r-cran-markdown \
36 | 		r-cran-mass r-cran-matrix r-cran-matrixstats \
37 | 		r-cran-memoise r-cran-mgcv r-cran-mime \
38 | 		r-cran-mockery r-cran-modeltools r-cran-multcomp \
39 | 		r-cran-munsell r-cran-mvtnorm r-cran-nlme \
40 | 		r-cran-nnet r-cran-numderiv r-cran-openssl \
41 | 		r-cran-pillar r-cran-pkgconfig r-cran-plogr \
42 | 		r-cran-plyr r-cran-praise r-cran-prettyunits \
43 | 		r-cran-prodlim r-cran-purrr r-cran-r6 \
44 | 		r-cran-rcolorbrewer r-cran-rcpp r-cran-rcurl \
45 | 		r-cran-readr r-cran-rematch r-cran-reshape2 \
46 | 		r-cran-rjson r-cran-rlang r-cran-rpart \
47 | 		r-cran-rprojroot r-cran-rsqlite r-cran-rstudioapi \
48 | 		r-cran-runit r-cran-sandwich r-cran-scales \
49 | 		r-cran-shiny r-cran-sourcetools r-cran-stringi \
50 | 		r-cran-stringr r-cran-strucchange r-cran-survival \
51 | 		r-cran-testthat r-cran-th.data r-cran-tibble \
52 | 		r-cran-tidyr r-cran-tidyselect r-cran-utf8 \
53 | 		r-cran-uuid r-cran-viridis r-cran-viridislite \
54 | 		r-cran-whisker r-cran-withr r-cran-xml \
55 | 		r-cran-xml2 r-cran-xtable r-cran-yaml \
56 | 		r-cran-zoo
57 | 
58 | # Deal with the Python2/3 situation
59 | RUN apt-get install -y --no-install-recommends \
60 | 		python3 \
61 | 		python3-dev \
62 | 		python3-pip && \
63 | 	pip3 install --no-cache-dir --upgrade pip setuptools && \
64 | 		echo "alias python='python3'" >> /root/.bash_aliases && \
65 | 		echo "alias pip='pip3'" >> /root/.bash_aliases && \
66 | 		cd /usr/local/bin && ln -s /usr/bin/python3 python
67 | 
68 | # Clone the repo
69 | RUN git clone https://github.com/alan-turing-institute/CSV_Wrangling
70 | 
71 | # Install dependencies
72 | RUN pip install -r CSV_Wrangling/requirements.txt
73 | RUN ./CSV_Wrangling/utils/install_R_packages.sh CSV_Wrangling/Rpackages.txt
74 | 
75 | WORKDIR CSV_Wrangling
76 | 


--------------------------------------------------------------------------------
/scripts/analysis/figure_bar_plot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Convert summary data to a bar plot.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | 
  9 | """
 10 | 
 11 | import json
 12 | import os
 13 | import argparse
 14 | 
 15 | from .core import (
 16 |     ORDERED_DETECTORS,
 17 |     ORDERED_PROP,
 18 |     check_detectors,
 19 |     clean_detector_name,
 20 | )
 21 | from .latex import build_latex_doc
 22 | 
 23 | 
 24 | def create_prop_graph(results, output_file):
 25 |     for prop in results:
 26 |         check_detectors(results[prop].keys())
 27 | 
 28 |     abbrev = [clean_detector_name(d) for d in ORDERED_DETECTORS]
 29 |     tex = (
 30 |         "\\documentclass[preview=true]{standalone}\n"
 31 |         "\\pdfinfoomitdate=1\n"
 32 |         "\\pdftrailerid{}\n"
 33 |         "\\pdfsuppressptexinfo=1\n"
 34 |         "\\usepackage{tikz}\n"
 35 |         "\\usepackage{pgfplots}\n"
 36 |         "\\pgfplotsset{compat=1.16}\n"
 37 |         "\\begin{document}\n"
 38 |         "\\begin{tikzpicture}\n"
 39 |         "\\begin{axis}[\n"
 40 |         "\tybar,\n"
 41 |         "\twidth={400},\n"
 42 |         "\theight={200},\n"
 43 |         "\tymin=0,\n"
 44 |         "\tlegend style={at={(0.5,-0.15)}, anchor=north, legend columns=-1},\n"
 45 |         "\tylabel={Accuracy (\\%%)},\n"
 46 |         "\tsymbolic x coords={%s},\n"
 47 |         "\txtick=data,\n"
 48 |         "\tnodes near coords,\n"
 49 |         "\tnodes near coords align={vertical},\n"
 50 |         "\tevery node near coord/.append style={font=\\tiny},\n"
 51 |         "\t]\n" % ",".join(abbrev)
 52 |     )
 53 |     for prop in ORDERED_PROP:
 54 |         line = "\\addplot coordinates {"
 55 |         for detector in ORDERED_DETECTORS:
 56 |             line += "(%s,%.16f) " % (
 57 |                 clean_detector_name(detector),
 58 |                 results[prop][detector],
 59 |             )
 60 |         line += "};\n"
 61 | 
 62 |         tex += line
 63 | 
 64 |     tex += "\\legend{%s}\n" % ",".join(ORDERED_PROP)
 65 |     tex += "\\end{axis}\n" "\\end{tikzpicture}\n" "\\end{document}"
 66 | 
 67 |     tex_file = os.path.splitext(output_file)[0] + ".tex"
 68 |     with open(tex_file, "w") as fid:
 69 |         fid.write(tex)
 70 | 
 71 |     build_latex_doc(tex, output_name=output_file)
 72 | 
 73 | 
 74 | def parse_args():
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument(
 77 |         "type",
 78 |         choices=["all", "human", "normal"],
 79 |         help="Subset of data to generate plot for",
 80 |         default="all",
 81 |     )
 82 |     parser.add_argument(
 83 |         "-o", dest="output", help="Output pdf file to write to", required=True
 84 |     )
 85 |     parser.add_argument(
 86 |         "-s",
 87 |         dest="summary",
 88 |         help="Summary file with the results",
 89 |         required=True,
 90 |     )
 91 | 
 92 |     return parser.parse_args()
 93 | 
 94 | 
 95 | def main():
 96 |     args = parse_args()
 97 | 
 98 |     with open(args.summary, "r") as fid:
 99 |         data = json.load(fid)
100 | 
101 |     key = "detection_accuracy_" + args.type
102 |     if not key in data:
103 |         raise ValueError("Can't find key %s in file %s" % (key, args.summary))
104 | 
105 |     create_prop_graph(data[key], args.output)
106 | 


--------------------------------------------------------------------------------
/scripts/analysis/table_parse_result.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Generate a table with percentages along a "no result"/"incorrect 
  6 | result"/"correct result" split.
  7 | 
  8 |           | Sniff | Suit | Hypo | Our (full) |
  9 | ----------------------------------------------
 10 | No Result |       |      |      |            |
 11 | Incorrect |       |      |      |            |
 12 | Correct   |       |      |      |            |
 13 | 
 14 | 
 15 | Author: Gertjan van den Burg
 16 | Date: 2019-04-02
 17 | 
 18 | """
 19 | 
 20 | import argparse
 21 | import json
 22 | 
 23 | from .core import (
 24 |     ORDERED_DETECTORS,
 25 |     TABLE_SPEC,
 26 |     clean_detector_name,
 27 |     check_detectors,
 28 | )
 29 | from .latex import build_latex_table
 30 | 
 31 | 
 32 | def create_table(results, output_file):
 33 |     row_no_result = ["No Result"]
 34 |     row_incorrect = ["Incorrect"]
 35 |     row_correct = ["Correct"]
 36 | 
 37 |     check_detectors(results["no_result_all"].keys())
 38 |     check_detectors(results["incorrect_result_all"].keys())
 39 |     check_detectors(results["correct_result_all"].keys())
 40 | 
 41 |     for key in ORDERED_DETECTORS:
 42 |         row_no_result.append(results["no_result_all"][key] * 100.0)
 43 |         row_incorrect.append(results["incorrect_result_all"][key] * 100.0)
 44 |         row_correct.append(results["correct_result_all"][key] * 100.0)
 45 | 
 46 |         # check that the values add up to 100% (minus precision errors)
 47 |         diff = abs(
 48 |             sum((r[-1] for r in [row_no_result, row_incorrect, row_correct]))
 49 |             - 100.0
 50 |         )
 51 |         if not diff < 1e-13:
 52 |             raise AssertionError("Difference is larger than eps: %r" % diff)
 53 | 
 54 |     headers = [""] + list(map(clean_detector_name, ORDERED_DETECTORS))
 55 | 
 56 |     table = [row_no_result, row_incorrect, row_correct]
 57 |     with open(output_file, "w") as fid:
 58 |         fid.write(
 59 |             build_latex_table(
 60 |                 table,
 61 |                 headers,
 62 |                 floatfmt=".2f",
 63 |                 bests=[min, min, max],
 64 |                 table_spec=TABLE_SPEC,
 65 |             )
 66 |         )
 67 | 
 68 | 
 69 | def parse_args():
 70 |     parser = argparse.ArgumentParser("Create parsing result table")
 71 |     parser.add_argument(
 72 |         "-o", dest="output", help="Output tex file to write to", required=True
 73 |     )
 74 |     parser.add_argument(
 75 |         "-s",
 76 |         dest="summary",
 77 |         help="Summary file with the results",
 78 |         required=True,
 79 |     )
 80 | 
 81 |     return parser.parse_args()
 82 | 
 83 | 
 84 | def main():
 85 |     args = parse_args()
 86 |     with open(args.summary, "r") as fid:
 87 |         data = json.load(fid)
 88 | 
 89 |     needed_keys = [
 90 |         "no_result_all",
 91 |         "incorrect_result_all",
 92 |         "correct_result_all",
 93 |     ]
 94 |     for key in needed_keys:
 95 |         if not key in data:
 96 |             raise ValueError(
 97 |                 "Required key '%s' not present in summary file." % key
 98 |             )
 99 | 
100 |     create_table(data, args.output)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/scripts/analysis/core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Common definitions for the analysis scripts
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | 
 11 | """
 12 | 
 13 | import json
 14 | import os
 15 | 
 16 | from common.detector_result import DetectorResult
 17 | 
 18 | DETECTOR_NAMES = {
 19 |     "hypoparsr": "HypoParsr",
 20 |     "sniffer": "Sniffer",
 21 |     "suitability": "Suitability",
 22 |     "our_score_pattern_only": "Pattern",
 23 |     "our_score_type_only": "Type",
 24 |     "our_score_full_no_tie": "No Tie",
 25 |     "our_score_full": "Full",
 26 | }
 27 | 
 28 | ORDERED_DETECTORS = [
 29 |     "hypoparsr",
 30 |     "sniffer",
 31 |     "suitability",
 32 |     "our_score_pattern_only",
 33 |     "our_score_type_only",
 34 |     "our_score_full_no_tie",
 35 |     "our_score_full",
 36 | ]
 37 | TABLE_SPEC = "lrrr|rrrr"
 38 | 
 39 | ORDERED_PROP = ["delimiter", "quotechar", "escapechar", "overall"]
 40 | 
 41 | CORPUS_NAMES = {"github": "GitHub", "ukdata": "UKdata"}
 42 | 
 43 | 
 44 | def check_detectors(names):
 45 |     if not set(ORDERED_DETECTORS) == set(names):
 46 |         print(
 47 |             "Detector set doesn't match!\nExpected: %r\nReceived: %r\n"
 48 |             % (sorted(set(ORDERED_DETECTORS)), sorted(set(names)))
 49 |         )
 50 |         raise SystemExit(1)
 51 | 
 52 | 
 53 | def clean_detector_name(detector):
 54 |     abbr = DETECTOR_NAMES.get(detector, detector)
 55 |     return abbr.replace("_", "\\_")
 56 | 
 57 | 
 58 | def load_detector_results(result_file):
 59 |     """
 60 |     Load the results from a given detector result file. Verify each record in 
 61 |     the process.
 62 |     """
 63 |     detector_names = set()
 64 |     results = {}
 65 |     with open(result_file, "r") as fid:
 66 |         for idx, line in enumerate(fid.readlines()):
 67 |             try:
 68 |                 record = DetectorResult.from_json(line.strip())
 69 |             except json.JSONDecodeError:
 70 |                 print(
 71 |                     "\nError parsing the following record in file (line %i): "
 72 |                     "%s\n---\n%s" % (idx + 1, result_file, line.strip())
 73 |                 )
 74 |                 raise SystemExit(1)
 75 | 
 76 |             detector_names.add(record.detector)
 77 | 
 78 |             fname = record.filename
 79 |             if not os.path.isabs(fname):
 80 |                 fname = os.path.abspath(fname)
 81 |                 record.filename = fname
 82 |             if fname in results:
 83 |                 raise ValueError(
 84 |                     "Duplicate result for file %s in detector file %s"
 85 |                     % (record.filename, result_file)
 86 |                 )
 87 | 
 88 |             record.validate()
 89 |             results[fname] = record
 90 | 
 91 |     if len(detector_names) > 1:
 92 |         raise ValueError(
 93 |             "More than one detector name in file: %s" % result_file
 94 |         )
 95 |     detector = detector_names.pop()
 96 |     return detector, results
 97 | 
 98 | 
 99 | def is_standard_dialect(dialect):
100 |     if (
101 |         dialect.delimiter == ","
102 |         and dialect.quotechar in ["", '"']
103 |         and dialect.escapechar == ""
104 |     ):
105 |         return True
106 |     return False
107 | 


--------------------------------------------------------------------------------
/scripts/analysis/figure_fail.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Convert summary data to a bar plot.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | 
  9 | """
 10 | 
 11 | import json
 12 | import argparse
 13 | import os
 14 | 
 15 | from .core import (
 16 |     CORPUS_NAMES,
 17 |     DETECTOR_NAMES,
 18 |     ORDERED_DETECTORS,
 19 |     check_detectors,
 20 | )
 21 | from .latex import build_latex_doc
 22 | 
 23 | BAR_PATTERNS = [
 24 |     "north east lines",
 25 |     "none",
 26 |     "north west lines",
 27 |     "horizontal lines",
 28 |     "vertical lines",
 29 |     "grid",
 30 |     "crosshatch",
 31 | ]
 32 | 
 33 | 
 34 | def clean_name(detector):
 35 |     abbr = DETECTOR_NAMES.get(detector, detector)
 36 |     return abbr.replace("_", "\\_")
 37 | 
 38 | 
 39 | def create_fail_graph(results, output_file):
 40 |     fail_data = {corpus: results[corpus]["failures"] for corpus in results}
 41 |     for corpus in fail_data:
 42 |         check_detectors(fail_data[corpus].keys())
 43 | 
 44 |     abbrev = [clean_name(d) for d in ORDERED_DETECTORS]
 45 |     tex = (
 46 |         "\\documentclass[preview=true]{standalone}\n"
 47 |         "\\pdfinfoomitdate=1\n"
 48 |         "\\pdftrailerid{}\n"
 49 |         "\\pdfsuppressptexinfo=1\n"
 50 |         "\\usepackage{tikz}\n"
 51 |         "\\usepackage{pgfplots}\n"
 52 |         "\\usetikzlibrary{patterns}\n"
 53 |         "\\pgfplotsset{compat=1.16}\n"
 54 |         "\\begin{document}\n"
 55 |         "\\begin{tikzpicture}\n"
 56 |         "\\begin{axis}[\n"
 57 |         "\tybar,\n"
 58 |         "\twidth={600},\n"
 59 |         "\theight={200},\n"
 60 |         "\tymin=0,\n"
 61 |         "\tlegend pos={north east},\n"
 62 |         "\tylabel={Failure (\\%%)},\n"
 63 |         "\tsymbolic x coords={%s},\n"
 64 |         "\txtick=data,\n"
 65 |         "\tnodes near coords,\n"
 66 |         "\tevery node near coord/.append style={font=\\tiny, /pgf/number format/fixed},\n"
 67 |         "\tnodes near coords align={vertical},\n"
 68 |         "\t]\n" % ",".join(abbrev)
 69 |     )
 70 | 
 71 |     corpora = sorted(fail_data.keys())
 72 | 
 73 |     for pattern, corpus in zip(BAR_PATTERNS, corpora):
 74 |         line = "\\addplot[postaction={pattern=%s}] coordinates {" % pattern
 75 |         for detector in ORDERED_DETECTORS:
 76 |             line += "(%s,%.16f) " % (
 77 |                 clean_name(detector),
 78 |                 fail_data[corpus][detector] * 100.0,
 79 |             )
 80 |         line += "};\n"
 81 |         tex += line
 82 | 
 83 |     tex += "\\legend{%s}\n" % ", ".join([CORPUS_NAMES.get(c) for c in corpora])
 84 | 
 85 |     tex += "\\end{axis}\n" "\\end{tikzpicture}\n" "\\end{document}"
 86 | 
 87 |     tex_file = os.path.splitext(output_file)[0] + ".tex"
 88 |     with open(tex_file, "w") as fid:
 89 |         fid.write(tex)
 90 | 
 91 |     build_latex_doc(tex, output_name=output_file)
 92 | 
 93 | 
 94 | def parse_args():
 95 |     parser = argparse.ArgumentParser()
 96 |     parser.add_argument(
 97 |         "-o", dest="output", help="Output pdf file to write to", required=True
 98 |     )
 99 |     parser.add_argument(
100 |         "-s",
101 |         dest="summaries",
102 |         help="Summary file with the results",
103 |         required=True,
104 |         nargs="+",
105 |     )
106 | 
107 |     return parser.parse_args()
108 | 
109 | 
110 | def main():
111 |     args = parse_args()
112 | 
113 |     all_data = {}
114 |     for summary_file in args.summaries:
115 |         with open(summary_file, "r") as fid:
116 |             data = json.load(fid)
117 |         all_data[data["corpus"]] = data
118 | 
119 |     create_fail_graph(all_data, args.output)
120 | 
121 | 


--------------------------------------------------------------------------------
/scripts/data_download.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Downloader for the experimental data.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | Date: 2018-11-26
 11 | 
 12 | """
 13 | 
 14 | import argparse
 15 | import hashlib
 16 | import json
 17 | import os
 18 | import random
 19 | import requests
 20 | import shutil
 21 | import sys
 22 | import tempfile
 23 | import time
 24 | 
 25 | 
 26 | def md5sum(filename):
 27 |     blocksize = 65536
 28 |     hasher = hashlib.md5()
 29 |     with open(filename, "rb") as fid:
 30 |         buf = fid.read(blocksize)
 31 |         while len(buf) > 0:
 32 |             hasher.update(buf)
 33 |             buf = fid.read(blocksize)
 34 |     return hasher.hexdigest()
 35 | 
 36 | 
 37 | def download_url(urls, md5old, output_dir):
 38 |     response = None
 39 |     for url in urls:
 40 |         # TODO: Catch error when URL no longer exists
 41 |         try:
 42 |             response = requests.get(url)
 43 |             break
 44 |         except requests.exceptions.ConnectionError:
 45 |             print(
 46 |                 "Connection error occurred trying to get url: %s" % url,
 47 |                 file=sys.stderr,
 48 |             )
 49 |             continue
 50 |         except requests.exceptions.ChunkedEncodingError:
 51 |             print("Connection error occurred trying to get url: %s" % url,
 52 |                     file=sys.stderr)
 53 |             continue
 54 |     if response is None or response.status_code != 200:
 55 |         return None
 56 | 
 57 |     tmpfd, tmpfname = tempfile.mkstemp()
 58 |     tmpfid = os.fdopen(tmpfd, "wb")
 59 |     tmpfid.write(response.content)
 60 |     tmpfid.close()
 61 | 
 62 |     md5new = md5sum(tmpfname)
 63 |     if not md5new == md5old:
 64 |         print(
 65 |             "Checksum mismatch for URL '%s'. Skipping this file." % url,
 66 |             file=sys.stderr,
 67 |         )
 68 |         os.unlink(tmpfname)
 69 |         return None
 70 |     target = os.path.join(output_dir, md5new + ".csv")
 71 |     shutil.move(tmpfname, target)
 72 |     return target
 73 | 
 74 | 
 75 | def parse_args():
 76 |     parser = argparse.ArgumentParser("Data Downloader")
 77 |     parser.add_argument(
 78 |         "-i",
 79 |         "--input",
 80 |         help="JSONlines file with urls and hashes",
 81 |         required=True,
 82 |     )
 83 |     parser.add_argument(
 84 |         "-o", "--output", help="output directory", required=True
 85 |     )
 86 |     return parser.parse_args()
 87 | 
 88 | 
 89 | def main():
 90 |     args = parse_args()
 91 | 
 92 |     # load the input file
 93 |     url_and_hash = []
 94 |     with open(args.input, "r") as fid:
 95 |         for line in fid:
 96 |             obj = json.loads(line.strip())
 97 |             url_and_hash.append(obj)
 98 | 
 99 |     # Remove files that already exist
100 |     have_obj = []
101 |     have_files = os.listdir(args.output)
102 |     for f in have_files:
103 |         h = os.path.splitext(f)[0]
104 |         obj = next((x for x in url_and_hash if x["md5"] == h), None)
105 |         if obj is None:
106 |             # ignore files not in our list
107 |             continue
108 |         have_obj.append(obj)
109 |     for obj in have_obj:
110 |         url_and_hash.remove(obj)
111 | 
112 |     # start the download
113 |     for obj in url_and_hash:
114 |         target = download_url(obj["urls"], obj["md5"], args.output)
115 |         if target is None:
116 |             continue
117 |         print("Downloaded file '%s'" % target)
118 |         time.sleep(random.random())
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/scripts/detection/sniffer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This detector uses the Python CSV Sniffer to detect the dialect.
  6 | 
  7 | A timeout is needed on the Sniffer because the regular expression for detecting 
  8 | double quotes can run into catastrophic backtracking if a CSV file has many 
  9 | empty lines at the end that only contain delimiters (i.e. ",,,,,,,,,," lines).
 10 | 
 11 | Author: Gertjan van den Burg
 12 | Copyright (c) 2018 - The Alan Turing Institute
 13 | License: See the LICENSE file.
 14 | 
 15 | """
 16 | 
 17 | import csv
 18 | 
 19 | from multiprocessing import Process, Manager
 20 | 
 21 | from .core import run
 22 | 
 23 | from common.encoding import get_encoding
 24 | from common.load import load_file
 25 | from common.detector_result import DetectorResult, Dialect, Status, StatusMsg
 26 | 
 27 | DETECTOR = "sniffer"
 28 | TIMEOUT = 120
 29 | 
 30 | 
 31 | def worker(args, return_dict, **kwargs):
 32 |     res = determine_dqr(*args, **kwargs)
 33 |     return_dict["output"] = res
 34 | 
 35 | 
 36 | def run_with_timeout(args, kwargs, limit):
 37 |     # See: https://stackoverflow.com/a/26664130/1154005
 38 |     # and: https://stackoverflow.com/a/10415215/1154005
 39 | 
 40 |     manager = Manager()
 41 |     return_dict = manager.dict()
 42 | 
 43 |     p = Process(target=worker, args=(args, return_dict), kwargs=kwargs)
 44 |     p.start()
 45 |     p.join(limit)
 46 |     if p.is_alive():
 47 |         p.terminate()
 48 |         return None
 49 |     if "output" in return_dict:
 50 |         return return_dict["output"]
 51 |     return None
 52 | 
 53 | 
 54 | def sniff(sample, delimiters=None):
 55 |     """
 56 |     This function mimics the Sniffer.sniff() method from the Python CSV 
 57 |     function, with one exception: it doesn't change the detected quotechar to 
 58 |     default to '"'. We do this because we want to know the detected quote 
 59 |     character.
 60 | 
 61 |     """
 62 |     sniffer = csv.Sniffer()
 63 | 
 64 |     quotechar, doublequote, delimiter, skipinitialspace = sniffer._guess_quote_and_delimiter(
 65 |         sample, delimiters
 66 |     )
 67 | 
 68 |     if not delimiter:
 69 |         delimiter, skipinitialspace = sniffer._guess_delimiter(
 70 |             sample, delimiters
 71 |         )
 72 |     if not delimiter:
 73 |         raise csv.Error("Could not determine delimiter")
 74 | 
 75 |     class dialect(csv.Dialect):
 76 |         _name = "sniffed"
 77 |         lineterminator = "\r\n"  # unused
 78 |         quoting = csv.QUOTE_MINIMAL
 79 | 
 80 |     dialect.doublequote = doublequote
 81 |     dialect.delimiter = delimiter
 82 |     dialect.quotechar = quotechar  # See above
 83 |     dialect.skipinitialspace = skipinitialspace
 84 |     dialect.escapechar = '' if dialect.escapechar is None else dialect.escapechar
 85 | 
 86 |     return dialect
 87 | 
 88 | 
 89 | def determine_dqr(filename, verbose=False):
 90 |     """ Run the python CSV Sniffer """
 91 |     encoding = get_encoding(filename)
 92 |     data = load_file(filename, encoding=encoding)
 93 |     if data is None:
 94 |         return DetectorResult(
 95 |             status=Status.SKIP, status_msg=StatusMsg.UNREADABLE
 96 |         )
 97 | 
 98 |     try:
 99 |         dialect = sniff(data)
100 |     except csv.Error:
101 |         return DetectorResult(
102 |             status=Status.FAIL, status_msg=StatusMsg.NO_RESULTS
103 |         )
104 | 
105 |     config = {
106 |         "delimiter": dialect.delimiter,
107 |         "quotechar": dialect.quotechar,
108 |         "escapechar": dialect.escapechar,
109 |     }
110 |     res = DetectorResult(dialect=Dialect.from_dict(config), status=Status.OK)
111 | 
112 |     return res
113 | 
114 | 
115 | def wrap_determine_dqr(filename, verbose=False):
116 |     res = run_with_timeout((filename,), {"verbose": verbose}, TIMEOUT)
117 |     if res is None:
118 |         return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.TIMEOUT)
119 |     return res
120 | 
121 | 
122 | def main():
123 |     run(determine_dqr=wrap_determine_dqr, detector=DETECTOR)
124 | 


--------------------------------------------------------------------------------
/scripts/common/detector_result.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Definitions for a DetectorResult object.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | 
 11 | """
 12 | 
 13 | 
 14 | import enum
 15 | import json
 16 | import socket
 17 | import sys
 18 | 
 19 | from .dialect import Dialect
 20 | 
 21 | 
 22 | class Status(enum.Enum):
 23 |     UNKNOWN = 0
 24 |     OK = 1
 25 |     FAIL = 2
 26 |     SKIP = 3
 27 | 
 28 | 
 29 | class StatusMsg(enum.Enum):
 30 |     UNKNOWN = 0
 31 |     MULTIPLE_ANSWERS = 1
 32 |     NO_RESULTS = 2
 33 |     TIMEOUT = 3
 34 |     UNREADABLE = 4
 35 |     NON_EXISTENT = 5
 36 |     NO_DIALECTS = 6
 37 |     HUMAN_SKIP = 7
 38 |     AMBIGUOUS_QUOTECHAR = 8
 39 | 
 40 | 
 41 | class DetectorResult(object):
 42 |     def __init__(
 43 |         self,
 44 |         detector=None,
 45 |         dialect=None,
 46 |         filename=None,
 47 |         hostname=None,
 48 |         runtime=None,
 49 |         status=None,
 50 |         status_msg=None,
 51 |         original_detector=None,
 52 |         note=None
 53 |     ):
 54 |         self.detector = detector
 55 |         self.dialect = dialect
 56 |         self.filename = filename
 57 |         self.hostname = hostname or socket.gethostname()
 58 |         self.runtime = runtime
 59 |         self.status = status
 60 |         self.status_msg = status_msg
 61 |         self.original_detector = original_detector or detector
 62 |         self.note = note
 63 | 
 64 |     def validate(self):
 65 |         assert isinstance(self.status, Status)
 66 |         if not self.status_msg is None:
 67 |             assert isinstance(self.status_msg, StatusMsg)
 68 |         assert not self.detector is None
 69 |         assert not self.hostname is None
 70 |         assert not self.filename is None
 71 |         if self.status == Status.OK:
 72 |             assert not self.dialect is None
 73 |             assert isinstance(self.dialect, Dialect)
 74 |             try:
 75 |                 self.dialect.validate()
 76 |             except ValueError:
 77 |                 print("Dialect validation error for: %r" % self)
 78 |                 raise
 79 |         else:
 80 |             assert self.dialect is None
 81 | 
 82 |     def to_json(self):
 83 |         self.validate()
 84 |         output = {
 85 |             "detector": self.detector,
 86 |             "filename": self.filename,
 87 |             "hostname": self.hostname,
 88 |             "runtime": self.runtime,
 89 |             "status": self.status.name,
 90 |         }
 91 |         if not self.dialect is None:
 92 |             output["dialect"] = self.dialect.to_dict()
 93 |         if not self.status_msg is None:
 94 |             output["status_msg"] = self.status_msg.name
 95 |         if not self.note is None:
 96 |             output['note'] = self.note
 97 |         if not self.detector == self.original_detector:
 98 |             output["original_detector"] = self.original_detector
 99 |         as_json = json.dumps(output)
100 |         return as_json
101 | 
102 |     @classmethod
103 |     def from_json(cls, line):
104 |         """ load from a json line """
105 |         d = json.loads(line)
106 |         try:
107 |             d["dialect"] = (
108 |                 Dialect.from_dict(d["dialect"]) if "dialect" in d else None
109 |             )
110 |         except:
111 |             print("Error occurred parsing dialect from line: %s" % line, 
112 |                     file=sys.stderr)
113 |             raise
114 |         d["status"] = Status[d["status"]]
115 |         d["status_msg"] = (
116 |             StatusMsg[d["status_msg"]] if "status_msg" in d else None
117 |         )
118 |         dr = cls(**d)
119 |         dr.validate()
120 |         return dr
121 | 
122 |     def __repr__(self):
123 |         s = (
124 |             "DetectorResult(detector=%r, dialect=%r, runtime=%r, status=%r, status_msg=%r)"
125 |             % (
126 |                 self.detector,
127 |                 self.dialect,
128 |                 self.runtime,
129 |                 self.status.value,
130 |                 self.status_msg,
131 |             )
132 |         )
133 |         return s
134 | 


--------------------------------------------------------------------------------
/scripts/analysis/latex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Code for compiling latex from Python.
  6 | 
  7 | Based on: https://github.com/GjjvdBurg/labella.py
  8 | 
  9 | Author: Gertjan van den Burg
 10 | Copyright (c) 2018 - The Alan Turing Institute
 11 | License: See the LICENSE file.
 12 | 
 13 | """
 14 | 
 15 | import os
 16 | import shutil
 17 | import subprocess
 18 | import tabulate
 19 | import tempfile
 20 | 
 21 | 
 22 | def compile_latex(fname, tmpdirname, silent=True):
 23 |     compiler = "latexmk"
 24 |     compiler_args = [
 25 |         "--pdf",
 26 |         "--outdir=" + tmpdirname,
 27 |         "--interaction=nonstopmode",
 28 |         fname,
 29 |     ]
 30 |     command = [compiler] + compiler_args
 31 |     try:
 32 |         output = subprocess.check_output(command, stderr=subprocess.STDOUT)
 33 |     except (OSError, IOError) as e:
 34 |         raise (e)
 35 |     except subprocess.CalledProcessError as e:
 36 |         print(e.output.decode())
 37 |         raise (e)
 38 |     else:
 39 |         if not silent:
 40 |             print(output.decode())
 41 | 
 42 | 
 43 | def build_latex_doc(tex, output_name=None, silent=True):
 44 |     with tempfile.TemporaryDirectory() as tmpdirname:
 45 |         basename = "labella_text"
 46 |         fname = os.path.join(tmpdirname, basename + ".tex")
 47 |         with open(fname, "w") as fid:
 48 |             fid.write(tex)
 49 | 
 50 |         compile_latex(fname, tmpdirname, silent=silent)
 51 | 
 52 |         pdfname = os.path.join(tmpdirname, basename + ".pdf")
 53 |         if output_name:
 54 |             shutil.copy2(pdfname, output_name)
 55 | 
 56 | 
 57 | def build_latex_table(
 58 |     table, headers, floatfmt="g", missingval="", bests="default", 
 59 |     table_spec=None
 60 | ):
 61 |     """Construct the LaTeX code for a table
 62 | 
 63 |     This function creates the LaTeX code for a data table while taking number 
 64 |     formatting, headers, missing values, and "best value formatting" into 
 65 |     account.
 66 | 
 67 |     The numbers in the table are formatted following the provided float format 
 68 |     and the missing value indicator using the ``_format`` function from the 
 69 |     ``tabulate`` package. To indicate a missing value the data row should mark 
 70 |     this value as ``None``.
 71 | 
 72 |     The ``bests`` parameter is used to decide how to highlight the best values 
 73 |     in each row. It can be either ``'default'``, ``None``, a list of length 1 
 74 |     where the element is either ``min`` or ``max``, or a list of length ``K`` 
 75 |     with similar elements where ``K`` is the length of the data table. If it is 
 76 |     ``'default'`` then ``max`` will be considered best for each row. If a list 
 77 |     of length 1 is supplied then the provided function will be used for each 
 78 |     row. If ``None``, no highlighting will be done.
 79 | 
 80 |     The ``table_spec`` parameter allows the user to specify the table 
 81 |     specification. This value is not checked. If it is None, the first column 
 82 |     will get 'l' spec and the remaining columns will get the 'r' spec.
 83 | 
 84 |     """
 85 |     if bests == "default":
 86 |         bests = [max]
 87 |     elif bests is None:
 88 |         bests = []
 89 | 
 90 |     if len(bests) > 1:
 91 |         assert len(bests) == len(table)
 92 |     assert all((x in [min, max] for x in bests))
 93 | 
 94 |     if len(bests) == 0:
 95 |         best_funcs = [None for x in range(len(table))]
 96 |     elif len(bests) == 1:
 97 |         best_funcs = [bests[0] for x in range(len(table))]
 98 |     else:
 99 |         best_funcs = bests[:]
100 | 
101 |     list_of_lists, headers = table, headers
102 |     cols = list(zip(*list_of_lists))
103 |     coltypes = list(map(tabulate._column_type, cols))
104 |     cols = [
105 |         [tabulate._format(v, ct, floatfmt, missingval) for v in c]
106 |         for c, ct in zip(cols, coltypes)
107 |     ]
108 |     n_cols = len(cols)
109 | 
110 |     data_rows = table
111 |     text_rows = list(zip(*cols))
112 | 
113 |     text = []
114 |     if table_spec is None:
115 |         text.append("\\begin{tabular}{l%s}" % ("r" * n_cols))
116 |     else:
117 |         text.append("\\begin{tabular}{%s}" % table_spec)
118 |     text.append(" & ".join(headers) + "\\\\")
119 |     text.append("\\hline")
120 |     for data_row, text_row, best_func in zip(data_rows, text_rows, best_funcs):
121 |         text_row = list(text_row)
122 |         if not best_func is None:
123 |             best_val = best_func([x for x in data_row if isinstance(x, float)])
124 |             best_idx = [i for i, v in enumerate(data_row) if v == best_val]
125 |             for idx in best_idx:
126 |                 text_row[idx] = "\\textbf{" + text_row[idx] + "}"
127 |         text.append(" & ".join(text_row) + "\\\\")
128 |     text.append("\\hline")
129 |     text.append("\\end{tabular}")
130 | 
131 |     return "\n".join(text)
132 | 


--------------------------------------------------------------------------------
/scripts/detection/core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Common functions for the Python code of the experiment.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | """
 11 | 
 12 | import os
 13 | import json
 14 | import time
 15 | import argparse
 16 | import codecs
 17 | import unicodedata
 18 | 
 19 | from tqdm import tqdm
 20 | 
 21 | from common.detector_result import DetectorResult, Status, StatusMsg
 22 | 
 23 | 
 24 | def can_be_delim_unicode(char, encoding=None):
 25 |     as_unicode = codecs.decode(bytes(char, encoding), encoding=encoding)
 26 |     ctr = unicodedata.category(as_unicode)
 27 |     if ctr in ["Lu", "Ll", "Lt", "Lm", "Lo"]:
 28 |         return False
 29 |     elif ctr in ["Nd", "Nl", "No"]:  # number
 30 |         return False
 31 |     elif ctr in ["Po", "Pd", "Pc"]:  # punctuation
 32 |         return True
 33 |     elif ctr in ["Ps", "Pe"]:  # open and close brackets (maybe include?)
 34 |         return False
 35 |     elif ctr == "Zs":  # space
 36 |         return True
 37 |     elif ctr == "Sm":  # math symbols
 38 |         return True
 39 |     elif ctr == "Cc":  # other control (i.e. tab etc.)
 40 |         if as_unicode == "\t":
 41 |             return True
 42 |         return False
 43 |     elif ctr == "Co":  # private use (maybe used for NA?)
 44 |         # NOTE: This is tricky, we may slow our algorithm down a lot by
 45 |         # including all these code points as potential delimiters, but we
 46 |         # may also find the delimiter here.
 47 |         # Let's see if we _ever_ find a file that uses a private use
 48 |         # codepoint as a delimiter.
 49 |         return False
 50 |     return True
 51 | 
 52 | 
 53 | 
 54 | def get_potential_quotechars(data):
 55 |     quotechars = set([""])
 56 |     if "'" in data:
 57 |         quotechars.add("'")
 58 |     if '"' in data:
 59 |         quotechars.add('"')
 60 |     if "~" in data:
 61 |         quotechars.add("~")
 62 |     return quotechars
 63 | 
 64 | 
 65 | def dump_result(output_file, res):
 66 |     with open(output_file, "a") as fid:
 67 |         fid.write(res.to_json() + "\n")
 68 | 
 69 | 
 70 | def load_previous(output_file):
 71 |     previous = set()
 72 |     if not os.path.exists(output_file):
 73 |         return previous
 74 |     with open(output_file, "r") as fid:
 75 |         for line in fid.readlines():
 76 |             record = json.loads(line.strip())
 77 |             previous.add(record["filename"])
 78 |     return previous
 79 | 
 80 | 
 81 | def main(
 82 |     path_file,
 83 |     output_file,
 84 |     determine_dqr=None,
 85 |     detector=None,
 86 |     verbose=False,
 87 |     progress=False,
 88 | ):
 89 |     with open(path_file, "r") as fid:
 90 |         files = [l.strip() for l in fid.readlines()]
 91 |     files.sort()
 92 | 
 93 |     previous = load_previous(output_file)
 94 | 
 95 |     for filename in tqdm(files, disable=not progress, desc=detector):
 96 |         if filename in previous:
 97 |             continue
 98 | 
 99 |         if not os.path.exists(filename):
100 |             res = DetectorResult(
101 |                 detector=detector,
102 |                 dialect=None,
103 |                 filename=filename,
104 |                 runtime=None,
105 |                 status=Status.FAIL,
106 |                 status_msg=StatusMsg.NON_EXISTENT,
107 |             )
108 |             dump_result(output_file, res)
109 |             continue
110 | 
111 |         if not progress:
112 |             print("[%s] Analyzing file: %s" % (detector, filename))
113 | 
114 |         start_time = time.time()
115 |         try:
116 |             res = determine_dqr(filename, verbose=verbose)
117 |         except KeyboardInterrupt:
118 |             raise
119 |         except:
120 |             print("Uncaught exception occured parsing file: %s" % filename)
121 |             raise
122 | 
123 |         res.runtime = time.time() - start_time
124 |         res.filename = filename
125 |         res.detector = detector
126 |         dump_result(output_file, res)
127 | 
128 | 
129 | def parse_args():
130 |     parser = argparse.ArgumentParser()
131 |     parser.add_argument("-v", "--verbose", dest="verbose", action="store_true")
132 |     parser.add_argument(
133 |         "-p", "--progress", dest="progress", action="store_true"
134 |     )
135 |     parser.add_argument(
136 |         "input_file",
137 |         help="Input file can be a file of paths to CSV file, or the path of a single CSV file. If the former, output_file must be set",
138 |     )
139 |     parser.add_argument(
140 |         "output_file",
141 |         help="Output file (JSON) to write the results to",
142 |         default=None,
143 |         nargs="?",
144 |     )
145 |     return parser.parse_args()
146 | 
147 | 
148 | def run(determine_dqr, detector):
149 |     args = parse_args()
150 |     if args.output_file is None:
151 |         print(determine_dqr(args.input_file, verbose=args.verbose))
152 |     else:
153 |         main(
154 |             args.input_file,
155 |             args.output_file,
156 |             determine_dqr=determine_dqr,
157 |             detector=detector,
158 |             verbose=args.verbose,
159 |             progress=args.progress,
160 |         )
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CSV Wrangling
  2 | 
  3 | [![Build Status](https://travis-ci.org/alan-turing-institute/CSV_Wrangling.svg?branch=master)](https://travis-ci.org/alan-turing-institute/CSV_Wrangling)
  4 | [![DOI](https://zenodo.org/badge/158363564.svg)](https://zenodo.org/badge/latestdoi/158363564)
  5 | 
  6 | This is the repository for reproducing the experiments in the paper:
  7 | 
  8 | [**Wrangling Messy CSV files by Detecting Row and Type 
  9 | Patterns**](https://rdcu.be/bLVur) 
 10 | [(PDF)](https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf)
 11 | 
 12 | by [G.J.J. van den Burg](https://gertjanvandenburg.com), [A. 
 13 | Nazabal](https://scholar.google.co.uk/citations?user=IanHvT4AAAAJ&hl=en&oi=ao) 
 14 | and [C. Sutton](https://homepages.inf.ed.ac.uk/csutton/).
 15 | 
 16 | For an implementation of the method developed in the paper, see the 
 17 | [CleverCSV](https://github.com/alan-turing-institute/CleverCSV) repository.
 18 | 
 19 | If you use this paper or this code in your own work, please ***cite the 
 20 | paper*** using for instance the following BibTeX citation:
 21 | 
 22 | ```bibtex
 23 | @article{van2019wrangling,
 24 |   title = {Wrangling Messy {CSV} Files by Detecting Row and Type Patterns},
 25 |   author = {{van den Burg}, G. J. J. and Naz{\'a}bal, A. and Sutton, C.},
 26 |   journal = {Data Mining and Knowledge Discovery},
 27 |   year = {2019},
 28 |   volume = {33},
 29 |   number = {6},
 30 |   pages = {1799--1820},
 31 |   issn = {1573-756X},
 32 |   doi = {10.1007/s10618-019-00646-y},
 33 | }
 34 | ```
 35 | 
 36 | ## Introduction
 37 | 
 38 | Our experiments are made reproducible through the use of [GNU 
 39 | Make](https://www.gnu.org/software/make/). You can either set up your local 
 40 | environment with the necessary dependencies as described under 
 41 | [Requirements](#requirements), or use the Dockerfile included in the 
 42 | repository.
 43 | 
 44 | There are two ways to reproduce our results. The first only reproduces the 
 45 | figures, tables, and constants in the paper from the raw detection results, 
 46 | while the second runs the detection methods as well.
 47 | 
 48 | 1. You can reproduce the figures, tables, and constants from the raw 
 49 |    experimental results included in this repository. This will not re-run all 
 50 |    the experiments but will regenerate the output used in the paper. The 
 51 |    command for this is:
 52 | 
 53 |    ```bash
 54 |    $ make output
 55 |    ```
 56 | 
 57 | 2. You can fully reproduce our experiments by downloading the data and 
 58 |    rerunning the detection methods on all the files. This might take a while 
 59 |    depending on the speed of your machine and the number of cores available. 
 60 |    Total wall-clock computation time for a single core is estimated at 11 
 61 |    days. The following commands will do all of this.
 62 | 
 63 |    ```bash
 64 |    $ make clean       # remove existing output files, except human annotated
 65 |    $ make data        # download the data
 66 |    $ make results     # run all the detectors and generate the result files
 67 |    ```
 68 | 
 69 |    If you'd like to use multiple cores, you can replace the last command with:
 70 | 
 71 |    ```bash
 72 |    $ make -j X results
 73 |    ```
 74 | 
 75 |    where ``X`` is the desired number of cores.
 76 | 
 77 | 
 78 | ## Data
 79 | 
 80 | There are two datasets that are used in the experiments. Because we don't own 
 81 | the rights to all these files, we can't package these files and make them 
 82 | available in a single download. We can however provide URLs to the files and 
 83 | add a download script, which is what we do here. The data can be downloaded 
 84 | with:
 85 | 
 86 | ```bash
 87 | $ make data
 88 | ```
 89 | 
 90 | If you wish to change the download location of the data, please edit the 
 91 | ``DATA_DIR`` variable in the Makefile.
 92 | 
 93 | **Note:** We are aware of the fact that some of the files may change or become 
 94 | unavailable in the future. This is an unfortunate side-effect of using 
 95 | publically available data in this way. The data downloader skips files that 
 96 | are unavailable or that have changed. Note that this may affect the exact 
 97 | reproducibility of the results.
 98 | 
 99 | The above downloads the "test" set that was used for the evaluation in the 
100 | paper. For the "working set" that was used to develop our algorithm, run 
101 | ``make dev-data``.
102 | 
103 | If the above datasets are insufficient, the complete original data sets are 
104 | available on request for research purposes. Contact ``gertjanvandenburg at 
105 | gmail dot com``.
106 | 
107 | ## Requirements
108 | 
109 | Below are the requirements for reproducing the experiments if you're not using 
110 | Docker. Note that at the moment only Linux-based systems are supported.  MacOS 
111 | will probably work, but hasn't been tested.
112 | 
113 | - Python 3.x with the packages in the ``requirements.txt`` file. These can be 
114 |   installed with: ``pip install --user -r requirements.txt``.
115 | 
116 | - R with the external packages installed through: 
117 |   ``install.packages(c('devtools', 'rjson', 'data.tree', 'RecordLinkage', 
118 |   'readr', 'tibble'))``.
119 | 
120 | - A working [LaTeX](https://www.latex-project.org/) installation is needed for 
121 |   creating the figures (at least ``texlive-latex-extra`` and 
122 |   ``texlive-pictures``), as well as a working 
123 |   [LaTeXMK](https://mg.readthedocs.io/latexmk.html) installation.
124 | 
125 | 
126 | ## Instructions
127 | 
128 | To clone this repository and all its submodules do:
129 | 
130 | ```bash
131 | $ git clone --recurse-submodules https://github.com/alan-turing-institute/CSV_Wrangling
132 | ```
133 | 
134 | Then install the requirements as listed above and run the ``make`` command of 
135 | your choice.
136 | 
137 | ## License
138 | 
139 | With the exception of the submodule in ``scripts/detection/lib/hypoparsr`` 
140 | this code is licensed under the [MIT 
141 | license](https://en.wikipedia.org/wiki/MIT_License). See the LICENSE file for 
142 | more details.
143 | 


--------------------------------------------------------------------------------
/scripts/analysis/show_failures.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Print the failure cases to the terminal.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | 
 11 | """
 12 | 
 13 | import argparse
 14 | import itertools
 15 | import numpy as np
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | from sklearn.metrics import confusion_matrix
 19 | 
 20 | from tabulate import tabulate
 21 | 
 22 | from common.dialect import ATTRIBUTES
 23 | from common.detector_result import Status
 24 | 
 25 | from .core import load_detector_results, is_standard_dialect
 26 | 
 27 | 
 28 | def parse_args():
 29 |     parser = argparse.ArgumentParser(
 30 |         description="Show failure cases for given detector results"
 31 |     )
 32 |     parser.add_argument(
 33 |         "-r",
 34 |         dest="reference_file",
 35 |         help="reference output file with ground truth",
 36 |         required=True,
 37 |     )
 38 |     parser.add_argument(
 39 |         "-d", dest="detector_file", help="detector output file", required=True
 40 |     )
 41 |     parser.add_argument(
 42 |         "-p",
 43 |         dest="attr_name",
 44 |         choices=ATTRIBUTES + ["overall"],
 45 |         help="Attribute to show failure for. If omitted, shows the files for which the detector failed.",
 46 |         required=False,
 47 |         default=None,
 48 |     )
 49 |     parser.add_argument(
 50 |         "-c",
 51 |         dest="confusion",
 52 |         help="Plot and print the confusion matrix",
 53 |         action="store_true",
 54 |     )
 55 |     parser.add_argument(
 56 |         "-m",
 57 |         dest="only_messy",
 58 |         help="Show only failures for messy files",
 59 |         action="store_true",
 60 |     )
 61 | 
 62 |     return parser.parse_args()
 63 | 
 64 | 
 65 | def show_complete_failure(
 66 |     ref_results, detector, det_results, only_messy=False
 67 | ):
 68 |     print("Detector: %s. Failure cases." % detector)
 69 |     count = 0
 70 |     total = 0
 71 |     for fname in ref_results:
 72 |         res_ref = ref_results[fname]
 73 |         if not res_ref.status == Status.OK:
 74 |             continue
 75 |         if only_messy and is_standard_dialect(res_ref.dialect):
 76 |             continue
 77 |         total += 1
 78 |         if det_results[fname].status == Status.SKIP:
 79 |             continue
 80 |         if det_results[fname].status == Status.FAIL:
 81 |             print(fname)
 82 |             count += 1
 83 |     print(
 84 |         "Total: %i out of %i (%.2f%%)" % (count, total, (count / total * 100))
 85 |     )
 86 | 
 87 | 
 88 | def plot_confusion(cm, clean_classes):
 89 |     plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
 90 |     plt.colorbar()
 91 |     tick_marks = np.arange(len(clean_classes))
 92 | 
 93 |     plt.xticks(tick_marks, clean_classes, rotation=45)
 94 |     plt.yticks(tick_marks, clean_classes)
 95 | 
 96 |     fmt = "d"
 97 |     thresh = cm.max() / 2
 98 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 99 |         plt.text(
100 |             j,
101 |             i,
102 |             format(cm[i, j], fmt),
103 |             horizontalalignment="center",
104 |             color="white" if cm[i, j] > thresh else "black",
105 |         )
106 |     plt.ylabel("True")
107 |     plt.xlabel("Predicted")
108 |     plt.tight_layout()
109 | 
110 | 
111 | def show_property_failure(
112 |     ref_results, detector, det_results, attr_name, show_confusion=False,
113 |     only_messy=False
114 | ):
115 |     print("Detector: %s. Property: %s." % (detector, attr_name))
116 |     count = 0
117 |     total = 0
118 |     y_true = []
119 |     y_pred = []
120 |     for fname in ref_results:
121 |         res_ref = ref_results[fname]
122 |         if not res_ref.status == Status.OK:
123 |             continue
124 |         if only_messy and is_standard_dialect(res_ref.dialect):
125 |             continue
126 |         total += 1
127 |         if not det_results[fname].status == Status.OK:
128 |             continue
129 |         if attr_name == "overall":
130 |             prop_ref = ref_results[fname].dialect
131 |             prop_det = det_results[fname].dialect
132 |             y_true.append(repr(prop_ref))
133 |             y_pred.append(repr(prop_det))
134 |         else:
135 |             prop_ref = getattr(ref_results[fname].dialect, attr_name)
136 |             prop_det = getattr(det_results[fname].dialect, attr_name)
137 |             y_true.append(prop_ref)
138 |             y_pred.append(prop_det)
139 |         if not prop_ref == prop_det:
140 |             print("%s  ref=%r  %s=%r" % (fname, prop_ref, detector, prop_det))
141 |             count += 1
142 |     print(
143 |         "Total: %i out of %i (%.2f%%)" % (count, total, (count / total * 100))
144 |     )
145 |     if show_confusion:
146 |         classes = []
147 |         for c in y_true + y_pred:
148 |             if not c in classes:
149 |                 classes.append(c)
150 |         cm = confusion_matrix(y_true, y_pred, labels=classes)
151 |         trans = {
152 |             "\t": "Tab",
153 |             "": "Empty",
154 |             " ": "Space",
155 |             "。": "CDot",
156 |             "：": "CCol",
157 |         }
158 |         clean = [trans.get(x, x) for x in classes]
159 |         print(tabulate(cm, headers=clean, showindex=clean))
160 |         plot_confusion(cm, clean)
161 |         plt.show()
162 | 
163 | 
164 | def main():
165 |     args = parse_args()
166 |     detector, det_results = load_detector_results(args.detector_file)
167 |     _, ref_results = load_detector_results(args.reference_file)
168 |     if args.attr_name is None:
169 |         show_complete_failure(
170 |             ref_results, detector, det_results, only_messy=args.only_messy
171 |         )
172 |     else:
173 |         show_property_failure(
174 |             ref_results,
175 |             detector,
176 |             det_results,
177 |             args.attr_name,
178 |             show_confusion=args.confusion,
179 |             only_messy=args.only_messy,
180 |         )
181 | 


--------------------------------------------------------------------------------
/scripts/detection/suitability.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Uses the suitability metric from the Proactive Wrangler paper to decide on the 
  6 | dialect.
  7 | 
  8 | Author: Gertjan van den Burg
  9 | Copyright (c) 2018 - The Alan Turing Institute
 10 | License: See the LICENSE file.
 11 | 
 12 | """
 13 | 
 14 | 
 15 | from common.dialect import Dialect
 16 | from common.encoding import get_encoding
 17 | from common.escape import is_potential_escapechar
 18 | from common.load import load_file
 19 | from common.parser import parse_file
 20 | from common.detector_result import DetectorResult, Status, StatusMsg
 21 | from common.utils import pairwise
 22 | 
 23 | from .core import run, get_potential_quotechars
 24 | from .lib.types.rudi_types import eval_types
 25 | from ._ties import break_ties
 26 | 
 27 | DETECTOR = "suitability"
 28 | WRANGLER_DELIMS = [",", ":", "|", "\t"]
 29 | 
 30 | 
 31 | def extract_cells(data, dialect):
 32 |     cells = []
 33 |     rows = parse_file(data, dialect)
 34 |     for row in rows:
 35 |         cells.extend(row)
 36 |     return cells
 37 | 
 38 | 
 39 | def get_columns(cells):
 40 |     cols = {}
 41 |     for row in cells:
 42 |         for i, c in enumerate(row):
 43 |             if not i in cols:
 44 |                 cols[i] = []
 45 |             cols[i].append(c)
 46 |     return cols
 47 | 
 48 | 
 49 | def count_empties(cells, dialect):
 50 |     count = 0
 51 |     for row in cells:
 52 |         for cell in row:
 53 |             if cell == "":
 54 |                 count += 1
 55 |             if not dialect.quotechar is None:
 56 |                 if cell == (dialect.quotechar + dialect.quotechar):
 57 |                     count += 1
 58 |     return count
 59 | 
 60 | 
 61 | def count_delimiters(cells):
 62 |     """Count the cells that contain a delimiter
 63 | 
 64 |     It is not entirely trivial whether or not the "continue" statement should 
 65 |     be there, as we could also count each occurrence of a delimiter in a cell 
 66 |     separately. However, since the normalization in the second term of (1) in 
 67 |     Guo et al. (2011) is normalized by |R|*|C| it seems naturally to include 
 68 |     it.
 69 |     """
 70 |     count = 0
 71 |     for row in cells:
 72 |         for cell in row:
 73 |             for d in WRANGLER_DELIMS:
 74 |                 if d in cell:
 75 |                     count += 1
 76 |                     continue  # see note
 77 |     return count
 78 | 
 79 | 
 80 | def column_homogeneity(column):
 81 |     """
 82 |     As the Proactive Wrangler (PW) paper doesn't give sufficient details on all 
 83 |     the types they implement, we use our own type inference engine (from 
 84 |     rudi_types) to guess the type. Note that "unicode_alphanum" is a generic 
 85 |     string type as is None. Empty cells are treated separately and are not 
 86 |     considered a type in the PW paper.
 87 | 
 88 |     """
 89 |     type_counts = {}
 90 |     for cell in column:
 91 |         detected_type = eval_types(cell)
 92 |         if detected_type is None:
 93 |             detected_type = "string"
 94 |         if detected_type == "unicode_alphanum":
 95 |             detected_type = "string"
 96 |         if detected_type == "empty":
 97 |             continue
 98 |         if not detected_type in type_counts:
 99 |             type_counts[detected_type] = 0
100 |         type_counts[detected_type] += 1
101 | 
102 |     R = len(column)
103 | 
104 |     homogeneity = 0
105 |     for t in type_counts:
106 |         homogeneity += pow(type_counts[t] / R, 2.0)
107 | 
108 |     return homogeneity
109 | 
110 | 
111 | def compute_suitability(data, dialect):
112 |     cells = extract_cells(data, dialect)
113 |     columns = get_columns(cells)
114 | 
115 |     R = len(cells)
116 |     C = len(columns)
117 | 
118 |     E = count_empties(cells, dialect)
119 |     D = count_delimiters(cells)
120 | 
121 |     homo = sum((column_homogeneity(columns[cidx]) for cidx in columns))
122 |     if R * C == 0:
123 |         suitability = 0
124 |     else:
125 |         suitability = (1 - homo / C) + (E + D) / (R * C)
126 | 
127 |     return suitability
128 | 
129 | 
130 | def get_dialects(data, encoding):
131 |     delims = WRANGLER_DELIMS
132 |     quotechars = get_potential_quotechars(data)
133 |     escapechars = {}
134 | 
135 |     for delim in delims:
136 |         delim_escapes = set()
137 |         for u, v in pairwise(data):
138 |             if v == delim and is_potential_escapechar(u, encoding):
139 |                 delim_escapes.add(u)
140 |         for quotechar in quotechars:
141 |             escapes = set(delim_escapes)
142 |             for u, v in pairwise(data):
143 |                 if v == quotechar and is_potential_escapechar(u, encoding):
144 |                     escapes.add(u)
145 |             escapes.add("")
146 |             escapechars[(delim, quotechar)] = escapes
147 | 
148 |     dialects = []
149 |     for delim in delims:
150 |         for quotechar in quotechars:
151 |             for escapechar in escapechars[(delim, quotechar)]:
152 |                 d = Dialect(delim, quotechar, escapechar)
153 |                 dialects.append(d)
154 |     return dialects
155 | 
156 | 
157 | def determine_dqr(filename, verbose=False):
158 |     encoding = get_encoding(filename)
159 |     data = load_file(filename, encoding=encoding)
160 |     if data is None:
161 |         return DetectorResult(
162 |             status=Status.SKIP, status_msg=StatusMsg.UNREADABLE
163 |         )
164 | 
165 |     dialects = get_dialects(data, encoding)
166 |     scores = []
167 | 
168 |     for dialect in sorted(dialects):
169 |         S = compute_suitability(data, dialect)
170 |         if verbose:
171 |             print("%15r\tsuitability = %.6f" % (dialect, S))
172 |         scores.append((S, dialect))
173 | 
174 |     min_suit = min((x[0] for x in scores))
175 |     min_dialects = [x[1] for x in scores if x[0] == min_suit]
176 | 
177 |     if len(min_dialects) > 1:
178 |         res = break_ties(data, min_dialects)
179 |     else:
180 |         res = min_dialects[0]
181 | 
182 |     if res is None:
183 |         return DetectorResult(
184 |             status=Status.FAIL, status_msg=StatusMsg.MULTIPLE_ANSWERS
185 |         )
186 | 
187 |     res = DetectorResult(dialect=res, status=Status.OK)
188 | 
189 |     return res
190 | 
191 | 
192 | def main():
193 |     run(determine_dqr=determine_dqr, detector=DETECTOR)
194 | 


--------------------------------------------------------------------------------
/scripts/detection/_ties.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Code for breaking ties in the heuristic solutions.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | Date: 2018-10-30
 11 | 
 12 | """
 13 | 
 14 | from common.parser import parse_file
 15 | from common.utils import pairwise
 16 | 
 17 | 
 18 | def break_ties_two(data, A, B):
 19 |     """
 20 |     Break ties between dialects A and B.
 21 | 
 22 |     """
 23 |     if A.delimiter == B.delimiter and A.escapechar == B.escapechar:
 24 |         if A.quotechar == "" or B.quotechar == "":
 25 |             d_no = A if A.quotechar == "" else B
 26 |             d_yes = B if d_no == A else A
 27 | 
 28 |             X = parse_file(data, dialect=d_no)
 29 |             Y = parse_file(data, dialect=d_yes)
 30 | 
 31 |             if X == Y:
 32 |                 # quotechar has no effect
 33 |                 return d_no
 34 |             else:
 35 |                 # quotechar has an effect
 36 |                 return d_yes
 37 |     elif A.quotechar == B.quotechar and A.escapechar == B.escapechar:
 38 |         if sorted([A.delimiter, B.delimiter]) == sorted([",", " "]):
 39 |             # Artifact due to type detection (comma as radix point)
 40 |             if A.delimiter == ",":
 41 |                 return A
 42 |             else:
 43 |                 return B
 44 |         elif A.delimiter == "-" or B.delimiter == "-":
 45 |             # Artifact due to type detection (dash as minus sign)
 46 |             if A.delimiter == "-":
 47 |                 return B
 48 |             else:
 49 |                 return A
 50 |     elif A.delimiter == B.delimiter and A.quotechar == B.quotechar:
 51 |         Dnone, Descape = (A, B) if A.escapechar == "" else (B, A)
 52 | 
 53 |         X = parse_file(data, Dnone)
 54 |         Y = parse_file(data, Descape)
 55 | 
 56 |         # double check shape. Usually if the shape differs the pattern score
 57 |         # should have caught it, but if by a freakish occurance it hasn't then
 58 |         # we can't break this tie (for now)
 59 |         if len(X) != len(Y):
 60 |             return None
 61 |         for x, y in zip(X, Y):
 62 |             if len(x) != len(y):
 63 |                 return None
 64 | 
 65 |         cells_escaped = []
 66 |         cells_unescaped = []
 67 |         for x, y in zip(X, Y):
 68 |             for u, v in zip(x, y):
 69 |                 if u != v:
 70 |                     cells_unescaped.append(u)
 71 |                     cells_escaped.append(v)
 72 | 
 73 |         # We will break the ties in the following ways:
 74 |         #
 75 |         # If the escapechar precedes the quotechar an even number of times
 76 |         # within each offending cell, then we think it is a functional escape
 77 |         # and the escaped version is the correct dialect. Note that if an odd
 78 |         # number of escaped quotechars would occur, then the shape of the file
 79 |         # will be different if it is ignored. Only if it occurs an even number
 80 |         # of times within the cell can we get the same shape.
 81 |         for u in cells_unescaped:
 82 |             count = 0
 83 |             for a, b in pairwise(u):
 84 |                 if a != Descape.escapechar:
 85 |                     continue
 86 |                 if a == Descape.escapechar and b == Descape.quotechar:
 87 |                     count += 1
 88 |             if count > 0 and count % 2 == 0:
 89 |                 return Descape
 90 |             else:
 91 |                 return Dnone
 92 |     return None
 93 | 
 94 | 
 95 | def break_ties_three(data, A, B, C):
 96 |     # NOTE: We have only observed one tie for each case during development, so
 97 |     # this may need to be improved in the future.
 98 |     equal_delim = A.delimiter == B.delimiter == C.delimiter
 99 |     equal_escape = A.escapechar == B.escapechar == C.escapechar
100 | 
101 |     if equal_delim and equal_escape:
102 |         # difference is *only* in quotechar
103 |         dialects = [A, B, C]
104 | 
105 |         # TODO: shouldn't hardcode single/double quotes here.
106 |         # try with type-only on: 
107 |         # github/test_set/files/6367b9c5338b9a035a221cfffd928e92.csv
108 |         d_none = next((d for d in dialects if d.quotechar == ""), None)
109 |         d_single = next((d for d in dialects if d.quotechar == "'"), None)
110 |         d_double = next((d for d in dialects if d.quotechar == '"'), None)
111 | 
112 |         # Added to fix above todo note, doesn't affect test results.
113 |         if any((d is None for d in [d_none, d_single, d_double])):
114 |             return None
115 | 
116 |         r_none = parse_file(data, d_none)
117 |         r_single = parse_file(data, d_single)
118 |         r_double = parse_file(data, d_double)
119 | 
120 |         if len(r_none) != len(r_single) or len(r_none) != len(r_double):
121 |             return None
122 | 
123 |         if r_none == r_single:
124 |             return break_ties_two(data, d_none, d_double)
125 |         elif r_none == r_double:
126 |             return break_ties_two(data, d_none, d_single)
127 |     elif equal_delim:
128 |         # difference is in quotechar *and* escapechar
129 | 
130 |         # NOTE: The reasoning here is as follows. If we are in this situation,
131 |         # then there is both a potential escapechar and there are quotechars,
132 |         # but the pattern score is the same and the type score can't make a
133 |         # difference because no cells become clean if we interpret the
134 |         # quote/escape correctly. This implies that the quote and escape do
135 |         # have a function. Thus, we find the dialects that have a quote and
136 |         # defer to break_ties_two.
137 | 
138 |         dialects = [A, B, C]
139 |         with_quote = [d for d in dialects if d.quotechar != ""]
140 | 
141 |         if len(with_quote) != 2:
142 |             return None
143 | 
144 |         return break_ties_two(data, with_quote[0], with_quote[1])
145 | 
146 |     return None
147 | 
148 | 
149 | def break_ties_four(data, dialects):
150 |     # NOTE: We have only observed one case during development where this
151 |     # function was needed. It may need to be revisited in the future if other
152 |     # examples are found.
153 | 
154 |     equal_delim = len(set([d.delimiter for d in dialects])) == 1
155 |     if not equal_delim:
156 |         return None
157 | 
158 |     # First, identify dialects that result in the same parsing result.
159 |     equal_dialects = []
160 |     for a, b in pairwise(dialects):
161 |         X = parse_file(data, a)
162 |         Y = parse_file(data, b)
163 |         if X == Y:
164 |             equal_dialects.append((a, b))
165 | 
166 |     # Try to break the ties in these pairs
167 |     new_dialects = set()
168 |     visited = set()
169 |     for A, B in equal_dialects:
170 |         ans = break_ties_two(data, A, B)
171 |         if not ans is None:
172 |             new_dialects.add(ans)
173 |         visited.add(A)
174 |         visited.add(B)
175 |     for d in dialects:
176 |         if not d in visited:
177 |             new_dialects.add(d)
178 | 
179 |     dialects = list(new_dialects)
180 | 
181 |     # Defer to other functions if the number of dialects was reduced
182 |     if len(dialects) == 2:
183 |         return break_ties_two(data, *dialects)
184 |     elif len(dialects) == 3:
185 |         return break_ties_three(data, *dialects)
186 | 
187 |     return None
188 | 
189 | 
190 | def break_ties(data, dialects):
191 |     if len(dialects) == 2:
192 |         return break_ties_two(data, dialects[0], dialects[1])
193 |     elif len(dialects) == 3:
194 |         return break_ties_three(data, dialects[0], dialects[1], dialects[2])
195 |     elif len(dialects) == 4:
196 |         return break_ties_four(data, dialects)
197 |     return None
198 | 


--------------------------------------------------------------------------------
/scripts/common/parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Our CSV parser.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | Date: 2018-10-22
 11 | 
 12 | """
 13 | 
 14 | 
 15 | def parse_file(
 16 |     S, dialect=None, delimiter=None, quotechar=None, escapechar=None
 17 | ):
 18 |     """
 19 |     Parse a CSV file given as a string by ``S`` into a list of lists.
 20 | 
 21 |     This function automatically takes double quotes into account, uses 
 22 |     universal newlines, and can deal with quotes that start *inside* a cell.  
 23 |     Quotes are only stripped from cells if they occur at the start and the end 
 24 |     of the cell.
 25 | 
 26 |     Tests
 27 |     -----
 28 | 
 29 |     Testing splitting on delimiter with or without quotes
 30 | 
 31 |     >>> parse_file('A,B,C,D,E', delimiter=',', quotechar='"')
 32 |     [['A', 'B', 'C', 'D', 'E']]
 33 |     >>> parse_file('A,B,C,D,E', delimiter=',', quotechar='')
 34 |     [['A', 'B', 'C', 'D', 'E']]
 35 |     >>> parse_file('A,B,C,D,E')
 36 |     [['A,B,C,D,E']]
 37 |     >>> parse_file('A,"B",C,D,E', delimiter=',', quotechar='"')
 38 |     [['A', 'B', 'C', 'D', 'E']]
 39 |     >>> parse_file('A,"B,C",D,E', delimiter=',', quotechar='"')
 40 |     [['A', 'B,C', 'D', 'E']]
 41 |     >>> parse_file('A,"B,C",D,E', delimiter=',', quotechar='')
 42 |     [['A', '"B', 'C"', 'D', 'E']]
 43 |     >>> parse_file('"A","B","C",,,,', delimiter=',', quotechar='')
 44 |     [['"A"', '"B"', '"C"', '', '', '', '']]
 45 | 
 46 |     Testing splitting on rows only:
 47 | 
 48 |     >>> parse_file('A"B"C\\rA"B""C""D"', quotechar='')
 49 |     [['A"B"C'], ['A"B""C""D"']]
 50 |     >>> parse_file('A"B"C\\nA"B""C""D"', quotechar='')
 51 |     [['A"B"C'], ['A"B""C""D"']]
 52 |     >>> parse_file('A"B"C\\r\\nA"B""C""D"', quotechar='')
 53 |     [['A"B"C'], ['A"B""C""D"']]
 54 |     >>> parse_file('A"B\\r\\nB"C\\r\\nD"E"F\\r\\nG', quotechar='"')
 55 |     [['A"B\\r\\nB"C'], ['D"E"F'], ['G']]
 56 |     >>> parse_file('A"B\\nB"C\\nD"E"F\\nG', quotechar='"')
 57 |     [['A"B\\nB"C'], ['D"E"F'], ['G']]
 58 |     >>> parse_file('A"B\\nB\\rB"C\\nD"E"F\\nG', quotechar='"')
 59 |     [['A"B\\nB\\rB"C'], ['D"E"F'], ['G']]
 60 | 
 61 |     Tests from Python's builtin CSV module:
 62 | 
 63 |     >>> parse_file('')
 64 |     []
 65 |     >>> parse_file('a,b\\r', delimiter=',')
 66 |     [['a', 'b']]
 67 |     >>> parse_file('a,b\\n', delimiter=',')
 68 |     [['a', 'b']]
 69 |     >>> parse_file('a,b\\r\\n', delimiter=',')
 70 |     [['a', 'b']]
 71 |     >>> parse_file('a,"', delimiter=',', quotechar='"')
 72 |     [['a', '']]
 73 |     >>> parse_file('"a', delimiter=',', quotechar='"')
 74 |     [['a']]
 75 |     >>> parse_file('a,|b,c', delimiter=',', quotechar='"', escapechar='|') # differs from Python (1)
 76 |     [['a', '|b', 'c']]
 77 |     >>> parse_file('a,b|,c', delimiter=',', quotechar='"', escapechar='|')
 78 |     [['a', 'b,c']]
 79 |     >>> parse_file('a,"b,|c"', delimiter=',', quotechar='"', escapechar='|') # differs from Python (1)
 80 |     [['a', 'b,|c']]
 81 |     >>> parse_file('a,"b,c|""', delimiter=',', quotechar='"', escapechar='|')
 82 |     [['a', 'b,c"']]
 83 |     >>> parse_file('a,"b,c"|', delimiter=',', quotechar='"', escapechar='|') # differs from Python (2)
 84 |     [['a', 'b,c']]
 85 |     >>> parse_file('1,",3,",5', delimiter=',', quotechar='"')
 86 |     [['1', ',3,', '5']]
 87 |     >>> parse_file('1,",3,",5', delimiter=',', quotechar='')
 88 |     [['1', '"', '3', '"', '5']]
 89 |     >>> parse_file(',3,"5",7.3, 9', delimiter=',', quotechar='"')
 90 |     [['', '3', '5', '7.3', ' 9']]
 91 |     >>> parse_file('"a\\nb", 7', delimiter=',', quotechar='"')
 92 |     [['a\\nb', ' 7']]
 93 | 
 94 |     Double quotes:
 95 | 
 96 |     >>> parse_file('a,"a""b""c"', delimiter=',', quotechar='"')
 97 |     [['a', 'a"b"c']]
 98 | 
 99 |     Mix double and escapechar:
100 | 
101 |     >>> parse_file('a,"bc""d"",|"f|""', delimiter=',', quotechar='"', escapechar='|')
102 |     [['a', 'bc"d","f"']]
103 | 
104 |     Other tests:
105 | 
106 |     >>> parse_file('a,b "c" d,e', delimiter=',', quotechar='')
107 |     [['a', 'b "c" d', 'e']]
108 |     >>> parse_file('a,b "c" d,e', delimiter=',', quotechar='"')
109 |     [['a', 'b "c" d', 'e']]
110 |     >>> parse_file('a,\\rb,c', delimiter=',')
111 |     [['a', ''], ['b', 'c']]
112 |     >>> parse_file('a,b\\r\\n\\r\\nc,d\\r\\n', delimiter=',')
113 |     [['a', 'b'], ['c', 'd']]
114 |     >>> parse_file('\\r\\na,b\\rc,d\\n\\re,f\\r\\n', delimiter=',')
115 |     [['a', 'b'], ['c', 'd'], ['e', 'f']]
116 | 
117 |     Further escape char tests:
118 | 
119 |     >>> parse_file('a,b,c||d', delimiter=',', quotechar='', escapechar='|')
120 |     [['a', 'b', 'c|d']]
121 |     >>> parse_file('a,b,c||d,e|,d', delimiter=',', quotechar='', escapechar='|')
122 |     [['a', 'b', 'c|d', 'e,d']]
123 | 
124 |     Quote mismatch until EOF:
125 | 
126 |     >>> parse_file('a,b,c"d,e\\n', delimiter=',', quotechar='"')
127 |     [['a', 'b', 'c"d,e\\n']]
128 |     >>> parse_file('a,b,c"d,e\\n', delimiter=',', quotechar='')
129 |     [['a', 'b', 'c"d', 'e']]
130 |     >>> parse_file('a,b,"c,d', delimiter=',', quotechar='"')
131 |     [['a', 'b', 'c,d']]
132 |     >>> parse_file('a,b,"c,d\\n', delimiter=',', quotechar='"')
133 |     [['a', 'b', 'c,d\\n']]
134 | 
135 |     Single column:
136 | 
137 |     >>> parse_file('a\\rb\\rc\\n')
138 |     [['a'], ['b'], ['c']]
139 | 
140 |     These tests illustrate a difference with the Python parser, which in this 
141 |     case would return ``[['a', 'abc', 'd']]``.
142 | 
143 |     >>> parse_file('a,"ab"c,d', delimiter=',', quotechar='')
144 |     [['a', '"ab"c', 'd']]
145 |     >>> parse_file('a,"ab"c,d', delimiter=',', quotechar='"')
146 |     [['a', '"ab"c', 'd']]
147 | 
148 | 
149 |     Notes
150 |     -----
151 | 
152 |     (1) We only interpret the escape character if it precedes the provided 
153 |     delimiter, quotechar, or itself. Otherwise, the escape character does not 
154 |     serve any purpose, and should not be dropped automatically.
155 | 
156 |     (2) For some reason the Python test suite places this escape character 
157 |     *inside* the preceding quoted block. This seems counterintuitive and 
158 |     incorrect and thus this behavior has not been duplicated.
159 | 
160 |     """
161 |     if not dialect is None:
162 |         delimiter = dialect.delimiter if delimiter is None else delimiter
163 |         quotechar = dialect.quotechar if quotechar is None else quotechar
164 |         escapechar = dialect.escapechar if escapechar is None else escapechar
165 | 
166 |     quote_cond = lambda c, q: q and c.startswith(q) and c.endswith(q)
167 | 
168 |     in_quotes = False
169 |     in_escape = False
170 |     rows = []
171 |     i = 0
172 |     row = []
173 |     field = ""
174 |     end_row = False
175 |     end_field = False
176 |     s = None
177 |     while i < len(S):
178 |         s = S[i]
179 |         if s == quotechar:
180 |             if in_escape:
181 |                 in_escape = False
182 |             elif not in_quotes:
183 |                 in_quotes = True
184 |             else:
185 |                 if i + 1 < len(S) and S[i + 1] == quotechar:
186 |                     i += 1
187 |                 else:
188 |                     in_quotes = False
189 |             field += s
190 |         elif s in ["\r", "\n"]:
191 |             if in_quotes:
192 |                 field += s
193 |             elif field == "" and row == []:
194 |                 pass
195 |             else:
196 |                 end_row = True
197 |                 end_field = True
198 |         elif s == delimiter:
199 |             if in_escape:
200 |                 in_escape = False
201 |                 field += s
202 |             elif in_quotes:
203 |                 field += s
204 |             else:
205 |                 end_field = True
206 |         elif s == escapechar:
207 |             if in_escape:
208 |                 field += s
209 |                 in_escape = False
210 |             else:
211 |                 in_escape = True
212 |         else:
213 |             if in_escape:
214 |                 field += escapechar
215 |                 in_escape = False
216 |             field += s
217 | 
218 |         if end_field:
219 |             if quote_cond(field, quotechar):
220 |                 field = field[1:-1]
221 |             row.append(field)
222 |             field = ""
223 |             end_field = False
224 | 
225 |         if end_row:
226 |             rows.append(row)
227 |             row = []
228 |             end_row = False
229 | 
230 |         i += 1
231 | 
232 |     if quote_cond(field, quotechar):
233 |         field = field[1:-1]
234 |     elif in_quotes:
235 |         if field.startswith(quotechar):
236 |             field = field[1:]
237 |         s = ""
238 |     if not s in ["\r", "\n", None]:
239 |         row.append(field)
240 |         rows.append(row)
241 | 
242 |     return rows
243 | 


--------------------------------------------------------------------------------
/scripts/detection/lib/types/rudi_types.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | 
  5 | Rudimentary types, used as a first pass to detect cell types given a potential 
  6 | delimiter.
  7 | 
  8 | Potentially add (small reward?):
  9 | 
 10 |     - Latitude and longitude
 11 |     - Alternative date(time) formats:
 12 |         x 2009-01-02T00:00
 13 |         x 18/10/2014
 14 |         x 04/07/11
 15 |         - 26-Feb
 16 |         - 10/12/2015 HH:MM
 17 |         - 10-Jul-12
 18 |         - Dec-13
 19 |     - File sizes and bandwidth speed
 20 |     - Unix Paths
 21 |     x Currency (\p{Sc} + float)
 22 | 
 23 | 
 24 | Notes:
 25 | 
 26 |     - Testing dates with Maya or Pendulum might work, but I got some false 
 27 |       positives such as "T2P" being interpreted as a time.
 28 | 
 29 |     - Maybe check out Moment.js? Many datetime formats for many locales. This 
 30 |       might be overkill for a "rudimentary type guess" though.
 31 | 
 32 |     x We can make this faster by compiling the regexes.
 33 | 
 34 | Should we consider a type hierarchy? Some urls (www.xxx.yyy) are also strings
 35 | 
 36 | Author: Gertjan van den Burg
 37 | 
 38 | """
 39 | 
 40 | import regex
 41 | import sys
 42 | 
 43 | 
 44 | STRIP_WHITESPACE = True
 45 | TO_CHECK = []
 46 | CHECK_ALL = False
 47 | 
 48 | # Used this site: https://unicode-search.net/unicode-namesearch.pl
 49 | SPECIALS_ALLOWED = [
 50 |     # Periods
 51 |     "\u002e",
 52 |     "\u06d4",
 53 |     "\u3002",
 54 |     "\ufe52",
 55 |     "\uff0e",
 56 |     "\uff61",
 57 |     # Parentheses
 58 |     "\u0028",
 59 |     "\u0029",
 60 |     "\u27ee",
 61 |     "\u27ef",
 62 |     "\uff08",
 63 |     "\uff09",
 64 |     # Question marks
 65 |     "\u003F",
 66 |     "\u00BF",
 67 |     "\u037E",
 68 |     "\u055E",
 69 |     "\u061F",
 70 |     "\u1367",
 71 |     "\u1945",
 72 |     "\u2047",
 73 |     "\u2048",
 74 |     "\u2049",
 75 |     "\u2CFA",
 76 |     "\u2CFB",
 77 |     "\u2E2E",
 78 |     "\uA60F",
 79 |     "\uA6F7",
 80 |     "\uFE16",
 81 |     "\uFE56",
 82 |     "\uFF1F",
 83 |     chr(69955),  # chakma question mark
 84 |     chr(125279),  # adlam initial question mark
 85 |     # Exclamation marks
 86 |     "\u0021",
 87 |     "\u00A1",
 88 |     "\u01C3",
 89 |     "\u055C",
 90 |     "\u07F9",
 91 |     "\u109F",
 92 |     "\u1944",
 93 |     "\u203C",
 94 |     "\u2048",
 95 |     "\u2049",
 96 |     "\uAA77",
 97 |     "\uFE15",
 98 |     "\uFE57",
 99 |     "\uFF01",
100 |     chr(125278),  # adlam initial exclamation mark
101 | ]
102 | 
103 | PATTERNS = {
104 |     "number_1": regex.compile(
105 |         "(?=[+-\.\d])[+-]?(?:0|[1-9]\d*)?(((?P<dot>\.)?(?(dot)(?P<yes_dot>\d*(\d+[eE][+-]?\d+)?)|(?P<no_dot>([eE][+-]?\d+)?)))|((?P<comma>,)?(?(comma)(?P<yes_comma>\d+(\d+[eE][+-]?\d+)?)|(?P<no_comma>([eE][+-]?\d+)?))))"
106 |     ),
107 |     "number_2": regex.compile("[+-]?(?:[1-9]|[1-9]\d{0,2})(?:\,\d{3})+\.\d*"),
108 |     "number_3": regex.compile("[+-]?(?:[1-9]|[1-9]\d{0,2})(?:\.\d{3})+\,\d*"),
109 |     "url": regex.compile(
110 |         "(?:(?:[A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)(?:(?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?"
111 |     ),
112 |     "email": regex.compile(
113 |         r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
114 |     ),
115 |     "unicode_alphanum": regex.compile(
116 |         "(\p{N}+\p{L}+[\p{N}\p{L}\ "
117 |         + regex.escape("".join(SPECIALS_ALLOWED))
118 |         + "]*|\p{L}+[\p{N}\p{L}\ "
119 |         + regex.escape("".join(SPECIALS_ALLOWED))
120 |         + "]+)"
121 |     ),
122 |     "time_hhmmss": regex.compile(
123 |         "(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])"
124 |     ),
125 |     "time_hhmm": regex.compile("(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])"),
126 |     "time_HHMM": regex.compile("(0[0-9]|1[0-9]|2[0-3])([0-5][0-9])"),
127 |     "time_HH": regex.compile("(0[0-9]|1[0-9]|2[0-3])([0-5][0-9])"),
128 |     "time_hmm": regex.compile("([0-9]|1[0-9]|2[0-3]):([0-5][0-9])"),
129 |     "currency": regex.compile("\p{Sc}\s?(.*)"),
130 |     "unix_path": regex.compile(
131 |         "[\/~]{1,2}(?:[a-zA-Z0-9\.]+(?:[\/]{1,2}))+(?:[a-zA-Z0-9\.]+)"
132 |     ),
133 | }
134 | 
135 | 
136 | def load_date_patterns():
137 |     year2 = "(?:\d{2})"
138 |     year4 = "(?:[12]\d{3})"
139 | 
140 |     month_leading = "(?:0[1-9]|1[0-2])"
141 |     month_sparse = "(?:[1-9]|1[0-2])"
142 | 
143 |     day_leading = "(?:0[1-9]|[12]\d|3[01])"
144 |     day_sparse = "(?:[1-9]|[12]\d|3[01])"
145 | 
146 |     sep = "[-/\.\ ]"
147 | 
148 |     counter = 0
149 |     for year in [year2, year4]:
150 |         for month in [month_leading, month_sparse]:
151 |             for day in [day_leading, day_sparse]:
152 |                 fmt = {"year": year, "month": month, "day": day, "sep": sep}
153 | 
154 |                 pat_1 = "{year}{sep}{month}{sep}{day}".format(**fmt)
155 |                 pat_2 = "{day}{sep}{month}{sep}{year}".format(**fmt)
156 |                 pat_3 = "{month}{sep}{day}{sep}{year}".format(**fmt)
157 | 
158 |                 pat_cn = "{year}年{month}月{day}日".format(**fmt)
159 |                 pat_ko = "{year}년{month}월{day}일".format(**fmt)
160 | 
161 |                 for pattern in [pat_1, pat_2, pat_3, pat_cn, pat_ko]:
162 |                     PATTERNS["date_%i" % counter] = regex.compile(pattern)
163 |                     counter += 1
164 | 
165 |     # These should be allowed as dates, but are also numbers.
166 |     for year in [year2, year4]:
167 |         fmt = {
168 |             "year": year,
169 |             "month": month_leading,
170 |             "day": day_leading,
171 |             "sep": "",
172 |         }
173 |         pat_1 = "{year}{sep}{month}{sep}{day}".format(**fmt)
174 |         pat_2 = "{day}{sep}{month}{sep}{year}".format(**fmt)
175 |         pat_3 = "{month}{sep}{day}{sep}{year}".format(**fmt)
176 | 
177 |         for pattern in [pat_1, pat_2, pat_3, pat_cn]:
178 |             PATTERNS["date_%i" % counter] = regex.compile(pattern)
179 |             counter += 1
180 | 
181 | 
182 | # TODO Ugly to do this here, but this is research code...
183 | 
184 | load_date_patterns()
185 | 
186 | 
187 | def test_with_regex(cell, patname):
188 |     # Test if cell *fully* matches reg (e.g. entire cell is number, maybe allow
189 |     # stripping of leading/trailing spaces)
190 |     if STRIP_WHITESPACE:
191 |         cell = cell.strip()
192 |     pat = PATTERNS.get(patname, None)
193 |     match = pat.fullmatch(cell)
194 |     return match is not None
195 | 
196 | 
197 | def test_number(cell):
198 |     # NOTE: This is more general than trying to coerce to float(), because it
199 |     # allows use of the comma as radix point.
200 |     if cell == "":
201 |         return False
202 |     if test_with_regex(cell, "number_1"):
203 |         return True
204 |     if test_with_regex(cell, "number_2"):
205 |         return True
206 |     if test_with_regex(cell, "number_3"):
207 |         return True
208 |     return False
209 | 
210 | 
211 | def test_url_or_email(cell):
212 |     return test_with_regex(cell, "url") or test_with_regex(cell, "email")
213 | 
214 | 
215 | def test_unicode_alphanum(cell):
216 |     # TODO: I'm not sure if it's desirable to allow alphanumeric cells, because
217 |     # it's not clear if they include "junk" cells due to incorrect delimiter
218 |     # (think: space). Maybe it's better to have only character cells?
219 |     # NOTE: This function assumes that number and url are already excluded.
220 | 
221 |     return test_with_regex(cell, "unicode_alphanum")
222 | 
223 | 
224 | def test_date(cell):
225 |     if test_number(cell):
226 |         return False
227 | 
228 |     for patname in PATTERNS:
229 |         if patname.startswith("date_"):
230 |             if test_with_regex(cell, patname):
231 |                 return True
232 |     return False
233 | 
234 | 
235 | def test_time(cell):
236 |     # HH:MM:SS, HH:MM, or H:MM
237 |     return (
238 |         test_with_regex(cell, "time_hmm")
239 |         or test_with_regex(cell, "time_hhmm")
240 |         or test_with_regex(cell, "time_hhmmss")
241 |     )
242 | 
243 | 
244 | def test_empty(cell):
245 |     if STRIP_WHITESPACE:
246 |         cell = cell.strip()
247 |     return cell == ""
248 | 
249 | 
250 | def test_percentage(cell):
251 |     cell = cell.strip()
252 |     return cell.endswith("%") and test_number(cell.rstrip("%"))
253 | 
254 | 
255 | def test_currency(cell):
256 |     if STRIP_WHITESPACE:
257 |         cell = cell.strip()
258 |     pat = PATTERNS.get("currency", None)
259 |     m = pat.fullmatch(cell)
260 |     if m is None:
261 |         return False
262 |     grp = m.group(1)
263 |     if not test_number(grp):
264 |         return False
265 |     return True
266 | 
267 | 
268 | def test_datetime(cell):
269 |     # Takes care of cells with '[date] [time]' and '[date]T[time]' (iso)
270 |     if " " in cell:
271 |         parts = cell.split(" ")
272 |         if len(parts) > 2:
273 |             return False
274 |         return test_date(parts[0]) and test_time(parts[1])
275 |     elif "T" in cell:
276 |         parts = cell.split("T")
277 |         if len(parts) > 2:
278 |             return False
279 |         isdate = test_date(parts[0])
280 |         if not isdate:
281 |             return False
282 |         # [date]T[time]
283 |         if test_time(parts[1]):
284 |             return True
285 |         # [date]T[time][+-][time]
286 |         if "+" in parts[1]:
287 |             subparts = parts[1].split("+")
288 |             istime1 = test_time(subparts[0])
289 |             istime2 = test_time(subparts[1])
290 |             if not istime1:
291 |                 return False
292 |             if istime2:
293 |                 return True
294 |             if test_with_regex(subparts[1], "time_HHMM"):
295 |                 return True
296 |             if test_with_regex(subparts[1], "time_HH"):
297 |                 return True
298 |         elif "-" in parts[1]:
299 |             subparts = parts[1].split("-")
300 |             istime1 = test_time(subparts[0])
301 |             istime2 = test_time(subparts[1])
302 |             if not istime1:
303 |                 return False
304 |             if istime2:
305 |                 return True
306 |             if test_with_regex(subparts[1], "time_HHMM"):
307 |                 return True
308 |             if test_with_regex(subparts[1], "time_HH"):
309 |                 return True
310 |     return False
311 | 
312 | 
313 | def test_nan(cell):
314 |     if STRIP_WHITESPACE:
315 |         cell = cell.strip()
316 |     # other forms (na and nan) are caught by unicode_alphanum
317 |     if cell.lower() == "n/a":
318 |         return True
319 |     return False
320 | 
321 | 
322 | def eval_types(cell, break_away=True):
323 |     type_tests = [
324 |         ("empty", test_empty),
325 |         ("url_or_email", test_url_or_email),
326 |         ("number", test_number),
327 |         ("time", test_time),
328 |         ("percentage", test_percentage),
329 |         ("currency", test_currency),
330 |         ("unicode_alphanum", test_unicode_alphanum),
331 |         ("nan", test_nan),
332 |         ("date", test_date),
333 |         ("datetime", test_datetime),
334 |     ]
335 | 
336 |     detected = []
337 |     for name, func in type_tests:
338 |         if func(cell):
339 |             detected.append(name)
340 |             if break_away:
341 |                 break
342 | 
343 |     if len(detected) > 1:
344 |         print(
345 |             "Type tests aren't mutually exclusive!\nCell: %r\nTypes: %r"
346 |             % (cell, detected),
347 |             file=sys.stderr,
348 |         )
349 |         raise ValueError
350 |     if len(detected) == 0:
351 |         return None
352 |     return detected[0]
353 | 


--------------------------------------------------------------------------------
/scripts/detection/hypo.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | #
  3 | # Author: G.J.J. van den Burg
  4 | # Copyright (c) 2018 - The Alan Turing Institute
  5 | # License: See the LICENSE file.
  6 | #
  7 | 
  8 | library(devtools)
  9 | library(rjson)
 10 | 
 11 | # load our local version of hypoparsr
 12 | match <- grep("--file=", commandArgs(trailingOnly=F))
 13 | this.path <- normalizePath(sub("--file=", "", commandArgs(trailingOnly=F)[match]))
 14 | this.dir <- dirname(this.path)
 15 | hypoparsr.dir <- paste(this.dir, "/lib/hypoparsr", sep="")
 16 | load_all(hypoparsr.dir, export_all=F)
 17 | 
 18 | printf <- function(...) invisible(cat(sprintf(...)));
 19 | fprintf <- function(file, ...) invisible(cat(sprintf(...), file=file))
 20 | 
 21 | #' Replacement for R's ridiculous strsplit that drops empties.
 22 | my.strsplit <- function(string, delim) {
 23 |   out <- strsplit(string, delim)
 24 |   if (substr(string, nchar(string), nchar(string)) == delim)
 25 |     out <- c(out, "")
 26 |   return(out)
 27 | }
 28 | 
 29 | real.quotechar <- function(filename, best, delim, rowsep, quote.method)
 30 | {
 31 |     # Since HypoParsr doesn't reliably return the quote character, we here try
 32 |     # to reverse engineer what they do to figure out what the quote character
 33 |     # is that they actually use.
 34 | 
 35 |     encoding <- strsplit(names(best$confidence[6]), '\n')[[1]][3]
 36 |     text <- readr::read_file(filename, locale=readr::locale(encoding=encoding))
 37 |     text <- iconv(text)
 38 | 
 39 |     if (rowsep == "E")
 40 |         regex.rowsep <- "\r\n"
 41 |     else if (rowsep == "N")
 42 |         regex.rowsep <- "(?<!\r)\n"
 43 |     else if (rowsep == "R")
 44 |         regex.rowsep <- "\r(?!\n)"
 45 |     else
 46 |         stop("Unknown rowsep not supported!")
 47 | 
 48 |     lines <- unlist(strsplit(text, regex.rowsep, perl=T))
 49 | 
 50 |     if (quote.method == "double") {
 51 |         escape_double <- T
 52 |         escape_backslash <- F
 53 |     } else if (quote.method == "escape") {
 54 |         escape_double <- F
 55 |         escape_backslash <- T
 56 |     }
 57 | 
 58 |     dim_check <- NULL
 59 |     tryCatch({
 60 |         dim_check <- suppressWarnings(readr::read_delim(text, delim=delim,
 61 |                                                         quote='', col_names=F,
 62 |                                                         escape_double=escape_double,
 63 |                                                         escape_backslash=escape_backslash,
 64 |                                                         n_max=10))
 65 |     },
 66 |     error=function(e) {
 67 |         fprintf(stderr(), "Error occurred in readr::read_delim\n")
 68 |         return(NULL)
 69 |     }
 70 |     )
 71 |     if (is.null(dim_check)) {
 72 |         fprintf(stderr(), "dim_check is null\n")
 73 |         return(NULL)
 74 |     }
 75 | 
 76 |     coltypes.string <- paste(rep("c", ncol(dim_check)), collapse="")
 77 | 
 78 |     # NOTE: We've changed the default ``na`` argument to read_delim because we 
 79 |     # don't want to have empty cells interpreted as NA for the purposes of this 
 80 |     # function.
 81 |     intermediate <- suppressWarnings(readr::read_delim(text, delim=delim, 
 82 |                                                        quote='', col_names=F,
 83 |                                                        escape_double=escape_double,
 84 |                                                        escape_backslash=escape_backslash,
 85 |                                                        na=c("NA"),
 86 |                                                        col_types=coltypes.string))
 87 | 
 88 |     # immediate red flag that parameters aren't correct.
 89 |     if (nrow(intermediate) != length(lines)) {
 90 |         fprintf(stderr(), "intermediate size doesn't match line size\n")
 91 |         return(NULL)
 92 |     }
 93 | 
 94 |     used.A <- FALSE
 95 |     used.Q <- FALSE
 96 | 
 97 |     for (qc in c("'", '"')) {
 98 |         for (i in 1:length(lines)) {
 99 |             cells <- unlist(my.strsplit(lines[i], delim))
100 | 
101 |             # if this happens, then quote should not have been ignored.
102 |             if (length(cells) != ncol(intermediate)) {
103 |                 fprintf(stderr(), "number of cells doesn't match number of cols\n");
104 |                 return(NULL)
105 |             }
106 | 
107 |             for (j in 1:length(cells)) {
108 |                 if (!(cells[j] == "NA" && is.na.data.frame(intermediate[i, j]))) {
109 |                     content <- gsub("[\r]", "", cells[j])
110 |                     if (content == '\\' && intermediate[i, j] != '\\') {
111 |                         # Use of the escapechar without (escaped) quotes 
112 |                         # present in the file (otherwise the quotechar in the 
113 |                         # detected dialect shouldn't have been empty)
114 |                         return('')
115 |                     }
116 |                     if (content != intermediate[i, j]) {
117 |                         fprintf(stderr(), "Unequal: %s and %s\n", content, intermediate[i, j])
118 |                         return(NULL)
119 |                     }
120 |                 }
121 |                 first <- substr(cells[j], 1, 1)
122 |                 last <- substr(cells[j], length(cells[j]), length(cells[j]))
123 |                 if (first == qc && last == qc) {
124 |                     if (qc == "'")
125 |                         used.A <- TRUE
126 |                     else if (qc == '"')
127 |                         used.Q <- TRUE
128 |                 }
129 |             }
130 |         }
131 |     }
132 | 
133 |     if (used.A && used.Q) {
134 |         return("AMBIGUOUS")
135 |     } else if (used.A) {
136 |         return("'")
137 |     } else if (used.Q) {
138 |         return('"')
139 |     } else {
140 |         return("")
141 |     }
142 | }
143 | 
144 | detect <- function(filename)
145 | {
146 |     if (!file.exists(filename)) {
147 |         return (list(status="FAIL", status_msg="NONEXISTENT_FILE", 
148 |                      dialect=NULL))
149 |     }
150 | 
151 |     hypo.res <- hypoparsr::parse_file(filename)
152 | 
153 |     if (is.null(hypo.res$results))
154 |         return (list(status="FAIL", status_msg="NO_RESULTS", dialect=NULL))
155 | 
156 |     # Get the delimiter from the result
157 |     best <- hypo.res$results[hypo.res$ranking][[1]]
158 | 
159 |     # the dialect is at the fifth position in the confidence array
160 |     dialect.idx <- 5
161 |     dialect.string <- names(best$confidence[dialect.idx])
162 |     dialect.list <- strsplit(dialect.string, '\n')
163 |     dialect.array <- dialect.list[[1]]
164 |     delim.quote.string <- dialect.array[3]
165 |     delim.quote.list <- strsplit(delim.quote.string, ' ')
166 |     delim.quote.array <- delim.quote.list[[1]]
167 | 
168 |     # NOTE: Hypoparsr only returns quotes in this string if they are
169 |     # *functional*, i.e. surrounds a delimiter (and maybe also a row
170 |     # separator?). This leads to problems because they don't report these quote
171 |     # characters in the description, but do end up stripping them from the
172 |     # cells later on. (Actually, the hypothesis with the quote character is
173 |     # created, and gets the same dialect score in this case. Other scores
174 |     # however can cause it not to appear in the best hypothesis).
175 | 
176 |     delimiter <- delim.quote.array[2]
177 |     quotechar <- delim.quote.array[4]
178 |     rowsep <- delim.quote.array[6]
179 |     quote.method <- delim.quote.array[9]
180 | 
181 |     used.real.qc <- F
182 |     if (quotechar == "DOUBLEQUOTE") {
183 |         quotechar <- '"'
184 |     } else if (quotechar == "'") {
185 |         quotechar <- "'"
186 |     } else {
187 |         quotechar <- real.quotechar(filename, best, delimiter, rowsep,
188 |                                     quote.method)
189 |         used.real.qc <- T
190 |     }
191 | 
192 |     if (is.null(quotechar)) {
193 |         quotechar <- ''
194 |     }
195 | 
196 |     # HypoParsr only considers a single escape character (see 
197 |     # hypoparsr/dialect.R)
198 |     escapechar <- if(quote.method == 'escape') "\\" else ""
199 | 
200 |     # NOTE: I don't think this case will actually happen, it would occur if 
201 |     # there are mixed quotes in a file, each with no significance.
202 |     if (!is.null(quotechar) && quotechar == "AMBIGUOUS")
203 |         return (list(status="FAIL", status_msg="AMBIGUOUS_QUOTECHAR", 
204 |                      dialect=NULL))
205 | 
206 |     dialect <- list(delimiter=delimiter, quotechar=quotechar, 
207 |                     escapechar=escapechar)
208 |     if (used.real.qc) {
209 |         out <- list(status="OK", dialect=dialect, note="USED_REAL_QC")
210 |     } else {
211 |         out <- list(status="OK", dialect=dialect)
212 |     }
213 | 
214 |     return(out)
215 | }
216 | 
217 | prepare.result <- function(detect.out, filename, runtime)
218 | {
219 |     dialect <- detect.out['dialect']
220 |     names(dialect) <- NULL
221 | 
222 |     status <- detect.out['status']
223 |     names(status) <- NULL
224 |     status <- status[[1]]
225 |     status.msg <- detect.out['status_msg']
226 | 
227 |     hostname <- Sys.info()['nodename']
228 |     names(hostname) <- NULL
229 |     res <- list(
230 |                 status=status,
231 |                 filename=filename,
232 |                 detector='hypoparsr',
233 |                 runtime=runtime,
234 |                 hostname=hostname
235 |                 )
236 |     if (!is.null(dialect[[1]]))
237 |         res['dialect'] <- dialect
238 |     if ("status_msg" %in% names(detect.out))
239 |         res['status_msg'] <- detect.out['status_msg']
240 |     if ("note" %in% names(detect.out))
241 |         res['note'] <- detect.out['note']
242 | 
243 |     as.json <- toJSON(res)
244 |     return(as.json)
245 | }
246 | 
247 | dump.result <- function(output.file, dialect, filename, duration)
248 | {
249 |     res.json <- prepare.result(dialect, filename, duration)
250 |     write(res.json, output.file, append=T)
251 | }
252 | 
253 | load.previous <- function(output.file)
254 | {
255 |     previous <- c()
256 |     if (!file.exists(output.file)) {
257 |         printf("Ouput file %s does not exist. No previous results.\n", 
258 |                output.file)
259 |         return(previous)
260 |     }
261 | 
262 |     lines <- readLines(output.file)
263 |     for (line in lines) {
264 |         record <- fromJSON(line)
265 |         previous = c(previous, record['filename'])
266 |     }
267 |     return(previous)
268 | }
269 | 
270 | main <- function(path.file, output.file)
271 | {
272 |     files <- readLines(path.file)
273 |     previous <- load.previous(output.file)
274 | 
275 |     n_files <- length(files)
276 |     n_previous <- length(previous)
277 |     n_todo <- n_files - n_previous
278 |     n_done <- 0
279 | 
280 |     for (filename in files) {
281 |         if (filename %in% previous)
282 |             next
283 | 
284 |         printf("[hypoparsr|%i/%i] Analyzing file: %s\n", n_done, n_todo, 
285 |                filename)
286 | 
287 |         start.time <- Sys.time()
288 |         dialect <- detect(filename)
289 |         end.time <- Sys.time()
290 |         duration <- difftime(end.time, start.time, units="secs")
291 | 
292 |         dump.result(output.file, dialect, filename, duration)
293 |         n_done <- n_done + 1
294 |     }
295 | }
296 | 
297 | args <- commandArgs(trailingOnly=T)
298 | if (length(args) == 1) {
299 |     filename <- args[1]
300 |     start.time <- Sys.time()
301 |     dialect <- detect(filename)
302 |     end.time <- Sys.time()
303 |     duration <- difftime(end.time, start.time, units="secs")
304 |     res.json <- prepare.result(dialect, filename, duration)
305 |     printf("%s\n", res.json)
306 | } else if (length(args) == 2) {
307 |     main(args[1], args[2])
308 | } else {
309 |     printf("Usage: hypo.R [path.file output.file | csv.file]\n")
310 | }
311 | 


--------------------------------------------------------------------------------
/scripts/detection/our_score_base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Get the best dialect with our data consistency measure.
  6 | 
  7 | This is the base package that contains common functions.
  8 | 
  9 | Author: Gertjan van den Burg
 10 | Copyright (c) 2018 - The Alan Turing Institute
 11 | License: See the LICENSE file.
 12 | """
 13 | 
 14 | import itertools
 15 | import re
 16 | 
 17 | from collections import Counter
 18 | 
 19 | from common.dialect import Dialect
 20 | from common.encoding import get_encoding
 21 | from common.escape import is_potential_escapechar
 22 | from common.load import load_file
 23 | from common.parser import parse_file
 24 | from common.detector_result import DetectorResult, Status, StatusMsg
 25 | from common.utils import pairwise
 26 | 
 27 | from .lib.types.rudi_types import eval_types
 28 | 
 29 | from .core import can_be_delim_unicode, get_potential_quotechars
 30 | from ._ties import break_ties
 31 | 
 32 | BLOCKED_DELIMS = [".", "/", '"', "'"]
 33 | 
 34 | 
 35 | def masked_by_quotechar(S, quotechar, escapechar, test_char):
 36 |     """Test if a character is always masked by quote characters
 37 | 
 38 |     >>> masked_by_quotechar('A"B&C"A', '"', '', '&')
 39 |     True
 40 |     >>> masked_by_quotechar('A"B&C"A&A', '"', '', '&')
 41 |     False
 42 |     >>> masked_by_quotechar('A|"B&C"A', '"', '|', '&')
 43 |     False
 44 |     >>> masked_by_quotechar('A"B"C', '"', '', '')
 45 |     False
 46 |     """
 47 |     if test_char == "":
 48 |         return False
 49 |     escape_next = False
 50 |     in_quotes = False
 51 |     i = 0
 52 |     while i < len(S):
 53 |         s = S[i]
 54 |         if s == quotechar:
 55 |             if escape_next:
 56 |                 i += 1
 57 |                 continue
 58 |             if not in_quotes:
 59 |                 in_quotes = True
 60 |             else:
 61 |                 if i + 1 < len(S) and S[i + 1] == quotechar:
 62 |                     i += 1
 63 |                 else:
 64 |                     in_quotes = False
 65 |         elif s == test_char and not in_quotes:
 66 |             return False
 67 |         elif s == escapechar:
 68 |             escape_next = True
 69 |         i += 1
 70 |     return True
 71 | 
 72 | 
 73 | def get_potential_delimiters(data, encoding):
 74 |     delims = set()
 75 |     c = Counter(data)
 76 |     for delim, _ in c.most_common():
 77 |         if (
 78 |             can_be_delim_unicode(delim, encoding=encoding)
 79 |             and not delim in BLOCKED_DELIMS
 80 |         ):
 81 |             delims.add(delim)
 82 |     delims.add("")
 83 |     return delims
 84 | 
 85 | 
 86 | def get_cells(data, dialect):
 87 |     rows = parse_file(data, dialect=dialect)
 88 |     all_cells = []
 89 |     for row in rows:
 90 |         all_cells.extend(row)
 91 |     return all_cells
 92 | 
 93 | 
 94 | def make_base_abstraction(S, dialect):
 95 |     stack = ""
 96 |     escape_next = False
 97 |     for s in S:
 98 |         if s in ["\r", "\n"]:
 99 |             if not stack.endswith("R"):
100 |                 stack += "R"
101 |         elif s == dialect.delimiter:
102 |             if escape_next:
103 |                 stack += "C"
104 |                 escape_next = False
105 |             else:
106 |                 stack += "D"
107 |         elif s == dialect.quotechar:
108 |             if escape_next:
109 |                 stack += "C"
110 |                 escape_next = False
111 |             else:
112 |                 stack += "Q"
113 |         elif s == dialect.escapechar:
114 |             if escape_next:
115 |                 if not stack.endswith("C"):
116 |                     stack += "C"
117 |                 escape_next = False
118 |             else:
119 |                 escape_next = True
120 |         else:
121 |             if escape_next:
122 |                 escape_next = False
123 |             if not stack.endswith("C"):
124 |                 stack += "C"
125 | 
126 |     return stack
127 | 
128 | 
129 | def merge_with_quotechar(S, dialect):
130 |     in_quotes = False
131 |     i = 0
132 |     quote_pairs = []
133 |     while i < len(S):
134 |         s = S[i]
135 |         if not s == "Q":
136 |             i += 1
137 |             continue
138 | 
139 |         if not in_quotes:
140 |             in_quotes = True
141 |             begin_quotes = i
142 |         else:
143 |             if i + 1 < len(S) and S[i + 1] == "Q":
144 |                 i += 1
145 |             else:
146 |                 end_quotes = i
147 |                 quote_pairs.append((begin_quotes, end_quotes))
148 |                 in_quotes = False
149 |         i += 1
150 | 
151 |     # replace quoted blocks by C
152 |     Sl = list(S)
153 |     for begin, end in quote_pairs:
154 |         for i in range(begin, end + 1):
155 |             Sl[i] = "C"
156 |     S = "".join(Sl)
157 | 
158 |     return S
159 | 
160 | 
161 | def strip_trailing(abstract):
162 |     while abstract.endswith("R"):
163 |         abstract = abstract[:-1]
164 |     return abstract
165 | 
166 | 
167 | def fill_empties(abstract):
168 |     while "DD" in abstract:
169 |         abstract = abstract.replace("DD", "DCD")
170 | 
171 |     while "DR" in abstract:
172 |         abstract = abstract.replace("DR", "DCR")
173 | 
174 |     while "RD" in abstract:
175 |         abstract = abstract.replace("RD", "RCD")
176 | 
177 |     while "CC" in abstract:
178 |         abstract = abstract.replace("CC", "C")
179 | 
180 |     if abstract.startswith("D"):
181 |         abstract = "C" + abstract
182 | 
183 |     if abstract.endswith("D"):
184 |         abstract += "C"
185 | 
186 |     return abstract
187 | 
188 | 
189 | def filter_urls(data):
190 |     pat = "(?:(?:[A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)(?:(?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?"
191 |     url_idxs = []
192 |     for match in re.finditer(pat, data):
193 |         url_idxs.append(match.span())
194 |     Sl = list(data)
195 |     for begin, end in url_idxs:
196 |         for i in range(begin, end):
197 |             Sl[i] = "U"
198 |     return "".join(Sl)
199 | 
200 | 
201 | def make_abstraction(data, dialect):
202 |     """
203 |     Make the abstract representation of a CSV file.
204 | 
205 |     Tests
206 |     -----
207 | 
208 |     >>> make_abstraction('A,B,C', Dialect(delimiter=',', quotechar='', escapechar=''))
209 |     'CDCDC'
210 |     >>> make_abstraction('A,\\rA,A,A\\r', Dialect(delimiter=',', quotechar='', escapechar=''))
211 |     'CDCRCDCDC'
212 |     >>> make_abstraction('a,a,\\n,a,a\\ra,a,a\\r\\n', Dialect(delimiter=',', quotechar='', escapechar=''))
213 |     'CDCDCRCDCDCRCDCDC'
214 |     >>> make_abstraction('a,"bc""d""e""f""a",\\r\\n', Dialect(delimiter=',', quotechar='"', escapechar=''))
215 |     'CDCDC'
216 |     >>> make_abstraction('a,"bc""d"",|"f|""', Dialect(delimiter=',', quotechar='"', escapechar='|'))
217 |     'CDC'
218 |     >>> make_abstraction(',,,', Dialect(delimiter=',', quotechar='', escapechar=''))
219 |     'CDCDCDC'
220 |     >>> make_abstraction(',"",,', Dialect(delimiter=',', quotechar='"', escapechar=''))
221 |     'CDCDCDC'
222 |     >>> make_abstraction(',"",,\\r\\n', Dialect(delimiter=',', quotechar='"', escapechar=''))
223 |     'CDCDCDC'
224 | 
225 |     Escape char:
226 | 
227 |     >>> make_abstraction('A,B|,C', Dialect(delimiter=',', quotechar='', escapechar='|'))
228 |     'CDC'
229 |     >>> make_abstraction('A,"B,C|"D"', Dialect(delimiter=',', quotechar='"', escapechar='|'))
230 |     'CDC'
231 |     >>> make_abstraction('a,|b,c', Dialect(delimiter=',', quotechar='', escapechar='|'))
232 |     'CDCDC'
233 |     >>> make_abstraction('a,b|,c', Dialect(delimiter=',', quotechar='', escapechar='|'))
234 |     'CDC'
235 |     >>> make_abstraction('a,"b,c|""', Dialect(delimiter=',', quotechar='"', escapechar='|'))
236 |     'CDC'
237 |     >>> make_abstraction('a,b||c', Dialect(delimiter=',', quotechar='', escapechar='|'))
238 |     'CDC'
239 |     >>> make_abstraction('a,"b|"c||d|"e"', Dialect(delimiter=',', quotechar='"', escapechar='|'))
240 |     'CDC'
241 |     >>> make_abstraction('a,"b|"c||d","e"', Dialect(delimiter=',', quotechar='"', escapechar='|'))
242 |     'CDCDC'
243 | 
244 |     """
245 | 
246 |     A = make_base_abstraction(data, dialect)
247 |     A = merge_with_quotechar(A, dialect)
248 |     A = fill_empties(A)
249 |     A = strip_trailing(A)
250 | 
251 |     return A
252 | 
253 | 
254 | def is_clean(cell):
255 |     return not (eval_types(cell) is None)
256 | 
257 | 
258 | def get_potential_dialects(data, encoding):
259 |     """
260 |     We consider as escape characters those characters for which 
261 |     is_potential_escapechar() is True and that occur at least once before a 
262 |     quote character or delimiter in the dialect.
263 | 
264 |     One may wonder if self-escaping is an issue here (i.e. "\\\\", two times 
265 |     backslash). It is not. In a file where a single backslash is desired and 
266 |     escaping with a backslash is used, then it only makes sense to do this in a 
267 |     file where the backslash is already used as an escape character (in which 
268 |     case we include it). If it is never used as escape for the delimiter or 
269 |     quotechar, then it is not necessary to self-escape.
270 |     """
271 |     delims = get_potential_delimiters(data, encoding)
272 |     quotechars = get_potential_quotechars(data)
273 |     escapechars = {}
274 | 
275 |     for delim, quotechar in itertools.product(delims, quotechars):
276 |         escapechars[(delim, quotechar)] = set([""])
277 | 
278 |     for u, v in pairwise(data):
279 |         if not is_potential_escapechar(u, encoding):
280 |             continue
281 |         for delim, quotechar in itertools.product(delims, quotechars):
282 |             if v == delim or v == quotechar:
283 |                 escapechars[(delim, quotechar)].add(u)
284 | 
285 |     dialects = []
286 |     for delim in delims:
287 |         for quotechar in quotechars:
288 |             for escapechar in escapechars[(delim, quotechar)]:
289 |                 if masked_by_quotechar(data, quotechar, escapechar, delim):
290 |                     continue
291 |                 d = Dialect(delim, quotechar, escapechar)
292 |                 dialects.append(d)
293 |     return dialects
294 | 
295 | 
296 | def determine_dqr(filename, score_func, verbose=False, do_break_ties=True):
297 |     encoding = get_encoding(filename)
298 |     data = load_file(filename, encoding=encoding)
299 |     if data is None:
300 |         return DetectorResult(
301 |             status=Status.SKIP, status_msg=StatusMsg.UNREADABLE
302 |         )
303 | 
304 |     # fix-up to replace urls by a character, this removes many potential
305 |     # delimiters that only occur in urls and cause noise.
306 |     dialects = get_potential_dialects(filter_urls(data), encoding)
307 |     if not dialects:
308 |         return DetectorResult(
309 |             status=Status.FAIL, status_msg=StatusMsg.NO_DIALECTS
310 |         )
311 | 
312 |     if verbose:
313 |         print(
314 |             "Length of data: %i\n"
315 |             "Considering %i dialects\n" % (len(data), len(dialects))
316 |         )
317 | 
318 |     scores = score_func(data, dialects, verbose=verbose)
319 | 
320 |     score_sort = sorted(
321 |         [(scores[dialect], dialect) for dialect in scores],
322 |         key=lambda x: x[0],
323 |         reverse=True,
324 |     )
325 | 
326 |     max_prob = score_sort[0][0]
327 |     dialects_with_score = [x[1] for x in score_sort if x[0] == max_prob]
328 | 
329 |     if len(dialects_with_score) > 1:
330 |         if do_break_ties:
331 |             res = break_ties(data, dialects_with_score)
332 |         else:
333 |             res = None
334 |     else:
335 |         res = dialects_with_score[0]
336 | 
337 |     if res is None:
338 |         if verbose:
339 |             print("More than 1 parameter set!")
340 |             for d in dialects_with_score:
341 |                 print(d)
342 |         return DetectorResult(
343 |             status=Status.FAIL, status_msg=StatusMsg.MULTIPLE_ANSWERS
344 |         )
345 | 
346 |     res = DetectorResult(dialect=res, status=Status.OK)
347 | 
348 |     return res
349 | 


--------------------------------------------------------------------------------
/scripts/analysis/figure_violins.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Creating violin plots in PGFplots (two-sided version)
  6 | 
  7 | Based on:
  8 | https://matplotlib.org/_modules/matplotlib/axes/_axes.html#Axes.violinplot
  9 | https://github.com/statsmodels/statsmodels/blob/master/statsmodels/graphics/boxplots.py
 10 | 
 11 | Author: Gertjan van den Burg
 12 | 
 13 | """
 14 | 
 15 | import argparse
 16 | import json
 17 | import numpy as np
 18 | import os
 19 | import math
 20 | 
 21 | from scipy.stats import gaussian_kde
 22 | 
 23 | from .core import (
 24 |     CORPUS_NAMES,
 25 |     ORDERED_DETECTORS,
 26 |     check_detectors,
 27 |     clean_detector_name,
 28 | )
 29 | from .latex import build_latex_doc
 30 | 
 31 | # Color
 32 | # COLOR_LEFT = "B40204"
 33 | # COLOR_RIGHT = "00AABA"
 34 | # COLOR_MINMAX = "FF0000"
 35 | 
 36 | # Grayscale
 37 | COLOR_LEFT = "101010"
 38 | COLOR_RIGHT = "878787"
 39 | COLOR_MINMAX = "000000"
 40 | 
 41 | USE_LOG = True
 42 | 
 43 | 
 44 | def transform(x):
 45 |     if USE_LOG:
 46 |         return math.log(x, 10)
 47 |     return x
 48 | 
 49 | 
 50 | def untransform(x):
 51 |     if USE_LOG:
 52 |         return pow(10, x)
 53 |     return x
 54 | 
 55 | 
 56 | def _interpolate(coords, values, x):
 57 |     """ Return the estimated value of x by interpolating the nearest neighbors 
 58 |     in coords. It is assumed coords is sorted and is of the same length as 
 59 |     values.
 60 |     """
 61 |     if x in coords:
 62 |         return values[coords == x]
 63 |     below_idx, above_idx = None, None
 64 |     for idx, c in enumerate(coords):
 65 |         if c < x:
 66 |             below_idx = idx
 67 |         if c > x:
 68 |             above_idx = idx
 69 |             break
 70 |     avg_val = (values[below_idx] + values[above_idx]) / 2
 71 |     return avg_val
 72 | 
 73 | 
 74 | def _single_violin_data(pos, pos_data, width, side, plot_opts):
 75 |     # Based almost entirely on_single_violin from statsmodels
 76 |     bw_factor = plot_opts.get("bw_factor", None)
 77 | 
 78 |     def _violin_range(pos_data, plot_opts):
 79 |         """Return array with correct range, with which violins can be plotted."""
 80 |         cutoff = plot_opts.get("cutoff", False)
 81 |         cutoff_type = plot_opts.get("cutoff_type", "std")
 82 |         cutoff_val = plot_opts.get("cutoff_val", 1.5)
 83 | 
 84 |         s = 0.0
 85 |         if not cutoff:
 86 |             if cutoff_type == "std":
 87 |                 s = cutoff_val * np.std(pos_data)
 88 |             else:
 89 |                 s = cutoff_val
 90 | 
 91 |         x_lower = kde.dataset.min() - s
 92 |         x_upper = kde.dataset.max() + s
 93 |         return np.linspace(x_lower, x_upper, 501)
 94 | 
 95 |     pos_data = np.asarray(pos_data)
 96 |     kde = gaussian_kde(pos_data, bw_method=bw_factor)
 97 | 
 98 |     xvals = _violin_range(pos_data, plot_opts)
 99 |     violin = kde.evaluate(xvals)
100 | 
101 |     # NOTE: we removed normalization by violin.max()
102 |     violin = width * violin
103 | 
104 |     if side == "both":
105 |         envelope_l, envelope_r = (-violin + pos, violin + pos)
106 |     elif side == "right":
107 |         envelope_l, envelope_r = (np.zeros_like(violin) + pos, violin + pos)
108 |     elif side == "left":
109 |         envelope_l, envelope_r = (-violin + pos, np.zeros_like(violin) + pos)
110 |     else:
111 |         msg = "`side` parameter should be one of {'left', 'right', 'both'}."
112 |         raise ValueError(msg)
113 | 
114 |     return xvals, envelope_l, envelope_r
115 | 
116 | 
117 | def get_median_coords(coords, left, right, median):
118 |     data = {}
119 |     data["xleft"] = _interpolate(coords, left, median)
120 |     data["xright"] = _interpolate(coords, right, median)
121 |     data["yleft"] = median
122 |     data["yright"] = median
123 |     return data
124 | 
125 | 
126 | def get_extrema_coords(pos, pos_data, width, side):
127 |     # min
128 |     xleft = pos
129 |     xleft -= width if side in ["left", "both"] else 0
130 |     xright = pos
131 |     xright += width if side in ["right", "both"] else 0
132 |     yleft = yright = np.min(pos_data)
133 |     min_coords = {
134 |         "xleft": xleft,
135 |         "xright": xright,
136 |         "yleft": yleft,
137 |         "yright": yright,
138 |     }
139 |     # max
140 |     yleft = yright = np.max(pos_data)
141 |     max_coords = {
142 |         "xleft": xleft,
143 |         "xright": xright,
144 |         "yleft": yleft,
145 |         "yright": yright,
146 |     }
147 |     return min_coords, max_coords
148 | 
149 | 
150 | def generate_violin_data(
151 |     summary_data, side="both", showmedian=True, showextrema=True, plot_opts={}
152 | ):
153 | 
154 |     check_detectors(summary_data["runtimes"].keys())
155 | 
156 |     dataset = list(
157 |         map(
158 |             np.asarray,
159 |             [
160 |                 list(map(transform, summary_data["runtimes"][key]))
161 |                 for key in ORDERED_DETECTORS
162 |             ],
163 |         )
164 |     )
165 | 
166 |     positions = np.arange(len(dataset)) + 1
167 |     pos_span = np.max(positions) - np.min(positions)
168 |     width = np.min(
169 |         [0.15 * np.max([pos_span, 1.]), plot_opts.get("violin_width", 0.8) / 2.]
170 |     )
171 | 
172 |     violin_data = []
173 |     for pos_data, pos, name in zip(dataset, positions, ORDERED_DETECTORS):
174 |         xvals, envelope_l, envelope_r = _single_violin_data(
175 |             pos, pos_data, width, side, plot_opts
176 |         )
177 | 
178 |         # return back to actual data
179 |         xvals = np.array([untransform(x) for x in xvals])
180 |         pos_data = np.array([untransform(x) for x in pos_data])
181 | 
182 |         data = {
183 |             "name": name,
184 |             "side": side,
185 |             "xvals": xvals,
186 |             "envelope_l": envelope_l,
187 |             "envelope_r": envelope_r,
188 |         }
189 | 
190 |         if showmedian:
191 |             data["median"] = get_median_coords(
192 |                 xvals, envelope_l, envelope_r, np.median(pos_data)
193 |             )
194 |         if showextrema:
195 |             data["min"], data["max"] = get_extrema_coords(
196 |                 pos, pos_data, width / 3, side
197 |             )
198 | 
199 |         violin_data.append(data)
200 | 
201 |     return violin_data
202 | 
203 | 
204 | def generate_tex_for_line(xleft=0, yleft=0, xright=0, yright=0, linestyle=""):
205 |     tex = ""
206 |     tex += "\\addplot[%s] coordinates {%%\n" % linestyle
207 |     tex += "(%.16f, %.16f)\n" % (xleft, yleft)
208 |     tex += "(%.16f, %.16f)\n" % (xright, yright)
209 |     tex += "};\n"
210 |     return tex
211 | 
212 | 
213 | def generate_tex_for_violin(
214 |     violin, edgecolor=None, edgethick=None, fillcolor=None, alpha=0.5
215 | ):
216 |     name = violin["name"] + violin["side"]
217 | 
218 |     edgecolor = "none" if edgecolor is None else edgecolor
219 |     edgethick = "" if edgethick is None else ", " + edgethick
220 |     fillcolor = "fill=none" if fillcolor is None else fillcolor
221 |     left_name, right_name = name + "Left", name + "Right"
222 | 
223 |     tex = "\\addplot [draw=%s %s, name path=%s] coordinates {%%\n" % (
224 |         edgecolor,
225 |         edgethick,
226 |         left_name,
227 |     )
228 |     for xx, yy in zip(violin["envelope_l"], violin["xvals"]):
229 |         tex += "(%.16f, %.16f)\n" % (xx, yy)
230 |     tex += "};\n"
231 |     tex += "\\addplot [draw=%s %s, name path=%s] coordinates {%%\n" % (
232 |         edgecolor,
233 |         edgethick,
234 |         right_name,
235 |     )
236 |     for xx, yy in zip(violin["envelope_r"], violin["xvals"]):
237 |         tex += "(%.16f, %.16f)\n" % (xx, yy)
238 |     tex += "};\n"
239 |     tex += "\\addplot [%s, opacity=%f] fill between [of=%s and %s];\n" % (
240 |         fillcolor,
241 |         alpha,
242 |         left_name,
243 |         right_name,
244 |     )
245 | 
246 |     if "median" in violin:
247 |         # linestyle = "dashed, dash pattern=on 2pt off 2pt"
248 |         violin["median"]["linestyle"] = "densely dotted, thick, black"
249 | 
250 |         tex += generate_tex_for_line(**violin["median"])
251 |     if "min" in violin:
252 |         violin["min"]["linestyle"] = "solid, ColorMinMax"
253 |         tex += generate_tex_for_line(**violin["min"])
254 |     if "max" in violin:
255 |         violin["max"]["linestyle"] = "solid, ColorMinMax"
256 |         tex += generate_tex_for_line(**violin["max"])
257 | 
258 |     return tex
259 | 
260 | 
261 | def generate_latex(violindata, legend_data, opacity=0.5):
262 |     abbrev = [clean_detector_name(d) for d in ORDERED_DETECTORS]
263 |     xtick = ",".join([str(i + 1) for i in range(len(abbrev))])
264 |     xticklabels = ",".join(abbrev)
265 | 
266 |     yrange = [pow(10, x) for x in [-6, -4, -2, 0, 2, 4]]
267 |     ytick = ",".join([str(i) for i in yrange])
268 | 
269 |     legend_entries = ", ".join(
270 |         [CORPUS_NAMES.get(c) for c in legend_data["corpora"]]
271 |     )
272 | 
273 |     tex = (
274 |         "\\documentclass[preview=true]{standalone}\n"
275 |         "\\pdfinfoomitdate=1\n"
276 |         "\\pdftrailerid{}\n"
277 |         "\\pdfsuppressptexinfo=1\n"
278 |         "\\usepackage{tikz}\n"
279 |         "\\usepackage{pgfplots}\n"
280 |         "\\pgfplotsset{compat=1.16}\n"
281 |         "\\usepgfplotslibrary{fillbetween}\n"
282 |         "\\definecolor{ColorLeft}{HTML}{%s}\n"
283 |         "\\definecolor{ColorRight}{HTML}{%s}\n"
284 |         "\\definecolor{ColorMinMax}{HTML}{%s}\n"
285 |         "\\begin{document}\n"
286 |         "\\begin{tikzpicture}\n"
287 |         "\\begin{semilogyaxis}[\n"
288 |         "xtick={%s},\n"
289 |         "xticklabels={%s},\n"
290 |         "ytick={%s},\n"
291 |         "ylabel={Runtime (s)},\n"
292 |         "width=600pt,\n"
293 |         "height=200pt,\n"
294 |         "ymajorgrids,\n"
295 |         "grid style={opacity=0.1},\n"
296 |         "legend entries={%s},\n"
297 |         "legend pos={south west},\n"
298 |         "]\n"
299 |         % (
300 |             COLOR_LEFT,
301 |             COLOR_RIGHT,
302 |             COLOR_MINMAX,
303 |             xtick,
304 |             xticklabels,
305 |             ytick,
306 |             legend_entries,
307 |         )
308 |     )
309 | 
310 |     tex += (
311 |         "\\addlegendimage{only marks, mark=square*, ColorLeft, opacity=%g}\n"
312 |         % (opacity)
313 |     )
314 |     tex += (
315 |         "\\addlegendimage{only marks, mark=square*, ColorRight, opacity=%g}\n"
316 |         % (opacity)
317 |     )
318 | 
319 |     for corpus in violindata:
320 |         for violin in violindata[corpus]:
321 |             fillcolor = (
322 |                 "ColorLeft" if violin["side"] == "left" else "ColorRight"
323 |             )
324 |             tex += generate_tex_for_violin(
325 |                 violin, edgecolor="black", fillcolor=fillcolor, alpha=opacity
326 |             )
327 | 
328 |     tex += "\\end{semilogyaxis}\n" "\\end{tikzpicture}\n" "\\end{document}"
329 |     return tex
330 | 
331 | 
332 | def create_twosided_violin(corpus_data, output_file):
333 |     corpora = sorted(corpus_data.keys())
334 |     sides = ["left", "right"]
335 |     assert len(corpora) == 2
336 |     legend_data = {"corpora": corpora, "colors": [COLOR_LEFT, COLOR_RIGHT]}
337 | 
338 |     violindata = {}
339 |     for corpus, side in zip(corpora, sides):
340 |         violindata[corpus] = generate_violin_data(
341 |             corpus_data[corpus], side=side, showmedian=True
342 |         )
343 | 
344 |     tex = generate_latex(violindata, legend_data)
345 |     tex_file = os.path.splitext(output_file)[0] + ".tex"
346 |     with open(tex_file, "w") as fid:
347 |         fid.write(tex)
348 |     build_latex_doc(tex, output_name=output_file)
349 | 
350 | 
351 | def parse_args():
352 |     parser = argparse.ArgumentParser()
353 |     parser.add_argument(
354 |         "-o", dest="output", help="Output pdf file to write to", required=True
355 |     )
356 |     parser.add_argument(
357 |         "-s",
358 |         dest="summaries",
359 |         help="Summary file(s) with the results",
360 |         required=True,
361 |         nargs="+",
362 |     )
363 |     return parser.parse_args()
364 | 
365 | 
366 | def main():
367 |     args = parse_args()
368 |     all_data = {}
369 |     for summary_file in args.summaries:
370 |         with open(summary_file, "r") as fid:
371 |             data = json.load(fid)
372 |         all_data[data["corpus"]] = data
373 | 
374 |     create_twosided_violin(all_data, args.output)
375 | 


--------------------------------------------------------------------------------
/scripts/detection/human.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This script should be opened within tmux and no other tmux sessions should be 
  6 | running.
  7 | 
  8 | Author: Gertjan van den Burg
  9 | Copyright (c) 2018 - The Alan Turing Institute
 10 | License: See the LICENSE file.
 11 | 
 12 | """
 13 | 
 14 | import json
 15 | import libtmux
 16 | import os
 17 | import sys
 18 | import time
 19 | 
 20 | from common.encoding import get_encoding
 21 | from common.escape import is_potential_escapechar
 22 | from common.load import load_file
 23 | from common.detector_result import DetectorResult, Dialect, Status, StatusMsg
 24 | from common.utils import pairwise
 25 | 
 26 | 
 27 | def has_quotechar(data):
 28 |     chars = set(data)
 29 |     if '"' in chars or "'" in chars or "~" in chars or "`" in chars:
 30 |         return True
 31 |     return False
 32 | 
 33 | 
 34 | def get_quotechar_options(data):
 35 |     options = set()
 36 |     if '"' in data:
 37 |         options.add("q")
 38 |     if "'" in data:
 39 |         options.add("a")
 40 |     if "`" in data:
 41 |         options.add("b")
 42 |     if "~" in data:
 43 |         options.add("t")
 44 |     options.add("n")
 45 |     return options
 46 | 
 47 | 
 48 | def get_escapechar_options(data, encoding, delim, quotechar):
 49 |     escapes = set()
 50 |     for u, v in pairwise(data):
 51 |         if not is_potential_escapechar(u, encoding):
 52 |             continue
 53 |         if v in [delim, quotechar] and not u in [delim, quotechar]:
 54 |             escapes.add(u)
 55 |     return escapes
 56 | 
 57 | 
 58 | def ask_dqe(
 59 |     filename,
 60 |     data,
 61 |     encoding,
 62 |     ask_delim,
 63 |     ask_quotechar,
 64 |     ask_escapechar,
 65 |     old_res,
 66 |     less_pane,
 67 | ):
 68 |     if not old_res is None:
 69 |         res = {
 70 |             "delimiter": old_res.get("delimiter", None),
 71 |             "quotechar": old_res.get("quotechar", None),
 72 |             "escapechar": old_res.get("escapechar", None),
 73 |         }
 74 |     else:
 75 |         res = {"delimiter": None, "quotechar": None, "escapechar": None}
 76 | 
 77 |     opened_vim = False
 78 |     opened_less = False
 79 | 
 80 |     note = None
 81 | 
 82 |     if ask_delim:
 83 |         less_pane.send_keys("less -f %s" % filename)
 84 |         opened_less = True
 85 |         prompt = "What is the delimiter? "
 86 |         while True:
 87 |             ans = input(prompt)
 88 |             if ans == "quit":
 89 |                 less_pane.send_keys("q")
 90 |                 opened_less = False
 91 |                 less_pane.send_keys("exit")
 92 |                 raise SystemExit
 93 |             if ans in ["vi", "vim"]:
 94 |                 less_pane.send_keys("q")
 95 |                 opened_less = False
 96 |                 less_pane.send_keys("vim %s" % filename)
 97 |                 opened_vim = True
 98 |                 continue
 99 |             if ans in ["hltab", "hlt"]:
100 |                 less_pane.send_keys("/\\t")
101 |                 continue
102 |             if ans in ["hlspace", "hls"]:
103 |                 less_pane.send_keys("/\\ ")
104 |                 continue
105 |             if ans == "skip":
106 |                 if opened_less:
107 |                     less_pane.send_keys("q")
108 |                 elif opened_vim:
109 |                     less_pane.send_keys(":q")
110 |                 less_pane.clear()
111 |                 return None, note
112 |             if ans == "note":
113 |                 note = input("Enter note: ").strip()
114 |                 continue
115 |             if ans == "none":
116 |                 res["delimiter"] = None
117 |             elif ans == "\\t":
118 |                 res["delimiter"] = "\t"
119 |             elif len(ans.strip()) > 1:
120 |                 print("Only length 0 or 1 delimiters are allowed")
121 |                 continue
122 |             else:
123 |                 res["delimiter"] = ans.rstrip("\n")
124 |             break
125 | 
126 |     print("Delimiter: %r" % res["delimiter"])
127 | 
128 |     if opened_vim:
129 |         less_pane.send_keys(":q")
130 |         opened_vim = False
131 |         time.sleep(1)
132 |         less_pane.send_keys("less -f %s" % filename)
133 |         opened_less = True
134 | 
135 |     if ask_quotechar:
136 |         if not opened_less:
137 |             less_pane.send_keys("less -f %s" % filename)
138 |             opened_less = True
139 | 
140 |         options = get_quotechar_options(data)
141 |         if "q" in options:
142 |             less_pane.send_keys('/"')
143 |             less_pane.send_keys("gg", enter=False, suppress_history=False)
144 |             less_pane.send_keys("n", enter=False, suppress_history=False)
145 |         elif "a" in options:
146 |             less_pane.send_keys("/'")
147 |             less_pane.send_keys("gg", enter=False, suppress_history=False)
148 |             less_pane.send_keys("n", enter=False, suppress_history=False)
149 |         opt_str = "/".join(sorted(options))
150 |         prompt = "What is the quotation mark? [%s] " % opt_str
151 |         while True:
152 |             if list(options) == ["n"]:
153 |                 res["quotechar"] = None
154 |                 break
155 |             ans = input(prompt)
156 |             ans = ans.rstrip("\n")
157 | 
158 |             if ans == "quit":
159 |                 less_pane.send_keys("q")
160 |                 opened_less = False
161 |                 less_pane.send_keys("exit")
162 |                 raise SystemExit
163 |             if ans in ["vi", "vim"]:
164 |                 less_pane.send_keys("q")
165 |                 opened_less = False
166 |                 less_pane.send_keys("vim %s" % filename)
167 |                 opened_vim = True
168 |                 continue
169 |             if ans == "skip":
170 |                 if opened_less:
171 |                     less_pane.send_keys("q")
172 |                 elif opened_vim:
173 |                     less_pane.send_keys(":q")
174 |                 less_pane.clear()
175 |                 return None, note
176 |             if ans == "note":
177 |                 note = input("Enter note: ").strip()
178 |                 continue
179 |             if not ans.strip().lower() in options:
180 |                 print("Please try again.")
181 |                 continue
182 |             if ans == "n":
183 |                 res["quotechar"] = None
184 |             else:
185 |                 if not ans.upper() in ["Q", "A", "B", "T"]:
186 |                     raise ValueError("Unknown option: %s" % ans)
187 |                 res["quotechar"] = {"Q": '"', "A": "'", "B": "`", "T": 
188 |                         "~"}[ans.upper()]
189 |             break
190 | 
191 |     print("Quotechar: %r" % res["quotechar"])
192 | 
193 |     if opened_vim:
194 |         less_pane.send_keys(":q")
195 |         opened_vim = False
196 |         time.sleep(1)
197 |         less_pane.send_keys("less -f %s" % filename)
198 |         opened_less = True
199 | 
200 |     options = get_escapechar_options(
201 |         data, encoding, res["delimiter"], res["quotechar"]
202 |     )
203 |     if ask_escapechar:
204 |         if not options:
205 |             print("No escapechar options.")
206 |             res["escapechar"] = ""
207 |         else:
208 |             if not opened_less:
209 |                 less_pane.send_keys("less -f %s" % filename)
210 |                 opened_less = True
211 |             if "n" in options:
212 |                 raise ValueError("'n' shouldn't be an option in escapechars!")
213 |             if len(options) == 1:
214 |                 if '\\' in options:
215 |                     less_pane.send_keys("/\\\\")
216 |                 less_pane.send_keys("gg", enter=False, suppress_history=False)
217 |                 less_pane.send_keys("n", enter=False, suppress_history=False)
218 |             options.add("n")
219 |             opt_str = "/".join(sorted(options))
220 |             prompt = "What is the escape character? [%s] " % opt_str
221 |             while True:
222 |                 ans = input(prompt)
223 |                 ans = ans.strip("\n")
224 |                 if ans == "quit":
225 |                     less_pane.send_keys("q")
226 |                     opened_less = False
227 |                     less_pane.send_keys("exit")
228 |                     raise SystemExit
229 |                 if ans == "skip":
230 |                     if opened_less:
231 |                         less_pane.send_keys("q")
232 |                         less_pane.clear()
233 |                     return None, note
234 |                 if ans == "note":
235 |                     note = input("Enter note: ").strip()
236 |                     continue
237 |                 if not ans.strip() in options:
238 |                     print("Please try again")
239 |                     continue
240 |                 if ans == "n":
241 |                     res["escapechar"] = ""
242 |                 else:
243 |                     res["escapechar"] = ans
244 |                 break
245 | 
246 |     print("Escapechar: %r" % res["escapechar"])
247 | 
248 |     if opened_less:
249 |         less_pane.send_keys("q")
250 |         less_pane.clear()
251 |     return res, note
252 | 
253 | 
254 | def annotate_file(filename, less_pane, previous):
255 |     print("")
256 |     encoding = get_encoding(filename)
257 |     data = load_file(filename, encoding=encoding)
258 | 
259 |     if previous:
260 |         ask_delim = not "delimiter" in previous
261 |         ask_quotechar = not "quotechar" in previous and has_quotechar(data)
262 |         ask_escapechar = not "escapechar" in previous
263 |     else:
264 |         ask_delim = True
265 |         ask_quotechar = has_quotechar(data)
266 |         ask_escapechar = True
267 | 
268 |     print("Annotating file: %s" % filename)
269 |     res, note = ask_dqe(
270 |         filename,
271 |         data,
272 |         encoding,
273 |         ask_delim,
274 |         ask_quotechar,
275 |         ask_escapechar,
276 |         previous,
277 |         less_pane,
278 |     )
279 | 
280 |     out = DetectorResult(
281 |         detector="human", filename=filename, runtime=None, status=Status.OK
282 |     )
283 |     if note:
284 |         out.note = note
285 | 
286 |     if res is None:
287 |         less_pane.send_keys("q")
288 |         less_pane.clear()
289 |         out.status = Status.SKIP
290 |         out.status_msg = StatusMsg.HUMAN_SKIP
291 |         return out
292 | 
293 |     if res["delimiter"] is None:
294 |         res["delimiter"] = ""
295 |     if res["quotechar"] is None:
296 |         res["quotechar"] = ""
297 | 
298 |     out.dialect = Dialect.from_dict(res)
299 | 
300 |     return out
301 | 
302 | 
303 | def dump_result(output_file, res):
304 |     with open(output_file, "a") as fid:
305 |         fid.write(res.to_json() + "\n")
306 | 
307 | 
308 | def load_previous(output_file):
309 |     previous = {}
310 |     if not os.path.exists(output_file):
311 |         return previous
312 |     with open(output_file, "r") as fid:
313 |         for line in fid.readlines():
314 |             record = json.loads(line.strip())
315 |             previous[record["filename"]] = record
316 |     return previous
317 | 
318 | 
319 | def init_tmux():
320 |     tmux_server = libtmux.Server()
321 |     tmux_sess = tmux_server.list_sessions()[-1]
322 |     tmux_win = tmux_sess.attached_window
323 |     less_pane = tmux_win.split_window(attach=False)
324 | 
325 |     return less_pane
326 | 
327 | 
328 | def batch_process(path_file, output_file):
329 |     with open(path_file, "r") as fid:
330 |         files = [l.strip() for l in fid.readlines()]
331 |     files.sort()
332 | 
333 |     previous = load_previous(output_file)
334 | 
335 |     done = [x for x in files if x in previous and "dialect" in previous[x]]
336 |     skipped = [
337 |         x for x in files if x in previous and previous[x]["status"] == "SKIP"
338 |     ]
339 |     todo = [x for x in files if not (x in done or x in skipped)]
340 | 
341 |     if not todo:
342 |         print("All done.")
343 |         return
344 | 
345 |     print("Number of files remaining: %i" % len(todo))
346 | 
347 |     less_pane = init_tmux()
348 | 
349 |     count = 0
350 |     start_time = time.time()
351 |     for filename in todo:
352 |         old_res = previous.get(filename, None)
353 | 
354 |         if not os.path.exists(filename):
355 |             print("File not found: %s" % filename)
356 |             res = DetectorResult(
357 |                 status=Status.SKIP, status_msg=StatusMsg.NON_EXISTENT
358 |             )
359 |             continue
360 | 
361 |         res = annotate_file(filename, less_pane, old_res)
362 |         res.filename = filename
363 |         dump_result(output_file, res)
364 |         count += 1
365 | 
366 |         if count % 10 == 0:
367 |             print(
368 |                 "\nProgress: %i done out of %i. "
369 |                 "This session: %i (%.2f seconds per file)"
370 |                 % (
371 |                     count,
372 |                     len(todo),
373 |                     count,
374 |                     ((time.time() - start_time) / count),
375 |                 )
376 |             )
377 | 
378 |     print("All done.")
379 | 
380 | 
381 | def main():
382 |     if len(sys.argv) == 2:
383 |         print(annotate_file(sys.argv[1], init_tmux()))
384 |     elif len(sys.argv) == 3:
385 |         batch_process(sys.argv[1], sys.argv[2])
386 |     else:
387 |         print("Usage: %s path_file output_file" % (sys.argv[0]))
388 |         raise SystemExit
389 | 
390 | 
391 | if __name__ == "__main__":
392 |     main()
393 | 


--------------------------------------------------------------------------------
/scripts/analysis/make_summary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Make summaries from the detector result files.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | Copyright (c) 2018 - The Alan Turing Institute
  9 | License: See the LICENSE file.
 10 | 
 11 | """
 12 | 
 13 | import argparse
 14 | import json
 15 | 
 16 | from common.dialect import ATTRIBUTES
 17 | from common.detector_result import Status
 18 | 
 19 | from .core import load_detector_results, is_standard_dialect
 20 | 
 21 | 
 22 | def prop_equal(res1, res2, attr_name):
 23 |     return getattr(res1.dialect, attr_name) == getattr(res2.dialect, attr_name)
 24 | 
 25 | 
 26 | def compute_attribute_accuracy(
 27 |     reference, detector, attr_name, detector_name, original_detector=None
 28 | ):
 29 |     n_equal, n_total = 0, 0
 30 |     od = original_detector
 31 | 
 32 |     for fname in reference:
 33 |         res_ref = reference[fname]
 34 |         if not fname in detector:
 35 |             print(
 36 |                 "Warning: no result for %s in for detector %s"
 37 |                 % (fname, detector_name)
 38 |             )
 39 |             continue
 40 |         res_det = detector[fname]
 41 |         if od is not None and res_ref.original_detector != od:
 42 |             continue
 43 |         if not res_ref.status == Status.OK:
 44 |             continue
 45 |         n_total += 1
 46 |         if res_det.status == Status.OK:
 47 |             n_equal += prop_equal(res_ref, res_det, attr_name)
 48 | 
 49 |     return n_equal / n_total
 50 | 
 51 | 
 52 | def compute_overall_accuracy(
 53 |     reference, detector, detector_name, original_detector=None
 54 | ):
 55 |     n_equal, n_total = 0, 0
 56 |     od = original_detector
 57 |     for fname in reference:
 58 |         res_ref = reference[fname]
 59 |         if not fname in detector:
 60 |             print(
 61 |                 "Warning: no result for %s in for detector %s"
 62 |                 % (fname, detector_name)
 63 |             )
 64 |             continue
 65 |         res_det = detector[fname]
 66 |         if od is not None and res_ref.original_detector != od:
 67 |             continue
 68 |         if not res_ref.status == Status.OK:
 69 |             continue
 70 |         n_total += 1
 71 |         if res_det.status == Status.OK:
 72 |             n_equal += res_ref.dialect == res_det.dialect
 73 |     return n_equal / n_total
 74 | 
 75 | 
 76 | def compute_standard_accuracy(reference, detector, standard=True):
 77 |     total_standard, total_messy = 0, 0
 78 |     correct_standard, correct_messy = 0, 0
 79 |     for fname in reference:
 80 |         res_ref = reference[fname]
 81 |         if not res_ref.status == Status.OK:
 82 |             continue
 83 |         if not fname in detector:
 84 |             print("Warning: no result for file: %s" % fname)
 85 |             continue
 86 |         res_det = detector[fname]
 87 | 
 88 |         is_std = is_standard_dialect(res_ref.dialect)
 89 |         if is_std:
 90 |             total_standard += 1
 91 |         else:
 92 |             total_messy += 1
 93 | 
 94 |         if not res_det.status == Status.OK:
 95 |             continue
 96 | 
 97 |         is_correct = res_det.dialect == res_ref.dialect
 98 |         if is_std:
 99 |             correct_standard += 1 if is_correct else 0
100 |         else:
101 |             correct_messy += 1 if is_correct else 0
102 |     if standard:
103 |         return correct_standard / total_standard
104 |     return correct_messy / total_messy
105 | 
106 | 
107 | def compute_fail_percentage(reference, detector, detector_name):
108 |     n_fail, n_total = 0, 0
109 |     for fname in reference:
110 |         if reference[fname].status == Status.OK:
111 |             n_total += 1
112 |         else:
113 |             continue
114 |         if not fname in detector:
115 |             print(
116 |                 "Warning: no result for %s in for detector %s"
117 |                 % (fname, detector_name)
118 |             )
119 |             continue
120 |         if detector[fname].status == Status.FAIL:
121 |             n_fail += 1
122 |     return n_fail / n_total
123 | 
124 | 
125 | def compute_nic_split_accuracy(reference, detector, mode=None):
126 |     files_total = 0
127 |     files_with_mode = 0
128 |     for fname in reference:
129 |         res_ref = reference[fname]
130 |         if not res_ref.status == Status.OK:
131 |             continue
132 |         if not fname in detector:
133 |             print("Warning: no result for file: %s" % fname)
134 |             continue
135 |         res_det = detector[fname]
136 | 
137 |         files_total += 1
138 | 
139 |         if mode == "no_results":
140 |             if not res_det.status == Status.OK:
141 |                 files_with_mode += 1
142 |         elif mode == "incorrect_results":
143 |             if (
144 |                 res_det.status == Status.OK
145 |                 and res_det.dialect != res_ref.dialect
146 |             ):
147 |                 files_with_mode += 1
148 |         elif mode == "correct_results":
149 |             if (
150 |                 res_det.status == Status.OK
151 |                 and res_det.dialect == res_ref.dialect
152 |             ):
153 |                 files_with_mode += 1
154 |         else:
155 |             raise ValueError("Unknown mode: %r" % mode)
156 |     return files_with_mode / files_total
157 | 
158 | def collect_computation_times(reference, detector, detector_name):
159 |     runtimes = []
160 |     for fname in sorted(reference.keys()):
161 |         if not reference[fname].status == Status.OK:
162 |             continue
163 |         if not fname in detector:
164 |             print(
165 |                 "Warning: no result for %s in for detector %s"
166 |                 % (fname, detector_name)
167 |             )
168 |             continue
169 |         # Note that we don't check whether the detector returned with status
170 |         # OK, because we want to include failures and timeouts in the runtime
171 |         # plots as well.
172 |         rt = detector[fname].runtime
173 |         if rt is None:
174 |             raise ValueError(
175 |                 "Runtime is None for result: %r" % detector[fname]
176 |             )
177 |         runtimes.append(detector[fname].runtime)
178 | 
179 |     return runtimes
180 | 
181 | 
182 | def count_reference_ok(reference, original_detector=None):
183 |     n_ok = 0
184 |     od = original_detector
185 |     for fname in reference:
186 |         if od is not None and reference[fname].original_detector != od:
187 |             continue
188 |         if reference[fname].status == Status.OK:
189 |             n_ok += 1
190 |     return n_ok
191 | 
192 | 
193 | def count_standard(reference_results, standard=True):
194 |     count = 0
195 |     for fname in reference_results:
196 |         ref = reference_results[fname]
197 |         if not ref.status == Status.OK:
198 |             continue
199 | 
200 |         is_std = is_standard_dialect(ref.dialect)
201 |         if standard:
202 |             if is_std:
203 |                 count += 1
204 |         else:
205 |             if not is_std:
206 |                 count += 1
207 |     return count
208 | 
209 | 
210 | def summarize_accuracy(
211 |     reference_results, detector_results_all, original_detector=None
212 | ):
213 |     accuracy = {}
214 |     for attr_name in ATTRIBUTES:
215 |         accuracy[attr_name] = {}
216 |         for detector in detector_results_all:
217 |             detector_results = detector_results_all[detector]
218 |             accuracy[attr_name][detector] = compute_attribute_accuracy(
219 |                 reference_results,
220 |                 detector_results,
221 |                 attr_name,
222 |                 detector,
223 |                 original_detector=original_detector,
224 |             )
225 | 
226 |     assert "overall" not in accuracy.keys()
227 |     accuracy["overall"] = {}
228 |     for detector in detector_results_all:
229 |         detector_results = detector_results_all[detector]
230 |         accuracy["overall"][detector] = compute_overall_accuracy(
231 |             reference_results,
232 |             detector_results,
233 |             detector,
234 |             original_detector=original_detector,
235 |         )
236 |     return accuracy
237 | 
238 | 
239 | def summarize_standard_accuracy(
240 |     reference_results, detector_results_all, standard=True
241 | ):
242 | 
243 |     accuracy = {}
244 |     for detector in detector_results_all:
245 |         detector_results = detector_results_all[detector]
246 |         accuracy[detector] = compute_standard_accuracy(
247 |             reference_results, detector_results, standard=standard
248 |         )
249 |     return accuracy
250 | 
251 | 
252 | def summarize_nic_split_accuracy(
253 |     reference_results, detector_results_all, mode=None
254 | ):
255 |     allowed_modes = ["no_results", "incorrect_results", "correct_results"]
256 |     if mode is None or not mode in allowed_modes:
257 |         raise ValueError("mode must be one of: %r" % allowed_modes)
258 | 
259 |     accuracies = {}
260 |     for detector in detector_results_all:
261 |         detector_results = detector_results_all[detector]
262 |         accuracies[detector] = compute_nic_split_accuracy(
263 |             reference_results, detector_results, mode=mode
264 |         )
265 |     return accuracies
266 | 
267 | 
268 | def create_summary(reference_results, detector_results_all):
269 |     summary = {}
270 |     summary["n_files_all"] = count_reference_ok(
271 |         reference_results, original_detector=None
272 |     )
273 |     summary["n_files_human"] = count_reference_ok(
274 |         reference_results, original_detector="human"
275 |     )
276 |     summary["n_files_normal"] = count_reference_ok(
277 |         reference_results, original_detector="normal"
278 |     )
279 |     summary["n_files_standard"] = count_standard(
280 |         reference_results, standard=True
281 |     )
282 |     summary["n_files_messy"] = count_standard(
283 |         reference_results, standard=False
284 |     )
285 | 
286 |     # Compute accuracy
287 |     summary["detection_accuracy_all"] = summarize_accuracy(
288 |         reference_results, detector_results_all, original_detector=None
289 |     )
290 |     summary["detection_accuracy_human"] = summarize_accuracy(
291 |         reference_results, detector_results_all, original_detector="human"
292 |     )
293 |     summary["detection_accuracy_normal"] = summarize_accuracy(
294 |         reference_results, detector_results_all, original_detector="normal"
295 |     )
296 | 
297 |     # Compute standard/non-standard split
298 |     summary["standard_accuracy_all"] = summarize_standard_accuracy(
299 |         reference_results, detector_results_all, standard=True
300 |     )
301 |     summary["messy_accuracy_all"] = summarize_standard_accuracy(
302 |         reference_results, detector_results_all, standard=False
303 |     )
304 | 
305 |     # Compute No result/Incorrect results/Correct result split
306 |     summary["no_result_all"] = summarize_nic_split_accuracy(
307 |         reference_results, detector_results_all, mode="no_results"
308 |     )
309 |     summary["incorrect_result_all"] = summarize_nic_split_accuracy(
310 |         reference_results, detector_results_all, mode="incorrect_results"
311 |     )
312 |     summary["correct_result_all"] = summarize_nic_split_accuracy(
313 |         reference_results, detector_results_all, mode="correct_results"
314 |     )
315 | 
316 |     # Compute failure rates
317 |     failures = {}
318 |     for detector in detector_results_all:
319 |         detector_results = detector_results_all[detector]
320 |         failures[detector] = compute_fail_percentage(
321 |             reference_results, detector_results, detector
322 |         )
323 |     summary["failures"] = failures
324 | 
325 |     # Collect runtimes
326 |     runtimes = {}
327 |     for detector in detector_results_all:
328 |         detector_results = detector_results_all[detector]
329 |         runtimes[detector] = collect_computation_times(
330 |             reference_results, detector_results, detector
331 |         )
332 |     summary["runtimes"] = runtimes
333 | 
334 |     return summary
335 | 
336 | 
337 | def parse_args():
338 |     parser = argparse.ArgumentParser(description="Compare detector results")
339 |     parser.add_argument(
340 |         "-c",
341 |         dest="corpus",
342 |         help="Name of the corpus we're looking at",
343 |         required=True,
344 |     )
345 |     parser.add_argument(
346 |         "-s",
347 |         dest="summary_file",
348 |         help="output file for the summary statistics",
349 |         required=True,
350 |     )
351 |     parser.add_argument(
352 |         "-r",
353 |         dest="reference_file",
354 |         help="reference output file with ground truth",
355 |         required=True,
356 |     )
357 |     parser.add_argument(
358 |         "-o",
359 |         dest="output_file",
360 |         nargs="+",
361 |         help="output_file(s) from different detectors",
362 |         required=True,
363 |     )
364 |     return parser.parse_args()
365 | 
366 | 
367 | def main():
368 |     args = parse_args()
369 | 
370 |     _, ref_results = load_detector_results(args.reference_file)
371 |     detector_results = {}
372 |     for fname in args.output_file:
373 |         name, results = load_detector_results(fname)
374 |         detector_results[name] = results
375 | 
376 |     summary_data = create_summary(ref_results, detector_results)
377 |     summary_data["corpus"] = args.corpus
378 |     with open(args.summary_file, "w") as fid:
379 |         fid.write(json.dumps(summary_data, indent=2))
380 | 


--------------------------------------------------------------------------------