├── .Rbuildignore ├── .circleci └── config.yml ├── .github ├── .gitignore └── workflows │ └── Actions.yaml ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── 000.R ├── ai_attach_direction.R ├── ai_crude_data_block_joins.R ├── ai_data_gid_join.R ├── ai_get_data_attr_map.R ├── ai_get_data_attr_map_details.R ├── ai_get_data_attr_map_main.R ├── ai_relative_data_join_attr.R ├── ai_relative_data_split_attr.R ├── analyze_cells.R ├── as_cell_df.R ├── as_cell_df_internal.R ├── attach_trace_info.R ├── cell_trace_plot.R ├── class-cell_df.R ├── class-cell_df_analysis.R ├── class-intermediate.R ├── class-read_cell_part.R ├── cli_bindings.R ├── collate_columns.R ├── collate_columns_etc.R ├── common_data_patterns.R ├── compact_gid_maps.R ├── compose_cells.R ├── detect_LibreOffice.R ├── detect_and_read.R ├── extend_data_block.R ├── file_etc.R ├── fj.R ├── generics-cell_df.R ├── generics-cell_df_analysis.R ├── generics-read_cell_part.R ├── get_all_col_representative.R ├── get_cells_from_admap.R ├── get_definiteness_checks.R ├── get_direction.R ├── get_direction_df.R ├── get_direction_metric.R ├── get_group_id.R ├── get_links_df.R ├── get_unpivotr_direction_names.R ├── global_vars.R ├── is_attachable.R ├── is_available.R ├── is_conforms_to_rcdf.R ├── is_xlsx_ok.R ├── make_read_cell_part.R ├── mask_data.R ├── merge_admaps.R ├── multi_names.R ├── possible_to_support.R ├── read_cells.R ├── read_cells_stages.R ├── read_excel_whole.R ├── read_excel_whole_readxl.R ├── read_pdf_from_tabulizer.R ├── read_xls_from_xlsx.R ├── reduce_2dfs.R ├── refine_cols.R ├── safe_dependency_check.R ├── shiny_etc.R ├── shiny_main.R ├── shiny_parts_server.R ├── shiny_parts_server_components.R ├── shiny_parts_ui.R ├── similarity_score.R ├── stitch_direction.R ├── tidycells-package.R ├── utils-deps.R ├── utils-pipe.R ├── validate_cells.R ├── validate_read_cell_part_object.R ├── value_attribute_classifiers.R ├── value_attribute_classify.R └── visual_functions.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── appveyor.yml ├── codecov.yml ├── cran-comments.md ├── dev-notes.md ├── docs ├── 404.html ├── LICENSE-text.html ├── LICENSE.html ├── articles │ ├── ext │ │ ├── compose_cells_cli1.png │ │ ├── logo.png │ │ ├── marks.png │ │ ├── read_cells.svg │ │ ├── read_cells_out.png │ │ ├── v12.png │ │ ├── v34.png │ │ └── v56.png │ ├── index.html │ ├── tidycells-intro.html │ └── tidycells-intro_files │ │ └── figure-html │ │ ├── unnamed-chunk-15-1.png │ │ ├── unnamed-chunk-21-1.png │ │ ├── unnamed-chunk-22-1.png │ │ ├── unnamed-chunk-25-1.png │ │ ├── unnamed-chunk-30-1.png │ │ └── unnamed-chunk-30-2.png ├── authors.html ├── dev-notes.html ├── docsearch.css ├── docsearch.js ├── index.html ├── link.svg ├── logo.png ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml └── reference │ ├── analyse_cells.html │ ├── analyze_cells-1.png │ ├── analyze_cells-2.png │ ├── analyze_cells.html │ ├── as_cell_df.html │ ├── basic_classifier.html │ ├── cell_analysis-class.html │ ├── cell_composition_traceback.html │ ├── cell_df-class.html │ ├── collate_columns.html │ ├── compose_cells.html │ ├── figures │ └── read_cells.svg │ ├── get_direction.html │ ├── get_direction_df.html │ ├── get_direction_metric.html │ ├── get_group_id.html │ ├── get_unpivotr_direction_names.html │ ├── index.html │ ├── numeric_values_classifier.html │ ├── pipe.html │ ├── read_cell_part-class.html │ ├── read_cells.html │ ├── read_cells_internal.html │ ├── sample_based_classifier.html │ ├── tidycells-package.html │ ├── validate_cells.html │ ├── value_attribute_classify-1.png │ ├── value_attribute_classify-2.png │ ├── value_attribute_classify-3.png │ ├── value_attribute_classify.html │ └── visual_functions.html ├── inst ├── extdata │ ├── RBI_HBS_Table_No_166.xlsx │ ├── marks.xlsx │ ├── marks_cells.rds │ ├── messy │ │ ├── csv.docx │ │ ├── docx.html │ │ ├── html.csv │ │ ├── pdf.docx │ │ ├── xls.pdf │ │ └── xlsx.csv │ └── untidy.xlsx └── rstudio │ └── addins.dcf ├── logo.png ├── man ├── analyse_cells.Rd ├── analyze_cells.Rd ├── as_cell_df.Rd ├── basic_classifier.Rd ├── cell_analysis-class.Rd ├── cell_composition_traceback.Rd ├── cell_df-class.Rd ├── collate_columns.Rd ├── compose_cells.Rd ├── figures │ └── read_cells.svg ├── get_direction.Rd ├── get_direction_df.Rd ├── get_direction_metric.Rd ├── get_group_id.Rd ├── get_unpivotr_direction_names.Rd ├── numeric_values_classifier.Rd ├── pipe.Rd ├── read_cell_part-class.Rd ├── read_cells.Rd ├── read_cells_internal.Rd ├── sample_based_classifier.Rd ├── tidycells-package.Rd ├── validate_cells.Rd ├── value_attribute_classify.Rd └── visual_functions.Rd ├── pkgdown └── favicon │ ├── apple-touch-icon-120x120.png │ ├── apple-touch-icon-60x60.png │ ├── apple-touch-icon-76x76.png │ ├── apple-touch-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ └── favicon.ico ├── tests ├── testthat.R └── testthat │ ├── test-VA_classifier.R │ ├── test-analyze_cells.R │ ├── test-as_cell_df.R │ ├── test-cells_to_df.R │ ├── test-collate_columns.R │ ├── test-etc.R │ ├── test-optional_package.R │ ├── test-print_cell_analysis.R │ ├── test-print_cell_df.R │ ├── test-read_cells.R │ ├── test-read_cells_real.R │ ├── test-use_cases.R │ ├── testdata │ ├── WSS.rds │ ├── badcells.rds │ ├── doc.doc │ ├── enron_from_unpivotr_processed.rds │ └── test.xls │ ├── testlib │ └── shiny_test.R │ └── testusecases │ └── ex1.rds ├── tidycells.Rproj └── vignettes ├── .gitignore ├── ext ├── compose_cells_cli1.png ├── logo.png ├── marks.png ├── read_cells.svg ├── read_cells_out.png ├── v12.png ├── v34.png └── v56.png └── tidycells-intro.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^\.travis\.yml$ 5 | ^appveyor\.yml$ 6 | ^codecov\.yml$ 7 | ^README\.Rmd$ 8 | ^doc$ 9 | ^Meta$ 10 | ^cran-comments\.md$ 11 | ^dev-notes\.md$ 12 | ^logo\.png$ 13 | ^_pkgdown\.yml$ 14 | ^docs$ 15 | ^pkgdown$ 16 | ^\.circleci$ 17 | ^\.circleci/config\.yml$ 18 | ^\.github$ 19 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: rocker/verse:latest 6 | environment: 7 | R_LIBS: ~/R/Library 8 | steps: 9 | - restore_cache: 10 | keys: 11 | - r-pkg-cache-{{ arch }}-{{ .Branch }} 12 | - r-pkg-cache-{{ arch }}- 13 | - checkout 14 | - run: 15 | name: Install package dependencies 16 | command: | 17 | mkdir -p ~/R/Library 18 | Rscript -e 'install.packages("remotes")' 19 | Rscript -e 'remotes::install_deps(dependencies = TRUE)' 20 | - run: 21 | name: Session information and installed package versions 22 | command: | 23 | Rscript -e 'sessionInfo()' 24 | Rscript -e 'installed.packages()[, c("Package", "Version")]' 25 | Rscript -e 'rmarkdown::pandoc_version()' 26 | - run: 27 | name: Build package 28 | command: R CMD build . 29 | - run: 30 | name: Check package 31 | command: R CMD check --as-cran --no-manual *tar.gz 32 | - store_artifacts: 33 | path: tidycells.Rcheck/ 34 | - save_cache: 35 | key: r-pkg-cache-{{ arch }}-{{ .Branch }} 36 | paths: 37 | - "~/R/Library" 38 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/Actions.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | name: Actions 10 | 11 | jobs: 12 | Actions: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: windows-latest, r: 'release'} 22 | - {os: windows-latest, r: 'devel'} 23 | - {os: windows-latest, r: 'oldrel'} 24 | - {os: macOS-latest, r: 'release'} 25 | - {os: macOS-latest, r: 'devel'} 26 | - {os: macOS-latest, r: 'oldrel'} 27 | - {os: ubuntu-16.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} 28 | 29 | env: 30 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 31 | RSPM: ${{ matrix.config.rspm }} 32 | 33 | steps: 34 | - uses: actions/checkout@v2 35 | 36 | - uses: r-lib/actions/setup-r@master 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | 40 | - uses: r-lib/actions/setup-pandoc@master 41 | 42 | - name: Query dependencies 43 | run: | 44 | install.packages('remotes') 45 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 46 | shell: Rscript {0} 47 | 48 | - name: Cache R packages 49 | if: runner.os != 'Windows' 50 | uses: actions/cache@v1 51 | with: 52 | path: ${{ env.R_LIBS_USER }} 53 | key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('.github/depends.Rds') }} 54 | restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-1- 55 | 56 | - name: Install system dependencies 57 | if: runner.os == 'Linux' 58 | env: 59 | RHUB_PLATFORM: linux-x86_64-ubuntu-gcc 60 | run: | 61 | Rscript -e "remotes::install_github('r-hub/sysreqs')" 62 | sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") 63 | sudo -s eval "$sysreqs" 64 | 65 | - name: Install dependencies 66 | run: | 67 | try(system("sudo R CMD javareconf")) 68 | remotes::install_cran("rJava", type = ifelse(tolower(Sys.info()[["sysname"]])=="linux","source","both")) 69 | remotes::install_deps(dependencies = TRUE) 70 | remotes::install_cran("rcmdcheck") 71 | shell: Rscript {0} 72 | 73 | - name: Check 74 | env: 75 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 76 | run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 77 | shell: Rscript {0} 78 | 79 | - name: Upload check results 80 | if: failure() 81 | uses: actions/upload-artifact@master 82 | with: 83 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 84 | path: check 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | doc 7 | Meta 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | # libreoffice for doc 3 | before_install: 4 | - if [[ "${TRAVIS_OS_NAME}" = "linux" ]]; then sudo apt-add-repository -y ppa:libreoffice/ppa; fi 5 | - if [[ "${TRAVIS_OS_NAME}" = "linux" ]]; then sudo apt-get install libreoffice; fi 6 | # temp fix for ps in osx 7 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then R -e "install.packages(c('ps'), repos = 'http://cran.rstudio.com/')" ; fi 8 | 9 | 10 | language: R 11 | 12 | r: 13 | - oldrel 14 | - release 15 | - devel 16 | 17 | cache: packages 18 | 19 | warnings_are_errors: true 20 | 21 | os: 22 | - linux 23 | - osx 24 | 25 | matrix: 26 | exclude: 27 | - 28 | os: osx 29 | r: devel 30 | 31 | after_success: 32 | - if [[ "${TRAVIS_OS_NAME}" = "linux" && "${TRAVIS_R_VERSION_STRING}" = "release" ]]; then Rscript -e 'covr::codecov(type = "all")'; fi 33 | - if [[ "${TRAVIS_OS_NAME}" = "linux" && "${TRAVIS_R_VERSION_STRING}" = "release" ]]; then Rscript -e 'covr::coveralls(type = "all", line_exclusions = c("R/shiny_etc.R","R/shiny_main.R","R/shiny_parts_server.R","R/shiny_parts_server_components.R","R/shiny_parts_ui.R","R/visual_functions.R"))'; fi 34 | - if [[ "${TRAVIS_OS_NAME}" = "linux" ]]; then libreoffice --version; fi 35 | - if [[ "${TRAVIS_OS_NAME}" = "linux" && "${TRAVIS_R_VERSION_STRING}" = "release" ]]; then Rscript -e 'if(!requireNamespace("RCurl", quietly = TRUE)) install.packages("RCurl")'; fi 36 | - if [[ "${TRAVIS_OS_NAME}" = "linux" && "${TRAVIS_R_VERSION_STRING}" = "release" ]]; then Rscript -e 'b <- list.files(pattern = "tidycells_[0-9.]+.tar.gz", full.names = T);if(length(b)>0){b <- normalizePath(max(b));cat("Got", b, "\n");lk <- paste0("ftp://win-builder.r-project.org/", c("R-devel", "R-release", "R-oldrelease"), "/", basename(b));fup <- function(u) {i <- 0;e <- NULL;repeat({i<-i + 1;e<- try(RCurl::ftpUpload(what = b, to = u), silent = TRUE);if(inherits(e, "try-error")){Sys.sleep(1);if(u!=lk[1]){e <- "failed"}}else{cat("Uploaded for:", u, "\n");break();};if(i>50) break();});e};lapply(lk, fup)}'; fi 37 | 38 | before_deploy: 39 | # so no overwrite of linux build 40 | - if [[ "${TRAVIS_OS_NAME}" = "osx" ]]; then rm -f *.tar.gz; fi 41 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Package 2 | Package: tidycells 3 | Title: Read Tabular Data from Diverse Sources and Easily Make 4 | Them Tidy 5 | Version: 0.2.2.999 6 | Authors@R: 7 | person(given = "Indranil", 8 | family = "Gayen", 9 | role = c("aut", "cre"), 10 | email = "nil.gayen@gmail.com", 11 | comment = c(ORCID = "0000-0003-0197-1944")) 12 | Description: Provides utilities to read, cells from complex 13 | tabular data and heuristic detection based 'structural assignment' of 14 | those cells to a columnar or tidy format. Read functionality has the 15 | ability to read structured, partially structured or unstructured 16 | tabular data from various types of documents. The 'structural 17 | assignment' functionality has both supervised and unsupervised way of 18 | assigning cells data to columnar/tidy format. Multiple disconnected 19 | blocks of tables in a single sheet are also handled appropriately. 20 | These tools are suitable for unattended conversation of messy tables 21 | into a consumable format(usable for further analysis and data 22 | wrangling). 23 | License: MIT + file LICENSE 24 | URL: https://r-rudra.github.io/tidycells/, 25 | https://github.com/r-rudra/tidycells 26 | BugReports: https://github.com/r-rudra/tidycells/issues 27 | Depends: 28 | R (>= 3.2.0) 29 | Imports: 30 | dplyr (>= 0.8.1), 31 | ggplot2, 32 | graphics, 33 | magrittr, 34 | methods, 35 | purrr (>= 0.3.2), 36 | rlang, 37 | stats, 38 | stringr (>= 1.4.0), 39 | tibble, 40 | tidyr, 41 | unpivotr (>= 0.5.1), 42 | utils 43 | Suggests: 44 | cli, 45 | covr, 46 | docxtractr, 47 | DT, 48 | knitr, 49 | miniUI, 50 | plotly, 51 | readr, 52 | readxl, 53 | rmarkdown, 54 | rstudioapi, 55 | shiny, 56 | shinytest, 57 | stringdist, 58 | tabulizer, 59 | testthat (>= 2.1.0), 60 | tidyxl, 61 | xlsx, 62 | XML 63 | VignetteBuilder: 64 | knitr 65 | Encoding: UTF-8 66 | LazyData: true 67 | Roxygen: list(markdown = TRUE) 68 | RoxygenNote: 7.1.0 69 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Indranil Gayen 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Indranil Gayen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(as.data.frame,cell_df) 4 | S3method(as.matrix,cell_df) 5 | S3method(as_cell_df,cells) 6 | S3method(as_cell_df,data.frame) 7 | S3method(as_cell_df,matrix) 8 | S3method(basic_classifier,cell_df) 9 | S3method(basic_classifier,default) 10 | S3method(numeric_values_classifier,cell_df) 11 | S3method(numeric_values_classifier,default) 12 | S3method(plot,cell_analysis) 13 | S3method(plot,cell_df) 14 | S3method(print,cell_analysis) 15 | S3method(print,cell_df) 16 | S3method(print,read_cell_part) 17 | S3method(read_cells,"NULL") 18 | S3method(read_cells,character) 19 | S3method(read_cells,default) 20 | S3method(read_cells,read_cell_part) 21 | S3method(sample_based_classifier,cell_df) 22 | S3method(sample_based_classifier,default) 23 | S3method(summary,cell_df) 24 | export("%>%") 25 | export(analyse_cells) 26 | export(analyze_cells) 27 | export(as_cell_df) 28 | export(basic_classifier) 29 | export(cell_composition_traceback) 30 | export(collate_columns) 31 | export(compose_cells) 32 | export(numeric_values_classifier) 33 | export(read_cells) 34 | export(sample_based_classifier) 35 | export(validate_cells) 36 | export(value_attribute_classify) 37 | export(visual_crop) 38 | export(visual_data_block_inspection) 39 | export(visual_orientation_modification) 40 | export(visual_traceback) 41 | export(visual_va_classify) 42 | exportClasses(cell_analysis) 43 | exportClasses(cell_df) 44 | exportClasses(read_cell_part) 45 | importFrom(dplyr,anti_join) 46 | importFrom(dplyr,arrange) 47 | importFrom(dplyr,bind_rows) 48 | importFrom(dplyr,case_when) 49 | importFrom(dplyr,count) 50 | importFrom(dplyr,desc) 51 | importFrom(dplyr,distinct) 52 | importFrom(dplyr,filter) 53 | importFrom(dplyr,full_join) 54 | importFrom(dplyr,group_by) 55 | importFrom(dplyr,group_split) 56 | importFrom(dplyr,if_else) 57 | importFrom(dplyr,inner_join) 58 | importFrom(dplyr,left_join) 59 | importFrom(dplyr,mutate) 60 | importFrom(dplyr,mutate_all) 61 | importFrom(dplyr,n) 62 | importFrom(dplyr,n_distinct) 63 | importFrom(dplyr,pull) 64 | importFrom(dplyr,recode) 65 | importFrom(dplyr,rename) 66 | importFrom(dplyr,right_join) 67 | importFrom(dplyr,select) 68 | importFrom(dplyr,summarise) 69 | importFrom(dplyr,ungroup) 70 | importFrom(magrittr,"%>%") 71 | importFrom(methods,setOldClass) 72 | importFrom(purrr,imap_dfr) 73 | importFrom(purrr,map) 74 | importFrom(purrr,map_chr) 75 | importFrom(purrr,map_dbl) 76 | importFrom(purrr,map_df) 77 | importFrom(purrr,map_int) 78 | importFrom(purrr,map_lgl) 79 | importFrom(purrr,reduce) 80 | importFrom(rlang,abort) 81 | importFrom(rlang,warn) 82 | importFrom(tibble,as_tibble) 83 | importFrom(tibble,tibble) 84 | importFrom(unpivotr,enhead) 85 | importFrom(utils,hasName) 86 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # tidycells (development version) 2 | 3 | 4 | # *News* 5 | 6 | # tidycells 0.2.2 _(2020-01-07)_ 7 | 8 | ## Other changes 9 | 10 | * Internal changes to unit tests which will fix [#11](https://github.com/r-rudra/tidycells/issues/11) 11 | 12 | # tidycells 0.2.1 _(2019-08-27)_ 13 | 14 | ## New features 15 | 16 | * Enhancement in the heuristic-based algorithm 17 | 18 | ## Other changes 19 | 20 | * Now if `read_cells` fails in the intermediate stage, it will give the output of last successful stage 21 | 22 | # tidycells 0.2.0 _(2019-08-20)_ 23 | 24 | ## First CRAN Release 25 | 26 | * Initial CRAN Release 27 | 28 | # tidycells 0.1.9 _(2019-08-12)_ 29 | 30 | ## New Features` 31 | 32 | * Added `collate_columns` to collate attribute-columns having similar content. 33 | 34 | # tidycells 0.1.0 _(2019-07-25)_ 35 | 36 | ## Initial Public Release 37 | 38 | * Initial Release to GitHub 39 | * Prior to this it was a private package 40 | 41 | -------------------------------------------------------------------------------- /R/000.R: -------------------------------------------------------------------------------- 1 | 2 | # pkg share global vars 3 | tidycells_pkg_env <- new.env() 4 | # used in --> is_available.R 5 | assign("na_pkgs", NULL, envir = tidycells_pkg_env) 6 | 7 | 8 | # safe_dependency_check 9 | # in later version create a proper framework 10 | # used in --> is_available.R and safe_dependency_check.R 11 | -------------------------------------------------------------------------------- /R/ai_attach_direction.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | ai_attach_direction <- function(d_att_dat_map_raw) { 4 | 5 | # asp: attr split 6 | d_att_dat_map_raw_asp <- d_att_dat_map_raw %>% 7 | # kept for tracking 8 | mutate(direction_basic = direction) %>% 9 | mutate(attr_gid_split = if_else(direction_group == "NS" & attr_group == "major", row_a, 10 | if_else(direction_group == "WE" & attr_group == "major", col_a, 0L) 11 | )) 12 | 13 | d_att_dat_map_raw_asp <- d_att_dat_map_raw_asp %>% 14 | mutate(attr_gid_split = if_else(direction_group == "corner", col_a, attr_gid_split)) 15 | 16 | d_att_dat_map_raw_asp %>% 17 | group_by(data_gid, attr_gid, direction, attr_gid_split) %>% 18 | group_split() %>% 19 | map_df(~ .x %>% mutate(direction = get_direction(.x))) 20 | } 21 | -------------------------------------------------------------------------------- /R/ai_crude_data_block_joins.R: -------------------------------------------------------------------------------- 1 | ai_crude_data_block_joins <- function(basic_admap, d_dat) { 2 | main_dir_attr <- basic_admap %>% 3 | filter(direction_group != "corner") %>% 4 | group_by(data_gid) %>% 5 | summarise(aid = attr_gid %>% 6 | paste0("_", direction) %>% 7 | unique() %>% 8 | sort() %>% 9 | paste0(collapse = "+")) 10 | 11 | crude_map <- main_dir_attr %>% 12 | group_by(aid) %>% 13 | mutate(new_gid = min(data_gid), n = n()) %>% 14 | ungroup() %>% 15 | filter(n > 1) %>% 16 | select(gid = data_gid, new_gid) 17 | 18 | done <- FALSE 19 | if (nrow(crude_map) > 0) { 20 | done <- TRUE 21 | d_dat <- get_group_id_join_gids(d_dat, gid_map = crude_map) 22 | } 23 | list(d_dat = d_dat, done = done) 24 | } 25 | -------------------------------------------------------------------------------- /R/ai_data_gid_join.R: -------------------------------------------------------------------------------- 1 | ai_data_gid_join <- function(d_dat, data_attr_map, full_data) { 2 | repeat({ 3 | if (length(unique(d_dat$group_id_map$gid)) < 2) break() 4 | 5 | data_gid_comb <- d_dat$group_id_map$gid %>% 6 | unique() %>% 7 | utils::combn(2) %>% 8 | as.data.frame(stringsAsFactors = FALSE) 9 | 10 | data_gid_comb_chk <- data_gid_comb %>% 11 | map_lgl(~ is_attachable( 12 | gid1 = .x[1], gid2 = .x[2], 13 | group_info = d_dat, whole_data = full_data, 14 | data_attr_map = data_attr_map 15 | )) 16 | if (any(data_gid_comb_chk)) { 17 | data_gid_joins <- data_gid_comb[data_gid_comb_chk] 18 | 19 | data_gid_join_map <- get_links_df(data_gid_joins) 20 | d_dat <- get_group_id_join_gids(d_dat, gid_map = data_gid_join_map) 21 | } else { 22 | break() 23 | } 24 | }) 25 | d_dat 26 | } 27 | -------------------------------------------------------------------------------- /R/ai_get_data_attr_map.R: -------------------------------------------------------------------------------- 1 | 2 | ai_get_data_attr_map <- function(dat_boundary, 3 | att_gid_map, 4 | attr_to_near_data = FALSE, leave_inside = FALSE) { 5 | 6 | # check relative location of each attr_gid (gid) wrt each data_gid 7 | if (!leave_inside) { 8 | d_att_map <- dat_boundary %>% 9 | split(.$gid) %>% 10 | map_df(~ get_direction_df(.x, datt = att_gid_map)) 11 | } else { 12 | d_att_map <- dat_boundary %>% 13 | split(.$gid) %>% 14 | map_df(~ get_direction_df(.x, datt = att_gid_map, allow_inside = TRUE)) 15 | } 16 | 17 | 18 | # for each attr_gid (gid), data_gid, direction, direction_group :- get minimum distance 19 | d_gid_att_map <- d_att_map %>% 20 | group_by(gid, data_gid, direction, direction_group) %>% 21 | summarise(md = min(c(dist, Inf))) %>% 22 | ungroup() 23 | 24 | # attach nearest attr_gid to each data gid 25 | d_gid_att_map_min_d <- d_gid_att_map %>% 26 | group_by(data_gid, direction_group) %>% 27 | mutate(m_dist = min(c(md, Inf))) %>% 28 | ungroup() %>% 29 | filter(md == m_dist) %>% 30 | select(-md) %>% 31 | rename(attr_gid = gid, dist = m_dist) 32 | 33 | # by default each data gid will choose nearest attr_gid (multiple allowed) 34 | if (attr_to_near_data) { 35 | # this does same for attr gid too 36 | # after each data gid choose nearest attr then each attr will choose nearest data gid 37 | 38 | d_gid_att_map_min_d <- d_gid_att_map_min_d %>% 39 | group_by(attr_gid) %>% 40 | mutate(md = min(c(dist, Inf))) %>% 41 | ungroup() %>% 42 | filter(md == dist) %>% 43 | select(-md) 44 | } 45 | 46 | 47 | lo <- list(map = d_gid_att_map_min_d, all_map = d_gid_att_map, raw = d_att_map) 48 | if (!leave_inside) { 49 | lo 50 | } else { 51 | lo %>% map(~ .x %>% filter(direction != "INSIDE")) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /R/ai_get_data_attr_map_details.R: -------------------------------------------------------------------------------- 1 | 2 | # dimention analysis and raw maps 3 | ai_get_data_attr_map_details <- function(basic_map, d_dat, d_att, major_direction_relax = TRUE) { 4 | dimension_analysis <- list() 5 | 6 | dimension_analysis$data_gid_dim <- d_dat$group_id_map %>% 7 | group_by(gid) %>% 8 | summarise( 9 | r_dim_data = n_distinct(row), 10 | c_dim_data = n_distinct(col) 11 | ) 12 | 13 | 14 | d_att_dat_map <- basic_map 15 | 16 | d_att_dat_map_raw <- d_att_dat_map %>% 17 | # join with data_gid to attach all data-cells 18 | inner_join(d_dat$group_id_map %>% 19 | select(row_d = row, col_d = col, data_gid = gid), 20 | by = "data_gid" 21 | ) %>% 22 | # join with attr_gid to attach all attr-cells 23 | inner_join(d_att$group_id_map %>% 24 | select(row_a = row, col_a = col, attr_gid = gid), 25 | by = "attr_gid" 26 | ) 27 | 28 | # attach dimension 29 | dimension_analysis$attr_data_dim <- d_att_dat_map_raw %>% 30 | group_by(attr_gid, data_gid) %>% 31 | summarise( 32 | r_dim = row_d %>% intersect(row_a) %>% length(), 33 | c_dim = col_d %>% intersect(col_a) %>% length(), 34 | direction_group = direction_group[1] 35 | ) %>% 36 | ungroup() %>% 37 | inner_join(dimension_analysis$data_gid_dim, by = c("data_gid" = "gid")) %>% 38 | mutate(rel_dim = if_else(direction_group == "NS", c_dim / c_dim_data, r_dim / r_dim_data)) %>% 39 | mutate(rel_dim = if_else(direction_group == "corner", 0, rel_dim)) %>% 40 | mutate(full_dim = (rel_dim >= 1)) 41 | 42 | # in case only non full dim major (NS or WE) attr present 43 | if (major_direction_relax) { 44 | dimension_analysis$attr_data_dim <- dimension_analysis$attr_data_dim %>% 45 | group_by(data_gid, direction_group) %>% 46 | mutate( 47 | is_full_dim_present = any(full_dim), 48 | this_attr_max_rel = (rel_dim == max(rel_dim)) 49 | ) %>% 50 | ungroup() %>% 51 | rename(full_dim_orig = full_dim) %>% 52 | mutate(full_dim = if_else(direction_group == "corner", 53 | full_dim_orig, 54 | if_else(is_full_dim_present, 55 | full_dim_orig, 56 | this_attr_max_rel 57 | ) 58 | )) 59 | } 60 | 61 | # fix major minor 62 | 63 | d_att_dat_map <- dimension_analysis$attr_data_dim %>% 64 | distinct(attr_gid, data_gid, full_dim) %>% 65 | right_join(d_att_dat_map, by = c("attr_gid", "data_gid")) %>% 66 | mutate(attr_group = if_else(full_dim, "major", "minor")) %>% 67 | select(-full_dim) 68 | 69 | 70 | 71 | d_att_dat_map_raw <- d_att_dat_map_raw %>% 72 | inner_join(d_att_dat_map %>% select(attr_gid, data_gid, direction, attr_group), 73 | by = c("attr_gid", "direction", "data_gid") 74 | ) 75 | 76 | list(raw_map = d_att_dat_map_raw, map = d_att_dat_map, dimension_analysis = dimension_analysis) 77 | } 78 | -------------------------------------------------------------------------------- /R/ai_get_data_attr_map_main.R: -------------------------------------------------------------------------------- 1 | 2 | ai_get_data_attr_map_main <- function(d_dat, d_att, crude_join = TRUE) { 3 | 4 | # start with simple attr data map 5 | admap0 <- ai_get_data_attr_map( 6 | dat_boundary = d_dat$group_id_boundary, 7 | att_gid_map = d_att$group_id_map 8 | ) 9 | 10 | 11 | if (crude_join) { 12 | # crude joins 13 | # absolutely sure joins 14 | crude_djoins <- ai_crude_data_block_joins(basic_admap = admap0$map, d_dat = d_dat) 15 | if (crude_djoins$done) { 16 | d_dat <- crude_djoins$d_dat 17 | admap0 <- ai_get_data_attr_map( 18 | dat_boundary = d_dat$group_id_boundary, 19 | att_gid_map = d_att$group_id_map 20 | ) 21 | } 22 | } 23 | 24 | 25 | # split attr gid relative to data_gid 26 | rel_chk <- ai_relative_data_split_attr(basic_map = admap0, d_att = d_att, d_dat = d_dat) 27 | if (rel_chk$done) { 28 | d_att <- rel_chk$d_att %>% map(unique) 29 | admap0 <- rel_chk$admap 30 | } 31 | 32 | 33 | admap1_major_minor <- admap0$all_map %>% 34 | rename(attr_gid = gid, dist = md) %>% 35 | filter(direction_group != "corner") %>% 36 | ai_get_data_attr_map_details(d_dat, d_att) 37 | 38 | admap1_major_minor_compact <- admap1_major_minor$map %>% 39 | filter(attr_group == "major") %>% 40 | rename(md = dist) %>% 41 | group_by(data_gid, direction_group) %>% 42 | mutate(m_dist = min(md)) %>% 43 | ungroup() %>% 44 | filter(md == m_dist) %>% 45 | select(-md) %>% 46 | rename(dist = m_dist) 47 | 48 | admap0_pass0 <- admap1_major_minor_compact %>% 49 | filter(direction_group != "corner") %>% 50 | unique() %>% 51 | select(-attr_group) 52 | 53 | # dimension analysis done here (major minor classification) 54 | admap1 <- admap0_pass0 %>% 55 | ai_get_data_attr_map_details(d_dat, d_att) 56 | 57 | list(admap = admap1, d_dat = d_dat, d_att = d_att) 58 | } 59 | -------------------------------------------------------------------------------- /R/ai_relative_data_join_attr.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | ai_relative_data_join_attr <- function(admap_main, d_att) { 4 | chk <- admap_main$raw_map %>% 5 | distinct(attr_gid, data_gid, direction, attr_group) %>% 6 | group_by(data_gid, direction, attr_group) %>% 7 | mutate(n_att = n_distinct(attr_gid)) %>% 8 | ungroup() %>% 9 | filter(n_att > 1) 10 | 11 | done <- FALSE 12 | 13 | if (nrow(chk) > 0) { 14 | # relative join required 15 | done <- TRUE 16 | 17 | rel_gids <- chk %>% 18 | select(-n_att) %>% 19 | inner_join(admap_main$raw_map, by = c("attr_gid", "data_gid", "direction", "attr_group")) 20 | 21 | d_att_dat_map_raw_rest <- admap_main$raw_map %>% 22 | anti_join(chk, by = c("attr_gid", "data_gid", "direction", "attr_group")) 23 | 24 | rel_gids_att <- rel_gids %>% 25 | distinct(attr_gid, data_gid, direction, attr_group, row = row_a, col = col_a) %>% 26 | group_by(data_gid, direction, attr_group) %>% 27 | mutate(new_attr_gid = paste(min(attr_gid), data_gid, direction, sep = "_")) %>% 28 | ungroup() 29 | 30 | rel_gids <- rel_gids %>% inner_join(rel_gids_att %>% distinct(attr_gid, new_attr_gid), by = "attr_gid") 31 | 32 | rel_gids <- rel_gids %>% 33 | group_by(new_attr_gid, data_gid) %>% 34 | mutate( 35 | # this is possibly not required anymore as attr_group is in grouping vars 36 | new_attr_group = if_else(any(attr_group == "major"), "major", "minor"), 37 | new_dist = min(dist) 38 | ) %>% 39 | ungroup() 40 | 41 | admap_main$raw_map <- rel_gids %>% 42 | select(-attr_group, -attr_gid, -dist) %>% 43 | rename(attr_gid = new_attr_gid, attr_group = new_attr_group, dist = new_dist) %>% 44 | bind_rows(d_att_dat_map_raw_rest) 45 | 46 | admap_main$map <- admap_main$raw_map %>% 47 | distinct(attr_gid, data_gid, direction, direction_group, dist, attr_group) 48 | 49 | # dimesion analysis is not kept 50 | 51 | # update d_att 52 | d_att$group_id_map <- d_att$group_id_map %>% 53 | bind_rows(rel_gids_att %>% 54 | select(gid = new_attr_gid, row, col)) 55 | d_att$group_id_boundary <- get_group_id_boundary(d_att$group_id_map) 56 | 57 | chk_this <- chk %>% distinct(gid = attr_gid, data_gid) 58 | 59 | if (is.null(d_att$missed_blocks)) { 60 | d_att$missed_blocks <- chk_this 61 | } else { 62 | d_att$missed_blocks <- chk_this %>% 63 | bind_rows(d_att$missed_blocks) %>% 64 | unique() 65 | } 66 | } 67 | 68 | list(done = done, d_att = d_att, admap = admap_main) 69 | } 70 | -------------------------------------------------------------------------------- /R/ai_relative_data_split_attr.R: -------------------------------------------------------------------------------- 1 | 2 | ai_relative_data_split_attr <- function(basic_map, d_att, d_dat) { 3 | chk <- basic_map$map %>% 4 | group_by(data_gid, attr_gid) %>% 5 | summarise(n_dirs = n_distinct(direction)) %>% 6 | ungroup() %>% 7 | filter(n_dirs > 1) 8 | 9 | done <- FALSE 10 | 11 | if (nrow(chk) > 0) { 12 | # relative split required 13 | done <- TRUE 14 | 15 | rel_gids <- chk %>% 16 | select(-n_dirs) %>% 17 | rename(gid = attr_gid) %>% 18 | inner_join(basic_map$raw, by = c("data_gid", "gid")) 19 | rel_gids <- rel_gids %>% 20 | mutate(new_attr_gid = paste(gid, data_gid, direction, sep = "_")) 21 | 22 | d_att$group_id_map <- d_att$group_id_map %>% 23 | bind_rows(rel_gids %>% 24 | select(gid = new_attr_gid, row, col)) 25 | d_att$group_id_boundary <- get_group_id_boundary(d_att$group_id_map) 26 | 27 | # information kept for missing link detection 28 | chk_this <- chk %>% distinct(gid = attr_gid, data_gid) 29 | d_att$missed_blocks <- chk_this 30 | 31 | admap_new <- ai_get_data_attr_map( 32 | dat_boundary = d_dat$group_id_boundary, 33 | att_gid_map = d_att$group_id_map 34 | ) 35 | admap_new$raw <- admap_new$raw %>% 36 | anti_join(chk_this, by = c("gid", "data_gid")) 37 | admap_new$all_map <- admap_new$all_map %>% 38 | anti_join(chk_this, by = c("gid", "data_gid")) 39 | admap_new$map <- admap_new$map %>% 40 | anti_join(chk_this %>% rename(attr_gid = gid), by = c("attr_gid", "data_gid")) 41 | } else { 42 | admap_new <- basic_map 43 | } 44 | 45 | list(done = done, d_att = d_att, admap = admap_new) 46 | } 47 | -------------------------------------------------------------------------------- /R/as_cell_df.R: -------------------------------------------------------------------------------- 1 | 2 | #' Transform data into Cell-DF Structure 3 | #' 4 | #' @description Transform an R object (mostly matrix or data.frame) into a [`cell_df`][cell_df-class] 5 | #' for further processing in other `tidycells` functions. 6 | #' 7 | #' @param d the data (either a matrix with column name or a data.frame) 8 | #' @param take_row_names consider row names as separate cells 9 | #' (applicable only for data with no (row, col) information). Default is \code{FALSE}. 10 | #' @param take_col_names consider column names as separate cells 11 | #' (applicable only for data with no (row, col) information). Default is \code{FALSE}. 12 | #' 13 | #' @return An object of class [`cell_df`][cell_df-class]. 14 | #' 15 | #' **Note**: After this, you may like to do [`Value Attribute Classification`][value_attribute_classify()]. 16 | #' 17 | #' @export 18 | #' @rdname as_cell_df 19 | #' @seealso 20 | #' * [`validate_cells`][validate_cells()] which is used to validate `cell_df`. 21 | #' * [`as_cells`][unpivotr::as_cells()] from `unpivotr` package. 22 | #' 23 | #' @examples 24 | #' 25 | #' as_cell_df(iris) 26 | #' 27 | #' # consider column name as cell 28 | #' as_cell_df(iris, take_col_names = TRUE) 29 | #' 30 | #' # if the data is already in a similar format it will not further transform 31 | #' # which is not true for ---> unpivotr::as_cells 32 | #' # check ---> unpivotr::as_cells(iris) %>% unpivotr::as_cells() 33 | #' unpivotr::as_cells(iris) %>% as_cell_df() 34 | as_cell_df <- function(d, take_row_names = FALSE, take_col_names = FALSE) { 35 | UseMethod("as_cell_df") 36 | } 37 | 38 | #' @export 39 | as_cell_df.cells <- function(d, ...) { 40 | d 41 | } 42 | 43 | #' @export 44 | as_cell_df.data.frame <- function(d, ...) { 45 | d %>% 46 | attach_intermediate_class() %>% 47 | as_cell_df_internal(...) 48 | } 49 | 50 | #' @export 51 | as_cell_df.matrix <- function(d, ...) { 52 | d %>% 53 | as.data.frame() %>% 54 | attach_intermediate_class() %>% 55 | as_cell_df_internal(...) 56 | } 57 | -------------------------------------------------------------------------------- /R/attach_trace_info.R: -------------------------------------------------------------------------------- 1 | 2 | attach_trace_info <- function(ca, dc) { 3 | if (missing(dc)) { 4 | if (missing(ca)) { 5 | abort("either 'ca' or 'dc' required") 6 | } 7 | dc_t <- compose_cells_raw(ca, trace_it_back = TRUE, silent = TRUE) 8 | dc <- dc_t 9 | } else { 10 | if (!any(stringr::str_detect(colnames(dc), "cellAddress_"))) { 11 | if (!missing(ca)) { 12 | dc_t <- compose_cells_raw(ca, trace_it_back = TRUE, silent = TRUE) 13 | dc_t <- dc_t[c("row", "col", colnames(dc_t)[stringr::str_detect(colnames(dc_t), "cellAddress_")])] 14 | dc_t$chk_this <- "ok" 15 | dc <- dc %>% left_join(dc_t, by = c("row", "col")) 16 | if (any(is.na(dc$chk_this))) { 17 | abort("at least one row could not be mapped properly") 18 | } 19 | } else { 20 | abort("supplied composition does not contain trace information and 'ca' is not given") 21 | } 22 | } 23 | } 24 | 25 | dc 26 | } 27 | -------------------------------------------------------------------------------- /R/cell_trace_plot.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Display static composition traceback plot 4 | #' 5 | #' @param ca A cell analysis 6 | #' @param dc (optional) a composed tidy data.frame 7 | #' @param trace_row the row number to trace. (Default 1) 8 | #' @keywords internal 9 | #' @export 10 | cell_composition_traceback <- function(ca, dc, trace_row = 1L) { 11 | cell_trace_plot(dc, trace_row, ca) 12 | } 13 | 14 | 15 | cell_trace_plot <- function(dc, trace_row, ca, prior_plot, prior_ca_plot) { 16 | dc <- attach_trace_info(ca, dc) 17 | 18 | if (!missing(trace_row)) { 19 | if (length(trace_row) > 0) { 20 | if (missing(prior_plot)) { 21 | if (missing(prior_ca_plot)) { 22 | prior_plot <- graphics::plot(ca$cell_df, no_plot = TRUE) 23 | } 24 | } 25 | 26 | d0 <- dc[trace_row, ] 27 | 28 | connected_cells <- d0[stringr::str_detect(colnames(d0), "cellAddress")] 29 | connected_cells <- connected_cells %>% 30 | map_lgl(~ !is.na(.x)) %>% 31 | connected_cells[.] 32 | connected_cells <- connected_cells %>% 33 | imap_dfr(~ .x %>% 34 | stringr::str_split(" :: ") %>% 35 | reduce(c) %>% 36 | stringr::str_split("_") %>% 37 | map(~ .x %>% 38 | as.integer() %>% 39 | t()) %>% 40 | reduce(rbind) %>% 41 | as.data.frame() %>% 42 | mutate(cn_id_raw = .y)) 43 | connected_cells <- connected_cells %>% 44 | mutate( 45 | cn_id = cn_id_raw %>% 46 | stringr::str_remove("cellAddress_"), 47 | cn = cn_id %>% 48 | stringr::str_extract("major|minor") 49 | ) 50 | colnames(connected_cells)[1:2] <- c("row", "col") 51 | 52 | connected_cells <- connected_cells %>% 53 | mutate(cn_id_ = cn_id) %>% 54 | tidyr::separate(cn_id_, into = c(rep(NA, 3), "d1", "d2")) %>% 55 | mutate(dty = (as.numeric(d1) + as.numeric(d2)) %% 2) %>% 56 | select(-d1, -d2) 57 | 58 | connected_cells <- connected_cells %>% mutate(r = d0$row, c = d0$col) 59 | 60 | 61 | if (missing(prior_ca_plot)) { 62 | prior_ca_plot <- graphics::plot(ca, 63 | prior_plot = prior_plot, 64 | gids = d0$data_block, 65 | zoom_selected_gids = TRUE, 66 | block_boundary = FALSE, 67 | direction_text = FALSE, 68 | no_plot = TRUE 69 | ) 70 | } 71 | this_plot <- prior_ca_plot 72 | this_plot <- this_plot + ggplot2::guides(fill = FALSE) 73 | 74 | this_plot <- this_plot + 75 | ggplot2::geom_tile( 76 | data = d0, 77 | ggplot2::aes(col, -row), 78 | color = "red", lwd = 1, inherit.aes = FALSE, 79 | alpha = 0, 80 | width = 1, height = 1, na.rm = TRUE 81 | ) 82 | 83 | if (length(unique(connected_cells$dty)) == 1) { 84 | this_plot <- this_plot + 85 | ggplot2::geom_curve( 86 | data = connected_cells, 87 | ggplot2::aes( 88 | x = c, y = -r, 89 | xend = col, yend = -row, 90 | color = cn 91 | ), 92 | inherit.aes = FALSE, curvature = -0.1, 93 | arrow = ggplot2::arrow(), lwd = 1, na.rm = TRUE 94 | ) 95 | } else { 96 | this_plot <- this_plot + 97 | ggplot2::geom_curve( 98 | data = connected_cells %>% filter(dty == 1), 99 | ggplot2::aes( 100 | x = c, y = -r, 101 | xend = col, yend = -row, 102 | color = cn 103 | ), 104 | inherit.aes = FALSE, curvature = -0.1, 105 | arrow = ggplot2::arrow(), lwd = 1, lty = "longdash", na.rm = TRUE 106 | ) + 107 | ggplot2::geom_curve( 108 | data = connected_cells %>% filter(dty != 1), 109 | ggplot2::aes( 110 | x = c, y = -r, 111 | xend = col, yend = -row, 112 | color = cn 113 | ), 114 | inherit.aes = FALSE, curvature = 0.1, 115 | arrow = ggplot2::arrow(), lwd = 1, na.rm = TRUE 116 | ) 117 | } 118 | 119 | 120 | this_plot <- this_plot + 121 | ggplot2::guides(color = FALSE) + 122 | ggplot2::scale_color_manual(values = c(major = "#B5525288", minor = "#4156A888")) 123 | 124 | 125 | return(this_plot) 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /R/class-cell_df.R: -------------------------------------------------------------------------------- 1 | 2 | cell_df_class <- c("cell_df", "cells", "rc_df", "tbl_df", "tbl", "data.frame") 3 | 4 | cell_df_class_internal <- c("cell_df", "cells", "rc_df", "tidyxl", "unpivotr", "readr", "unknown") 5 | 6 | #' @importFrom methods setOldClass 7 | #' @exportClass cell_df 8 | 9 | setOldClass(cell_df_class) 10 | 11 | #' `cell_df` class 12 | #' 13 | #' @description 14 | #' The `cell_df` class is a subclass of [`tbl_df`][tibble::tibble()] and [`data.frame`][base::data.frame()], 15 | #' created in order to store cell level information. 16 | #' 17 | #' @section Properties of `cell_df`: 18 | #' 19 | #' Objects of class `cell_df` have: 20 | #' * A `class` attribute of `c("cell_df", "cells", "rc_df", "tbl_df", "tbl", "data.frame")`. 21 | #' * Two column of `integer` type named `row` and `col`. All entries must be positive and not `NA`. 22 | #' This represents a cell address. 23 | #' * A column of `character` type named `data_type`. Which can contain only `numeric` or `character` as entries. 24 | #' This represents the data type of the cell (classified to only two categories). 25 | #' * A column of `character` type named `value`. This stores value of corresponding cells. 26 | #' * Apart from these columns it can contain other columns. 27 | #' However, (`row`, `col`) together should identify the row of the `cell_df` uniquely. 28 | #' 29 | #' @section Applicable methods on `cell_df`: 30 | #' * `print`: Prints identifier of the class and the content of the underlying tibble. 31 | #' * `summary`: Calculates basic stats like number of rows and columns, number of characters and numeric fields, density etc. 32 | #' * `plot`: Plots (using [`ggplot2`][ggplot2::ggplot()]) the cell information as ordinary table. 33 | #' * `as.matrix`: This will transform the data back into tabular form and create a character matrix. 34 | #' * `as.data.frame`: Similar to `as.matrix` this will also transform the data back into tabular form. 35 | #' 36 | #' @seealso 37 | #' 38 | #' The function [`validate_cells`][validate_cells()] which is used to validate `cell_df`. 39 | #' 40 | #' The `as.matrix` and `as.data.frame` method is similar to [`unpivotr::rectify`][unpivotr::rectify()]. 41 | #' 42 | #' Object of this class is also compatible to most of the functions from \code{\link[unpivotr:unpivotr-package]{unpivotr}} package. 43 | #' 44 | #' @name cell_df-class 45 | #' @keywords internal 46 | NULL 47 | 48 | 49 | new_cell_df <- function(dat) { 50 | val <- validate_cells(dat) 51 | 52 | if (!val) { 53 | abort(paste0(attr(val, "msg"), collapse = "\n")) 54 | } 55 | 56 | if (!is.integer(dat$row)) { 57 | dat <- dat %>% mutate(row = as.integer(row)) 58 | } 59 | 60 | if (!is.integer(dat$col)) { 61 | dat <- dat %>% mutate(col = as.integer(col)) 62 | } 63 | 64 | # if all above passes 65 | 66 | # unclass cell_df induces class 67 | dat <- unset_cell_df_class(dat) 68 | 69 | # re-attach cell_df class 70 | class(dat) <- class(dat) %>% 71 | c(cell_df_class, .) 72 | 73 | 74 | 75 | dat 76 | } 77 | 78 | unset_cell_df_class <- function(dat) { 79 | class(dat) <- class(dat) %>% 80 | setdiff(cell_df_class_internal) 81 | dat 82 | } 83 | 84 | is_cell_df <- function(d) { 85 | inherits(d, "cell_df") 86 | } 87 | -------------------------------------------------------------------------------- /R/class-cell_df_analysis.R: -------------------------------------------------------------------------------- 1 | 2 | #' `cell_analysis` class 3 | #' 4 | #' @description 5 | #' The `cell_analysis` class is based on list, 6 | #' created in order to store analysis of cell level information ([`cell_df`][cell_df-class]). 7 | #' 8 | #' @section Properties of `cell_analysis`: 9 | #' 10 | #' Objects of class `cell_analysis` have following named nodes: 11 | #' * `cells` : Contains information about `cell_group_type` in terms of (data, minor and major attributes).(a tibble) 12 | #' * `sections` : Contains boundaries of each data block. (a tibble) 13 | #' * `details` : a list containing further information 14 | #' * `cell_df` : The original [`cell_df`][cell_df-class] which is passed for processing 15 | #' 16 | #' @section Applicable methods on `cell_analysis`: 17 | #' * `print`: Prints identifier of the class and the number of blocks (and potential issues if any). 18 | #' * `plot`: Plots (using [`ggplot2`][ggplot2::ggplot()]) the data-block information. 19 | #' 20 | #' @name cell_analysis-class 21 | #' @keywords internal 22 | NULL 23 | 24 | #' @exportClass cell_analysis 25 | cell_df_analysis_class <- c("cell_analysis", "cell_df_analysis", "list") 26 | setOldClass(cell_df_analysis_class) 27 | -------------------------------------------------------------------------------- /R/class-intermediate.R: -------------------------------------------------------------------------------- 1 | 2 | #' @importFrom utils hasName 3 | 4 | 5 | detect_cell_df_pattern <- function(dat) { 6 | 7 | # this is built as per the description of the return values from supported packages 8 | # supported packages / functions : 9 | # 1) tidyxl::xlsx_cells 10 | # 2) unpivotr::as_cells 11 | # 3) readr::melt_csv (and family) 12 | 13 | chk <- tibble( 14 | type = c( 15 | "tidyxl", 16 | "unpivotr", 17 | "readr" 18 | ), 19 | 20 | col_names = list( 21 | c("sheet", "address", "row", "col", "is_blank", "data_type", "error", "logical", "numeric", "date", "character"), 22 | c("row", "col", "data_type"), 23 | c("row", "col", "data_type", "value") 24 | ), 25 | 26 | data_types = list( 27 | c("error", "logical", "numeric", "date", "character", "blank"), 28 | c("chr", "cplx", "cplx", "dbl", "fct", "int", "lgl", "list", "ord"), 29 | c("integer", "character", "date") 30 | ), 31 | 32 | optional_cols = list( 33 | c("sheet", "error", "logical", "numeric", "date", "character", "blank"), 34 | c("chr", "cplx", "cplx", "dbl", "fct", "int", "lgl", "list", "ord"), 35 | c() 36 | ) 37 | ) 38 | 39 | if (!hasName(dat, "data_type")) { 40 | # all of them has data_type 41 | return("unknown") 42 | } 43 | 44 | d_type <- chk %>% 45 | mutate( 46 | ccn = col_names %>% map_lgl(~ hasName(dat, .x) %>% all()), 47 | cdt = data_types %>% map_int(~ (.x %in% dat$data_type) %>% sum()), 48 | coc = optional_cols %>% map_int(~ hasName(dat, .x) %>% sum()) 49 | ) %>% 50 | filter(ccn) %>% 51 | filter(cdt == max(cdt), coc == max(coc)) %>% 52 | pull(type) 53 | 54 | if (length(d_type) == 0) { 55 | return("unknown") 56 | } 57 | 58 | if (length(d_type) > 1) { 59 | d_type <- d_type[1] 60 | } 61 | 62 | d_type 63 | } 64 | 65 | 66 | attach_intermediate_class <- function(dat) { 67 | class(dat) <- c(detect_cell_df_pattern(dat), class(dat)) 68 | 69 | dat 70 | } 71 | -------------------------------------------------------------------------------- /R/class-read_cell_part.R: -------------------------------------------------------------------------------- 1 | 2 | #' `read_cell_part` class 3 | #' 4 | #' @description 5 | #' The `read_cell_part` class is based on list, 6 | #' created in order to store (possible) intermediate [`read_cells`][read_cells()] output in detailed format. 7 | #' 8 | #' **Note:** This class is designed mainly for _internal use_. 9 | #' It is recommended to manipulate objects of this class only for cases where desired output is not coming. 10 | #' 11 | #' @section Properties of `read_cell_part`: 12 | #' 13 | #' Objects of class `read_cell_part` **may** have following named nodes (the nodes may change based on stage): 14 | #' * `file_name` : file name which was given to process 15 | #' * `stage` : stage at which it was last processed. 16 | #' * `info` : a list containing further information on type and content of the file (provided the file got read) 17 | #' * `is_empty` : whether the file contains no 'known tabular information' 18 | #' * `cell_list` : list of [`cell_df`][cell_df-class] 19 | #' (possibly after `Value Attribute Classification` done if stage is higher than `make_cells`) 20 | #' * `cell_analysis_list` : list of [`cell_analysis`][cell_analysis-class] 21 | #' * `final_composition` : final composition (a `tibble`) with only main columns 22 | #' (or all column if `compose_main_cols_only = FALSE`) 23 | #' * `final_composition_main` : only appear if `compose_main_cols_only = FALSE`. 24 | #' This holds final composition (a `tibble`) with only main columns 25 | #' 26 | #' @section Applicable methods on `read_cell_part`: 27 | #' * `print`: Prints identifier of the class and the stage at which it is last processed. 28 | #' 29 | #' @name read_cell_part-class 30 | #' @keywords internal 31 | NULL 32 | 33 | #' @exportClass read_cell_part 34 | read_cell_part_class <- c("read_cell_part", "list") 35 | setOldClass(read_cell_part_class) 36 | -------------------------------------------------------------------------------- /R/cli_bindings.R: -------------------------------------------------------------------------------- 1 | 2 | ok_cli <- function() { 3 | is_available("cli") 4 | } 5 | 6 | cli_bs <- function() { 7 | if (ok_cli()) { 8 | paste0(cli::symbol$menu, " ") 9 | } else { 10 | "* " 11 | } 12 | } 13 | cli_tick <- function() { 14 | if (ok_cli()) { 15 | cli::symbol$tick 16 | } else { 17 | "V" 18 | } 19 | } 20 | cli_cross <- function() { 21 | if (ok_cli()) { 22 | cli::symbol$cross 23 | } else { 24 | "X" 25 | } 26 | } 27 | 28 | 29 | cli_bb <- function(x) { 30 | if (ok_cli()) { 31 | cli::style_bold(cli::col_blue(x)) 32 | } else { 33 | x 34 | } 35 | } 36 | 37 | cli_b <- function(x) { 38 | if (ok_cli()) { 39 | cli::col_blue(x) 40 | } else { 41 | x 42 | } 43 | } 44 | 45 | cli_r <- function(x) { 46 | if (ok_cli()) { 47 | cli::col_red(x) 48 | } else { 49 | x 50 | } 51 | } 52 | 53 | cli_g <- function(x) { 54 | if (ok_cli()) { 55 | cli::col_green(x) 56 | } else { 57 | x 58 | } 59 | } 60 | 61 | cli_br <- function(x) { 62 | if (ok_cli()) { 63 | cli::style_bold(cli::col_red(x)) 64 | } else { 65 | x 66 | } 67 | } 68 | 69 | cli_box <- function(x, ...) { 70 | if (ok_cli()) { 71 | cli::cat_boxx(x, ...) 72 | } else { 73 | cat(x, sep = "\n") 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /R/collate_columns.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' Collate Columns Based on Content 5 | #' 6 | #' @description After [`compose_cells`][compose_cells()], this function rearranges and rename attribute-columns in order to 7 | #' make columns properly aligned, based on the content of the columns. 8 | #' 9 | #' @param composed_data output of [`compose_cells`][compose_cells()] (preferably not processed) 10 | #' @param combine_threshold a numerical threshold (between 0-1) for content-based collation of columns. (Default 1) 11 | #' @param rest_cols number of rest columns (beyond `combine_threshold` joins these many numbers of columns to keep) 12 | #' @param retain_other_cols whether to keep other intermediate (and possibly not so important) columns. (Default `FALSE`) 13 | #' @param retain_cell_address whether to keep columns like (`row`, `col`, `data_block`). 14 | #' This may be required for [`traceback`][cell_composition_traceback()] (Default `FALSE`) 15 | #' 16 | #' @return A column collated data.frame 17 | #' 18 | #' @details 19 | #' * **Dependency on _stringdist_**: If you have \code{\link[stringdist:stringdist-package]{stringdist}} installed, 20 | #' the approximate string matching will be enhanced. There may be variations in outcome if you have `stringdist` 21 | #' vs if you don't have it. 22 | #' * **Possibility of randomness**: If the attribute column is containing many distinct values, then a column representative sample will be drawn. 23 | #' Hence it is always recommended to [`set.seed`][base::set.seed()] if reproducibility is a matter of concern. 24 | #' 25 | #' @export 26 | #' 27 | collate_columns <- function(composed_data, 28 | combine_threshold = 1, 29 | rest_cols = Inf, 30 | retain_other_cols = FALSE, 31 | retain_cell_address = FALSE) { 32 | ok <- FALSE 33 | 34 | defcols_this <- defcols 35 | if (is.data.frame(composed_data)) { 36 | if (!utils::hasName(composed_data, "table_tag")) { 37 | defcols_this <- setdiff(defcols_this, "table_tag") 38 | } 39 | if (all(utils::hasName(composed_data, defcols_this))) { 40 | ok <- TRUE 41 | 42 | if (utils::hasName(composed_data, "table_tag")) { 43 | dcl <- composed_data %>% 44 | group_by(data_block, table_tag) 45 | } else { 46 | dcl <- composed_data %>% 47 | group_by(data_block) 48 | } 49 | 50 | dcl <- dcl %>% 51 | group_split() %>% 52 | map(~ { 53 | .d <- ungroup(.x) 54 | this_cols <- colnames(.d) 55 | nm_cols <- this_cols[stringr::str_detect(this_cols, "row|col|corner")] 56 | nm_cols <- setdiff(nm_cols, defcols_this) 57 | if (length(nm_cols) == 0) { 58 | nm_cols <- setdiff(this_cols, defcols_this) 59 | } 60 | .d <- .d[c(defcols_this, nm_cols)] 61 | na_c <- .d %>% map_lgl(~ is.na(.x) %>% all()) 62 | .d[!na_c] 63 | }) 64 | } 65 | } else { 66 | # data.frame is a list, first data.frame check is required 67 | if (is.list(composed_data)) { 68 | if (all(map_lgl(composed_data, is.data.frame))) { 69 | if (!any(map_lgl(composed_data, ~ utils::hasName(.x, "table_tag")))) { 70 | defcols_this <- setdiff(defcols_this, "table_tag") 71 | } 72 | if (all(map_lgl(composed_data, ~ all(utils::hasName(.x, defcols_this))))) { 73 | ok <- TRUE 74 | dcl <- composed_data 75 | } 76 | } 77 | } 78 | } 79 | 80 | 81 | 82 | if (!ok) { 83 | abort("The argument composed_data has to be output of compose_cells. Given composed_data has no known format.") 84 | } 85 | 86 | if (length(dcl) == 1) { 87 | out_d <- dcl[[1]] 88 | 89 | colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "uncollated_", "old_uc_") 90 | colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "collated_", "old_c_") 91 | 92 | restcols <- setdiff(colnames(out_d), defcols_this) 93 | if (length(restcols) > 0) { 94 | cn_map_0 <- tibble(cn = restcols) %>% 95 | mutate(is_major = stringr::str_detect(tolower(cn), "major")) %>% 96 | arrange(cn) %>% 97 | mutate(sn = seq_along(cn), sn_m = sn - is_major * (10^10)) %>% 98 | arrange(sn_m) %>% 99 | mutate(fsn = seq_along(cn), new_cn = paste0("collated_", fsn)) %>% 100 | select(cn, new_cn) 101 | 102 | for (i in seq_along(cn_map_0$cn)) { 103 | colnames(out_d)[which(colnames(out_d) == cn_map_0$cn[i])] <- cn_map_0$new_cn[i] 104 | } 105 | } 106 | } else { 107 | out_d <- dcl %>% reduce(reduce_2dfs, 108 | combine_th = combine_threshold, 109 | rest_cols = rest_cols, 110 | retain_other_cols = retain_other_cols 111 | ) 112 | } 113 | 114 | 115 | 116 | 117 | 118 | if (!retain_cell_address) { 119 | out_d <- out_d[setdiff(colnames(out_d), c("row", "col", "data_block"))] 120 | } 121 | 122 | out_d[sort(colnames(out_d))] 123 | } 124 | -------------------------------------------------------------------------------- /R/collate_columns_etc.R: -------------------------------------------------------------------------------- 1 | 2 | norm_this <- function(x) { 3 | m <- min(x, na.rm = TRUE) 4 | M <- max(x, na.rm = TRUE) 5 | if (m == M) { 6 | if (m > 0.5) { 7 | x <- rep(1, length(x)) 8 | } else { 9 | x <- rep(0, length(x)) 10 | } 11 | } else { 12 | x <- (x - m) / (M - m) 13 | } 14 | x 15 | } 16 | 17 | 18 | # possibly table tag if across table collation is required 19 | defcols <- c("row", "col", "value", "data_block", "table_tag") 20 | -------------------------------------------------------------------------------- /R/common_data_patterns.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #@Dev 4 | # quick added may need to think through about how to add it properly. 5 | # common data patterns 6 | cdp_va_classify_value_rect <- function(d){ 7 | val_range <- d %>% 8 | filter(type == "value") %>% 9 | summarise(mr = min(row), mc = min(col), Mr = max(row), Mc =max(col)) 10 | d$type[ 11 | d$row>=val_range$mr & 12 | d$row<=val_range$Mr & 13 | d$col>=val_range$mc & 14 | d$col<=val_range$Mc 15 | ] <- "value" 16 | d 17 | } 18 | 19 | -------------------------------------------------------------------------------- /R/compact_gid_maps.R: -------------------------------------------------------------------------------- 1 | 2 | # diffrent split resulted in same attr_gid 3 | # these will be compated to single group 4 | compact_gid_maps <- function(gid_map, admap_main) { 5 | gid_map_raw <- gid_map$group_id_map 6 | 7 | foot_prints <- gid_map_raw %>% 8 | group_by(gid) %>% 9 | group_split() %>% 10 | map_df(~ .x %>% 11 | distinct(row, col, gid) %>% 12 | arrange(row, col) %>% 13 | summarise(gid = gid[1], fp = paste0(row, ",", col, collapse = ";"))) 14 | 15 | ngmap <- foot_prints %>% 16 | group_by(fp) %>% 17 | mutate(new_gid = min(gid)) %>% 18 | ungroup() %>% 19 | distinct(gid, new_gid) 20 | 21 | gid_map_raw_new <- gid_map_raw %>% left_join(ngmap, by = "gid") 22 | gid_map_raw_new <- gid_map_raw_new %>% 23 | mutate(new_gid = if_else(is.na(new_gid), gid, new_gid)) %>% 24 | select(-gid) %>% 25 | rename(gid = new_gid) %>% 26 | distinct() 27 | 28 | gid_map$group_id_map <- gid_map_raw_new 29 | gid_map$group_id_boundary <- get_group_id_boundary(gid_map_raw_new) 30 | 31 | admap_main_raw_map_new <- admap_main$raw_map %>% 32 | left_join(ngmap %>% rename(attr_gid = gid, new_attr_gid = new_gid), by = "attr_gid") 33 | admap_main_raw_map_new <- admap_main_raw_map_new %>% 34 | mutate(new_attr_gid = if_else(is.na(new_attr_gid), attr_gid, new_attr_gid)) %>% 35 | select(-attr_gid) %>% 36 | rename(attr_gid = new_attr_gid) %>% 37 | distinct() 38 | 39 | admap_main$raw_map <- admap_main_raw_map_new 40 | 41 | admap_main$map <- admap_main$raw_map %>% 42 | distinct(attr_gid, data_gid, direction, direction_group, dist, attr_group) 43 | 44 | # dimesion analysis is not kept 45 | 46 | list(gid_map = gid_map, admap = admap_main) 47 | } 48 | -------------------------------------------------------------------------------- /R/detect_LibreOffice.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # sallow level detection of LibreOffice for support of doc 4 | # check this https://github.com/hrbrmstr/docxtractr/issues/23 5 | detect_LibreOffice <- function(type = 1) { 6 | out <- FALSE 7 | if (is_available("docxtractr")) { 8 | if (!identical(type, 2)) { 9 | get_lo <- function() { 10 | # an internal function of docxtractr 11 | # this will work even if they change `lo_find` 12 | # as then it will be operational by 2nd option below 13 | lo_find() 14 | } 15 | # this is to avoid docxtractr's internal lo_find 16 | environment(get_lo) <- environment(docxtractr::read_docx) 17 | test <- try(get_lo(), silent = TRUE) 18 | if (!inherits(test, "try-error")) { 19 | if (is.character(test)) { 20 | if (file.exists(test)) { 21 | out <- TRUE 22 | } 23 | } 24 | } 25 | } 26 | 27 | if (identical(type, 2) & !out) { 28 | # heuristic option (should work even when lo_find change) 29 | bad_doc <- "*?@:bad.doc" 30 | try(docxtractr::read_docx(bad_doc), silent = TRUE) 31 | test_op <- options("path_to_libreoffice")[[1]] 32 | if (!is.null(test_op)) { 33 | if (is.character(test_op)) { 34 | if (file.exists(test_op)) { 35 | out <- TRUE 36 | } 37 | } 38 | } 39 | } 40 | } 41 | out 42 | } 43 | -------------------------------------------------------------------------------- /R/extend_data_block.R: -------------------------------------------------------------------------------- 1 | 2 | extend_data_block <- function(dbry, att_map, d_att) { 3 | att_map %>% 4 | select(gid_att = attr_gid, gid = data_gid) %>% 5 | right_join(dbry, by = "gid") %>% 6 | left_join(d_att$group_id_boundary, by = c("gid_att" = "gid")) %>% 7 | mutate( 8 | r_min = pmin(r_min.x, r_min.y, na.rm = TRUE), 9 | c_min = pmin(c_min.x, c_min.y, na.rm = TRUE), 10 | r_max = pmax(r_max.x, r_max.y, na.rm = TRUE), 11 | c_max = pmax(c_max.x, c_max.y, na.rm = TRUE) 12 | ) %>% 13 | group_by(gid) %>% 14 | summarise( 15 | r_min = min(r_min, na.rm = TRUE), 16 | c_min = min(c_min, na.rm = TRUE), 17 | r_max = max(r_max, na.rm = TRUE), 18 | c_max = max(c_max, na.rm = TRUE) 19 | ) 20 | } 21 | -------------------------------------------------------------------------------- /R/file_etc.R: -------------------------------------------------------------------------------- 1 | this_file_ext <- function(fn) { 2 | x <- stringr::str_split(fn, "\\.") 3 | 4 | x %>% map_chr(~ { 5 | if (length(.x) == 1) { 6 | "" 7 | } else { 8 | rev(.x)[1] 9 | } 10 | }) 11 | } 12 | 13 | is_txt_file <- function(fn) { 14 | f <- file(fn, "rb", raw = TRUE) 15 | bytes <- readBin(f, "int", 1000, size = 1, signed = FALSE) 16 | close(f) 17 | return(max(bytes) <= 128) 18 | } 19 | 20 | 21 | 22 | common_file_error <- function(fn) { 23 | if (missing(fn)) { 24 | abort("No file name given") 25 | } 26 | if (length(fn) != 1) { 27 | abort("only one file name supported") 28 | } 29 | if (!file.exists(fn)) { 30 | abort("file does not exists (or possibly you do not have permission)") 31 | } 32 | } 33 | 34 | crude_format_from_signature <- function(fn) { 35 | common_file_error(fn) 36 | 37 | f_8 <- readBin(fn, n = 8, what = "raw") 38 | # as xls and doc both have same magic number "D0 CF 11 E0 A1 B1 1A E1" 39 | # many other have it though 40 | # ref : https://asecuritysite.com/forensics/magic 41 | xls_doc_magic <- as.raw(c(0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1)) 42 | # either xlsx or docx 43 | # as xlsx and docx both have same magic number "50 4B 03 04" 44 | xlsx_docx_magic <- as.raw(c(0x50, 0x4b, 0x03, 0x04)) 45 | pdf_magic <- as.raw(c(0x25, 0x50, 0x44, 0x46)) 46 | 47 | if (identical(f_8, xls_doc_magic)) { 48 | return("xls_doc") 49 | } 50 | if (identical(f_8[1:4], xlsx_docx_magic)) { 51 | return("xlsx_docx") 52 | } 53 | if (identical(f_8[1:4], pdf_magic)) { 54 | return("pdf") 55 | } 56 | return("unknown") 57 | } 58 | -------------------------------------------------------------------------------- /R/fj.R: -------------------------------------------------------------------------------- 1 | 2 | fj <- function(x, y, 3 | join_by = c("row", "col", "value", "data_block"), 4 | sallow_join = FALSE, sep = " :: ") { 5 | c1 <- colnames(x) 6 | c2 <- colnames(y) 7 | comm_cols <- c1 %>% 8 | intersect(c2) %>% 9 | setdiff(join_by) 10 | chk <- comm_cols %>% 11 | length() 12 | extra <- NULL 13 | if (chk > 0) { 14 | if (identical(sallow_join, TRUE)) { 15 | # try to merge the columns 16 | # check first content 17 | # need to update x, y so using for loop 18 | 19 | fmg_wc <- full_join(x, y, by = join_by, suffix = c(".fj1", ".fj2")) 20 | 21 | for (cn in comm_cols) { 22 | xc <- fmg_wc[[paste0(cn, ".fj1")]] %>% stringr::str_trim() 23 | yc <- fmg_wc[[paste0(cn, ".fj2")]] %>% stringr::str_trim() 24 | 25 | xc[is.na(xc)] <- "" 26 | yc[is.na(yc)] <- "" 27 | 28 | fmg_wc[[paste0(cn, ".fj1")]] <- xc 29 | fmg_wc[[paste0(cn, ".fj2")]] <- yc 30 | 31 | # back to NA again to remove them from comparison 32 | xc[nchar(xc) == 0] <- NA 33 | yc[nchar(yc) == 0] <- NA 34 | 35 | if (all(xc == yc, na.rm = TRUE)) { 36 | # both the columns are actually equal 37 | # removing from x, y 38 | fmg_wc[[cn]] <- fmg_wc[[paste0(cn, ".fj1")]] 39 | x[[cn]] <- NULL 40 | y[[cn]] <- NULL 41 | } else { 42 | # string join in x 43 | fmg_wc[[cn]] <- paste0(fmg_wc[[paste0(cn, ".fj1")]], sep, fmg_wc[[paste0(cn, ".fj2")]]) 44 | # removing from x, y 45 | x[[cn]] <- NULL 46 | y[[cn]] <- NULL 47 | } 48 | } 49 | 50 | extra <- unique(fmg_wc[c(join_by, comm_cols)]) 51 | } else { 52 | abort(paste( 53 | "unexpected error while joining.", 54 | "(Please contact developer)" 55 | )) 56 | } 57 | } 58 | f0 <- full_join(x, y, by = join_by) 59 | if (!is.null(extra)) { 60 | if (is.data.frame(extra)) { 61 | if (nrow(extra) > 0) { 62 | if (length(setdiff(intersect(colnames(extra), colnames(f0)), join_by)) == 0) { 63 | f0 <- full_join(f0, extra, by = join_by) 64 | } 65 | } 66 | } 67 | } 68 | f0 69 | } 70 | -------------------------------------------------------------------------------- /R/generics-read_cell_part.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' @export 4 | print.read_cell_part <- function(x, ...) { 5 | msg <- cli_bb("A partial read_cell") 6 | msg <- paste0( 7 | msg, 8 | "\n", 9 | cli_b("At stage "), x$stage 10 | ) 11 | cat(msg) 12 | } 13 | -------------------------------------------------------------------------------- /R/get_all_col_representative.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | get_col_representative <- function(x, cut_th = 500L, lower_it = TRUE, silent = TRUE) { 5 | x <- x[!is.na(x)] 6 | x <- unique(x) 7 | x <- as.character(x) 8 | if (lower_it) { 9 | x <- tolower(x) 10 | } 11 | x <- stringr::str_trim(x) 12 | x <- unique(x) 13 | dxt <- tibble(x, nc = nchar(x)) 14 | dxt <- dxt[dxt$nc > 0, ] 15 | x <- dxt$x 16 | if (length(x) > cut_th) { 17 | if (!silent) { 18 | message(paste0( 19 | "Representatives for column is selected based on a sample.", 20 | "\nYou may need to set.seed() to have a reproducible outcome" 21 | )) 22 | } 23 | dxt <- dxt %>% arrange(nc) 24 | fidx <- round(seq(from = 1.5, to = nrow(dxt) - 0.5, length.out = 10)) 25 | dxtr <- dxt[-fidx, ] 26 | x1 <- sample(dxtr$x, max(round(cut_th / 10), min(4, length(dxtr$x)))) 27 | x2 <- dxt$x[fidx] 28 | x <- unique(x1, x2) 29 | } 30 | sort(unique(x)) 31 | } 32 | 33 | get_all_col_representative <- function(d, except_cols = defcols, ...) { 34 | d <- d[setdiff(colnames(d), except_cols)] 35 | crs <- d %>% map(get_col_representative, ...) 36 | crsd <- tibble(nn = names(crs), tinf = crs %>% map_chr(~ paste0(sort(.x), collapse = " :: "))) 37 | crsd <- crsd %>% mutate(tndup = !duplicated(tinf)) 38 | crs <- crs[crsd$nn[crsd$tndup]] 39 | 40 | crs 41 | } 42 | -------------------------------------------------------------------------------- /R/get_cells_from_admap.R: -------------------------------------------------------------------------------- 1 | 2 | get_cells_from_admap <- function(admap, d_dat, d_att) { 3 | cells <- d_dat$group_id_map %>% 4 | select(gid, row, col) %>% 5 | mutate(cell_group_type = "data") 6 | 7 | cells <- admap$map %>% 8 | mutate(cell_group_type = paste0(attr_group, "_attr")) %>% 9 | select(attr_gid, data_gid, cell_group_type) %>% 10 | left_join(d_att$group_id_map, by = c("attr_gid" = "gid")) %>% 11 | select(gid = data_gid, row, col, cell_group_type) %>% 12 | bind_rows(cells) %>% 13 | distinct() 14 | 15 | cells 16 | } 17 | -------------------------------------------------------------------------------- /R/get_definiteness_checks.R: -------------------------------------------------------------------------------- 1 | 2 | get_definiteness_checks <- function(definiteness_details, silent = TRUE) { 3 | definiteness_checks <- list( 4 | # each gid (attr) should map to only one data_gid (this can happen) 5 | gid_att = nrow(definiteness_details$gid_att) == 0, 6 | # all data id should have at least one NS and WE attr_gid 7 | # (this also can happen then data_gid joining maybe required) 8 | gid_data = nrow(definiteness_details$gid_data) == 0, 9 | all_gid_att = length(definiteness_details$all_gid_att) == 0 10 | ) 11 | 12 | if (!definiteness_checks$gid_att) { 13 | attr(definiteness_checks$gid_att, "msg") <- 14 | "At least one attribute group-id (gid) is mapped to more than one data_gid (this can happen for overlapped blocks)" 15 | } 16 | 17 | if (!definiteness_checks$gid_data) { 18 | attr(definiteness_checks$gid_data, "msg") <- 19 | "At least one data group-id (gid) is lacking either NS or WE direction_group attirbute (this means 2D data block has only 1D attribute)" 20 | } 21 | 22 | if (!definiteness_checks$all_gid_att) { 23 | attr(definiteness_checks$all_gid_att, "msg") <- 24 | "At least one attribute group-id (gid) is not mapped with any block" 25 | } 26 | 27 | if (!all(unlist(definiteness_checks))) { 28 | if (!silent) { 29 | warn("Heuristic data detection may not be correct. Look at definiteness_checks.") 30 | } 31 | } 32 | 33 | definiteness_checks 34 | } 35 | 36 | 37 | get_definiteness_details <- function(admap_raw, all_attr_gids) { 38 | unmapped_attr_gids <- setdiff(all_attr_gids, admap_raw$attr_gid) 39 | 40 | this_admap <- admap_raw %>% distinct(attr_gid, data_gid, direction_group) 41 | 42 | definiteness_details <- list( 43 | # each gid (attr) should map to only one data_gid (this can happen) 44 | gid_att = this_admap %>% 45 | count(attr_gid) %>% filter(n > 1), 46 | # all data id should have at least one NS and WE attr_gid 47 | # (this also can happen then data_gid joining maybe required) 48 | gid_data = this_admap %>% 49 | group_by(data_gid) %>% 50 | summarise(ch = all(c("NS", "WE") %in% direction_group)) %>% 51 | filter(!ch), 52 | all_gid_att = unmapped_attr_gids 53 | ) 54 | 55 | definiteness_details 56 | } 57 | -------------------------------------------------------------------------------- /R/get_direction.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' get optimal direction (in terms of `unpivotr` directions) 4 | #' 5 | #' @param d_part parts of `data_attr_map_raw` 6 | #' @details Used internally 7 | #' @keywords internal 8 | #' @return a string value denoting direction 9 | #' 10 | 11 | get_direction <- function(d_part) { 12 | directions <- get_unpivotr_direction_names() 13 | 14 | if (d_part$direction[1] %in% names(directions)) { 15 | dirs <- directions[[d_part$direction[1]]] 16 | 17 | if (length(dirs) > 1) { 18 | d0 <- d_part %>% 19 | distinct(data_gid, row = row_d, col = col_d) 20 | 21 | d0s <- d0 %>% summarise(mnr = min(row), mxr = max(row), mnc = min(col), mxc = max(col)) 22 | 23 | d1 <- tibble(row = seq(from = d0s$mnr, to = d0s$mxr, by = 1), col = d0s$mnc) %>% 24 | bind_rows( 25 | tibble(col = seq(from = d0s$mnc, to = d0s$mxc, by = 1), row = d0s$mnr) 26 | ) %>% 27 | mutate(data_gid = d0$data_gid[1]) %>% 28 | unique() 29 | 30 | a1 <- d_part %>% 31 | distinct(attr_gid, row = row_a, col = col_a) 32 | 33 | dmd <- tibble() 34 | for (dir in dirs) { 35 | dm_now <- get_direction_metric(d1, a1, dir) 36 | dmd <- dmd %>% bind_rows(tibble(dm = dm_now, dir = dir)) 37 | if (dm_now == 1) break() 38 | } 39 | dmd$dir[which.max(dmd$dm)] 40 | } else { 41 | dirs[1] 42 | } 43 | } else { 44 | abort("direction name not known.\n(have you tampered a cell-analysis?)") 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /R/get_direction_df.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Get Directional Orientation for Attributes corresponding to a given data 4 | #' group 5 | #' 6 | #' @param dp single data group boundary information 7 | #' @param datt attribute data (including attribute group id) 8 | #' @details Used internally by get_data_block_information function 9 | #' @keywords internal 10 | #' @return Directional Orientation for Attributes 11 | #' 12 | get_direction_df <- function(dp, datt, allow_inside = FALSE) { 13 | directions <- list() 14 | 15 | directions$N <- datt %>% 16 | filter( 17 | row < dp$r_min, 18 | col >= dp$c_min, 19 | col <= dp$c_max 20 | ) %>% 21 | mutate(dist = (dp$r_min - row)) %>% 22 | mutate( 23 | direction = "N", 24 | direction_group = "NS", 25 | data_gid = dp$gid 26 | ) 27 | 28 | directions$S <- datt %>% 29 | filter( 30 | row > dp$r_max, 31 | col >= dp$c_min, 32 | col <= dp$c_max 33 | ) %>% 34 | mutate(dist = (row - dp$r_max)) %>% 35 | mutate( 36 | direction = "S", 37 | direction_group = "NS", 38 | data_gid = dp$gid 39 | ) 40 | 41 | directions$W <- datt %>% 42 | filter( 43 | col < dp$c_min, 44 | row >= dp$r_min, 45 | row <= dp$r_max 46 | ) %>% 47 | mutate(dist = (dp$c_min - col)) %>% 48 | mutate( 49 | direction = "W", 50 | direction_group = "WE", 51 | data_gid = dp$gid 52 | ) 53 | 54 | directions$E <- datt %>% 55 | filter( 56 | col > dp$c_max, 57 | row >= dp$r_min, 58 | row <= dp$r_max 59 | ) %>% 60 | mutate(dist = (col - dp$c_max)) %>% 61 | mutate( 62 | direction = "E", 63 | direction_group = "WE", 64 | data_gid = dp$gid 65 | ) 66 | 67 | # corner directions 68 | 69 | directions$NW <- datt %>% 70 | filter( 71 | row < dp$r_min, 72 | col < dp$c_min 73 | ) %>% 74 | mutate(dist = sqrt((dp$r_min - row)^2 + (dp$c_min - col)^2)) %>% 75 | mutate( 76 | direction = "NW", 77 | direction_group = "corner", 78 | data_gid = dp$gid 79 | ) 80 | 81 | directions$NE <- datt %>% 82 | filter( 83 | row < dp$r_min, 84 | col > dp$c_max 85 | ) %>% 86 | mutate(dist = sqrt((dp$r_min - row)^2 + (dp$c_max - col)^2)) %>% 87 | mutate( 88 | direction = "NE", 89 | direction_group = "corner", 90 | data_gid = dp$gid 91 | ) 92 | 93 | directions$SE <- datt %>% 94 | filter( 95 | row > dp$r_max, 96 | col > dp$c_max 97 | ) %>% 98 | mutate(dist = sqrt((dp$r_max - row)^2 + (dp$c_max - col)^2)) %>% 99 | mutate( 100 | direction = "SE", 101 | direction_group = "corner", 102 | data_gid = dp$gid 103 | ) 104 | 105 | directions$SW <- datt %>% 106 | filter( 107 | row > dp$r_max, 108 | col < dp$c_min 109 | ) %>% 110 | mutate(dist = sqrt((dp$r_max - row)^2 + (dp$c_min - col)^2)) %>% 111 | mutate( 112 | direction = "SW", 113 | direction_group = "corner", 114 | data_gid = dp$gid 115 | ) 116 | 117 | if (allow_inside) { 118 | directions$INSIDE <- datt %>% 119 | filter( 120 | row >= dp$r_min, 121 | row <= dp$r_max, 122 | col >= dp$c_min, 123 | col <= dp$c_max 124 | ) %>% 125 | mutate(dist = 0) %>% 126 | mutate( 127 | direction = "INSIDE", 128 | direction_group = "inside", 129 | data_gid = dp$gid 130 | ) 131 | } 132 | 133 | 134 | direction_df <- directions %>% bind_rows() 135 | 136 | direction_df 137 | } 138 | -------------------------------------------------------------------------------- /R/get_direction_metric.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' get direction metric 4 | #' 5 | #' @param d1 part of d_part with data_gid 6 | #' @param a1 part of d_part with attr_gid 7 | #' @param direction direction name (compatible with `unpivotr`) 8 | #' should be one of [`get_unpivotr_direction_names`][get_unpivotr_direction_names()] 9 | #' 10 | #' @details Used internally by [`get_direction`][get_direction()] function 11 | #' @keywords internal 12 | #' @return a scaled fraction denoting coverage (1 means full coverage) for the supplied direction. 13 | #' 14 | get_direction_metric <- function(d1, a1, direction) { 15 | l1 <- try(get_direction_metric_part_raw(d1, a1, direction), silent = TRUE) 16 | 17 | if (inherits(l1, "try-error")) l1 <- 0 18 | if (length(l1) != 1) l1 <- 0 19 | if (is.na(l1)) l1 <- 0 20 | 21 | l1 / nrow(d1) 22 | } 23 | 24 | get_direction_metric_part_raw <- function(d1, a1, direction) { 25 | # suppressWarnings should be removed once unpivotr::enhead chages 26 | # this is happening as "All elements of `...` must be named." warning in tidyr 27 | # ref: https://github.com/tidyverse/tidyr/issues/714 28 | # ref: https://github.com/nacnudus/unpivotr/issues/26 29 | suppressWarnings({ 30 | d1 %>% 31 | enhead(a1, direction) %>% 32 | filter(!is.na(attr_gid)) %>% 33 | pull(attr_gid) %>% 34 | length() 35 | }) 36 | } 37 | -------------------------------------------------------------------------------- /R/get_group_id.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' Get Group ID for joined cells 5 | #' 6 | #' @param dat the (data or attribute) cells (in at least row-col format) 7 | #' 8 | #' @details Used internally by get_data_block_information function 9 | #' @keywords internal 10 | #' @return Group ID attached information in a list 11 | #' 12 | get_group_id <- function(dat) { 13 | digi_sep <- dat %>% 14 | summarise(rm = max(row), cm = max(col)) %>% 15 | map(~ log(.x, base = 10) %>% ceiling()) %>% 16 | unlist() %>% 17 | max() 18 | digi_sep <- digi_sep + 1 19 | 20 | # attach r_id, c_id 21 | dat %>% 22 | select(row, col) %>% 23 | arrange(row, col) %>% 24 | group_by(row) %>% 25 | mutate(cid = (c(1, diff(col)) != 1) %>% cumsum()) %>% 26 | ungroup() %>% 27 | mutate(cid = cid + row * 10^digi_sep) %>% 28 | arrange(col, row) %>% 29 | group_by(col) %>% 30 | mutate(rid = (c(1, diff(row)) != 1) %>% cumsum()) %>% 31 | ungroup() %>% 32 | mutate(rid = rid + col * 10^digi_sep) -> drc_id 33 | 34 | # attach g_id 35 | drc_id <- drc_id %>% mutate(gid = rid) 36 | 37 | n_gid <- drc_id %>% 38 | summarise(n_distinct(gid)) %>% 39 | pull(1) 40 | 41 | repeat ({ 42 | drc_id %>% 43 | group_by(cid) %>% 44 | mutate(gid = min(gid)) %>% 45 | group_by(rid) %>% 46 | mutate(gid = min(gid)) %>% 47 | ungroup() -> drc_id 48 | 49 | if (n_gid > (drc_id %>% summarise(n_distinct(gid)) %>% pull(1))) { 50 | n_gid <- drc_id %>% 51 | summarise(n_distinct(gid)) %>% 52 | pull(1) 53 | } else { 54 | break() 55 | } 56 | }) 57 | 58 | drc_id <- drc_id %>% mutate(gid = as.character(gid)) 59 | 60 | # boundary 61 | drc_boundary <- get_group_id_boundary(drc_id) 62 | 63 | list(group_id_map = drc_id, group_id_boundary = drc_boundary) 64 | } 65 | 66 | 67 | get_group_id_boundary <- function(drc_id) { 68 | drc_id %>% 69 | group_by(gid) %>% 70 | summarise(r_min = min(row), c_min = min(col), r_max = max(row), c_max = max(col)) 71 | } 72 | 73 | 74 | get_group_id_join_gids <- function(old_group_id_info, gid_map) { 75 | old_group_id_info$group_id_map <- old_group_id_info$group_id_map %>% 76 | left_join(gid_map, by = "gid") %>% 77 | mutate(new_gid = if_else(is.na(new_gid), gid, new_gid)) %>% 78 | select(-gid) %>% 79 | rename(gid = new_gid) 80 | old_group_id_info$group_id_boundary <- get_group_id_boundary(old_group_id_info$group_id_map) 81 | old_group_id_info 82 | } 83 | -------------------------------------------------------------------------------- /R/get_links_df.R: -------------------------------------------------------------------------------- 1 | 2 | get_links_df <- function(gid_joins) { 3 | gids <- unlist(gid_joins) %>% 4 | unique() %>% 5 | sort() 6 | ldf <- NULL 7 | repeat({ 8 | l0 <- linked_to(gids[1], gid_joins) 9 | ldf <- ldf %>% bind_rows(tibble(gid = l0, new_gid = min(l0))) 10 | gids <- setdiff(gids, ldf$gid) 11 | if (length(gids) == 0) break() 12 | }) 13 | ldf 14 | } 15 | 16 | linked_to_part <- function(x, gid_joins) { 17 | gid_joins %>% 18 | map(~ { 19 | if (x %in% .x) { 20 | .x 21 | } else { 22 | NULL 23 | } 24 | }) %>% 25 | unlist() %>% 26 | unique() %>% 27 | sort() 28 | } 29 | 30 | linked_to <- function(x, gid_joins) { 31 | l0 <- linked_to_part(x, gid_joins) 32 | repeat({ 33 | l1 <- l0 %>% 34 | map(~ linked_to_part(.x, gid_joins)) %>% 35 | unlist() %>% 36 | unique() %>% 37 | sort() 38 | if (!identical(l1, l0)) { 39 | l0 <- l1 40 | } else { 41 | break() 42 | } 43 | }) 44 | 45 | l1 46 | } 47 | -------------------------------------------------------------------------------- /R/get_unpivotr_direction_names.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' get unpivotr direction names 4 | #' 5 | #' @details Kept for compatibility. Used internally by get_direction function. 6 | #' @keywords internal 7 | #' @return directions as used in unpivotr package with directional grouping 8 | #' 9 | get_unpivotr_direction_names <- function() { 10 | 11 | # direction order 12 | # "N" "NNW" "NNE" "ABOVE" 13 | # "W" "WNW" "WSW" "LEFT" 14 | # "S" "SSW" "SSE" "BELOW" 15 | # "E" "ENE" "ESE" "RIGHT" 16 | 17 | list( 18 | N = c("N", "NNW", "NNE", "ABOVE"), 19 | W = c("W", "WNW", "WSW", "LEFT"), 20 | S = c("S", "SSW", "SSE", "BELOW"), 21 | E = c("E", "ENE", "ESE", "RIGHT"), 22 | NE = "NNE", 23 | NW = "NNW", 24 | SE = "SSE", 25 | SW = "SSW" 26 | ) 27 | } 28 | -------------------------------------------------------------------------------- /R/global_vars.R: -------------------------------------------------------------------------------- 1 | ## fix for R CMD check 2 | if (getRversion() >= "2.15.1") { 3 | gvars <- c( 4 | 5 | ######## 6 | # for dplyr oprations : non-standard evaluation (nse) 7 | 8 | ".", "ad", "ag", 9 | "aid", "attr_gid", "attr_gid_split", "attr_group", 10 | "c_dim", "c_dim_data", "c_max", "c_max.x", 11 | "c_max.y", "c_min", "c_min.x", "c_min.y", 12 | "ccn", "cdt", "cell_group_type", "ch", 13 | "cid", "cn", "cn_id", "cn_id_", 14 | "cname", "cname_new", "cname_ord", "coc", 15 | "col_a", "col_d", "col_names", "col_orig", 16 | "d", "d1", "d2", "data_block", 17 | "data_gid", "data_type", "data_types", "date_raw", 18 | "decision", "dir_n", "direction", "direction_basic", 19 | "direction_group", "dist", "dist_", "dty", 20 | "dummy_order", "file_type", "full_dim", "full_dim_orig", 21 | "g_id_a", "g_id_e", "g_id_v", "gid", 22 | "implemented", "is_blank", "is_blank_not_num_c", "is_blank_not_num_cb", 23 | "is_full_dim_present", "is_num", "m_dist", "md", 24 | "n_att", "n_dirs", "natural_gid", "nc", 25 | "new_attr_gid", "new_attr_group", "new_dist", "new_gid", 26 | "new_type", "new_type_c", "new_type_r", "not_num_c", 27 | "not_num_cb", "nt", "num_and_pm", "num_c", 28 | "num_c_len", "num_try", "optional_cols", "package", 29 | "pkg_installed", "present_num_c_b", "r", "r_dim", 30 | "r_dim_data", "r_max", "r_max.x", "r_max.y", 31 | "r_min", "r_min.x", "r_min.y", "raw_value", 32 | "rc", "rc_n", "rel_dim", "rid", 33 | "RN", "row_a", "row_d", "row_orig", 34 | "sheet", "support_possible", "this_attr_max_rel", "txt", 35 | "txt_orig", "txt_size_", "type", "val_type", 36 | "value", "value_chk", "id", "block", 37 | "cn_id_raw", "cr", "n1", "n2", 38 | "new_name", "old_name", "tinf", "fsn", 39 | "is_major", "new_cn", "sn", "sn_m", 40 | "table_tag", "fp", "file_type_raw", "original_file_type", 41 | 42 | ######## 43 | # for shiny 44 | 45 | "absolutePanel", "actionButton", "br", 46 | "brushedPoints", "brushOpts", "callModule", "checkboxInput", 47 | "clickOpts", "conditionalPanel", "dialogViewer", "div", 48 | "h3", "h5", "icon", "incProgress", 49 | "is.reactive", "isolate", "nearPoints", "NS", 50 | "observe", "observeEvent", "plotOutput", "radioButtons", 51 | "reactive", "reactiveVal", "removeNotification", "renderPlot", 52 | "runGadget", "selectizeInput", "shinyApp", "showNotification", 53 | "sliderInput", "span", "stopApp", "tags", 54 | "updateSelectizeInput", "wellPanel", "withProgress", 55 | 56 | ######## 57 | # for miniUI 58 | 59 | "gadgetTitleBar", "miniButtonBlock", "miniContentPanel", 60 | "miniPage", "miniTabPanel", "miniTabstripPanel", 61 | 62 | ######## 63 | # for rJava and docxtractr internal 64 | 65 | "lo_find", ".jnew" 66 | ) 67 | 68 | utils::globalVariables(gvars) 69 | rm(gvars) 70 | } 71 | -------------------------------------------------------------------------------- /R/is_attachable.R: -------------------------------------------------------------------------------- 1 | 2 | is_attachable <- function(gid1, gid2, group_info, whole_data, data_attr_map) { 3 | 4 | # should have similar major sides (of attributes) 5 | if (!identical( 6 | data_attr_map %>% filter(data_gid == gid1) %>% pull(direction) %>% sort(), 7 | data_attr_map %>% filter(data_gid == gid2) %>% pull(direction) %>% sort() 8 | )) { 9 | return(FALSE) 10 | } 11 | 12 | # if any intersecting cell present 13 | gc1 <- group_info$group_id_map %>% 14 | filter(gid == gid1) %>% 15 | select(row, col) 16 | gc2 <- group_info$group_id_map %>% 17 | filter(gid == gid2) %>% 18 | select(row, col) 19 | gcj <- gc1 %>% inner_join(gc2, by = c("row", "col")) 20 | if (nrow(gcj) > 0) { 21 | return(TRUE) 22 | } 23 | 24 | # should have no other entry within the enclosed combined boundary 25 | this_group_info <- group_info 26 | this_group_info$group_id_map <- this_group_info$group_id_map %>% filter(gid %in% c(gid1, gid2)) 27 | this_group_info <- get_group_id_join_gids( 28 | this_group_info, 29 | tibble(gid = c(gid1, gid2), new_gid = gid1) 30 | ) 31 | combined_boundary <- this_group_info$group_id_boundary 32 | this_region_data <- whole_data %>% 33 | filter( 34 | row <= combined_boundary$r_max, 35 | row >= combined_boundary$r_min, 36 | col <= combined_boundary$c_max, 37 | col >= combined_boundary$c_min 38 | ) 39 | this_region_data_rest <- this_region_data %>% 40 | anti_join(this_group_info$group_id_map, by = c("row", "col")) %>% 41 | filter(type %in% c("value", "attribute")) 42 | 43 | if (nrow(this_region_data_rest) > 0) { 44 | return(FALSE) 45 | } 46 | 47 | return(TRUE) 48 | } 49 | -------------------------------------------------------------------------------- /R/is_available.R: -------------------------------------------------------------------------------- 1 | 2 | # this is designed to test and simulate 3 | # non availability of certain packages 4 | is_available <- function(pkgname) { 5 | if (pkgname %in% tidycells_pkg_env$na_pkgs) { 6 | return(FALSE) 7 | } 8 | # kept for safe_dependency_check 9 | if (pkgname %in% names(tidycells_pkg_env$safe_dependency_check)) { 10 | chk <- tidycells_pkg_env$safe_dependency_check[[pkgname]]() 11 | if (isFALSE(chk)) { 12 | return(FALSE) 13 | } 14 | } 15 | rlang::is_installed(pkgname) 16 | } 17 | 18 | not_available <- function(pkgs, add = TRUE) { 19 | if (missing(pkgs)) { 20 | tidycells_pkg_env$na_pkgs <- NULL 21 | } else { 22 | if (length(pkgs) >= 1) { 23 | if (is.character(pkgs)) { 24 | if (add) { 25 | old <- tidycells_pkg_env$na_pkgs 26 | pkgs <- c(old, pkgs) 27 | } 28 | tidycells_pkg_env$na_pkgs <- pkgs 29 | } 30 | } 31 | } 32 | return(invisible(0)) 33 | } 34 | -------------------------------------------------------------------------------- /R/is_conforms_to_rcdf.R: -------------------------------------------------------------------------------- 1 | # row col df : rc_df 2 | 3 | is_conforms_to_rcdf <- function(d) { 4 | UseMethod("is_conforms_to_rcdf") 5 | } 6 | 7 | is_conforms_to_rcdf.data.frame <- function(d) { 8 | msg <- character(0) 9 | lvl <- 0 10 | 11 | decision <- FALSE 12 | 13 | if (hasName(d, "row") & 14 | hasName(d, "col")) { 15 | int_chk <- FALSE 16 | 17 | if (is.integer(d$row) & is.integer(d$col)) { 18 | int_chk <- TRUE 19 | } else { 20 | suppressMessages(suppressWarnings({ 21 | 22 | # delete potential columns with same name 23 | # this is for checking only 24 | d$row_orig <- NULL 25 | d$col_orig <- NULL 26 | d$chk <- NULL 27 | # (so that no duplicate name come in following section of code) 28 | 29 | d <- d %>% 30 | rename( 31 | row_orig = row, 32 | col_orig = col 33 | ) %>% 34 | mutate( 35 | row = as.integer(row_orig), 36 | col = as.integer(col_orig), 37 | # diff with orig 38 | row_d = as.numeric(row_orig) - row, 39 | col_d = as.numeric(col_orig) - col, 40 | # collect overall checks 41 | chk = (row_d == 0) & (col_d == 0) 42 | ) 43 | })) 44 | 45 | int_chk <- all(d$chk, na.rm = TRUE) 46 | 47 | if (length(int_chk) != 1) int_chk <- FALSE 48 | 49 | if (is.na(int_chk)) int_chk <- FALSE 50 | } 51 | 52 | 53 | if (int_chk) { 54 | rest_chk <- tibble(chks = c( 55 | all(d$row > 0), 56 | all(d$col > 0), 57 | !any(is.na(d$row)), 58 | !any(is.na(d$col)), 59 | (d %>% count(row, col) %>% filter(n > 1) %>% nrow()) == 0 60 | ), neg_msgs = c( 61 | "all rows are not positive", 62 | "all cols are not positive", 63 | "NA present in rows", 64 | "NA present in cols", 65 | "for each (row, col) pair only one cell should be present: which is not true" 66 | )) 67 | 68 | decision <- all(rest_chk$chks) 69 | msg <- rest_chk$neg_msgs[!rest_chk$chks] 70 | if (any(!rest_chk$chks)) { 71 | lvl <- max(which(!rest_chk$chks), na.rm = TRUE) + 2 72 | } 73 | } else { 74 | decision <- FALSE 75 | msg <- "row/col is not interger (or meaningful coercion not possible)" 76 | lvl <- 2 77 | } 78 | } else { 79 | decision <- FALSE 80 | msg <- "row/col column not present" 81 | lvl <- 1 + (hasName(d, "row") + hasName(d, "col")) / 2 82 | } 83 | 84 | if (length(msg)) { 85 | attr(decision, "msg") <- msg[!is.na(msg)] 86 | } 87 | 88 | if (lvl > 0) { 89 | attr(decision, "lvl") <- lvl 90 | } 91 | decision 92 | } 93 | 94 | is_conforms_to_rcdf.matrix <- function(d) { 95 | d %>% 96 | as.data.frame() %>% 97 | is_conforms_to_rcdf.data.frame() 98 | } 99 | 100 | 101 | as_rc_df <- function(d) { 102 | d0 <- d %>% select(row, col) 103 | if (!is_conforms_to_rcdf(d0)) { 104 | abort("Data does not conforms to rcdf format") 105 | } 106 | class(d0) <- setdiff(class(d0), c("cell_df", "cells")) %>% 107 | c("rc_df", .) 108 | d0 109 | } 110 | -------------------------------------------------------------------------------- /R/is_xlsx_ok.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # function to detect if xlsx working 4 | is_xlsx_ok <- function(no_pkg_check = F) { 5 | if (!no_pkg_check) { 6 | # this is created for safe dependency checks 7 | if (!is_available("xlsx")) { 8 | return(FALSE) 9 | } 10 | } 11 | 12 | sample_xls <- system.file("extdata", "messy", "xls.pdf", package = "tidycells", mustWork = TRUE) 13 | 14 | suppressMessages( 15 | suppressWarnings( 16 | dxt <- try(xlsx::read.xlsx(sample_xls, sheetIndex = 1), silent = TRUE) 17 | ) 18 | ) 19 | 20 | if (is.data.frame(dxt)) { 21 | return(TRUE) 22 | } else { 23 | return(FALSE) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/make_read_cell_part.R: -------------------------------------------------------------------------------- 1 | 2 | make_read_cell_part <- function(obj) { 3 | stage <- attr(obj, "read_cells_stage") 4 | if (is.null(stage)) { 5 | abort(paste0( 6 | "No 'read_cells_stage' attribute found!", 7 | "\nMake sure the 'obj' is outcome from read_cells with 'simplify = TRUE' and 'at_level<6'" 8 | )) 9 | } 10 | 11 | if (!(stage %in% read_cell_task_orders)) { 12 | abort("Unknown stage of the object!") 13 | } 14 | 15 | attr(obj, "read_cells_stage") <- NULL 16 | 17 | out_l <- list(stage = stage) 18 | if (stage == read_cell_task_orders[1]) { 19 | out_l$info <- list(content = obj) 20 | } 21 | 22 | if (stage == read_cell_task_orders[2]) { 23 | out_l$is_empty <- is.null(obj) 24 | out_l$cell_list <- obj 25 | } 26 | 27 | if (stage == read_cell_task_orders[3]) { 28 | out_l$is_empty <- is.null(obj) 29 | out_l$cell_list <- obj 30 | } 31 | 32 | if (stage == read_cell_task_orders[4]) { 33 | out_l$is_empty <- is.null(obj) 34 | out_l$cell_analysis_list <- obj 35 | } 36 | 37 | # kept for compatibilty maybe this is not required 38 | if (stage == read_cell_task_orders[5]) { 39 | out_l$is_empty <- is.null(obj) 40 | out_l$final_composition <- obj 41 | } 42 | 43 | class(out_l) <- read_cell_part_class 44 | 45 | out_l 46 | } 47 | -------------------------------------------------------------------------------- /R/mask_data.R: -------------------------------------------------------------------------------- 1 | # for sharing data 2 | mask_data <- function(d) { 3 | validate_cells(d) 4 | if (hasName(d, "type")) { 5 | d0 <- d %>% mutate(value = recode(type, attribute = "A", value = "0", empty = "")) 6 | } else { 7 | d0 <- d %>% mutate(value = recode(data_type, character = "A", numeric = "0")) 8 | } 9 | as_tibble(d0) 10 | } 11 | -------------------------------------------------------------------------------- /R/merge_admaps.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | merge_admaps <- function(x, y) { 4 | this <- list() 5 | this$raw_map <- x$raw_map %>% 6 | bind_rows(y$raw_map) %>% 7 | distinct() 8 | this$map <- x$map %>% 9 | bind_rows(y$map) %>% 10 | distinct() 11 | this$dimension_analysis <- list() 12 | this$dimension_analysis$data_gid_dim <- x$dimension_analysis$data_gid_dim %>% 13 | bind_rows(y$dimension_analysis$data_gid_dim) %>% 14 | distinct() 15 | this$dimension_analysis$attr_data_dim <- x$dimension_analysis$attr_data_dim %>% 16 | bind_rows(y$dimension_analysis$attr_data_dim) %>% 17 | distinct() 18 | this 19 | } 20 | -------------------------------------------------------------------------------- /R/multi_names.R: -------------------------------------------------------------------------------- 1 | 2 | ## This file keeps entries for multiple names of same functions 3 | ## 4 | ## analyse_cells <- analyze_cells 5 | 6 | 7 | #' Analyse Cells 8 | #' 9 | #' @description Note that this is an other name for `tidycells::analyze_cells`. 10 | #' After [`Value Attribute Classification`][value_attribute_classify()] done on a [`cell_df`][cell_df-class] next task to do is 11 | #' analyze it's contents for data block detection, attribute orientation identification etc. The function `analyze_cells` (and also `analyse_cells`) 12 | #' does the same for you. 13 | #' 14 | #' **Note**: 15 | #' \if{html}{If you are not sure about what package functions actually do or how they work together, 16 | #' please start with \href{../doc/tidycells-intro.html}{\code{vignette("tidycells-intro")}}.} 17 | #' \if{latex}{If you are not sure about what package functions actually do or how they work together, 18 | #' please start with \code{vignette("tidycells-intro")}.} 19 | #' 20 | #' @param d A [`cell_df`][cell_df-class] after [`Value Attribute Classification`][value_attribute_classify()] done 21 | #' @param silent logical scalar indicating whether to raise a warning if heuristic detection fails. (Default TRUE). 22 | #' 23 | #' @details it returns detailed analysis of the data structure including data block detection, attribute orientation detection etc. 24 | #' The argument `silent` is set to `TRUE` by default, as the warning will be given whenever the [`cell_analysis`][cell_analysis-class] is printed. 25 | #' 26 | #' After this step one may like to do : 27 | #' * [`compose_cells`][compose_cells()] 28 | #' 29 | #' If in an interactive session, following additional functions can be helpful for interactive visualizations: 30 | #' * [`visual_data_block_inspection`][visual_data_block_inspection()] 31 | #' * [`visual_orientation_modification`][visual_orientation_modification()] 32 | #' * [`visual_traceback`][visual_traceback()] 33 | #' 34 | #' @return Detailed analysis of the cell data structure. 35 | #' Which will be a [`cell_analysis`][cell_analysis-class] class object. 36 | #' 37 | #' @seealso [`compose_cells`][compose_cells()], [`collate_columns`][collate_columns()], [`analyze_cells`][analyze_cells()] 38 | #' @keywords internal 39 | #' @export 40 | analyse_cells <- function(d, silent = TRUE) { 41 | analyze_cells_raw(d = d, silent = silent) 42 | } 43 | -------------------------------------------------------------------------------- /R/read_excel_whole.R: -------------------------------------------------------------------------------- 1 | read_excel_whole <- function(fn) { 2 | if (is_available("xlsx")) { 3 | # preferred 4 | read_xls_from_xlsx(fn) 5 | } else { 6 | message("Using readxl to read xls. Manually check date and numeric cells. (for better result install xlsx package)") 7 | read_excel_whole_readxl(fn) 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /R/read_excel_whole_readxl.R: -------------------------------------------------------------------------------- 1 | # readxl 2 | # possible_date_range is somehow inspired by LibreOffice 3 | read_excel_whole_part_readxl <- function(fn, sheet = 1L, possible_date_range = c(as.Date("1930-01-01"), Sys.Date() + 3800)) { 4 | if (!is_available("readxl")) { 5 | abort("'readxl' package is required") 6 | } 7 | 8 | suppressWarnings({ 9 | d0 <- readxl::read_excel(fn, 10 | col_names = FALSE, col_types = "text", sheet = sheet, .name_repair = "minimal" 11 | ) 12 | d1 <- readxl::read_excel(fn, 13 | col_names = FALSE, col_types = "date", sheet = sheet, .name_repair = "minimal" 14 | ) 15 | }) 16 | 17 | possible_date_range <- as.character(possible_date_range) 18 | 19 | d0f <- as.matrix(d0) %>% as.character() 20 | d1f <- as.matrix(d1) %>% as.character() 21 | d1f <- if_else(d1f <= max(possible_date_range) & d1f >= min(possible_date_range), d1f, NA_character_) 22 | 23 | df <- if_else(is.na(d1f), d0f, d1f) 24 | 25 | d <- matrix(df, nrow = nrow(d0)) 26 | 27 | as.data.frame(d, stringsAsFactors = FALSE) %>% 28 | tibble::as_tibble() 29 | } 30 | 31 | read_excel_whole_readxl <- function(fn) { 32 | if (!is_available("readxl")) { 33 | abort("'readxl' package is required") 34 | } 35 | sheets <- readxl::excel_sheets(fn) 36 | lout <- sheets %>% map(~ read_excel_whole_part_readxl(fn, sheet = .x)) 37 | names(lout) <- sheets 38 | lout 39 | } 40 | -------------------------------------------------------------------------------- /R/read_pdf_from_tabulizer.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | read_pdf_from_tabulizer <- function(fn, no_pkg_check = F) { 4 | if (!no_pkg_check) { 5 | if (!is_available("tabulizer")) { 6 | abort("'tabulizer' package is required") 7 | } 8 | } 9 | 10 | 11 | # read in both ways [time consuming!] 12 | # as decide may not function as expected 13 | pl <- try(tabulizer::extract_tables(fn, method = "lattice"), silent = TRUE) 14 | ps <- try(tabulizer::extract_tables(fn, method = "stream"), silent = TRUE) 15 | 16 | # final 17 | pf <- NULL 18 | 19 | # if both are try-error return one of them 20 | if (inherits(pl, "try-error") & inherits(ps, "try-error")) { 21 | pf <- pl 22 | } else { 23 | if (!inherits(pl, "try-error") & is.list(pl) & inherits(ps, "try-error")) { 24 | pf <- pl 25 | } 26 | 27 | if (inherits(pl, "try-error") & !inherits(ps, "try-error") & is.list(ps)) { 28 | pf <- ps 29 | } 30 | 31 | if (!inherits(pl, "try-error") & !inherits(ps, "try-error")) { 32 | # if length unequal return the same with more length 33 | if (length(pl) > length(ps)) { 34 | pf <- pl 35 | } 36 | 37 | if (length(pl) < length(ps)) { 38 | pf <- ps 39 | } 40 | 41 | # if both of them have same length 42 | if (length(pl) == length(ps)) { 43 | # do degenrate study 44 | degenrate_l <- pl %>% 45 | map(dim) %>% 46 | map_lgl(~ any(.x < 2)) 47 | degenrate_s <- ps %>% 48 | map(dim) %>% 49 | map_lgl(~ any(.x < 2)) 50 | 51 | pf <- seq_along(pl) %>% 52 | map(~ { 53 | if (degenrate_l[.x]) { 54 | return(ps[[.x]]) 55 | } 56 | if (degenrate_s[.x]) { 57 | return(pl[[.x]]) 58 | } 59 | 60 | # prefer low area 61 | al <- dim(pl[[.x]]) %>% prod() 62 | as <- dim(ps[[.x]]) %>% prod() 63 | if (al <= as) { 64 | return(pl[[.x]]) 65 | } 66 | if (al > as) { 67 | return(ps[[.x]]) 68 | } 69 | }) 70 | } 71 | } 72 | } 73 | 74 | return(pf) 75 | } 76 | -------------------------------------------------------------------------------- /R/read_xls_from_xlsx.R: -------------------------------------------------------------------------------- 1 | 2 | read_xls_from_xlsx <- function(fn, no_pkg_check = F) { 3 | if (!no_pkg_check) { 4 | # this is created for safe dependency checks 5 | if (!is_available("xlsx")) { 6 | abort("'xlsx' package is required") 7 | } 8 | } 9 | 10 | 11 | # local functions 12 | get_date <- function(x) { 13 | if (identical(x, round(x))) { 14 | xo <- as.Date(x - 25569, origin = "1970-01-01") 15 | } else { 16 | xo <- as.POSIXct((x - 25569) * 86400, 17 | tz = "GMT", 18 | origin = "1970-01-01" 19 | ) 20 | } 21 | xo 22 | } 23 | 24 | for_a_sheet <- function(sheet) { 25 | rows <- xlsx::getRows(sheet) 26 | if (length(rows) == 0) { 27 | return(NULL) 28 | } # exit early 29 | 30 | cells <- xlsx::getCells(rows) 31 | res <- cells %>% map(xlsx::getCellValue) 32 | 33 | dat <- names(res) %>% 34 | stringr::str_split("\\.") %>% 35 | map(as.integer) %>% 36 | reduce(rbind) %>% 37 | as_tibble(.name_repair = "minimal") 38 | 39 | colnames(dat) <- c("row", "col") 40 | 41 | dat <- dat %>% mutate(raw_value = res) 42 | # credit goes directly to xlsx-R-Package creators 43 | dateUtil <- date_utils() 44 | 45 | 46 | dat <- dat %>% mutate(is_num = raw_value %>% map_lgl(is.numeric)) 47 | 48 | dat_n <- dat %>% 49 | filter(is_num) 50 | 51 | dat_c <- dat %>% 52 | filter(!is_num) %>% 53 | mutate(data_type = "chr") 54 | 55 | if (nrow(dat_c) > 0) { 56 | dat_c <- dat_c %>% 57 | mutate(chr = raw_value %>% map_chr(1)) 58 | } 59 | 60 | num_cells <- cells[dat$is_num] 61 | 62 | if (length(num_cells) > 0) { 63 | # possible dates 64 | is_date <- num_cells %>% map_lgl(dateUtil$isCellDateFormatted) 65 | 66 | dat_n <- dat_n %>% 67 | mutate(is_date = is_date) 68 | 69 | dat_n_nd <- dat_n %>% 70 | filter(!is_date) %>% 71 | mutate(data_type = "dbl") 72 | 73 | if (nrow(dat_n_nd) > 0) { 74 | dat_n_nd <- dat_n_nd %>% 75 | mutate(dbl = raw_value %>% map_dbl(1)) 76 | } 77 | 78 | 79 | dat_n_d <- dat_n %>% 80 | filter(is_date) 81 | 82 | if (nrow(dat_n_d) > 0) { 83 | dat_n_d <- dat_n_d %>% 84 | mutate(date_raw = raw_value %>% map(get_date)) 85 | 86 | dat_n_d <- dat_n_d %>% 87 | mutate(data_type = if_else( 88 | date_raw %>% map_lgl(~ inherits(.x, "POSIXct")), 89 | true = "dttm", 90 | false = "date" 91 | )) 92 | 93 | dat_n_d_dt <- dat_n_d %>% 94 | filter(data_type == "date") 95 | 96 | if (nrow(dat_n_d_dt) > 0) { 97 | dat_n_d_dt <- dat_n_d_dt %>% 98 | mutate(date = date_raw %>% map(1) %>% reduce(c)) 99 | } 100 | 101 | dat_n_d_dttm <- dat_n_d %>% 102 | filter(data_type == "dttm") 103 | 104 | if (nrow(dat_n_d_dttm) > 0) { 105 | dat_n_d_dttm <- dat_n_d_dttm %>% 106 | mutate(dttm = date_raw %>% map(1) %>% reduce(c)) 107 | } 108 | 109 | 110 | dat_n_d <- dat_n_d_dt %>% bind_rows(dat_n_d_dttm) 111 | } 112 | 113 | dat_n <- dat_n_d %>% bind_rows(dat_n_nd) 114 | } 115 | 116 | dat <- dat_c %>% bind_rows(dat_n) 117 | 118 | dat <- dat %>% arrange(row, col) 119 | dat[c("row", "col", "data_type", sort(unique(dat$data_type)))] 120 | } 121 | 122 | read_xls_for_tidycells <- function(filename) { 123 | wb <- xlsx::loadWorkbook(filename) 124 | sheets <- xlsx::getSheets(wb) 125 | 126 | sheets %>% map(for_a_sheet) 127 | } 128 | 129 | date_utils <- function() { 130 | # credit goes directly to xlsx-R-Package creators 131 | .jnew("org/apache/poi/ss/usermodel/DateUtil") 132 | } 133 | 134 | # put them in xlsx environment 135 | environment(date_utils) <- environment(xlsx::read.xlsx) 136 | 137 | suppressMessages( 138 | suppressWarnings( 139 | read_xls_for_tidycells(fn) 140 | ) 141 | ) 142 | } 143 | -------------------------------------------------------------------------------- /R/reduce_2dfs.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | get_connected_cols <- function(col_map_with_dist) { 4 | col_map_with_dist 5 | col_map <- tibble() 6 | repeat({ 7 | if (nrow(col_map_with_dist) == 0) break() 8 | col_map_this <- col_map_with_dist %>% filter(dist == min(dist)) 9 | if (nrow(col_map_this) == 0) break() 10 | col_map <- col_map %>% bind_rows(col_map_this) 11 | col_map_with_dist <- col_map_with_dist %>% filter(!(n1 %in% col_map$n1), !(n2 %in% col_map$n2)) 12 | }) 13 | col_map 14 | } 15 | 16 | reduce_2dfs <- function(dc1, dc2, combine_th = 0.6, rest_cols = Inf, retain_other_cols = FALSE) { 17 | colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "uncollated_", "d1_old_uc_") 18 | colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "collated_", "d1_old_c_") 19 | 20 | colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "uncollated_", "d2_old_uc_") 21 | colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "collated_", "d2_old_c_") 22 | 23 | 24 | cr1 <- get_all_col_representative(dc1) 25 | cr2 <- get_all_col_representative(dc2) 26 | 27 | # if either of cr1 or cr2 is empty simply rbind and return 28 | if (length(cr1) * length(cr2) == 0) { 29 | dcnew <- dc1 %>% bind_rows(dc2) 30 | return(dcnew) 31 | } 32 | 33 | all_maps <- expand.grid(n1 = names(cr1), n2 = names(cr2), stringsAsFactors = FALSE) 34 | m1 <- seq_len(nrow(all_maps)) %>% 35 | map(~ similarity_score(cr1[[all_maps$n1[.x]]], cr2[[all_maps$n2[.x]]]) %>% t()) %>% 36 | reduce(rbind) 37 | m2 <- as.data.frame(m1) %>% 38 | map(norm_this) %>% 39 | as.data.frame() 40 | # KFL : kept for later 41 | wts <- rep(1, ncol(m2)) 42 | m3 <- m2 %>% apply(MARGIN = 1, function(x) sum(x * wts) / sum(wts)) 43 | all_maps <- all_maps %>% mutate(dist = m3) 44 | 45 | all_maps_c <- get_connected_cols(all_maps) 46 | 47 | amap_ok <- all_maps_c %>% filter(dist <= combine_th) 48 | amap_not_ok <- all_maps_c %>% filter(dist > combine_th) 49 | 50 | if (nrow(amap_not_ok) > 0) { 51 | repeat({ 52 | induced_cols <- amap_not_ok %>% 53 | mutate(n1 = paste0("n1.", n1), n2 = paste0("n2.", n2)) %>% 54 | select(n1, n2) %>% 55 | unlist() %>% 56 | unique() 57 | 58 | if (length(induced_cols) > rest_cols) { 59 | amap_not_ok <- amap_not_ok %>% arrange(dist) 60 | if (rest_cols <= 0) { 61 | amap_not_ok <- amap_not_ok %>% filter(FALSE) 62 | } else { 63 | # pass a single row to amap_ok 64 | amap_ok <- amap_ok %>% bind_rows(amap_not_ok[1, ]) 65 | amap_not_ok <- amap_not_ok[-1, ] 66 | } 67 | } else { 68 | break() 69 | } 70 | }) 71 | } 72 | 73 | cmap <- tibble(new_name = NA_character_, old_name = NA_character_, block = NA_real_) %>% filter(FALSE) 74 | 75 | if (nrow(amap_ok) > 0) { 76 | amap_ok <- amap_ok %>% 77 | arrange(dist) %>% 78 | mutate(new_name = paste0("collated_", seq_along(dist))) 79 | 80 | cmap_this <- amap_ok %>% 81 | select(-dist) %>% 82 | tidyr::gather(cr, old_name, -new_name) 83 | 84 | cmap_this <- cmap_this %>% 85 | mutate(block = recode(cr, n1 = dc1$data_block[1], n2 = dc2$data_block[1])) %>% 86 | select(-cr) 87 | 88 | cmap <- cmap %>% bind_rows(cmap_this) 89 | } 90 | 91 | if (nrow(amap_not_ok) > 0) { 92 | cmap_this <- amap_not_ok %>% 93 | select(-dist) %>% 94 | tidyr::gather(cr, old_name) %>% 95 | mutate(new_name = paste0("uncollated_", seq_along(old_name))) 96 | 97 | cmap_this <- cmap_this %>% 98 | mutate(block = recode(cr, n1 = dc1$data_block[1], n2 = dc2$data_block[1])) %>% 99 | select(-cr) 100 | 101 | cmap <- cmap %>% bind_rows(cmap_this) 102 | } 103 | 104 | cmap1 <- cmap %>% filter(block == dc1$data_block[1]) 105 | cmap2 <- cmap %>% filter(block == dc2$data_block[1]) 106 | 107 | for (i in seq_len(nrow(cmap1))) { 108 | colnames(dc1)[which(colnames(dc1) == cmap1$old_name[[i]])] <- cmap1$new_name[[i]] 109 | } 110 | 111 | for (i in seq_len(nrow(cmap2))) { 112 | colnames(dc2)[which(colnames(dc2) == cmap2$old_name[[i]])] <- cmap2$new_name[[i]] 113 | } 114 | 115 | dcnew <- dc1 %>% bind_rows(dc2) 116 | 117 | if (!retain_other_cols) { 118 | nc_cols <- colnames(dcnew) %>% 119 | stringr::str_detect("collated") %>% 120 | colnames(dcnew)[.] 121 | dcnew <- dcnew[c(intersect(defcols, colnames(dcnew)), nc_cols)] 122 | } 123 | 124 | dcnew 125 | } 126 | -------------------------------------------------------------------------------- /R/refine_cols.R: -------------------------------------------------------------------------------- 1 | 2 | refine_cols <- function(dcomp_part, cn_df, sep = " :: ") { 3 | cn_df <- cn_df %>% filter(cname %in% colnames(dcomp_part)) 4 | out <- dcomp_part 5 | if (nrow(cn_df) > 0) { 6 | cn_df <- cn_df %>% 7 | group_by(ag) %>% 8 | mutate(cname_new = rc_n %>% as.factor() %>% as.integer() %>% paste0(ag, "_", .)) %>% 9 | ungroup() %>% 10 | arrange(cname_new, cname_ord) 11 | 12 | cn_l <- cn_df %>% 13 | select(cname, cname_new) %>% 14 | split(.$cname_new) 15 | 16 | out_p <- cn_l %>% 17 | map(~ { 18 | .d0 <- dcomp_part[.x$cname] 19 | .d0 <- .d0 %>% mutate_all(stringr::str_trim) 20 | dcomp_part[.x$cname_new[1]] <- .d0 %>% apply(1, paste, collapse = sep) 21 | dcomp_part[c("row", "col", .x$cname_new[1])] 22 | }) %>% 23 | reduce(fj, join_by = c("row", "col")) 24 | 25 | out <- out_p %>% fj(dcomp_part, join_by = c("row", "col")) 26 | } 27 | 28 | out 29 | } 30 | -------------------------------------------------------------------------------- /R/safe_dependency_check.R: -------------------------------------------------------------------------------- 1 | 2 | assign("safe_dependency_check", list( 3 | # this is for rJava deps 4 | xlsx = function() { 5 | chk <- F 6 | try( 7 | { 8 | if (is_xlsx_ok(no_pkg_check = T)) { 9 | sample_xls <- system.file("extdata", "messy", "xls.pdf", package = "tidycells", mustWork = TRUE) 10 | x <- read_xls_from_xlsx(sample_xls, no_pkg_check = T) 11 | if (is.data.frame(x[[1]])) { 12 | if (nrow(x[[1]]) > 0) { 13 | chk <- T 14 | } 15 | } 16 | } 17 | }, 18 | silent = T 19 | ) 20 | chk 21 | }, 22 | # for tabulizer 23 | # mainly for https://github.com/ropensci/tabulizer/issues/106 24 | tabulizer = function() { 25 | chk <- F 26 | try( 27 | { 28 | if (requireNamespace("tabulizer", quietly = T)) { 29 | sample_pdf <- system.file("extdata", "messy", "pdf.docx", package = "tidycells", mustWork = TRUE) 30 | x <- read_pdf_from_tabulizer(sample_pdf, no_pkg_check = T) 31 | if (is.matrix(x[[1]])) { 32 | if (nrow(x[[1]]) > 0) { 33 | chk <- T 34 | } 35 | } 36 | } 37 | }, 38 | silent = T 39 | ) 40 | chk 41 | } 42 | ), envir = tidycells_pkg_env) 43 | -------------------------------------------------------------------------------- /R/shiny_etc.R: -------------------------------------------------------------------------------- 1 | 2 | shiny_check <- function(force_load = FALSE) { 3 | if (!force_load) { 4 | if (!interactive()) { 5 | abort("need an interactive session for this functionality.") 6 | } 7 | } 8 | 9 | if (!is_available("shiny")) { 10 | abort("'shiny' package is required for this functionality.") 11 | } else { 12 | if (!is_available("miniUI")) { 13 | abort("'miniUI' package is required for this functionality.") 14 | } 15 | } 16 | 17 | es <- loadNamespace("shiny") 18 | try(attachNamespace(es), silent = TRUE) 19 | 20 | em <- loadNamespace("miniUI") 21 | try(attachNamespace(em), silent = TRUE) 22 | } 23 | 24 | shiny_unload <- function() { 25 | AutoUnloadShiny <- options("AutoUnloadShiny")[[1]] 26 | if (is.null(AutoUnloadShiny)) AutoUnloadShiny <- TRUE 27 | if (AutoUnloadShiny) { 28 | suppressWarnings(try(unloadNamespace("miniUI"), silent = TRUE)) 29 | suppressWarnings(try(unloadNamespace("shiny"), silent = TRUE)) 30 | } 31 | } 32 | 33 | global_objects <- function(cls = c("cell_df", "cell_analysis", "rc_df"), check_rc_df = FALSE) { 34 | cls <- match.arg(cls) 35 | 36 | objs <- ls(envir = globalenv()) 37 | 38 | cobjs <- objs %>% 39 | map_lgl(~ inherits(x = get(.x, envir = globalenv()), cls)) %>% 40 | objs[.] 41 | 42 | cobjs 43 | } 44 | 45 | command_prompt_ask <- function(title, message, default = NULL) { 46 | cat(paste0(cli_bb(message), "\n")) 47 | cm <- readline(prompt = paste0(title, ":")) 48 | if (nchar(cm) == 0) { 49 | default 50 | } else { 51 | cm 52 | } 53 | } 54 | 55 | rstudioapi_ask <- function(title, message, default = NULL, is_question = FALSE, ...) { 56 | ok <- FALSE 57 | if (is_available("rstudioapi")) { 58 | if (rstudioapi::hasFun("viewer")) { 59 | ok <- TRUE 60 | } 61 | } 62 | 63 | if (ok) { 64 | if (is_question) { 65 | rstudioapi::showQuestion(title = title, message = message, ...) 66 | } else { 67 | rstudioapi::showPrompt(title = title, message = message, default = default) 68 | } 69 | } else { 70 | command_prompt_ask(title = title, message = message, default = default) 71 | } 72 | } 73 | 74 | global_object_picker <- function(what = c("cell_df", "cell_analysis", "rc_df")) { 75 | what <- match.arg(what) 76 | objs <- global_objects(cls = what) 77 | if (length(objs) == 1) { 78 | return(invisible(get(objs, envir = globalenv()))) 79 | } 80 | if (length(objs) < 1) { 81 | abort(paste0("No ", what, " found in R global environment.")) 82 | } 83 | if (length(objs) > 1) { 84 | this_title <- paste0("Select a ", what, " to proceed") 85 | this_msg <- paste0( 86 | "Pick from one of these (type the variable name)\n", 87 | paste0(objs, collapse = ", ") 88 | ) 89 | this_msg_warned <- FALSE 90 | repeat({ 91 | sel <- rstudioapi_ask( 92 | title = this_title, 93 | message = this_msg 94 | ) 95 | if (length(sel) == 0) abort("User Cancelled") 96 | if (sel %in% objs) { 97 | break() 98 | } else { 99 | if (!this_msg_warned) { 100 | this_msg_warned <- TRUE 101 | this_msg <- paste0(this_msg, "\n(Wrong input try again)") 102 | } 103 | } 104 | }) 105 | objs <- sel 106 | return(invisible(get(objs, envir = globalenv()))) 107 | } 108 | 109 | abort("unknown error") 110 | } 111 | -------------------------------------------------------------------------------- /R/shiny_main.R: -------------------------------------------------------------------------------- 1 | common_finish <- function(ui, server, title, viewer_pane, test_this) { 2 | if (test_this) { 3 | es <- new.env() 4 | assign("ui", ui, envir = es) 5 | assign("server", server, envir = es) 6 | return(es) 7 | } 8 | 9 | if (viewer_pane) { 10 | runGadget(shinyApp(ui, server), 11 | stopOnCancel = FALSE 12 | ) 13 | } else { 14 | viewer <- dialogViewer(title, 15 | width = 900, 16 | height = 700 17 | ) 18 | runGadget(shinyApp(ui, server), 19 | viewer = viewer, 20 | stopOnCancel = FALSE 21 | ) 22 | } 23 | } 24 | 25 | shiny_app_va_classify <- function(d, viewer_pane = FALSE, test_this = FALSE) { 26 | 27 | 28 | ########## UI ########## 29 | ui <- miniPage( 30 | gadgetTitleBar("Value/Attribute Classifier"), 31 | miniTabstripPanel( 32 | ui_part_va_classify(), 33 | ui_part_crop(reset_msg = "Load initial data (With which you started this tab. To load original data reset from Classify Tab.)"), 34 | ui_part_plot_tune(selected_fill = "type"), 35 | ui_part_visualize(), 36 | id = "now_tab_main" 37 | ) 38 | ) 39 | 40 | ########## Server ########## 41 | server <- server_va_select(d) 42 | 43 | common_finish(ui, server, 44 | title = "TidyCells: Value/Attribute Classifier", 45 | viewer_pane = viewer_pane, test_this = test_this 46 | ) 47 | } 48 | 49 | shiny_app_crop <- function(d, viewer_pane = FALSE, test_this = FALSE) { 50 | 51 | 52 | ########## UI ########## 53 | ui <- miniPage( 54 | gadgetTitleBar("Data Crop"), 55 | miniTabstripPanel( 56 | ui_part_crop(), 57 | ui_part_plot_tune(), 58 | ui_part_visualize(), 59 | id = "now_tab_main" 60 | ) 61 | ) 62 | 63 | ########## Server ########## 64 | server <- server_crop(d) 65 | 66 | common_finish(ui, server, 67 | title = "TidyCells: Data Crop", 68 | viewer_pane = viewer_pane, test_this = test_this 69 | ) 70 | } 71 | 72 | shiny_app_data_block <- function(x, viewer_pane = FALSE, test_this = FALSE) { 73 | 74 | 75 | ########## UI ########## 76 | ui <- miniPage( 77 | gadgetTitleBar("Data Blocks Inspection"), 78 | miniTabstripPanel( 79 | ui_part_data_block(), 80 | ui_part_plot_tune(txt_alpha_max = 0.9, txt_alpha = 0.2, selected_fill = "type"), 81 | ui_part_visualize(), 82 | id = "now_tab_main" 83 | ) 84 | ) 85 | 86 | ########## Server ########## 87 | server <- server_data_block(x) 88 | 89 | common_finish(ui, server, 90 | title = "TidyCells: Data Blocks Inspection", 91 | viewer_pane = viewer_pane, test_this = test_this 92 | ) 93 | } 94 | 95 | shiny_app_orientation_modification <- function(x, viewer_pane = FALSE, test_this = FALSE) { 96 | 97 | 98 | ########## UI ########## 99 | ui <- miniPage( 100 | gadgetTitleBar("Data Blocks Orientation Modification"), 101 | miniTabstripPanel( 102 | ui_part_data_block(zoom_this = TRUE, plot_issues_option = FALSE), 103 | ui_part_orientation_modification(), 104 | ui_part_plot_tune(txt_alpha_max = 0.9, txt_alpha = 0.2, selected_fill = "type"), 105 | ui_part_visualize(), 106 | id = "now_tab_main" 107 | ) 108 | ) 109 | 110 | ########## Server ########## 111 | server <- server_orientation_modification(x) 112 | 113 | common_finish(ui, server, 114 | title = "TidyCells: Data Blocks Orientation Modification", 115 | viewer_pane = viewer_pane, test_this = test_this 116 | ) 117 | } 118 | 119 | shiny_app_traceback <- function(x, dcomp, viewer_pane = FALSE, test_this = FALSE) { 120 | if (!DT_present()) { 121 | abort(paste("The package 'DT' is required for this functionality", 122 | "If you want static plot, please use 'cell_traceback_plot'.", 123 | sep = "\n" 124 | )) 125 | } 126 | 127 | 128 | ########## UI ########## 129 | ui <- miniPage( 130 | gadgetTitleBar("Composition Traceback"), 131 | miniTabstripPanel( 132 | ui_part_traceback(), 133 | ui_part_data_block(zoom_this = TRUE, direction_text_this = FALSE, plot_issues_option = FALSE), 134 | ui_part_plot_tune(txt_alpha_max = 0.9, selected_fill = "type"), 135 | ui_part_visualize(), 136 | id = "now_tab_main" 137 | ) 138 | ) 139 | 140 | ########## Server ########## 141 | server <- server_traceback(x, dcomp) 142 | 143 | common_finish(ui, server, 144 | title = "TidyCells: Composition Traceback (Composition Viewer)", 145 | viewer_pane = viewer_pane, test_this = test_this 146 | ) 147 | } 148 | -------------------------------------------------------------------------------- /R/similarity_score.R: -------------------------------------------------------------------------------- 1 | 2 | ads_score <- function(ads) { 3 | ads[ads < 0] <- 0 4 | ads[ads > 10^10] <- 10^10 5 | (stats::median(ads, na.rm = TRUE) * 5 + mean(ads, na.rm = TRUE) * 4 + min(ads, na.rm = TRUE) * 1) / 10 6 | } 7 | 8 | similarity_score <- function(x, y) { 9 | comm <- intersect(x, y) 10 | bscore <- 1 - length(comm) / mean(length(x), length(y)) 11 | 12 | if (is.na(bscore)) bscore <- 0 13 | if (bscore > 10^10) bscore <- 10^10 14 | if (bscore < 0) bscore <- 0 15 | 16 | # Levenshtein distance 17 | ads <- utils::adist(x, y) 18 | 19 | ads_l <- list(ads) 20 | 21 | if (is_available("stringdist")) { 22 | suppressMessages(suppressWarnings({ 23 | mthds <- c("osa", "jaccard", "soundex", "jw") 24 | ads_sdl <- mthds %>% map(~ stringdist::stringdistmatrix(x, y, method = .x, p = 0.1, q = 3)) 25 | ads_l <- c(ads_l, ads_sdl) 26 | })) 27 | } 28 | 29 | ascore <- ads_l %>% map_dbl(ads_score) 30 | c(ascore, bscore) 31 | } 32 | -------------------------------------------------------------------------------- /R/stitch_direction.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | stitch_direction <- function(d_part, dat, attr_name, trace_it = FALSE) { 4 | d0 <- d_part %>% 5 | distinct(data_gid, row = row_d, col = col_d) 6 | 7 | a0 <- d_part %>% 8 | distinct(attr_gid, row = row_a, col = col_a) 9 | 10 | chkd <- d0$data_gid %>% 11 | unique() %>% 12 | length() 13 | chka <- a0$attr_gid %>% 14 | unique() %>% 15 | length() 16 | chkatsp <- d_part$attr_gid_split %>% 17 | unique() %>% 18 | length() 19 | 20 | chkdir <- d_part$direction %>% 21 | unique() %>% 22 | length() 23 | 24 | chkaty <- d_part$attr_group %>% 25 | unique() %>% 26 | length() 27 | 28 | if (chkd != 1 | chka != 1 | chkdir != 1 | chkatsp != 1 | chkaty != 1) { 29 | abort(paste("failed to stitch.", 30 | "(have you tampered a cell-analysis?)", 31 | "(make sure the cell-analysis is free from error or potential issues)", 32 | "If still the error persists, please contact the developer.", 33 | sep = "\n" 34 | )) 35 | } 36 | 37 | 38 | if (missing(attr_name)) { 39 | daty <- d_part$attr_group[1] 40 | 41 | dgn <- d_part$direction_group[1] %>% 42 | recode(WE = "row", NS = "col") 43 | 44 | dn <- d_part$direction_basic[1] %>% 45 | recode( 46 | N = "top", W = "left", S = "bottom", E = "right", 47 | NW = "topLeft", NE = "topRight", SW = "bottomLeft", SE = "bottomRight" 48 | ) 49 | 50 | datsp <- d_part$attr_gid_split_order[1] 51 | 52 | dd <- d_part$dist_order[1] 53 | 54 | 55 | attr_name <- paste(daty, dgn, dn, datsp, dd, sep = "_") 56 | } 57 | 58 | 59 | 60 | direction <- d_part$direction %>% unique() 61 | direction <- direction[1] 62 | 63 | d0 <- d0 %>% 64 | select(-data_gid) %>% 65 | left_join(dat %>% select(row, col, value), by = c("row", "col")) 66 | a0 <- a0 %>% 67 | select(-attr_gid) %>% 68 | left_join(dat %>% select(row, col, attr = value), by = c("row", "col")) 69 | 70 | if (trace_it) { 71 | a0 <- a0 %>% mutate(cadd = paste(row, col, sep = "_")) 72 | } 73 | 74 | # suppressWarnings should be removed once unpivotr::enhead chages 75 | # this is happening as "All elements of `...` must be named." warning in tidyr 76 | # ref: https://github.com/tidyverse/tidyr/issues/714 77 | # ref: https://github.com/nacnudus/unpivotr/issues/26 78 | # directions NNW (and similar) is strict; it does not allow multi block 79 | # possibly need separate enhead like function 80 | suppressWarnings({ 81 | d1 <- d0 %>% 82 | enhead(a0, direction) 83 | }) 84 | 85 | 86 | colnames(d1)[which(colnames(d1) == "attr")] <- attr_name 87 | 88 | if (trace_it) { 89 | colnames(d1)[which(colnames(d1) == "cadd")] <- paste0("cellAddress_", attr_name) 90 | } 91 | 92 | d1 <- d1 %>% mutate(data_block = d_part$natural_gid[1]) 93 | 94 | d1 95 | } 96 | -------------------------------------------------------------------------------- /R/tidycells-package.R: -------------------------------------------------------------------------------- 1 | #' `tidycells` package 2 | #' 3 | #' **Read Tabular Data from Diverse Sources and Easily Make Them Tidy** 4 | #' 5 | #' The package provides utilities to read, cells from complex tabular data 6 | #' and heuristic detection based structural assignment of those cells to a 7 | #' columnar or tidy format. Read functionality has the ability to read (in 8 | #' a unified manner) structured, partially structured or unstructured 9 | #' tabular data (usually spreadsheets for public data dissemination and 10 | #' aimed for common human understanding) from various types of documents. 11 | #' The tabular information is read as cells. The 'structure assignment' 12 | #' functionality has both supervised and unsupervised way of assigning 13 | #' cells data to columnar/tidy format. Multiple disconnected blocks of 14 | #' tables in a single sheet are also handled appropriately. These tools are 15 | #' suitable for unattended conversation of (maybe a pile of) messy tables 16 | #' (like government data) into a consumable format(usable for further 17 | #' analysis and data wrangling). 18 | #' 19 | #' To get started check out \href{../doc/tidycells-intro.html}{\code{vignette("tidycells-intro")}}. 20 | #' 21 | #' @name tidycells-package 22 | #' @keywords internal 23 | "_PACKAGE" 24 | -------------------------------------------------------------------------------- /R/utils-deps.R: -------------------------------------------------------------------------------- 1 | #### dplyr #### 2 | 3 | #' @importFrom dplyr bind_rows 4 | NULL 5 | 6 | #' @importFrom dplyr distinct 7 | NULL 8 | 9 | #' @importFrom dplyr ungroup 10 | NULL 11 | 12 | #' @importFrom dplyr group_split 13 | NULL 14 | 15 | #' @importFrom dplyr pull 16 | NULL 17 | 18 | #' @importFrom dplyr if_else 19 | NULL 20 | 21 | #' @importFrom dplyr rename 22 | NULL 23 | 24 | #' @importFrom dplyr arrange 25 | NULL 26 | 27 | #' @importFrom dplyr full_join 28 | NULL 29 | 30 | #' @importFrom dplyr inner_join 31 | NULL 32 | 33 | #' @importFrom dplyr right_join 34 | NULL 35 | 36 | #' @importFrom dplyr left_join 37 | NULL 38 | 39 | #' @importFrom dplyr anti_join 40 | NULL 41 | 42 | #' @importFrom dplyr select 43 | NULL 44 | 45 | #' @importFrom dplyr filter 46 | NULL 47 | 48 | #' @importFrom dplyr group_by 49 | NULL 50 | 51 | #' @importFrom dplyr mutate 52 | NULL 53 | 54 | #' @importFrom dplyr mutate_all 55 | NULL 56 | 57 | #' @importFrom dplyr summarise 58 | NULL 59 | 60 | #' @importFrom dplyr n 61 | NULL 62 | 63 | #' @importFrom dplyr desc 64 | NULL 65 | 66 | #' @importFrom dplyr count 67 | NULL 68 | 69 | #' @importFrom dplyr case_when 70 | NULL 71 | 72 | #' @importFrom dplyr recode 73 | NULL 74 | 75 | #' @importFrom dplyr n_distinct 76 | NULL 77 | 78 | 79 | #### tibble #### 80 | 81 | #' @importFrom tibble tibble 82 | NULL 83 | 84 | #' @importFrom tibble as_tibble 85 | NULL 86 | 87 | 88 | #### purrr #### 89 | 90 | #' @importFrom purrr map 91 | NULL 92 | 93 | #' @importFrom purrr map_df 94 | NULL 95 | 96 | #' @importFrom purrr map_lgl 97 | NULL 98 | 99 | #' @importFrom purrr map_dbl 100 | NULL 101 | 102 | #' @importFrom purrr map_int 103 | NULL 104 | 105 | #' @importFrom purrr map_chr 106 | NULL 107 | 108 | #' @importFrom purrr imap_dfr 109 | NULL 110 | 111 | #' @importFrom purrr reduce 112 | NULL 113 | 114 | 115 | #### unpivotr #### 116 | 117 | #' @importFrom unpivotr enhead 118 | NULL 119 | 120 | #### rlang #### 121 | 122 | #' @importFrom rlang abort 123 | NULL 124 | 125 | #' @importFrom rlang warn 126 | NULL 127 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /R/validate_cells.R: -------------------------------------------------------------------------------- 1 | #' Validate cell-DF 2 | #' 3 | #' @param dat An R object 4 | #' 5 | #' @return Logical scalar. If the value is `FALSE`, it will contain attribute named `msg`. 6 | #' @details It checks for following facts: 7 | #' * Whether `dat` is a `data.frame` 8 | #' * Whether `dat` conforms to `rc_df` format 9 | #' * Whether `dat` conforms to `cell_df` format 10 | #' If all the checks are passed then it returns `TRUE` else it returns `FALSE` with `msg` attribute, indicating reason for validation failure. 11 | #' 12 | #' @export 13 | #' @keywords internal 14 | #' 15 | #' @examples 16 | #' # returns TRUE 17 | #' validate_cells(tibble::tibble(row = 1, col = 2, data_type = "numeric", value = "1")) 18 | #' 19 | #' # this is FALSE 20 | #' chk <- validate_cells(tibble::tibble(row = 1, col = 2, data_type = "numeric")) 21 | #' 22 | #' # msg 23 | #' attr(chk, "msg") 24 | validate_cells <- function(dat) { 25 | msg <- character(0) 26 | 27 | if (!inherits(dat, "data.frame")) { 28 | msg <- msg %>% c("data.frame expected") 29 | } 30 | 31 | if (!is_conforms_to_rcdf(dat)) { 32 | msg <- msg %>% c("does not conforms to rc_df format") 33 | } 34 | 35 | if (!all(hasName(dat, c("row", "col", "data_type", "value")))) { 36 | msg <- msg %>% c("does not has required columns") 37 | } 38 | 39 | # optional test if the dat is VA classified 40 | if (hasName(dat, "type")) { 41 | dtyl <- dat$type %>% 42 | setdiff(c("value", "attribute", "empty")) %>% 43 | length() 44 | if (dtyl != 0) { 45 | msg <- msg %>% c("type column present but content differs from value, attribute, empty") 46 | } 47 | } 48 | 49 | if (hasName(dat, "data_type")) { 50 | dtl <- dat$data_type %>% 51 | setdiff(c("numeric", "character")) %>% 52 | length() 53 | 54 | if (dtl != 0) { 55 | msg <- msg %>% c("data_type has fields apart from numeric, character") 56 | } 57 | 58 | if (!rlang::is_atomic(dat$data_type)) { 59 | msg <- msg %>% c("data_type is not atomic") 60 | } else { 61 | if (!is.character(dat$data_type)) { 62 | msg <- msg %>% c("data_type must be character") 63 | } 64 | } 65 | } 66 | 67 | 68 | if (hasName(dat, "value")) { 69 | if (!rlang::is_atomic(dat$value)) { 70 | msg <- msg %>% c("value is not atomic") 71 | } else { 72 | if (!is.character(dat$value)) { 73 | msg <- msg %>% c("value must be character") 74 | } 75 | } 76 | } 77 | 78 | 79 | if (length(msg) == 0) { 80 | TRUE 81 | } else { 82 | res <- FALSE 83 | attr(res, "msg") <- msg 84 | res 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /R/validate_read_cell_part_object.R: -------------------------------------------------------------------------------- 1 | 2 | validate_read_cell_part_object <- function(x, level) { 3 | stage <- attr(x, "read_cells_stage") 4 | if (missing(level)) { 5 | level <- stage 6 | } 7 | 8 | if (is.null(level)) { 9 | abort("read_cell_part object is not valid") 10 | } 11 | 12 | if (length(level) != 1) { 13 | abort("read_cell_part object is not valid") 14 | } 15 | 16 | if (is.numeric(level)) { 17 | level <- as.integer(level) 18 | if (level >= 1 & level <= length(read_cell_task_orders)) { 19 | level <- read_cell_task_orders[level] 20 | } 21 | } 22 | 23 | if (is.character(level)) { 24 | if (level %in% read_cell_task_orders) { 25 | if (level == read_cell_task_orders[1]) { 26 | # NULL is possible 27 | if (is.null(x)) { 28 | return(list(chk = TRUE, level = level)) 29 | } 30 | # a df 31 | if (is.data.frame(x)) { 32 | if (nrow(x) > 0) { 33 | return(list(chk = TRUE, level = level)) 34 | } 35 | } 36 | # list of dfs 37 | if (is.list(x)) { 38 | if (length(x) > 0) { 39 | if (is.data.frame(x[[1]])) { 40 | return(list(chk = TRUE, level = level)) 41 | } 42 | } 43 | } 44 | } 45 | if (level == read_cell_task_orders[2]) { 46 | if (is.list(x)) { 47 | if (length(x) > 0) { 48 | if (x %>% map_lgl(is_cell_df) %>% all()) { 49 | return(list(chk = TRUE, level = level)) 50 | } 51 | } 52 | } 53 | } 54 | if (level == read_cell_task_orders[3]) { 55 | if (is.list(x)) { 56 | if (length(x) > 0) { 57 | if (x %>% map_lgl(is_cell_df) %>% all()) { 58 | if (x %>% map_lgl(~ hasName(.x, "type")) %>% all()) { 59 | return(list(chk = TRUE, level = level)) 60 | } 61 | } 62 | } 63 | } 64 | } 65 | if (level == read_cell_task_orders[4]) { 66 | if (is.list(x)) { 67 | if (length(x) > 0) { 68 | if (x %>% map_lgl(~ inherits(.x, cell_df_analysis_class[1])) %>% all()) { 69 | return(list(chk = TRUE, level = level)) 70 | } 71 | } 72 | } 73 | } 74 | if (level == read_cell_task_orders[5]) { 75 | if (is.data.frame(x)) { 76 | if (all(utils::hasName(x, setdiff(defcols, "table_tag")))) { 77 | return(list(chk = TRUE, level = level)) 78 | } 79 | } 80 | } 81 | } else { 82 | abort(paste0(level, " is not a valid level.")) 83 | } 84 | } 85 | 86 | return(list(chk = FALSE, level = level)) 87 | } 88 | -------------------------------------------------------------------------------- /R/value_attribute_classify.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Value/Attribute Classifier 4 | #' 5 | #' @description After [`as_cell_df`][as_cell_df()] (entry point to `tidycells`) you may need to use this function 6 | #' or individual _Value/Attribute Classifier_-functions as listed below in _"see also"_ - section. 7 | #' 8 | #' Here the idea is to classify all cells into either `value`, `attribute`, `empty` which will be 9 | #' used by [`analyze_cells`][analyze_cells()] for further processing. 10 | #' 11 | #' @param d a Cell DF 12 | #' @param classifier a classifier 13 | #' 14 | #' @return a Cell DF with Value/Attribute Classification. The underlying tibble will contain an extra column named `type`. 15 | #' @export 16 | #' @details In order to understand the data orientation and detect data-blocks Cell DF requires additional column named `type`. 17 | #' This `type` column potentially contains either `value`, `attribute`, `empty`. The `value` are given corresponding to cells 18 | #' with observations in it. The tag, `attribute` is for the identifier of these cells. Lastly, `empty` cells are useless cells or 19 | #' cells with no meaningful information. 20 | #' 21 | #' For `classifier` following options are present: 22 | #' * `basic_classifier` : naive classifier which recode `data_type`. 23 | #' * `sample_based_classifier` : sample-based classifier. 24 | #' * `numeric_values_classifier` : considers number like cells as values. 25 | #' 26 | #' Each of the above are available as individual functions. Those can also be directly applied on a `cell-df`. 27 | #' However, it is recommended to use `value_attribute_classify` as it tests for integrity after classification. 28 | #' 29 | #' @examples 30 | #' 31 | #' iris %>% 32 | #' as_cell_df() %>% 33 | #' sample_based_classifier(value_sample = "setosa") %>% 34 | #' plot() 35 | #' 36 | #' iris %>% 37 | #' as_cell_df() %>% 38 | #' sample_based_classifier(value_sample = "setosa") %>% 39 | #' numeric_values_classifier() %>% 40 | #' plot() 41 | #' 42 | #' if (rlang::is_installed("tidyxl")) { 43 | #' cdn <- system.file("extdata", "RBI_HBS_Table_No_166.xlsx", package = "tidycells") %>% 44 | #' tidyxl::xlsx_cells() 45 | #' cdn <- cdn %>% 46 | #' dplyr::filter(sheet == sheet[1]) %>% 47 | #' as_cell_df() 48 | #' 49 | #' # all of these are same except value_attribute_classify will perform validate_cells once again 50 | #' cd1 <- sample_based_classifier(cdn, value_sample = "APR") 51 | #' cd2 <- sample_based_classifier(value_sample = "APR")(cdn) 52 | #' cd3 <- value_attribute_classify(cdn, 53 | #' classifier = sample_based_classifier(value_sample = "APR") 54 | #' ) 55 | #' # see it 56 | #' plot(cd3) 57 | #' } 58 | #' @seealso 59 | #' Individual classifier functions: 60 | #' * [basic_classifier][basic_classifier()] 61 | #' * [sample_based_classifier][sample_based_classifier()] 62 | #' * [numeric_values_classifier][numeric_values_classifier()], 63 | #' 64 | #' For interactive Value/Attribute Classification check [visual_va_classify][visual_va_classify()] 65 | #' 66 | value_attribute_classify <- function(d, classifier = basic_classifier()) { 67 | if (!is_cell_df(d)) { 68 | abort("Cell DF Expected") 69 | } 70 | 71 | classifier_try <- try(purrr::as_mapper(classifier), silent = TRUE) 72 | if (!inherits(classifier_try, "try-error")) { 73 | classifier <- classifier_try 74 | } 75 | 76 | if (!is.function(classifier)) { 77 | abort("classifier is not a function") 78 | } 79 | 80 | cd <- classifier(d) 81 | 82 | chk <- validate_cells(cd) 83 | 84 | if (!chk) { 85 | abort(paste0(attr(chk, "msg"), collapse = "\n")) 86 | } 87 | 88 | cd 89 | } 90 | -------------------------------------------------------------------------------- /R/visual_functions.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Interactive operations and Visualizations 4 | #' 5 | #' @param d a cell_df. 6 | #' @description These functions require \code{\link[shiny:shiny-package]{shiny}} and `miniUI`. 7 | #' Each of these modules has option for interactive plots (using `plotly`). 8 | #' However, for the same you need to have `plotly` installed. 9 | #' 10 | #' @details Each of these functions are designed for interactive sessions only. 11 | #' Arguments to these functions are optional as it tries to pick those from global environment. 12 | #' If there are multiple variables defined in global environment which are of same desired class then you will get option to choose 13 | #' one of them. These are also available as \href{https://rstudio.github.io/rstudioaddins/}{RStudio Addin} 14 | #' (if you are using RStudio IDE you'll get those addin in the RStudio addin panel). 15 | #' 16 | #' Following are the names used in RStudio Addin 17 | #' * _Crop Cells_: `visual_crop` 18 | #' * _Cell Analysis Inspection_: `visual_data_block_inspection` 19 | #' * _Orientation Modification_: `visual_orientation_modification` 20 | #' * _Traceback Composition_: `visual_traceback` 21 | #' * _Classify Value/Attribute_: `visual_va_classify` 22 | #' 23 | #' For each of the modules it tries to load and unload `shiny` before and after the execution of the module. 24 | #' You can disable the same by setting `options(AutoUnloadShiny = FALSE)`. 25 | #' 26 | #' @return `visual_crop` returns a cell_df after the interactive operations. 27 | #' @export 28 | #' @keywords internal 29 | #' @rdname visual_functions 30 | visual_crop <- function(d) { 31 | if (missing(d)) { 32 | d <- global_object_picker("cell_df") 33 | } 34 | 35 | shiny_check() 36 | dout <- shiny_app_crop(d) 37 | shiny_unload() 38 | 39 | return(invisible(dout)) 40 | } 41 | 42 | #' @return `visual_va_classify` returns a cell_df after the interactive operations. 43 | #' @export 44 | #' @keywords internal 45 | #' @rdname visual_functions 46 | #' @seealso [`value_attribute_classify`][value_attribute_classify()] 47 | visual_va_classify <- function(d) { 48 | if (missing(d)) { 49 | d <- global_object_picker("cell_df") 50 | } 51 | 52 | shiny_check() 53 | dout <- shiny_app_va_classify(d) 54 | shiny_unload() 55 | 56 | return(invisible(dout)) 57 | } 58 | 59 | #' @param x a cell_analysis. 60 | #' @export 61 | #' @keywords internal 62 | #' @rdname visual_functions 63 | visual_data_block_inspection <- function(x) { 64 | if (missing(x)) { 65 | x <- global_object_picker("cell_analysis") 66 | } 67 | 68 | shiny_check() 69 | shiny_app_data_block(x) 70 | shiny_unload() 71 | 72 | return(invisible(NULL)) 73 | } 74 | 75 | #' @return `visual_orientation_modification` returns a cell_analysis after the interactive operations. 76 | #' @export 77 | #' @keywords internal 78 | #' @rdname visual_functions 79 | visual_orientation_modification <- function(x) { 80 | if (missing(x)) { 81 | x <- global_object_picker("cell_analysis") 82 | } 83 | 84 | shiny_check() 85 | this_out <- shiny_app_orientation_modification(x) 86 | shiny_unload() 87 | 88 | return(invisible(this_out)) 89 | } 90 | 91 | #' @param dcomp a composition (a tibble rendered by [`compose_cells`][compose_cells()]). 92 | #' @return `visual_traceback` returns a composed tidy data (as a tibble). 93 | #' @details `visual_traceback` requires package `DT`. 94 | #' If `DT` is not present you may use [`cell_composition_traceback`][cell_composition_traceback()] 95 | #' 96 | #' @export 97 | #' @keywords internal 98 | #' @rdname visual_functions 99 | #' @seealso [`cell_composition_traceback`][cell_composition_traceback()] 100 | visual_traceback <- function(x, dcomp) { 101 | if (!DT_present()) { 102 | abort("package `DT` is required") 103 | } 104 | 105 | if (missing(x)) { 106 | x <- global_object_picker("cell_analysis") 107 | } 108 | 109 | shiny_check() 110 | this_out <- shiny_app_traceback(x, dcomp) 111 | shiny_unload() 112 | 113 | return(invisible(this_out)) 114 | } 115 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | destination: docs 2 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | 10 | install: 11 | ps: Bootstrap 12 | 13 | cache: 14 | - C:\RLibrary 15 | 16 | 17 | environment: 18 | NOT_CRAN: true 19 | 20 | global: 21 | WARNINGS_ARE_ERRORS: 1 22 | PKGTYPE: both 23 | NOT_CRAN: true 24 | USE_RTOOLS: true 25 | 26 | matrix: 27 | - R_VERSION: release 28 | R_ARCH: x64 29 | 30 | - R_VERSION: devel 31 | 32 | - R_VERSION: patched 33 | 34 | - R_VERSION: oldrel 35 | RTOOLS_VERSION: 33 36 | CRAN: http://cran.rstudio.com 37 | 38 | 39 | build_script: 40 | - travis-tool.sh install_deps 41 | 42 | test_script: 43 | - travis-tool.sh run_tests 44 | 45 | on_failure: 46 | - 7z a failure.zip *.Rcheck\* 47 | - appveyor PushArtifact failure.zip 48 | 49 | artifacts: 50 | - path: '*.Rcheck\**\*.log' 51 | name: Logs 52 | 53 | - path: '*.Rcheck\**\*.out' 54 | name: Logs 55 | 56 | - path: '*.Rcheck\**\*.fail' 57 | name: Logs 58 | 59 | - path: '*.Rcheck\**\*.Rout' 60 | name: Logs 61 | 62 | - path: '\*_*.tar.gz' 63 | name: Bits 64 | 65 | - path: '\*_*.zip' 66 | name: Bits 67 | 68 | notifications: 69 | - provider: Email 70 | to: 71 | - nil.gayen@gmail.com 72 | subject: 'r-rudra tidycells build {{status}}' 73 | message: "https://ci.appveyor.com/project/bedantaguru/tidycells" 74 | on_build_success: false 75 | on_build_failure: true 76 | on_build_status_changed: true 77 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 1% 13 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | 3 | ### Local 4 | 5 | * Windows 10 x64 (build 17134) [x86_64-w64-mingw32/x64 (64-bit)] 6 | * R version 3.6.1 (2019-07-05) 7 | * R version 3.6.2 (2019-12-12) 8 | 9 | ### WinBuilder 10 | 11 | * Windows [x86_64-w64-mingw32 (64-bit)] 12 | * R version 3.5.3 (2019-03-11) 13 | * R version 3.6.2 (2019-12-12) 14 | * R Under development (unstable) (2020-01-03 r77629) 15 | 16 | ### Travis 17 | 18 | * macOS High Sierra 10.13.6 [x86_64-apple-darwin15.6.0 (64-bit)] 19 | * (R: release) R version 3.6.2 (2019-12-12) 20 | * Ubuntu 16.04.6 LTS [x86_64-pc-linux-gnu (64-bit)] 21 | * (R: oldrel) R version 3.5.3 (2017-01-27) 22 | * (R: release) R version 3.6.1 (2017-01-27) 23 | * (R: devel) R Under development (unstable) (2020-01-03 r77628) 24 | 25 | ### AppVeyor 26 | 27 | * Windows Server 2012 R2 x64 (build 9600) [x86_64-w64-mingw32 (64-bit)] 28 | * (release) R version 3.6.2 (2019-12-12) 29 | * (devel) R Under development (unstable) (2020-01-03 r77629) 30 | * (patched) R version 3.6.2 Patched (2020-01-03 r77629) 31 | * (oldrel) R version 3.5.3 (2019-03-11) 32 | 33 | ### RHub 34 | 35 | #### Successful Builds 36 | 37 | * CentOS 6 with Redhat Developer Toolset [x86_64-redhat-linux-gnu (64-bit)] 38 | * (R from EPEL) R version 3.5.2 (2018-12-20) 39 | * Fedora Linux [x86_64-pc-linux-gnu (64-bit)] 40 | * (R-devel GCC) R Under development (unstable) (2020-01-03 r77629) 41 | * (R-devel clang gfortran) R Under development (unstable) (2020-01-03 r77629) 42 | * macOS 10.11 El Capitan [x86_64-apple-darwin15.6.0 (64-bit)] 43 | * (R-release) R version 3.6.2 (2019-12-12) 44 | * Oracle Solaris 10 x86 32 bit [i386-pc-solaris2.10 (32-bit)] 45 | * (R-patched) R version 3.6.0 (2019-04-26) 46 | * Windows Server 2008 R2 SP1 [x86_64-w64-mingw32 (64-bit)] 47 | * (R-release 32/64 bit) R version 3.6.2 (2019-12-12) 48 | * (R-patched 32/64 bit) R version 3.6.2 Patched (2019-12-12 r77564) 49 | * (R-oldrel 32/64 bit) R version 3.5.3 (2019-03-11) 50 | * (R-devel 32/64 bit) R Under development (unstable) (2019-11-08 r77393) 51 | * Windows Server 2012 [x86_64-w64-mingw32 (64-bit)] 52 | * (R-devel Rtools4.0 32/64 bit) R version 4.0.0 Under development (Testing Rtools) (2019-09-30 r77236) 53 | 54 | See other builds in [Dev-Notes](https://github.com/r-rudra/tidycells/blob/master/dev-notes.md#r-hub-other-builds) 55 | 56 | ## R CMD check results 57 | 58 | 0 errors | 0 warnings | 0 note 59 | 60 | * **This is a update of tidycells.** 61 | * **There are _NO_ references describing the methods in this package.** 62 | 63 | **Note**: The methods in the package are purely based on experiences and knowledge of the author and not based on any published article. Hence there are no references to attach currently. 64 | 65 | Also, note that the methods used in the package are novel in this field (as per the knowledge of the author). Any potential similarity with other related work is completely unintentional. 66 | 67 | 68 | ## Downstream dependencies 69 | 70 | * No downstream dependencies currently 71 | 72 | ## (_Optional_) Other Notes 73 | 74 | Also Check [Dev-Notes](https://github.com/r-rudra/tidycells/blob/master/dev-notes.md) 75 | -------------------------------------------------------------------------------- /docs/articles/ext/compose_cells_cli1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/ext/compose_cells_cli1.png -------------------------------------------------------------------------------- /docs/articles/ext/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/ext/logo.png -------------------------------------------------------------------------------- /docs/articles/ext/marks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/ext/marks.png -------------------------------------------------------------------------------- /docs/articles/ext/read_cells_out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/ext/read_cells_out.png -------------------------------------------------------------------------------- /docs/articles/ext/v12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/ext/v12.png -------------------------------------------------------------------------------- /docs/articles/ext/v34.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/ext/v34.png -------------------------------------------------------------------------------- /docs/articles/ext/v56.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/ext/v56.png -------------------------------------------------------------------------------- /docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-21-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-21-1.png -------------------------------------------------------------------------------- /docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-22-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-22-1.png -------------------------------------------------------------------------------- /docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-25-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-25-1.png -------------------------------------------------------------------------------- /docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-30-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-30-1.png -------------------------------------------------------------------------------- /docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-30-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/articles/tidycells-intro_files/figure-html/unnamed-chunk-30-2.png -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/logo.png -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('body').scrollspy({ 13 | target: '#sidebar', 14 | offset: 60 15 | }); 16 | 17 | $('[data-toggle="tooltip"]').tooltip(); 18 | 19 | var cur_path = paths(location.pathname); 20 | var links = $("#navbar ul li a"); 21 | var max_length = -1; 22 | var pos = -1; 23 | for (var i = 0; i < links.length; i++) { 24 | if (links[i].getAttribute("href") === "#") 25 | continue; 26 | // Ignore external links 27 | if (links[i].host !== location.host) 28 | continue; 29 | 30 | var nav_path = paths(links[i].pathname); 31 | 32 | var length = prefix_length(nav_path, cur_path); 33 | if (length > max_length) { 34 | max_length = length; 35 | pos = i; 36 | } 37 | } 38 | 39 | // Add class to parent
  • , and enclosing
  • if in dropdown 40 | if (pos >= 0) { 41 | var menu_anchor = $(links[pos]); 42 | menu_anchor.parent().addClass("active"); 43 | menu_anchor.closest("li.dropdown").addClass("active"); 44 | } 45 | }); 46 | 47 | function paths(pathname) { 48 | var pieces = pathname.split("/"); 49 | pieces.shift(); // always starts with / 50 | 51 | var end = pieces[pieces.length - 1]; 52 | if (end === "index.html" || end === "") 53 | pieces.pop(); 54 | return(pieces); 55 | } 56 | 57 | // Returns -1 if not found 58 | function prefix_length(needle, haystack) { 59 | if (needle.length > haystack.length) 60 | return(-1); 61 | 62 | // Special case for length-0 haystack, since for loop won't run 63 | if (haystack.length === 0) { 64 | return(needle.length === 0 ? 0 : -1); 65 | } 66 | 67 | for (var i = 0; i < haystack.length; i++) { 68 | if (needle[i] != haystack[i]) 69 | return(i); 70 | } 71 | 72 | return(haystack.length); 73 | } 74 | 75 | /* Clipboard --------------------------*/ 76 | 77 | function changeTooltipMessage(element, msg) { 78 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 79 | element.setAttribute('data-original-title', msg); 80 | $(element).tooltip('show'); 81 | element.setAttribute('data-original-title', tooltipOriginalTitle); 82 | } 83 | 84 | if(ClipboardJS.isSupported()) { 85 | $(document).ready(function() { 86 | var copyButton = ""; 87 | 88 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 89 | 90 | // Insert copy buttons: 91 | $(copyButton).prependTo(".hasCopyButton"); 92 | 93 | // Initialize tooltips: 94 | $('.btn-copy-ex').tooltip({container: 'body'}); 95 | 96 | // Initialize clipboard: 97 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 98 | text: function(trigger) { 99 | return trigger.parentNode.textContent; 100 | } 101 | }); 102 | 103 | clipboardBtnCopies.on('success', function(e) { 104 | changeTooltipMessage(e.trigger, 'Copied!'); 105 | e.clearSelection(); 106 | }); 107 | 108 | clipboardBtnCopies.on('error', function() { 109 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 110 | }); 111 | }); 112 | } 113 | })(window.jQuery || window.$) 114 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.7.2 2 | pkgdown: 1.4.1 3 | pkgdown_sha: ~ 4 | articles: 5 | tidycells-intro: tidycells-intro.html 6 | 7 | -------------------------------------------------------------------------------- /docs/reference/analyze_cells-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/reference/analyze_cells-1.png -------------------------------------------------------------------------------- /docs/reference/analyze_cells-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/reference/analyze_cells-2.png -------------------------------------------------------------------------------- /docs/reference/value_attribute_classify-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/reference/value_attribute_classify-1.png -------------------------------------------------------------------------------- /docs/reference/value_attribute_classify-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/reference/value_attribute_classify-2.png -------------------------------------------------------------------------------- /docs/reference/value_attribute_classify-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/docs/reference/value_attribute_classify-3.png -------------------------------------------------------------------------------- /inst/extdata/RBI_HBS_Table_No_166.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/RBI_HBS_Table_No_166.xlsx -------------------------------------------------------------------------------- /inst/extdata/marks.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/marks.xlsx -------------------------------------------------------------------------------- /inst/extdata/marks_cells.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/marks_cells.rds -------------------------------------------------------------------------------- /inst/extdata/messy/csv.docx: -------------------------------------------------------------------------------- 1 | Kid Name,Weight,Age 2 | Nakshatra,12,1.5 3 | Titas,16,6 4 | -------------------------------------------------------------------------------- /inst/extdata/messy/docx.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/messy/docx.html -------------------------------------------------------------------------------- /inst/extdata/messy/html.csv: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
    Kid NameWeightAge
    Nakshatra121.5
    Titas166
    47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /inst/extdata/messy/pdf.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/messy/pdf.docx -------------------------------------------------------------------------------- /inst/extdata/messy/xls.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/messy/xls.pdf -------------------------------------------------------------------------------- /inst/extdata/messy/xlsx.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/messy/xlsx.csv -------------------------------------------------------------------------------- /inst/extdata/untidy.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/inst/extdata/untidy.xlsx -------------------------------------------------------------------------------- /inst/rstudio/addins.dcf: -------------------------------------------------------------------------------- 1 | Name: Crop Cells 2 | Description: Interactive way to crop or delete cells 3 | Binding: visual_crop 4 | Interactive: true 5 | 6 | Name: Cell Analysis Inspection 7 | Description: Interactive way to inspect cell analysis 8 | Binding: visual_data_block_inspection 9 | Interactive: true 10 | 11 | Name: Orientation Modification 12 | Description: Interactive way to modify attribute orientation in a cell analysis 13 | Binding: visual_orientation_modification 14 | Interactive: true 15 | 16 | Name: Traceback Composition 17 | Description: Interactive way to see how the cell analysis is composed 18 | Binding: visual_traceback 19 | Interactive: true 20 | 21 | Name: Classify Value/Attribute 22 | Description: Interactive way to classify cells in value/attribute 23 | Binding: visual_va_classify 24 | Interactive: true 25 | 26 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/logo.png -------------------------------------------------------------------------------- /man/analyse_cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/multi_names.R 3 | \name{analyse_cells} 4 | \alias{analyse_cells} 5 | \title{Analyse Cells} 6 | \usage{ 7 | analyse_cells(d, silent = TRUE) 8 | } 9 | \arguments{ 10 | \item{d}{A \code{\link[=cell_df-class]{cell_df}} after \code{\link[=value_attribute_classify]{Value Attribute Classification}} done} 11 | 12 | \item{silent}{logical scalar indicating whether to raise a warning if heuristic detection fails. (Default TRUE).} 13 | } 14 | \value{ 15 | Detailed analysis of the cell data structure. 16 | Which will be a \code{\link[=cell_analysis-class]{cell_analysis}} class object. 17 | } 18 | \description{ 19 | Note that this is an other name for \code{tidycells::analyze_cells}. 20 | After \code{\link[=value_attribute_classify]{Value Attribute Classification}} done on a \code{\link[=cell_df-class]{cell_df}} next task to do is 21 | analyze it's contents for data block detection, attribute orientation identification etc. The function \code{analyze_cells} (and also \code{analyse_cells}) 22 | does the same for you. 23 | 24 | \strong{Note}: 25 | \if{html}{If you are not sure about what package functions actually do or how they work together, 26 | please start with \href{../doc/tidycells-intro.html}{\code{vignette("tidycells-intro")}}.} 27 | \if{latex}{If you are not sure about what package functions actually do or how they work together, 28 | please start with \code{vignette("tidycells-intro")}.} 29 | } 30 | \details{ 31 | it returns detailed analysis of the data structure including data block detection, attribute orientation detection etc. 32 | The argument \code{silent} is set to \code{TRUE} by default, as the warning will be given whenever the \code{\link[=cell_analysis-class]{cell_analysis}} is printed. 33 | 34 | After this step one may like to do : 35 | \itemize{ 36 | \item \code{\link[=compose_cells]{compose_cells}} 37 | } 38 | 39 | If in an interactive session, following additional functions can be helpful for interactive visualizations: 40 | \itemize{ 41 | \item \code{\link[=visual_data_block_inspection]{visual_data_block_inspection}} 42 | \item \code{\link[=visual_orientation_modification]{visual_orientation_modification}} 43 | \item \code{\link[=visual_traceback]{visual_traceback}} 44 | } 45 | } 46 | \seealso{ 47 | \code{\link[=compose_cells]{compose_cells}}, \code{\link[=collate_columns]{collate_columns}}, \code{\link[=analyze_cells]{analyze_cells}} 48 | } 49 | \keyword{internal} 50 | -------------------------------------------------------------------------------- /man/analyze_cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/analyze_cells.R 3 | \name{analyze_cells} 4 | \alias{analyze_cells} 5 | \title{Analyze Cells} 6 | \usage{ 7 | analyze_cells(d, silent = TRUE) 8 | } 9 | \arguments{ 10 | \item{d}{A \code{\link[=cell_df-class]{cell_df}} after \code{\link[=value_attribute_classify]{Value Attribute Classification}} done} 11 | 12 | \item{silent}{logical scalar indicating whether to raise a warning if heuristic detection fails. (Default TRUE).} 13 | } 14 | \value{ 15 | Detailed analysis of the cell data structure. 16 | Which will be a \code{\link[=cell_analysis-class]{cell_analysis}} class object. 17 | } 18 | \description{ 19 | After \code{\link[=value_attribute_classify]{Value Attribute Classification}} done on a \code{\link[=cell_df-class]{cell_df}} next task to do is 20 | analyze it's contents for data block detection, attribute orientation identification etc. The function \code{analyze_cells} (and also \code{analyse_cells}) 21 | does the same for you. 22 | 23 | \strong{Note}: 24 | \if{html}{If you are not sure about what package functions actually do or how they work together, 25 | please start with \href{../doc/tidycells-intro.html}{\code{vignette("tidycells-intro")}}.} 26 | \if{latex}{If you are not sure about what package functions actually do or how they work together, 27 | please start with \code{vignette("tidycells-intro")}.} 28 | } 29 | \details{ 30 | it returns detailed analysis of the data structure including data block detection, attribute orientation detection etc. 31 | The argument \code{silent} is set to \code{TRUE} by default, as the warning will be given whenever the \code{\link[=cell_analysis-class]{cell_analysis}} is printed. 32 | 33 | After this step one may like to do : 34 | \itemize{ 35 | \item \code{\link[=compose_cells]{compose_cells}} 36 | } 37 | 38 | If in an interactive session, following additional functions can be helpful for interactive visualizations: 39 | \itemize{ 40 | \item \code{\link[=visual_data_block_inspection]{visual_data_block_inspection}} 41 | \item \code{\link[=visual_orientation_modification]{visual_orientation_modification}} 42 | \item \code{\link[=visual_traceback]{visual_traceback}} 43 | } 44 | } 45 | \examples{ 46 | d <- structure(c( 47 | "block 1", "", "C", "D", "", "block 2", "", "C", 48 | "D", "", "A", "1", "2", "", "", "A", "10", "20", "", "B", "3", 49 | "4", "", "", "B", "30", "40" 50 | ), .Dim = c(9L, 3L)) 51 | d <- as.data.frame(d) 52 | cd <- as_cell_df(d) \%>\% numeric_values_classifier() 53 | 54 | # see it 55 | cd \%>\% plot(adaptive_txt_size = FALSE) 56 | ca <- analyze_cells(cd) 57 | 58 | # look at the plot for detected directions 59 | plot(ca) 60 | } 61 | \seealso{ 62 | \code{\link[=compose_cells]{compose_cells}}, \code{\link[=collate_columns]{collate_columns}} 63 | } 64 | -------------------------------------------------------------------------------- /man/as_cell_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/as_cell_df.R 3 | \name{as_cell_df} 4 | \alias{as_cell_df} 5 | \title{Transform data into Cell-DF Structure} 6 | \usage{ 7 | as_cell_df(d, take_row_names = FALSE, take_col_names = FALSE) 8 | } 9 | \arguments{ 10 | \item{d}{the data (either a matrix with column name or a data.frame)} 11 | 12 | \item{take_row_names}{consider row names as separate cells 13 | (applicable only for data with no (row, col) information). Default is \code{FALSE}.} 14 | 15 | \item{take_col_names}{consider column names as separate cells 16 | (applicable only for data with no (row, col) information). Default is \code{FALSE}.} 17 | } 18 | \value{ 19 | An object of class \code{\link[=cell_df-class]{cell_df}}. 20 | 21 | \strong{Note}: After this, you may like to do \code{\link[=value_attribute_classify]{Value Attribute Classification}}. 22 | } 23 | \description{ 24 | Transform an R object (mostly matrix or data.frame) into a \code{\link[=cell_df-class]{cell_df}} 25 | for further processing in other \code{tidycells} functions. 26 | } 27 | \examples{ 28 | 29 | as_cell_df(iris) 30 | 31 | # consider column name as cell 32 | as_cell_df(iris, take_col_names = TRUE) 33 | 34 | # if the data is already in a similar format it will not further transform 35 | # which is not true for ---> unpivotr::as_cells 36 | # check ---> unpivotr::as_cells(iris) \%>\% unpivotr::as_cells() 37 | unpivotr::as_cells(iris) \%>\% as_cell_df() 38 | } 39 | \seealso{ 40 | \itemize{ 41 | \item \code{\link[=validate_cells]{validate_cells}} which is used to validate \code{cell_df}. 42 | \item \code{\link[unpivotr:as_cells]{as_cells}} from \code{unpivotr} package. 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /man/basic_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/value_attribute_classifiers.R 3 | \name{basic_classifier} 4 | \alias{basic_classifier} 5 | \alias{basic_classifier.default} 6 | \alias{basic_classifier.cell_df} 7 | \title{Naive \code{data_type} Based Value/Attribute Classifier} 8 | \usage{ 9 | basic_classifier(x, ...) 10 | 11 | \method{basic_classifier}{default}(x, ...) 12 | 13 | \method{basic_classifier}{cell_df}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{Either a \code{\link[=cell_df-class]{cell_df}} or leave blank for function output 17 | (required by \code{\link[=value_attribute_classify]{value_attribute_classify}})} 18 | } 19 | \description{ 20 | Naive \code{data_type} Based Value/Attribute Classifier 21 | } 22 | \seealso{ 23 | \link[=value_attribute_classify]{value_attribute_classify} 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/cell_analysis-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/class-cell_df_analysis.R 3 | \name{cell_analysis-class} 4 | \alias{cell_analysis-class} 5 | \title{\code{cell_analysis} class} 6 | \description{ 7 | The \code{cell_analysis} class is based on list, 8 | created in order to store analysis of cell level information (\code{\link[=cell_df-class]{cell_df}}). 9 | } 10 | \section{Properties of \code{cell_analysis}}{ 11 | 12 | 13 | Objects of class \code{cell_analysis} have following named nodes: 14 | \itemize{ 15 | \item \code{cells} : Contains information about \code{cell_group_type} in terms of (data, minor and major attributes).(a tibble) 16 | \item \code{sections} : Contains boundaries of each data block. (a tibble) 17 | \item \code{details} : a list containing further information 18 | \item \code{cell_df} : The original \code{\link[=cell_df-class]{cell_df}} which is passed for processing 19 | } 20 | } 21 | 22 | \section{Applicable methods on \code{cell_analysis}}{ 23 | 24 | \itemize{ 25 | \item \code{print}: Prints identifier of the class and the number of blocks (and potential issues if any). 26 | \item \code{plot}: Plots (using \code{\link[ggplot2:ggplot]{ggplot2}}) the data-block information. 27 | } 28 | } 29 | 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /man/cell_composition_traceback.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cell_trace_plot.R 3 | \name{cell_composition_traceback} 4 | \alias{cell_composition_traceback} 5 | \title{Display static composition traceback plot} 6 | \usage{ 7 | cell_composition_traceback(ca, dc, trace_row = 1L) 8 | } 9 | \arguments{ 10 | \item{ca}{A cell analysis} 11 | 12 | \item{dc}{(optional) a composed tidy data.frame} 13 | 14 | \item{trace_row}{the row number to trace. (Default 1)} 15 | } 16 | \description{ 17 | Display static composition traceback plot 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/cell_df-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/class-cell_df.R 3 | \name{cell_df-class} 4 | \alias{cell_df-class} 5 | \title{\code{cell_df} class} 6 | \description{ 7 | The \code{cell_df} class is a subclass of \code{\link[tibble:tibble]{tbl_df}} and \code{\link[base:data.frame]{data.frame}}, 8 | created in order to store cell level information. 9 | } 10 | \section{Properties of \code{cell_df}}{ 11 | 12 | 13 | Objects of class \code{cell_df} have: 14 | \itemize{ 15 | \item A \code{class} attribute of \code{c("cell_df", "cells", "rc_df", "tbl_df", "tbl", "data.frame")}. 16 | \item Two column of \code{integer} type named \code{row} and \code{col}. All entries must be positive and not \code{NA}. 17 | This represents a cell address. 18 | \item A column of \code{character} type named \code{data_type}. Which can contain only \code{numeric} or \code{character} as entries. 19 | This represents the data type of the cell (classified to only two categories). 20 | \item A column of \code{character} type named \code{value}. This stores value of corresponding cells. 21 | \item Apart from these columns it can contain other columns. 22 | However, (\code{row}, \code{col}) together should identify the row of the \code{cell_df} uniquely. 23 | } 24 | } 25 | 26 | \section{Applicable methods on \code{cell_df}}{ 27 | 28 | \itemize{ 29 | \item \code{print}: Prints identifier of the class and the content of the underlying tibble. 30 | \item \code{summary}: Calculates basic stats like number of rows and columns, number of characters and numeric fields, density etc. 31 | \item \code{plot}: Plots (using \code{\link[ggplot2:ggplot]{ggplot2}}) the cell information as ordinary table. 32 | \item \code{as.matrix}: This will transform the data back into tabular form and create a character matrix. 33 | \item \code{as.data.frame}: Similar to \code{as.matrix} this will also transform the data back into tabular form. 34 | } 35 | } 36 | 37 | \seealso{ 38 | The function \code{\link[=validate_cells]{validate_cells}} which is used to validate \code{cell_df}. 39 | 40 | The \code{as.matrix} and \code{as.data.frame} method is similar to \code{\link[unpivotr:rectify]{unpivotr::rectify}}. 41 | 42 | Object of this class is also compatible to most of the functions from \code{\link[unpivotr:unpivotr-package]{unpivotr}} package. 43 | } 44 | \keyword{internal} 45 | -------------------------------------------------------------------------------- /man/collate_columns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/collate_columns.R 3 | \name{collate_columns} 4 | \alias{collate_columns} 5 | \title{Collate Columns Based on Content} 6 | \usage{ 7 | collate_columns( 8 | composed_data, 9 | combine_threshold = 1, 10 | rest_cols = Inf, 11 | retain_other_cols = FALSE, 12 | retain_cell_address = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{composed_data}{output of \code{\link[=compose_cells]{compose_cells}} (preferably not processed)} 17 | 18 | \item{combine_threshold}{a numerical threshold (between 0-1) for content-based collation of columns. (Default 1)} 19 | 20 | \item{rest_cols}{number of rest columns (beyond \code{combine_threshold} joins these many numbers of columns to keep)} 21 | 22 | \item{retain_other_cols}{whether to keep other intermediate (and possibly not so important) columns. (Default \code{FALSE})} 23 | 24 | \item{retain_cell_address}{whether to keep columns like (\code{row}, \code{col}, \code{data_block}). 25 | This may be required for \code{\link[=cell_composition_traceback]{traceback}} (Default \code{FALSE})} 26 | } 27 | \value{ 28 | A column collated data.frame 29 | } 30 | \description{ 31 | After \code{\link[=compose_cells]{compose_cells}}, this function rearranges and rename attribute-columns in order to 32 | make columns properly aligned, based on the content of the columns. 33 | } 34 | \details{ 35 | \itemize{ 36 | \item \strong{Dependency on \emph{stringdist}}: If you have \code{\link[stringdist:stringdist-package]{stringdist}} installed, 37 | the approximate string matching will be enhanced. There may be variations in outcome if you have \code{stringdist} 38 | vs if you don't have it. 39 | \item \strong{Possibility of randomness}: If the attribute column is containing many distinct values, then a column representative sample will be drawn. 40 | Hence it is always recommended to \code{\link[base:set.seed]{set.seed}} if reproducibility is a matter of concern. 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /man/compose_cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compose_cells.R 3 | \name{compose_cells} 4 | \alias{compose_cells} 5 | \title{Compose a Cell Analysis to a tidy form} 6 | \usage{ 7 | compose_cells( 8 | ca, 9 | post_process = TRUE, 10 | attr_sep = " :: ", 11 | discard_raw_cols = FALSE, 12 | print_attribute_overview = FALSE, 13 | silent = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{ca}{a cell_analysis to process} 18 | 19 | \item{post_process}{logical scalar. If disabled a list will be returned without performing post-processing. (Default \code{TRUE})} 20 | 21 | \item{attr_sep}{a character string to separate the attributes. (Default is \verb{::})} 22 | 23 | \item{discard_raw_cols}{logical scalar. If enabled only main processed columns will be returned. (Default \code{FALSE})} 24 | 25 | \item{print_attribute_overview}{print the overview of the attributes (4 distinct values from each attribute of each block)} 26 | 27 | \item{silent}{whether to suppress warning message on compose failure (Default \code{FALSE})} 28 | } 29 | \value{ 30 | a data.frame (as tibble) in tidy form. 31 | } 32 | \description{ 33 | After \code{\link[=analyze_cells]{analyze_cells}} carried out, you may like to use this function in order to 34 | stitch the cells together as per the analyzed results, to form a meaningful structural representation (like tidy format). 35 | } 36 | \examples{ 37 | cd <- 1:(9) \%>\% 38 | matrix(nrow = 3) \%>\% 39 | as_cell_df() 40 | cd <- sample_based_classifier(cd, attribute_sample = "1") 41 | cd <- cd \%>\% dplyr::filter(value != "1") 42 | ca <- analyze_cells(cd) 43 | 44 | compose_cells(ca) 45 | } 46 | -------------------------------------------------------------------------------- /man/get_direction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_direction.R 3 | \name{get_direction} 4 | \alias{get_direction} 5 | \title{get optimal direction (in terms of \code{unpivotr} directions)} 6 | \usage{ 7 | get_direction(d_part) 8 | } 9 | \arguments{ 10 | \item{d_part}{parts of \code{data_attr_map_raw}} 11 | } 12 | \value{ 13 | a string value denoting direction 14 | } 15 | \description{ 16 | get optimal direction (in terms of \code{unpivotr} directions) 17 | } 18 | \details{ 19 | Used internally 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/get_direction_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_direction_df.R 3 | \name{get_direction_df} 4 | \alias{get_direction_df} 5 | \title{Get Directional Orientation for Attributes corresponding to a given data 6 | group} 7 | \usage{ 8 | get_direction_df(dp, datt, allow_inside = FALSE) 9 | } 10 | \arguments{ 11 | \item{dp}{single data group boundary information} 12 | 13 | \item{datt}{attribute data (including attribute group id)} 14 | } 15 | \value{ 16 | Directional Orientation for Attributes 17 | } 18 | \description{ 19 | Get Directional Orientation for Attributes corresponding to a given data 20 | group 21 | } 22 | \details{ 23 | Used internally by get_data_block_information function 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/get_direction_metric.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_direction_metric.R 3 | \name{get_direction_metric} 4 | \alias{get_direction_metric} 5 | \title{get direction metric} 6 | \usage{ 7 | get_direction_metric(d1, a1, direction) 8 | } 9 | \arguments{ 10 | \item{d1}{part of d_part with data_gid} 11 | 12 | \item{a1}{part of d_part with attr_gid} 13 | 14 | \item{direction}{direction name (compatible with \code{unpivotr}) 15 | should be one of \code{\link[=get_unpivotr_direction_names]{get_unpivotr_direction_names}}} 16 | } 17 | \value{ 18 | a scaled fraction denoting coverage (1 means full coverage) for the supplied direction. 19 | } 20 | \description{ 21 | get direction metric 22 | } 23 | \details{ 24 | Used internally by \code{\link[=get_direction]{get_direction}} function 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/get_group_id.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_group_id.R 3 | \name{get_group_id} 4 | \alias{get_group_id} 5 | \title{Get Group ID for joined cells} 6 | \usage{ 7 | get_group_id(dat) 8 | } 9 | \arguments{ 10 | \item{dat}{the (data or attribute) cells (in at least row-col format)} 11 | } 12 | \value{ 13 | Group ID attached information in a list 14 | } 15 | \description{ 16 | Get Group ID for joined cells 17 | } 18 | \details{ 19 | Used internally by get_data_block_information function 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/get_unpivotr_direction_names.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_unpivotr_direction_names.R 3 | \name{get_unpivotr_direction_names} 4 | \alias{get_unpivotr_direction_names} 5 | \title{get unpivotr direction names} 6 | \usage{ 7 | get_unpivotr_direction_names() 8 | } 9 | \value{ 10 | directions as used in unpivotr package with directional grouping 11 | } 12 | \description{ 13 | get unpivotr direction names 14 | } 15 | \details{ 16 | Kept for compatibility. Used internally by get_direction function. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/numeric_values_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/value_attribute_classifiers.R 3 | \name{numeric_values_classifier} 4 | \alias{numeric_values_classifier} 5 | \alias{numeric_values_classifier.default} 6 | \alias{numeric_values_classifier.cell_df} 7 | \title{Value/Attribute Classifier which considers number like cells as values} 8 | \usage{ 9 | numeric_values_classifier(x, ..., allow_chars, verbose = FALSE) 10 | 11 | \method{numeric_values_classifier}{default}(x, ..., allow_chars = NULL, verbose = FALSE) 12 | 13 | \method{numeric_values_classifier}{cell_df}(x, ..., allow_chars, verbose = FALSE) 14 | } 15 | \arguments{ 16 | \item{x}{Either a \code{\link[=cell_df-class]{cell_df}} or 17 | leave blank for function output (required by \code{\link[=value_attribute_classify]{value_attribute_classify}})} 18 | 19 | \item{allow_chars}{(Optional) Allow characters together with numbers. (a character vector)} 20 | 21 | \item{verbose}{Logical scalar for printing discovered cells (which gets new type by this rule)} 22 | } 23 | \description{ 24 | Value/Attribute Classifier which considers number like cells as values 25 | } 26 | \seealso{ 27 | \link[=value_attribute_classify]{value_attribute_classify} 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/read_cell_part-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/class-read_cell_part.R 3 | \name{read_cell_part-class} 4 | \alias{read_cell_part-class} 5 | \title{\code{read_cell_part} class} 6 | \description{ 7 | The \code{read_cell_part} class is based on list, 8 | created in order to store (possible) intermediate \code{\link[=read_cells]{read_cells}} output in detailed format. 9 | 10 | \strong{Note:} This class is designed mainly for \emph{internal use}. 11 | It is recommended to manipulate objects of this class only for cases where desired output is not coming. 12 | } 13 | \section{Properties of \code{read_cell_part}}{ 14 | 15 | 16 | Objects of class \code{read_cell_part} \strong{may} have following named nodes (the nodes may change based on stage): 17 | \itemize{ 18 | \item \code{file_name} : file name which was given to process 19 | \item \code{stage} : stage at which it was last processed. 20 | \item \code{info} : a list containing further information on type and content of the file (provided the file got read) 21 | \item \code{is_empty} : whether the file contains no 'known tabular information' 22 | \item \code{cell_list} : list of \code{\link[=cell_df-class]{cell_df}} 23 | (possibly after \verb{Value Attribute Classification} done if stage is higher than \code{make_cells}) 24 | \item \code{cell_analysis_list} : list of \code{\link[=cell_analysis-class]{cell_analysis}} 25 | \item \code{final_composition} : final composition (a \code{tibble}) with only main columns 26 | (or all column if \code{compose_main_cols_only = FALSE}) 27 | \item \code{final_composition_main} : only appear if \code{compose_main_cols_only = FALSE}. 28 | This holds final composition (a \code{tibble}) with only main columns 29 | } 30 | } 31 | 32 | \section{Applicable methods on \code{read_cell_part}}{ 33 | 34 | \itemize{ 35 | \item \code{print}: Prints identifier of the class and the stage at which it is last processed. 36 | } 37 | } 38 | 39 | \keyword{internal} 40 | -------------------------------------------------------------------------------- /man/read_cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_cells.R 3 | \name{read_cells} 4 | \alias{read_cells} 5 | \title{Read Cells from file} 6 | \usage{ 7 | read_cells( 8 | x, 9 | at_level = c("collate", "detect_and_read", "make_cells", "va_classify", "analyze", 10 | "compose"), 11 | omit = NULL, 12 | simplify = TRUE, 13 | compose_main_cols_only = TRUE, 14 | from_level, 15 | silent = TRUE, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{either a valid file path or a \code{\link[=read_cell_part-class]{read_cell_part}}} 21 | 22 | \item{at_level}{till which level to process. 23 | Should be one of \code{detect_and_read}, \code{make_cells}, \code{va_classify}, \code{analyze}, \code{compose}, \code{collate}. 24 | Or simply a number (like 1 means \code{detect_and_read}, 5 means \code{compose}).} 25 | 26 | \item{omit}{(Optional) the file-types to omit. A character vector.} 27 | 28 | \item{simplify}{whether to simplify the output. (Default \code{TRUE}). If set to \code{FALSE} a \code{\link[=read_cell_part-class]{read_cell_part}} 29 | will be returned.} 30 | 31 | \item{compose_main_cols_only}{whether to compose main columns only. (Default \code{TRUE}).} 32 | 33 | \item{from_level}{(Optional) override start level. (\code{read_cells} will process after \code{from_level})} 34 | 35 | \item{silent}{if \code{TRUE} no message will be displayed.(Default \code{TRUE})} 36 | 37 | \item{...}{further arguments} 38 | } 39 | \value{ 40 | If \code{simplify=TRUE} then different kind of object is returned in different levels (depends on \code{at_level}). 41 | If \code{at_level="compose"} then only final tibble is returned otherwise if the output is not \code{NULL} an attribute will be present 42 | named \code{"read_cells_stage"}. 43 | 44 | If \code{simplify=FALSE} then it will return a \code{\link[=read_cell_part-class]{read_cell_part}} which you can process manually 45 | and continue again with \code{read_cells} (perhaps then \code{from_level} may be useful). 46 | } 47 | \description{ 48 | This function is designed to read cell level information 49 | (and the finally \link[=analyze_cells]{analyze}, \link[=compose_cells]{compose} and \link[=collate_columns]{collate_columns}) 50 | from many file types like xls, pdf, doc etc. 51 | This is a wrapper function to functions from multiple packages. The support for a specific file is dependent on 52 | the installed packages. To see the list of supported files and potentially required packages (if any) just 53 | run \code{read_cells()} in the console. This function supports the file format based on content and not based on just the file 54 | extension. That means if a file is saved as pdf and then the extension is removed (or extension modified to say \code{.xlsx}) 55 | then also the \code{read_cells} will detect it as pdf and read its content. 56 | 57 | \strong{Note} : 58 | \itemize{ 59 | \item \code{read_cells} is supposed to work for any kind of data. However, if it fails in intermediate stage it will raise 60 | a warning and give results till successfully processed stage. 61 | \item The heuristic-algorithm are not well-optimized (yet) so may be slow on large files. 62 | \item If the target table has numerical values as data and text as their attribute (identifier of the data elements), 63 | straight forward method is sufficient in the majority of situations. Otherwise, you may need to utilize other functions. 64 | } 65 | 66 | \strong{A Word of Warning} : 67 | 68 | \emph{The functions used inside \code{read_cells} are heuristic-algorithm based. Thus, outcomes may be unexpected. 69 | It is recommend to try \code{read_cells} on the target file. If the outcome is expected., it is fine. 70 | If not try again with \code{read_cells(file_name, at_level = "compose")}. If after that also the output is not as expected 71 | then other functions are required to be used. At that time start again with \code{read_cells(file_name, at_level = "make_cells")} 72 | and proceed to further functions.} 73 | } 74 | \details{ 75 | It performs following set of actions if called with default \code{at_level}. 76 | \itemize{ 77 | \item \strong{detect_and_read}: Detect file type based on content and attempt to read the same in a format suitable to convert as \code{\link[=cell_df-class]{cell_df}}. 78 | \item \strong{make_cells}: Convert the file content to \code{\link[=cell_df-class]{cell_df}} using \code{\link[=as_cell_df]{as_cell_df}}. 79 | \item \strong{va_classify}: Run \link[=value_attribute_classify]{Value Attribute Classification} using \code{\link[=numeric_values_classifier]{numeric_values_classifier}}. 80 | \item \strong{analyze}: Analyze the cells using \code{\link[=analyze_cells]{analyze_cells}}. 81 | \item \strong{compose}: Compose the cell-analysis to a tidy form using \code{\link[=compose_cells]{compose_cells}}. 82 | \item \strong{collate}: Finally, collate columns based on content using \code{\link[=collate_columns]{collate_columns}}. 83 | } 84 | 85 | \if{html}{ 86 | 87 | Here is the flowchart of the same: 88 | 89 | \figure{read_cells.svg}{options: width=400} 90 | 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /man/read_cells_internal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_cells.R 3 | \name{read_cells.read_cell_part} 4 | \alias{read_cells.read_cell_part} 5 | \alias{read_cells.character} 6 | \alias{read_cells.default} 7 | \alias{read_cells.NULL} 8 | \title{Read Cells from file} 9 | \usage{ 10 | \method{read_cells}{read_cell_part}( 11 | x, 12 | at_level = c("collate", "detect_and_read", "make_cells", "va_classify", "analyze", 13 | "compose"), 14 | omit = NULL, 15 | simplify = TRUE, 16 | compose_main_cols_only = TRUE, 17 | from_level, 18 | silent = TRUE, 19 | ... 20 | ) 21 | 22 | \method{read_cells}{character}( 23 | x, 24 | at_level = c("collate", "detect_and_read", "make_cells", "va_classify", "analyze", 25 | "compose"), 26 | omit = NULL, 27 | simplify = TRUE, 28 | compose_main_cols_only = TRUE, 29 | ... 30 | ) 31 | 32 | \method{read_cells}{default}( 33 | x, 34 | at_level = c("collate", "detect_and_read", "make_cells", "va_classify", "analyze", 35 | "compose"), 36 | omit = NULL, 37 | simplify = TRUE, 38 | compose_main_cols_only = TRUE, 39 | from_level, 40 | ... 41 | ) 42 | 43 | \method{read_cells}{`NULL`}(x, ...) 44 | } 45 | \arguments{ 46 | \item{x}{either a valid file path or a \code{\link[=read_cell_part-class]{read_cell_part}}} 47 | 48 | \item{at_level}{till which level to process. 49 | Should be one of \code{detect_and_read}, \code{make_cells}, \code{va_classify}, \code{analyze}, \code{compose}. Or simply a number.} 50 | 51 | \item{omit}{(Optional) the file-types to omit. A character vector.} 52 | 53 | \item{simplify}{whether to simplify the output. (Default \code{TRUE}). If set to \code{FALSE} a \code{\link[=read_cell_part-class]{read_cell_part}} 54 | will be returned.} 55 | 56 | \item{compose_main_cols_only}{whether to compose main columns only. (Default \code{TRUE}).} 57 | 58 | \item{from_level}{(Optional) override start level. (\code{read_cells} will process after \code{from_level})} 59 | 60 | \item{silent}{if \code{TRUE} no message will be displayed.(Default \code{TRUE})} 61 | 62 | \item{...}{further arguments} 63 | } 64 | \description{ 65 | Read Cells from file 66 | } 67 | \seealso{ 68 | \code{\link[=read_cells]{read_cells}} 69 | } 70 | \keyword{internal} 71 | -------------------------------------------------------------------------------- /man/sample_based_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/value_attribute_classifiers.R 3 | \name{sample_based_classifier} 4 | \alias{sample_based_classifier} 5 | \alias{sample_based_classifier.default} 6 | \alias{sample_based_classifier.cell_df} 7 | \title{Sample Based Value/Attribute Classifier} 8 | \usage{ 9 | sample_based_classifier( 10 | x, 11 | ..., 12 | value_sample, 13 | attribute_sample, 14 | empty_sample, 15 | partial_match = FALSE, 16 | case_insensitive = FALSE, 17 | verbose = FALSE 18 | ) 19 | 20 | \method{sample_based_classifier}{default}( 21 | x, 22 | ..., 23 | value_sample = NULL, 24 | attribute_sample = NULL, 25 | empty_sample = NULL, 26 | partial_match = FALSE, 27 | case_insensitive = FALSE, 28 | verbose = FALSE 29 | ) 30 | 31 | \method{sample_based_classifier}{cell_df}( 32 | x, 33 | ..., 34 | value_sample, 35 | attribute_sample, 36 | empty_sample, 37 | partial_match = FALSE, 38 | case_insensitive = FALSE, 39 | verbose = FALSE 40 | ) 41 | } 42 | \arguments{ 43 | \item{x}{Either a \code{\link[=cell_df-class]{cell_df}} or leave blank for function output 44 | (required by \code{\link[=value_attribute_classify]{value_attribute_classify}})} 45 | 46 | \item{value_sample}{Sample of values (an optional \code{character} vector)} 47 | 48 | \item{attribute_sample}{Sample of attribute (an optional \code{character} vector)} 49 | 50 | \item{empty_sample}{Sample of empty cells (an optional \code{character} vector)} 51 | 52 | \item{partial_match}{Logical scalar indicating whether partial match is allowed. (Default \code{FALSE})} 53 | 54 | \item{case_insensitive}{Logical scalar indicating whether match will be case insensitive. (Default \code{FALSE})} 55 | 56 | \item{verbose}{Logical scalar for printing discovered cells (which gets new type by this rule)} 57 | } 58 | \description{ 59 | Sample Based Value/Attribute Classifier 60 | } 61 | \seealso{ 62 | \link[=value_attribute_classify]{value_attribute_classify} 63 | } 64 | \keyword{internal} 65 | -------------------------------------------------------------------------------- /man/tidycells-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tidycells-package.R 3 | \docType{package} 4 | \name{tidycells-package} 5 | \alias{tidycells} 6 | \alias{tidycells-package} 7 | \title{\code{tidycells} package} 8 | \description{ 9 | \strong{Read Tabular Data from Diverse Sources and Easily Make Them Tidy} 10 | } 11 | \details{ 12 | The package provides utilities to read, cells from complex tabular data 13 | and heuristic detection based structural assignment of those cells to a 14 | columnar or tidy format. Read functionality has the ability to read (in 15 | a unified manner) structured, partially structured or unstructured 16 | tabular data (usually spreadsheets for public data dissemination and 17 | aimed for common human understanding) from various types of documents. 18 | The tabular information is read as cells. The 'structure assignment' 19 | functionality has both supervised and unsupervised way of assigning 20 | cells data to columnar/tidy format. Multiple disconnected blocks of 21 | tables in a single sheet are also handled appropriately. These tools are 22 | suitable for unattended conversation of (maybe a pile of) messy tables 23 | (like government data) into a consumable format(usable for further 24 | analysis and data wrangling). 25 | 26 | To get started check out \href{../doc/tidycells-intro.html}{\code{vignette("tidycells-intro")}}. 27 | } 28 | \seealso{ 29 | Useful links: 30 | \itemize{ 31 | \item \url{https://r-rudra.github.io/tidycells/} 32 | \item \url{https://github.com/r-rudra/tidycells} 33 | \item Report bugs at \url{https://github.com/r-rudra/tidycells/issues} 34 | } 35 | 36 | } 37 | \author{ 38 | \strong{Maintainer}: Indranil Gayen \email{nil.gayen@gmail.com} (\href{https://orcid.org/0000-0003-0197-1944}{ORCID}) 39 | 40 | } 41 | \keyword{internal} 42 | -------------------------------------------------------------------------------- /man/validate_cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate_cells.R 3 | \name{validate_cells} 4 | \alias{validate_cells} 5 | \title{Validate cell-DF} 6 | \usage{ 7 | validate_cells(dat) 8 | } 9 | \arguments{ 10 | \item{dat}{An R object} 11 | } 12 | \value{ 13 | Logical scalar. If the value is \code{FALSE}, it will contain attribute named \code{msg}. 14 | } 15 | \description{ 16 | Validate cell-DF 17 | } 18 | \details{ 19 | It checks for following facts: 20 | \itemize{ 21 | \item Whether \code{dat} is a \code{data.frame} 22 | \item Whether \code{dat} conforms to \code{rc_df} format 23 | \item Whether \code{dat} conforms to \code{cell_df} format 24 | If all the checks are passed then it returns \code{TRUE} else it returns \code{FALSE} with \code{msg} attribute, indicating reason for validation failure. 25 | } 26 | } 27 | \examples{ 28 | # returns TRUE 29 | validate_cells(tibble::tibble(row = 1, col = 2, data_type = "numeric", value = "1")) 30 | 31 | # this is FALSE 32 | chk <- validate_cells(tibble::tibble(row = 1, col = 2, data_type = "numeric")) 33 | 34 | # msg 35 | attr(chk, "msg") 36 | } 37 | \keyword{internal} 38 | -------------------------------------------------------------------------------- /man/value_attribute_classify.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/value_attribute_classify.R 3 | \name{value_attribute_classify} 4 | \alias{value_attribute_classify} 5 | \title{Value/Attribute Classifier} 6 | \usage{ 7 | value_attribute_classify(d, classifier = basic_classifier()) 8 | } 9 | \arguments{ 10 | \item{d}{a Cell DF} 11 | 12 | \item{classifier}{a classifier} 13 | } 14 | \value{ 15 | a Cell DF with Value/Attribute Classification. The underlying tibble will contain an extra column named \code{type}. 16 | } 17 | \description{ 18 | After \code{\link[=as_cell_df]{as_cell_df}} (entry point to \code{tidycells}) you may need to use this function 19 | or individual \emph{Value/Attribute Classifier}-functions as listed below in \emph{"see also"} - section. 20 | 21 | Here the idea is to classify all cells into either \code{value}, \code{attribute}, \code{empty} which will be 22 | used by \code{\link[=analyze_cells]{analyze_cells}} for further processing. 23 | } 24 | \details{ 25 | In order to understand the data orientation and detect data-blocks Cell DF requires additional column named \code{type}. 26 | This \code{type} column potentially contains either \code{value}, \code{attribute}, \code{empty}. The \code{value} are given corresponding to cells 27 | with observations in it. The tag, \code{attribute} is for the identifier of these cells. Lastly, \code{empty} cells are useless cells or 28 | cells with no meaningful information. 29 | 30 | For \code{classifier} following options are present: 31 | \itemize{ 32 | \item \code{basic_classifier} : naive classifier which recode \code{data_type}. 33 | \item \code{sample_based_classifier} : sample-based classifier. 34 | \item \code{numeric_values_classifier} : considers number like cells as values. 35 | } 36 | 37 | Each of the above are available as individual functions. Those can also be directly applied on a \code{cell-df}. 38 | However, it is recommended to use \code{value_attribute_classify} as it tests for integrity after classification. 39 | } 40 | \examples{ 41 | 42 | iris \%>\% 43 | as_cell_df() \%>\% 44 | sample_based_classifier(value_sample = "setosa") \%>\% 45 | plot() 46 | 47 | iris \%>\% 48 | as_cell_df() \%>\% 49 | sample_based_classifier(value_sample = "setosa") \%>\% 50 | numeric_values_classifier() \%>\% 51 | plot() 52 | 53 | if (rlang::is_installed("tidyxl")) { 54 | cdn <- system.file("extdata", "RBI_HBS_Table_No_166.xlsx", package = "tidycells") \%>\% 55 | tidyxl::xlsx_cells() 56 | cdn <- cdn \%>\% 57 | dplyr::filter(sheet == sheet[1]) \%>\% 58 | as_cell_df() 59 | 60 | # all of these are same except value_attribute_classify will perform validate_cells once again 61 | cd1 <- sample_based_classifier(cdn, value_sample = "APR") 62 | cd2 <- sample_based_classifier(value_sample = "APR")(cdn) 63 | cd3 <- value_attribute_classify(cdn, 64 | classifier = sample_based_classifier(value_sample = "APR") 65 | ) 66 | # see it 67 | plot(cd3) 68 | } 69 | } 70 | \seealso{ 71 | Individual classifier functions: 72 | \itemize{ 73 | \item \link[=basic_classifier]{basic_classifier} 74 | \item \link[=sample_based_classifier]{sample_based_classifier} 75 | \item \link[=numeric_values_classifier]{numeric_values_classifier}, 76 | } 77 | 78 | For interactive Value/Attribute Classification check \link[=visual_va_classify]{visual_va_classify} 79 | } 80 | -------------------------------------------------------------------------------- /man/visual_functions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visual_functions.R 3 | \name{visual_crop} 4 | \alias{visual_crop} 5 | \alias{visual_va_classify} 6 | \alias{visual_data_block_inspection} 7 | \alias{visual_orientation_modification} 8 | \alias{visual_traceback} 9 | \title{Interactive operations and Visualizations} 10 | \usage{ 11 | visual_crop(d) 12 | 13 | visual_va_classify(d) 14 | 15 | visual_data_block_inspection(x) 16 | 17 | visual_orientation_modification(x) 18 | 19 | visual_traceback(x, dcomp) 20 | } 21 | \arguments{ 22 | \item{d}{a cell_df.} 23 | 24 | \item{x}{a cell_analysis.} 25 | 26 | \item{dcomp}{a composition (a tibble rendered by \code{\link[=compose_cells]{compose_cells}}).} 27 | } 28 | \value{ 29 | \code{visual_crop} returns a cell_df after the interactive operations. 30 | 31 | \code{visual_va_classify} returns a cell_df after the interactive operations. 32 | 33 | \code{visual_orientation_modification} returns a cell_analysis after the interactive operations. 34 | 35 | \code{visual_traceback} returns a composed tidy data (as a tibble). 36 | } 37 | \description{ 38 | These functions require \code{\link[shiny:shiny-package]{shiny}} and \code{miniUI}. 39 | Each of these modules has option for interactive plots (using \code{plotly}). 40 | However, for the same you need to have \code{plotly} installed. 41 | } 42 | \details{ 43 | Each of these functions are designed for interactive sessions only. 44 | Arguments to these functions are optional as it tries to pick those from global environment. 45 | If there are multiple variables defined in global environment which are of same desired class then you will get option to choose 46 | one of them. These are also available as \href{https://rstudio.github.io/rstudioaddins/}{RStudio Addin} 47 | (if you are using RStudio IDE you'll get those addin in the RStudio addin panel). 48 | 49 | Following are the names used in RStudio Addin 50 | \itemize{ 51 | \item \emph{Crop Cells}: \code{visual_crop} 52 | \item \emph{Cell Analysis Inspection}: \code{visual_data_block_inspection} 53 | \item \emph{Orientation Modification}: \code{visual_orientation_modification} 54 | \item \emph{Traceback Composition}: \code{visual_traceback} 55 | \item \emph{Classify Value/Attribute}: \code{visual_va_classify} 56 | } 57 | 58 | For each of the modules it tries to load and unload \code{shiny} before and after the execution of the module. 59 | You can disable the same by setting \code{options(AutoUnloadShiny = FALSE)}. 60 | 61 | \code{visual_traceback} requires package \code{DT}. 62 | If \code{DT} is not present you may use \code{\link[=cell_composition_traceback]{cell_composition_traceback}} 63 | } 64 | \seealso{ 65 | \code{\link[=value_attribute_classify]{value_attribute_classify}} 66 | 67 | \code{\link[=cell_composition_traceback]{cell_composition_traceback}} 68 | } 69 | \keyword{internal} 70 | -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/pkgdown/favicon/apple-touch-icon-120x120.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/pkgdown/favicon/apple-touch-icon-60x60.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/pkgdown/favicon/apple-touch-icon-76x76.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/pkgdown/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/pkgdown/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/pkgdown/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/pkgdown/favicon/favicon.ico -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(tidycells) 3 | 4 | 5 | test_check("tidycells") 6 | -------------------------------------------------------------------------------- /tests/testthat/test-VA_classifier.R: -------------------------------------------------------------------------------- 1 | test_that("numeric_values_classifier works", { 2 | skip_if_not_installed("tidyxl") 3 | cd <- system.file("extdata", "untidy.xlsx", package = "tidycells") %>% 4 | tidyxl::xlsx_cells() 5 | cd <- as_cell_df(cd) 6 | 7 | cdn <- system.file("extdata", "RBI_HBS_Table_No_166.xlsx", package = "tidycells") %>% 8 | tidyxl::xlsx_cells() 9 | cdn <- cdn %>% 10 | dplyr::filter(sheet == sheet[1]) %>% 11 | as_cell_df() 12 | 13 | cd1 <- numeric_values_classifier(cd) 14 | cd2 <- numeric_values_classifier()(cd) 15 | cd3 <- value_attribute_classify(cd, classifier = numeric_values_classifier()) 16 | 17 | cdn1 <- numeric_values_classifier(cdn) 18 | 19 | expect_output( 20 | numeric_values_classifier(cd, verbose = TRUE), 21 | "New cells detected as different type\n# A tibble: 579 x 6" 22 | ) 23 | expect_output( 24 | numeric_values_classifier(cdn, verbose = TRUE), 25 | "New cells detected as different type\n# A tibble: 50 x 6" 26 | ) 27 | expect_output( 28 | summary(cdn1), 29 | "Types: 838 values, 181 attributes and 50 empty cells" 30 | ) 31 | expect_output( 32 | print(cdn1), 33 | "With Value/Attribute Classification" 34 | ) 35 | expect_identical(cd1, cd2) 36 | expect_identical(cd1, cd3) 37 | expect_equal(c(table(cd1$type)), c("attribute" = 244, "value" = 579)) 38 | }) 39 | 40 | 41 | test_that("sample_based_classifier works", { 42 | skip_if_not_installed("tidyxl") 43 | cdn <- system.file("extdata", "RBI_HBS_Table_No_166.xlsx", package = "tidycells") %>% 44 | tidyxl::xlsx_cells() 45 | cdn <- cdn %>% 46 | dplyr::filter(sheet == sheet[1]) %>% 47 | as_cell_df() 48 | 49 | cd1 <- sample_based_classifier(cdn, value_sample = "APR") 50 | cd2 <- sample_based_classifier(value_sample = "APR")(cdn) 51 | cd3 <- value_attribute_classify(cdn, classifier = sample_based_classifier(value_sample = "APR")) 52 | 53 | 54 | cdn0 <- cdn %>% numeric_values_classifier() 55 | cdn2 <- cdn0 %>% sample_based_classifier(value_sample = "APR") 56 | cdn3 <- cdn0 %>% sample_based_classifier(value_sample = "aug") 57 | 58 | cdn4 <- cdn0 %>% 59 | sample_based_classifier(attribute_sample = "APR") %>% 60 | # revese mapping 61 | sample_based_classifier(value_sample = "APR") 62 | 63 | expect_output(summary(cdn2), "929 values, 90 attributes and 50 empty cells") 64 | expect_output(summary(cdn3), "838 values, 181 attributes and 50 empty cells") 65 | expect_identical(summary(cdn3, no_print = TRUE), summary(cdn0, no_print = TRUE)) 66 | expect_output(summary(cdn4), "929 values, 90 attributes and 50 empty cells") 67 | expect_identical(cd1, cd2) 68 | expect_identical(cd1, cd3) 69 | expect_equal(c(table(cd1$type)), c("attribute" = 137, "value" = 932)) 70 | }) 71 | -------------------------------------------------------------------------------- /tests/testthat/test-analyze_cells.R: -------------------------------------------------------------------------------- 1 | test_that("analyze_cell works: base", { 2 | d <- structure(c( 3 | "block 1", "", "C", "D", "", "block 2", "", "C", 4 | "D", "", "A", "1", "2", "", "", "A", "10", "20", "", "B", "3", 5 | "4", "", "", "B", "30", "40" 6 | ), .Dim = c(9L, 3L)) 7 | d <- as.data.frame(d) 8 | cd <- as_cell_df(d) %>% numeric_values_classifier() 9 | ca <- analyze_cells(cd) 10 | 11 | expect_output(print(ca), "Total blocks: 2") 12 | expect_equal(ca$sections %>% 13 | dplyr::select(-gid) %>% 14 | unlist() %>% 15 | unique() %>% 16 | sort(), c(1, 2, 3, 4, 6, 9)) 17 | expect_identical(ca$cell_df, cd) 18 | expect_equal(names(ca$details), c( 19 | "attr_details", "data_details", 20 | "data_attr_map_raw", "definiteness_checks" 21 | )) 22 | }) 23 | 24 | 25 | test_that("analyze_cell works: tidyxl", { 26 | skip_if_not_installed("tidyxl") 27 | 28 | cd <- system.file("extdata", "untidy.xlsx", package = "tidycells", mustWork = TRUE) %>% 29 | tidyxl::xlsx_cells() %>% 30 | as_cell_df() 31 | 32 | cd <- numeric_values_classifier(cd) 33 | 34 | ca <- analyze_cells(cd) 35 | 36 | dc <- compose_cells(ca, discard_raw_cols = TRUE) 37 | 38 | dc_sel <- dc %>% 39 | dplyr::select(-row, -col, -data_block, -value) %>% 40 | unique() 41 | 42 | expect_equal(ca$details$data_attr_map_raw$data_gid %>% unique() %>% length(), 9) 43 | expect_equal( 44 | dc_sel$minor_1 %>% unique() %>% sort(), 45 | c( 46 | "ABOVE LEFT", "ABOVE LEFT border", "BELOW RIGHT", "BELOW RIGHT border", 47 | "NNE WSW", "NNW WNW :: NNE WSW", "SSE ESE", "SSE ESE Disconnected", 48 | "SSW ENE" 49 | ) 50 | ) 51 | }) 52 | -------------------------------------------------------------------------------- /tests/testthat/test-as_cell_df.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("as_cell_df() works on tidyxl::xlsx_cells on single sheet", { 3 | skip_if_not_installed("tidyxl") 4 | cd <- system.file("extdata", "untidy.xlsx", package = "tidycells") %>% 5 | tidyxl::xlsx_cells() 6 | d0 <- as_cell_df(cd) 7 | 8 | cd <- system.file("extdata", "RBI_HBS_Table_No_166.xlsx", package = "tidycells") %>% 9 | tidyxl::xlsx_cells() 10 | 11 | d1 <- cd %>% 12 | dplyr::filter(sheet == sheet[1]) %>% 13 | as_cell_df() 14 | 15 | expect_identical(d0, as_cell_df(d0)) 16 | expect_true(inherits(d0, "cell_df")) 17 | expect_output(summary(d0), "88 x 25") 18 | expect_true(inherits(d1, "cell_df")) 19 | expect_output(summary(d1), "231 characters and 838 numbers") 20 | }) 21 | 22 | test_that("as_cell_df() does not work on tidyxl::xlsx_cells on multiple sheets", { 23 | skip_if_not_installed("tidyxl") 24 | cd <- system.file("extdata", "RBI_HBS_Table_No_166.xlsx", package = "tidycells") %>% 25 | tidyxl::xlsx_cells() 26 | 27 | expect_error(as_cell_df(cd)) 28 | }) 29 | 30 | test_that("as_cell_df() works on readr::melt_csv", { 31 | skip_if_not_installed("readr") 32 | skip_if_not_installed("datasets") 33 | tf <- tempfile() 34 | write.csv(head(iris, n = 7), tf, row.names = FALSE) 35 | d0 <- readr::melt_csv(tf) %>% as_cell_df() 36 | unlink(tf) 37 | 38 | expect_identical(d0, as_cell_df(d0)) 39 | expect_true(inherits(d0, "cell_df")) 40 | expect_output(summary(d0), "8 x 5") 41 | }) 42 | 43 | test_that("as_cell_df() works on unpivotr::as_cells", { 44 | d0 <- unpivotr::as_cells(data.frame(x = 1:26, letters)) %>% as_cell_df() 45 | 46 | expect_identical(d0, as_cell_df(d0)) 47 | expect_true(inherits(d0, "cell_df")) 48 | expect_output(summary(d0), "26 characters and 26 numbers") 49 | }) 50 | 51 | test_that("as_cell_df() works on data.frame", { 52 | d0 <- as_cell_df(data.frame(x = 1:26, y = letters)) 53 | 54 | expect_identical(d0, as_cell_df(d0)) 55 | expect_true(inherits(d0, "cell_df")) 56 | expect_output(summary(d0), "26 characters and 26 numbers") 57 | 58 | skip_if_not_installed("datasets") 59 | d1 <- as_cell_df(datasets::iris) 60 | expect_true(inherits(d1, "cell_df")) 61 | }) 62 | -------------------------------------------------------------------------------- /tests/testthat/test-cells_to_df.R: -------------------------------------------------------------------------------- 1 | test_that("cell_df to DF (and vice versa) works", { 2 | d0 <- data.frame(x = 1:26, letters, stringsAsFactors = FALSE) 3 | d1 <- as_cell_df(d0) 4 | 5 | d2 <- as.data.frame(d1) 6 | d3 <- as_cell_df(d2) 7 | d4 <- as.data.frame(d3) 8 | 9 | expect_equal(dim(d1), c(52, 4)) 10 | expect_equal(dim(d3), c(52, 4)) 11 | expect_equal(dim(d2), dim(d0)) 12 | expect_equal(dim(d4), dim(d0)) 13 | expect_identical(d2, d4) 14 | expect_identical(d3, as_cell_df(d4)) 15 | }) 16 | -------------------------------------------------------------------------------- /tests/testthat/test-collate_columns.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test_that("collate_columns works", { 5 | d <- system.file("extdata", "marks_cells.rds", package = "tidycells", mustWork = TRUE) %>% 6 | readRDS() 7 | d <- numeric_values_classifier(d) 8 | da <- analyze_cells(d) 9 | 10 | expect_output( 11 | dc1 <- compose_cells(da, print_attribute_overview = TRUE), 12 | "Nakshatra Gayen, Titas Gupta, Ujjaini Gayen, Utsyo Roy" 13 | ) 14 | dc2 <- compose_cells(da, post_process = FALSE) 15 | cc1 <- collate_columns(dc1) 16 | cc2 <- collate_columns(dc2) 17 | 18 | expect_equal(cc1, cc2) 19 | expect_identical( 20 | cc1 %>% unlist() %>% unique() %>% sort(), 21 | cc2 %>% unlist() %>% unique() %>% sort() 22 | ) 23 | 24 | # all School should appear in single column 25 | expect_equal( 26 | cc1 %>% purrr::map_lgl(~ any(stringr::str_detect(.x, "School"))) %>% sum(), 27 | 1 28 | ) 29 | # all Male/Female should appear in single column 30 | expect_equal( 31 | cc1 %>% purrr::map_lgl(~ any(stringr::str_detect(.x, "Male"))) %>% sum(), 32 | 1 33 | ) 34 | # strict test (may be dropped in future) 35 | expect_equal( 36 | cc1 %>% map(~ .x %>% 37 | unique() %>% 38 | length()) %>% unlist() %>% sort() %>% as.numeric(), 39 | c(1, 2, 3, 3, 12, 12) 40 | ) 41 | }) 42 | 43 | 44 | 45 | 46 | test_that("collate_columns colname convention works", { 47 | expect_error(collate_columns(NULL), "Given composed_data has no known format") 48 | expect_error(collate_columns(tibble()), "Given composed_data has no known format") 49 | expect_error(collate_columns(tibble(col = 1, value = 1, data_block = 1, table_tag = 1, major_1 = 1)), "Given composed_data has no known format") 50 | expect_error(collate_columns(tibble(row = 1, value = 1, data_block = 1, table_tag = 1, major_1 = 1)), "Given composed_data has no known format") 51 | expect_error(collate_columns(tibble(row = 1, col = 1, data_block = 1, table_tag = 1, major_1 = 1)), "Given composed_data has no known format") 52 | expect_error(collate_columns(tibble(row = 1, col = 1, value = 1, table_tag = 1, major_1 = 1)), "Given composed_data has no known format") 53 | }) 54 | -------------------------------------------------------------------------------- /tests/testthat/test-optional_package.R: -------------------------------------------------------------------------------- 1 | test_that("optional_package dependency test", { 2 | fold <- system.file("extdata", "messy", package = "tidycells", mustWork = TRUE) 3 | dm <- tibble::tibble(fn = list.files(fold, full.names = TRUE)) 4 | 5 | dm <- dm %>% 6 | dplyr::mutate(original = dm$fn %>% 7 | purrr::map_chr(~ basename(.x) %>% 8 | stringr::str_split("\\.") %>% 9 | purrr::map_chr(1))) 10 | 11 | # both option for csv must result in same output (on test data at least) 12 | if (is_available("readr")) { 13 | d1 <- read_cells(dm$fn[dm$original == "csv"]) 14 | not_available("readr") 15 | d2 <- read_cells(dm$fn[dm$original == "csv"]) 16 | not_available() 17 | expect_equal(d1, d2) 18 | } 19 | 20 | # both option for xls must result in same output (on test data at least) 21 | if (is_available("xlsx") & is_available("readxl")) { 22 | d1 <- read_cells(dm$fn[dm$original == "xls"]) 23 | not_available("xlsx") 24 | expect_message(d2 <- read_cells(dm$fn[dm$original == "xls"]), "readxl") 25 | expect_equal(d1, d2) 26 | not_available(c("readxl", "docxtractr")) 27 | not_available("cli") 28 | expect_output(possible_to_support(), "readxl") 29 | expect_output(possible_to_support(), "docxtractr") 30 | expect_null(read_cells(dm$fn[dm$original == "xls"])) 31 | not_available() 32 | } 33 | }) 34 | -------------------------------------------------------------------------------- /tests/testthat/test-print_cell_analysis.R: -------------------------------------------------------------------------------- 1 | test_that("print cell-analysis works works", { 2 | x <- head(iris) 3 | cd <- x %>% 4 | as_cell_df(take_col_names = TRUE) %>% 5 | numeric_values_classifier() 6 | ca <- analyze_cells(cd) 7 | 8 | # following should run without error or warning 9 | wnow <- options("warn")[[1]] 10 | options(warn = 2) 11 | 12 | plot(ca) 13 | plot(ca, adaptive_txt_size = FALSE, no_txt = FALSE) 14 | plot(ca, 15 | prior_plot = plot(cd, no_plot = TRUE, adaptive_txt_size = FALSE) 16 | ) 17 | plot(ca, 18 | prior_plot = plot(cd, no_plot = TRUE, adaptive_txt_size = FALSE), 19 | gids = 0 20 | ) 21 | plot(ca, 22 | zoom_selected_gids = TRUE, 23 | dat_att_boundary = TRUE, block_boundary = FALSE 24 | ) 25 | plot(ca, plot_issues = TRUE) 26 | 27 | # also the traceback plot works 28 | cell_composition_traceback(ca, trace_row = 1) 29 | cell_composition_traceback(ca, trace_row = 10) 30 | 31 | options(warn = wnow) 32 | 33 | expect_true(inherits(plot(ca, no_plot = TRUE), "ggplot")) 34 | }) 35 | -------------------------------------------------------------------------------- /tests/testthat/test-print_cell_df.R: -------------------------------------------------------------------------------- 1 | test_that("print cell-df works", { 2 | x <- head(iris) 3 | cd <- x %>% as_cell_df() 4 | cdv <- cd %>% basic_classifier() 5 | g1 <- plot(cd, no_plot = TRUE) 6 | g2 <- plot(cdv, no_plot = TRUE) 7 | # following should run without error or warning 8 | wnow <- options("warn")[[1]] 9 | options(warn = 2) 10 | 11 | plot(cdv, no_fill = TRUE) 12 | plot(cdv, adaptive_txt_size = FALSE) 13 | plot(cdv, no_txt = TRUE) 14 | 15 | options(warn = wnow) 16 | 17 | expect_equal(rlang::as_name(g1$mapping$fill), "data_type") 18 | expect_equal(rlang::as_name(g2$mapping$fill), "type") 19 | }) 20 | -------------------------------------------------------------------------------- /tests/testthat/test-read_cells_real.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("read_cells on real data works I", { 3 | skip_if_not_installed("tidyxl") 4 | 5 | dcpi <- system.file("extdata", "RBI_HBS_Table_No_166.xlsx", package = "tidycells", mustWork = TRUE) %>% 6 | read_cells(at_level = "compose") 7 | 8 | chk1 <- dcpi$major_1 %>% 9 | unique() %>% 10 | stringr::str_detect("[a-zA-Z]") 11 | chk1 <- all(!chk1) 12 | expect_equal(unique(dcpi$minor_2), "Year/Month") 13 | expect_true(chk1) 14 | expect_equal( 15 | dcpi$major_2 %>% unique() %>% tolower() %>% sort(), 16 | month.abb %>% tolower() %>% sort() 17 | ) 18 | }) 19 | 20 | 21 | test_that("sample analysis on real data works II", { 22 | dwss <- readRDS("testdata/WSS.rds") 23 | 24 | da <- analyze_cells(dwss) 25 | dcmp <- compose_cells(da, discard_raw_cols = TRUE) 26 | 27 | chk1 <- dcmp$major_1 %>% 28 | unique() %>% 29 | stringr::str_detect("[a-zA-Z]") 30 | chk1 <- all(!chk1) 31 | 32 | expect_true(chk1) 33 | expect_equal( 34 | dcmp$major_2 %>% unique() %>% sort() %>% .[[15]], 35 | "3 Rupee Securities (including Treasury Bills)" 36 | ) 37 | }) 38 | 39 | 40 | test_that("sample analysis on real data works III", { 41 | cd <- readRDS("testdata/enron_from_unpivotr_processed.rds") 42 | 43 | ca <- analyze_cells(cd) 44 | expect_warning(dcmp <- compose_cells_raw(ca, discard_raw_cols = TRUE, ask_user = FALSE), "failed to compose") 45 | 46 | 47 | expect_equal( 48 | dcmp$data_block %>% unique() %>% length(), 49 | 7 50 | ) 51 | expect_equal( 52 | dcmp$major_1 %>% unique() %>% sort() %>% .[[6]], 53 | "Cash :: to :: 2002-03-01" 54 | ) 55 | }) 56 | -------------------------------------------------------------------------------- /tests/testthat/test-use_cases.R: -------------------------------------------------------------------------------- 1 | 2 | # I'll add few important masked data and expected outcome to make the logic robust 3 | # ex1 added just after CRAN release. Thanks to the user for this contribution 4 | 5 | test_that("use cases from users works - I", { 6 | cd <- readRDS("testusecases/ex1.rds") 7 | ca <- analyze_cells(cd) 8 | d0 <- ca$details$data_attr_map_raw %>% distinct(attr_gid, data_gid, direction, direction_group, attr_group) 9 | chk_str <- d0 %>% 10 | filter(attr_group == "major") %>% 11 | distinct(direction, direction_group) %>% 12 | arrange(direction, direction_group) %>% 13 | summarise(paste0(direction, ",", direction_group, collapse = ";")) %>% 14 | pull() 15 | expect_output(print(ca), "Total blocks: 1") 16 | expect_equal(nrow(ca$details$data_details$group_id_boundary), 1) 17 | expect_equal(chk_str, "E,WE;N,NS;W,WE") 18 | }) 19 | -------------------------------------------------------------------------------- /tests/testthat/testdata/WSS.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/tests/testthat/testdata/WSS.rds -------------------------------------------------------------------------------- /tests/testthat/testdata/badcells.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/tests/testthat/testdata/badcells.rds -------------------------------------------------------------------------------- /tests/testthat/testdata/doc.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/tests/testthat/testdata/doc.doc -------------------------------------------------------------------------------- /tests/testthat/testdata/enron_from_unpivotr_processed.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/tests/testthat/testdata/enron_from_unpivotr_processed.rds -------------------------------------------------------------------------------- /tests/testthat/testdata/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/tests/testthat/testdata/test.xls -------------------------------------------------------------------------------- /tests/testthat/testusecases/ex1.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/tests/testthat/testusecases/ex1.rds -------------------------------------------------------------------------------- /tidycells.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace 23 | 24 | QuitChildProcessesOnExit: Yes 25 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/ext/compose_cells_cli1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/vignettes/ext/compose_cells_cli1.png -------------------------------------------------------------------------------- /vignettes/ext/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/vignettes/ext/logo.png -------------------------------------------------------------------------------- /vignettes/ext/marks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/vignettes/ext/marks.png -------------------------------------------------------------------------------- /vignettes/ext/read_cells_out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/vignettes/ext/read_cells_out.png -------------------------------------------------------------------------------- /vignettes/ext/v12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/vignettes/ext/v12.png -------------------------------------------------------------------------------- /vignettes/ext/v34.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/vignettes/ext/v34.png -------------------------------------------------------------------------------- /vignettes/ext/v56.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-rudra/tidycells/fdf37ecea15ef6513a935845c8d9a408d6fb30c1/vignettes/ext/v56.png --------------------------------------------------------------------------------