├── get_HCP.R ├── analysis.py ├── .gitignore ├── functions.py └── get_agreements.R /get_HCP.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(httr) 3 | library(rvest) 4 | 5 | df <- read.csv(file = 'data/FWS_Species_Data_Explorer_HCPs.csv', header = TRUE, stringsAsFactors = FALSE) 6 | base <- 'https://ecos.fws.gov' 7 | 8 | # out_df <- data.frame('file' = c(), 'id' = c(), 'title' = c()) 9 | for(i in 1:nrow(df)){ 10 | id <- df[i,7] 11 | title <- df[i,8] 12 | suffix <- df[i,9] 13 | url <- paste(base, suffix, sep = '') 14 | 15 | page <- read_html(url) 16 | links <- html_nodes(page, 'a') 17 | pdfs <- links[grep('.pdf', links)] 18 | files <- html_attr(pdfs, 'href') 19 | 20 | if(length(pdfs) == 0){ 21 | row <- data.frame('file' = NA, 'id' = id, 'title' = title) 22 | }else{ 23 | row <- data.frame('file' = files, 'id' = id, 'title' = title) 24 | } 25 | 26 | if(i == 1){ 27 | out_df <- row 28 | }else{ 29 | out_df <- bind_rows(out_df, row) 30 | } 31 | 32 | } 33 | 34 | write.csv(out_df, file = 'data/HCP_docs_8Jun21.csv') 35 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Aug 25 16:51:04 2021 4 | 5 | @author: MEvans 6 | """ 7 | import argparse 8 | import pytesseract 9 | from os import walk, path 10 | from functions import process_file 11 | import pandas as pd 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--directory', '-d', required = True, help = 'root directory to look for files for OCR', type = str) 15 | parser.add_argument('--outDir', '-o', required = True, help = 'root directory into which to copy files', type = str) 16 | parser.add_argument('--tessExc', '-t', required = True, help = 'location of tesseract executable', type = str) 17 | args = parser.parse_args() 18 | 19 | pytesseract.pytesseract.tesseract_cmd = f'{args.tessExc}' 20 | 21 | from pdf2image.exceptions import ( 22 | PDFInfoNotInstalledError, 23 | PDFPageCountError, 24 | PDFSyntaxError 25 | ) 26 | 27 | rows = [] 28 | for root, dirs, files in walk(args.directory): 29 | if len(files) > 0: 30 | for file in files: 31 | if path.splitext(file)[1] == '.pdf': 32 | row = process_file(root, file, args.outDir) 33 | rows.append(row) 34 | 35 | df = pd.DataFrame(rows, columns = ['file', 'region', 'hcp', 'npages', 'ocr']) 36 | df.to_csv(f'{args.outDir}/metadata.csv') -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Data files 132 | data/ 133 | output/ -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Aug 25 16:47:29 2021 4 | 5 | @author: MEvans 6 | """ 7 | 8 | import PyPDF2 9 | from pytesseract import image_to_string 10 | import cv2 11 | import os 12 | from os import path 13 | import numpy as np 14 | from pdf2image import convert_from_path 15 | from os.path import join 16 | from shutil import copyfile 17 | 18 | def check_file(file): 19 | """Get the number of pages in a PDF and check if text is OCR'd 20 | Parameters: 21 | file (str): path to file 22 | Return: 23 | tpl (int, bool): number of pages, does the first page of file contain text? 24 | """ 25 | # creating a pdf file object 26 | pdfFileObj = open(file, 'rb') 27 | 28 | # creating a pdf reader object 29 | pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 30 | 31 | # printing number of pages in pdf file 32 | if not pdfReader.isEncrypted: 33 | npages = pdfReader.numPages 34 | print(file, 'contained', npages, 'pages') 35 | 36 | # creating a page object 37 | pageObj = pdfReader.getPage(0) 38 | 39 | text = pageObj.extractText() 40 | 41 | hasText = len(text) > 0 42 | 43 | else: 44 | hasText = True 45 | npages = 0 46 | 47 | # close the pdf file object 48 | pdfFileObj.close() 49 | 50 | return npages, hasText 51 | 52 | def ocr_file(file, out, preprocess = 'thresh'): 53 | # get the basename of the image 54 | base = path.splitext(path.basename(file))[0] 55 | print('file basename', base) 56 | # we have pdfs coming in, so need to convert to image file that is interpretable by opencv 57 | pages = convert_from_path(file) 58 | # npages = len(pages) 59 | # print('number of pages = ', npages) 60 | 61 | output = '' 62 | for page in pages: 63 | 64 | # process the image 65 | #image = cv2.imread(args.image) # we don't have to read image, pdf2image creates a list of PIL images 66 | # PIL images aren't necessarily in the format needed by cv, which is a BGR numpy array 67 | array = np.array(page) 68 | print('shape of current page array', array.shape) 69 | gray = cv2.cvtColor(array, cv2.COLOR_RGB2GRAY) 70 | #bgr = page.convert('RGB') 71 | #gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 72 | 73 | if preprocess == 'thresh': 74 | gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY)[1] 75 | 76 | elif preprocess == 'blur': 77 | gray = cv2.medianBlur(gray, 3) 78 | 79 | #filename = f'{os.getpid()}.png' 80 | #cv2.imwrite(filename, gray) 81 | 82 | text = image_to_string(gray) 83 | #os.remove(filename) 84 | output = output + text 85 | 86 | with open(f'{out}/{base}.txt', 'w') as f: 87 | f.write(output) 88 | 89 | def process_file(root, file, out): 90 | infile = join(root, file) 91 | parts = root.split('/') 92 | region = parts[-3] 93 | hcp = parts[-2] 94 | outpath = join(out, '/'.join(parts[-3:])) 95 | outfile = join(outpath, file) 96 | 97 | os.makedirs(outpath, exist_ok = True) 98 | 99 | # check_file takes full filepath and returns num pages and whether text was extracted 100 | npages, hasText = check_file(infile) 101 | # if text was extracted copy the original file over to our 102 | if hasText: 103 | print('copying', infile, 'to', outfile) 104 | copyfile(infile, outfile)# do something 105 | ocr = 'Yes' 106 | else: 107 | print('ocr-ing', outfile) 108 | ocr_file(infile, outpath) 109 | ocr = 'No' 110 | 111 | row = [file, region, hcp, npages, ocr] 112 | return row -------------------------------------------------------------------------------- /get_agreements.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(httr) 3 | library(rvest) 4 | library(ecosscraper) 5 | TECP_table <- get_TECP_table() 6 | spp_agmt <- data.frame(Type = character(0), Plan = character(0), Link = character(0), Spp = character(0)) 7 | for(i in 1:nrow(TECP_table)){ 8 | url <- TECP_table$Species_Page[i] 9 | species <- TECP_table$Scientific_Name[i] 10 | page <- read_html(url) 11 | tables <- try(html_nodes(page, "table")) 12 | if(length(grep("(CCAA Plan)", tables)) > 0){ 13 | CCAA_table <- tables[grep("(CCAA)", tables)] 14 | CCAAs <- html_table(CCAA_table)[[1]][1][,1] 15 | CCAA_links <- html_nodes(CCAA_table, "a") 16 | CCAA_df <- data.frame(Type = rep("CCAA", length(CCAAs)), 17 | Plan = CCAAs, 18 | Link = html_attr(CCAA_links, "href"), 19 | Spp = rep(species, length(CCAAs))) 20 | spp_agmt <- rbind(spp_agmt, CCAA_df) 21 | } 22 | if(length(grep("(CCA Plan)", tables)) > 0){ 23 | CCA_table <- tables[grep("(CCA )", tables)] 24 | CCAs <- html_table(CCA_table)[[1]][1][,1] 25 | CCA_links <- html_nodes(CCA_table, "a") 26 | CCA_df <- data.frame(Type = rep("CCA", length(CCAs)), 27 | Plan = CCAs, 28 | Link = html_attr(CCA_links, "href"), 29 | Spp = rep(species, length(CCAs))) 30 | spp_agmt <- rbind(spp_agmt, CCA_df) 31 | } 32 | if(length(grep("(HCP Plan)", tables)) > 0){ 33 | HCP_table <- tables[grep("(HCP Plan)", tables)] 34 | HCPs <- html_table(HCP_table)[[1]][1][,1] 35 | HCP_links <- html_nodes(HCP_table, "a") 36 | HCP_df <- data.frame(Type = rep("HCP", length(HCPs)), 37 | Plan = HCPs, 38 | Link = html_attr(HCP_links, "href"), 39 | Spp = rep(species, length(HCPs))) 40 | spp_agmt <- rbind(spp_agmt, HCP_df) 41 | } 42 | if(length(grep("(SHA Plan)", tables)) > 0){ 43 | SHA_table <- tables[grep("(SHA)", tables)] 44 | SHAs <- html_table(SHA_table)[[1]][1][,1] 45 | SHA_links <- html_nodes(SHA_table, "a") 46 | SHA_df <- data.frame(Type = rep("SHA", length(SHAs)), 47 | Plan = SHAs, 48 | Link = html_attr(SHA_links, "href"), 49 | Spp = rep(species, length(SHAs))) 50 | spp_agmt <- rbind(spp_agmt, SHA_df) 51 | } 52 | rm(tables) 53 | } 54 | spp_agmt$ScrapeDate <- Sys.Date() 55 | 56 | plans_spp <- group_by(spp_agmt, Link)%>% 57 | summarise(Plan = first(Plan), Type = first(Type)) 58 | 59 | 60 | plans_from_spp <- data.frame() 61 | for(i in 1:nrow(plans)){ 62 | url <- paste("https://ecos.fws.gov", plans$Link[i], sep="") 63 | page <- read_html(url) 64 | tab <- ecosscraper::get_conservation_plan_data(url, "") 65 | plans_from_spp <- bind_rows(plans_from_spp, as.data.frame(tab)) 66 | } 67 | 68 | get_conservation_plan_links <- function(types, method = "spp") 69 | page_agmt <- data.frame(Type = character(0), Plan = character(0), Link = character(0)) 70 | for(i in c("HCP", "SHA", "CCA", "CCAA")){ 71 | url <- paste("https://ecos.fws.gov/ecp0/conservationPlan/region?region=9&type=", i, sep = "") 72 | page <- read_html(url) 73 | opt_nodes <- html_nodes(page, "option") 74 | ids <- html_attr(opt_nodes, "value") 75 | names <- html_text(opt_nodes) 76 | links <- paste("https://ecos.fws.gov/ecp0/conservationPlan/plan?plan_id=", ids, sep = "") 77 | type <- i 78 | df <- data.frame(Type = type, Plan = names, Link = links) 79 | page_agmt <- bind_rows(page_agmt, df) 80 | rm(df) 81 | } 82 | 83 | test <- group_by(page_agmt, Link)%>% 84 | summarise(Plan = first(Plan), Type = first(Type)) 85 | 86 | page_not_spp <- plans_page[!plans_page$Link %in% plans_spp$Link, ] 87 | spp_not_page <- plans_spp[!plans_spp$Link %in% plans_page$Link, ] 88 | 89 | plans_from_pages <- data.frame() 90 | for(i in 1:nrow(page_not_spp)){ 91 | url <- page_not_spp$Link[i] 92 | page <- read_html(url) 93 | tab <- ecosscraper::get_conservation_plan_data(url, "") 94 | plans_from_pages <- bind_rows(plans_from_pages, as.data.frame(tab)) 95 | } 96 | 97 | diff <- function(m1, m2){ 98 | d <- m1 - m2 99 | z <- (d - mean(d))/sd(d) 100 | p <- pnorm(abs(z), lower.tail = FALSE) 101 | return(list("d" = d, "z" = z, "p" = p)) 102 | } 103 | 104 | weights <- function(d, p){ 105 | d2 <- p*d 106 | z2 <- (d2 - mean(d2))/sd(d2) 107 | p2 <- pnorm(abs(z2), lower.tail = FALSE) 108 | return(list("d" = d2, "z" = z2, "p" = p2)) 109 | } 110 | 111 | for (i in 1:10){ 112 | di <- get(paste("dif", i, sep = "")) 113 | dif <- weights(dif$p, di$d, di$z) 114 | } 115 | 116 | 117 | plans_all$Type <- vapply(1:nrow(plans_all), function(i){ 118 | type <- links_spp$Type[links_spp$Plan == plans_all$Plan_Name[i]] 119 | if (length(type) == 0) {type <- links_page$Type[links_page$Plan == plans_all$Plan_Name[i]]} 120 | if (length(type) == 0) {type <- ("")} 121 | return(type)}, 122 | USE.NAMES = FALSE, FUN.VALUE = character(1)) 123 | 124 | --------------------------------------------------------------------------------