├── get_HCP.R
├── analysis.py
├── .gitignore
├── functions.py
└── get_agreements.R


/get_HCP.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(httr)
 3 | library(rvest)
 4 | 
 5 | df <- read.csv(file = 'data/FWS_Species_Data_Explorer_HCPs.csv', header = TRUE, stringsAsFactors = FALSE)
 6 | base <- 'https://ecos.fws.gov'
 7 | 
 8 | # out_df <- data.frame('file' = c(), 'id' = c(), 'title' = c())
 9 | for(i in 1:nrow(df)){
10 |   id <- df[i,7]
11 |   title <- df[i,8]
12 |   suffix <- df[i,9]
13 |   url <- paste(base, suffix, sep = '')
14 |   
15 |   page <- read_html(url)
16 |   links <- html_nodes(page, 'a')
17 |   pdfs <- links[grep('.pdf', links)]
18 |   files <- html_attr(pdfs, 'href')
19 |   
20 |   if(length(pdfs) == 0){
21 |     row <- data.frame('file' = NA, 'id' = id, 'title' = title)
22 |   }else{
23 |     row <- data.frame('file' = files, 'id' = id, 'title' = title)
24 |   }
25 |   
26 |   if(i == 1){
27 |     out_df <- row
28 |   }else{
29 |     out_df <- bind_rows(out_df, row)
30 |   }
31 |   
32 | }
33 | 
34 | write.csv(out_df, file = 'data/HCP_docs_8Jun21.csv')
35 | 


--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Aug 25 16:51:04 2021
 4 | 
 5 | @author: MEvans
 6 | """
 7 | import argparse
 8 | import pytesseract
 9 | from os import walk, path
10 | from functions import process_file
11 | import pandas as pd
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--directory', '-d', required = True, help = 'root directory to look for files for OCR', type = str)
15 | parser.add_argument('--outDir', '-o', required = True, help = 'root directory into which to copy files', type = str)
16 | parser.add_argument('--tessExc', '-t', required = True, help = 'location of tesseract executable', type = str)
17 | args = parser.parse_args()
18 | 
19 | pytesseract.pytesseract.tesseract_cmd = f'{args.tessExc}'
20 | 
21 | from pdf2image.exceptions import (
22 |     PDFInfoNotInstalledError,
23 |     PDFPageCountError,
24 |     PDFSyntaxError
25 | )
26 | 
27 | rows = []
28 | for root, dirs, files in walk(args.directory):
29 |   if len(files) > 0:
30 |     for file in files:
31 |       if path.splitext(file)[1] == '.pdf':
32 |         row = process_file(root, file, args.outDir)
33 |         rows.append(row)
34 | 
35 | df = pd.DataFrame(rows, columns = ['file', 'region', 'hcp', 'npages', 'ocr'])
36 | df.to_csv(f'{args.outDir}/metadata.csv')


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Data files
132 | data/
133 | output/


--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Aug 25 16:47:29 2021
  4 | 
  5 | @author: MEvans
  6 | """
  7 | 
  8 | import PyPDF2 
  9 | from pytesseract import image_to_string
 10 | import cv2
 11 | import os
 12 | from os import path
 13 | import numpy as np
 14 | from pdf2image import convert_from_path
 15 | from os.path import join
 16 | from shutil import copyfile
 17 | 
 18 | def check_file(file):
 19 |   """Get the number of pages in a PDF and check if text is OCR'd
 20 |   Parameters:
 21 |     file (str): path to file
 22 |   Return:
 23 |     tpl (int, bool): number of pages, does the first page of file contain text?
 24 |   """   
 25 |   # creating a pdf file object 
 26 |   pdfFileObj = open(file, 'rb') 
 27 |       
 28 |   # creating a pdf reader object 
 29 |   pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 
 30 |       
 31 |   # printing number of pages in pdf file 
 32 |   if not pdfReader.isEncrypted:
 33 |     npages = pdfReader.numPages 
 34 |     print(file, 'contained', npages, 'pages')
 35 | 
 36 |     # creating a page object 
 37 |     pageObj = pdfReader.getPage(0) 
 38 | 
 39 |     text = pageObj.extractText()
 40 | 
 41 |     hasText = len(text) > 0
 42 | 
 43 |   else:
 44 |     hasText = True
 45 |     npages = 0
 46 | 
 47 |   # close the pdf file object 
 48 |   pdfFileObj.close()   
 49 | 
 50 |   return npages, hasText
 51 | 
 52 | def ocr_file(file, out, preprocess = 'thresh'):
 53 |   # get the basename of the image
 54 |   base = path.splitext(path.basename(file))[0]
 55 |   print('file basename', base)
 56 |   # we have pdfs coming in, so need to convert to image file that is interpretable by opencv
 57 |   pages = convert_from_path(file)
 58 |   # npages = len(pages)
 59 |   # print('number of pages = ', npages)
 60 | 
 61 |   output = ''
 62 |   for page in pages:
 63 | 
 64 |       # process the image
 65 |       #image = cv2.imread(args.image) # we don't have to read image, pdf2image creates a list of PIL images
 66 |       # PIL images aren't necessarily in the format needed by cv, which is a BGR numpy array
 67 |       array = np.array(page)
 68 |       print('shape of current page array', array.shape)
 69 |       gray = cv2.cvtColor(array, cv2.COLOR_RGB2GRAY)
 70 |       #bgr = page.convert('RGB')
 71 |       #gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
 72 | 
 73 |       if preprocess == 'thresh':
 74 |           gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY)[1]
 75 | 
 76 |       elif preprocess == 'blur':
 77 |           gray = cv2.medianBlur(gray, 3)
 78 | 
 79 |       #filename = f'{os.getpid()}.png'
 80 |       #cv2.imwrite(filename, gray)
 81 | 
 82 |       text = image_to_string(gray)
 83 |           #os.remove(filename)
 84 |       output = output + text
 85 | 
 86 |   with open(f'{out}/{base}.txt', 'w') as f:
 87 |       f.write(output)
 88 |       
 89 | def process_file(root, file, out):
 90 |     infile = join(root, file)
 91 |     parts = root.split('/')
 92 |     region = parts[-3]
 93 |     hcp = parts[-2]
 94 |     outpath = join(out, '/'.join(parts[-3:]))
 95 |     outfile = join(outpath, file)
 96 |     
 97 |     os.makedirs(outpath, exist_ok = True)
 98 |     
 99 |     # check_file takes full filepath and returns num pages and whether text was extracted
100 |     npages, hasText = check_file(infile)
101 |     # if text was extracted copy the original file over to our
102 |     if hasText:
103 |         print('copying', infile, 'to', outfile)
104 |         copyfile(infile, outfile)# do something
105 |         ocr = 'Yes'
106 |     else:
107 |         print('ocr-ing', outfile)
108 |         ocr_file(infile, outpath)
109 |         ocr = 'No'
110 |       
111 |     row = [file, region, hcp, npages, ocr]
112 |     return row     


--------------------------------------------------------------------------------
/get_agreements.R:
--------------------------------------------------------------------------------
  1 | library(dplyr)
  2 | library(httr)
  3 | library(rvest)
  4 | library(ecosscraper)
  5 | TECP_table <- get_TECP_table()
  6 | spp_agmt <- data.frame(Type = character(0), Plan = character(0), Link = character(0), Spp = character(0))
  7 | for(i in 1:nrow(TECP_table)){
  8 |   url <- TECP_table$Species_Page[i]
  9 |   species <- TECP_table$Scientific_Name[i]
 10 |   page <- read_html(url)
 11 |   tables <- try(html_nodes(page, "table"))
 12 |   if(length(grep("(CCAA Plan)", tables)) > 0){
 13 |     CCAA_table <- tables[grep("(CCAA)", tables)]
 14 |     CCAAs <- html_table(CCAA_table)[[1]][1][,1]
 15 |     CCAA_links <- html_nodes(CCAA_table, "a")
 16 |     CCAA_df <- data.frame(Type = rep("CCAA", length(CCAAs)), 
 17 |                           Plan = CCAAs,
 18 |                           Link = html_attr(CCAA_links, "href"),
 19 |                           Spp = rep(species, length(CCAAs)))
 20 |     spp_agmt <- rbind(spp_agmt, CCAA_df)
 21 |   }
 22 |   if(length(grep("(CCA Plan)", tables)) > 0){
 23 |     CCA_table <- tables[grep("(CCA )", tables)]
 24 |     CCAs <- html_table(CCA_table)[[1]][1][,1]
 25 |     CCA_links <- html_nodes(CCA_table, "a")
 26 |     CCA_df <- data.frame(Type = rep("CCA", length(CCAs)), 
 27 |                          Plan = CCAs,
 28 |                          Link = html_attr(CCA_links, "href"),
 29 |                          Spp = rep(species, length(CCAs)))
 30 |     spp_agmt <- rbind(spp_agmt, CCA_df)
 31 |   }
 32 |   if(length(grep("(HCP Plan)", tables)) > 0){
 33 |     HCP_table <- tables[grep("(HCP Plan)", tables)]
 34 |     HCPs <- html_table(HCP_table)[[1]][1][,1]
 35 |     HCP_links <- html_nodes(HCP_table, "a")
 36 |     HCP_df <- data.frame(Type = rep("HCP", length(HCPs)), 
 37 |                          Plan = HCPs,
 38 |                          Link = html_attr(HCP_links, "href"),
 39 |                          Spp = rep(species, length(HCPs)))
 40 |     spp_agmt <- rbind(spp_agmt, HCP_df)
 41 |   }
 42 |   if(length(grep("(SHA Plan)", tables)) > 0){
 43 |     SHA_table <- tables[grep("(SHA)", tables)]
 44 |     SHAs <- html_table(SHA_table)[[1]][1][,1]
 45 |     SHA_links <- html_nodes(SHA_table, "a")
 46 |     SHA_df <- data.frame(Type = rep("SHA", length(SHAs)), 
 47 |                          Plan = SHAs,
 48 |                          Link = html_attr(SHA_links, "href"),
 49 |                          Spp = rep(species, length(SHAs)))
 50 |     spp_agmt <- rbind(spp_agmt, SHA_df)
 51 |   }
 52 |   rm(tables)
 53 | }
 54 | spp_agmt$ScrapeDate <- Sys.Date()
 55 | 
 56 | plans_spp <- group_by(spp_agmt, Link)%>%
 57 |   summarise(Plan = first(Plan), Type = first(Type))
 58 | 
 59 | 
 60 | plans_from_spp <- data.frame()
 61 | for(i in 1:nrow(plans)){
 62 |   url <- paste("https://ecos.fws.gov", plans$Link[i], sep="")
 63 |   page <- read_html(url)
 64 |   tab <- ecosscraper::get_conservation_plan_data(url, "")
 65 |   plans_from_spp <- bind_rows(plans_from_spp, as.data.frame(tab))
 66 | }
 67 | 
 68 | get_conservation_plan_links <- function(types, method = "spp")
 69 | page_agmt <- data.frame(Type = character(0), Plan = character(0), Link = character(0)) 
 70 | for(i in c("HCP", "SHA", "CCA", "CCAA")){
 71 |   url <- paste("https://ecos.fws.gov/ecp0/conservationPlan/region?region=9&type=", i, sep = "")
 72 |   page <- read_html(url)
 73 |   opt_nodes <- html_nodes(page, "option")
 74 |   ids <- html_attr(opt_nodes, "value")
 75 |   names <- html_text(opt_nodes)
 76 |   links <- paste("https://ecos.fws.gov/ecp0/conservationPlan/plan?plan_id=", ids, sep = "")
 77 |   type <- i
 78 |   df <- data.frame(Type = type, Plan = names, Link = links)
 79 |   page_agmt <- bind_rows(page_agmt, df)
 80 |   rm(df)
 81 | }
 82 | 
 83 | test <- group_by(page_agmt, Link)%>%
 84 |   summarise(Plan = first(Plan), Type = first(Type))
 85 |   
 86 | page_not_spp <- plans_page[!plans_page$Link %in% plans_spp$Link, ]
 87 | spp_not_page <- plans_spp[!plans_spp$Link %in% plans_page$Link, ]
 88 | 
 89 | plans_from_pages <- data.frame()
 90 | for(i in 1:nrow(page_not_spp)){
 91 |   url <- page_not_spp$Link[i]
 92 |   page <- read_html(url)
 93 |   tab <- ecosscraper::get_conservation_plan_data(url, "")
 94 |   plans_from_pages <- bind_rows(plans_from_pages, as.data.frame(tab))
 95 | }
 96 | 
 97 | diff <- function(m1, m2){
 98 |   d <- m1 - m2
 99 |   z <- (d - mean(d))/sd(d)
100 |   p <- pnorm(abs(z), lower.tail = FALSE)
101 |   return(list("d" = d, "z" = z, "p" = p))
102 | }
103 | 
104 | weights <-  function(d, p){
105 |     d2 <- p*d
106 |     z2 <- (d2 - mean(d2))/sd(d2)
107 |     p2 <- pnorm(abs(z2), lower.tail = FALSE)
108 |     return(list("d" = d2, "z" = z2, "p" = p2))
109 | }
110 | 
111 | for (i in 1:10){
112 |   di <- get(paste("dif", i, sep = ""))
113 |   dif <- weights(dif$p, di$d, di$z)
114 | }
115 |   
116 | 
117 | plans_all$Type <- vapply(1:nrow(plans_all), function(i){
118 |   type <- links_spp$Type[links_spp$Plan == plans_all$Plan_Name[i]]
119 |   if (length(type) == 0) {type <- links_page$Type[links_page$Plan == plans_all$Plan_Name[i]]}
120 |   if (length(type) == 0) {type <- ("")}
121 |   return(type)},
122 |   USE.NAMES = FALSE, FUN.VALUE = character(1))
123 | 
124 | 


--------------------------------------------------------------------------------