├── requirements.txt
├── README.md
└── scripts
    ├── helper_scripts
        ├── movestuff.sh
        ├── README.md
        └── gdrive_upload_bulk.py
    ├── experimental
        ├── README.md
        └── Trigraph_Finder.py
    ├── NLP
        ├── wordripper.py
        ├── sentence_tokenize.py
        ├── stopword_filter.py
        ├── speechparts.py
        ├── bulk-wordripper.py
        ├── bulk-speechparts.py
        └── README.md
    └── OCR
        ├── local OCR processing
            ├── striptext.sh
            └── OCR.sh
        ├── google cloud processing
            ├── client_secret.json
            └── OCR_drive.py
        └── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | ocrmypdf
2 | tesseract-ocr
3 | unpaper
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # KrangTools
2 | KrangTools is a set of scripts that can be used in text processing for OCR and NLP pipeline building.
3 | 


--------------------------------------------------------------------------------
/scripts/helper_scripts/movestuff.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | for file in $(ls -p | grep -v / | head -15000)
3 | do
4 | # Change this value
5 | mv $file /dir/foo
6 | done
7 | 


--------------------------------------------------------------------------------
/scripts/experimental/README.md:
--------------------------------------------------------------------------------
1 | # Experimental! All scripts in this file are considered experimental! They won't hurt anything, but they also may not function cleanly
2 | 


--------------------------------------------------------------------------------
/scripts/NLP/wordripper.py:
--------------------------------------------------------------------------------
1 | import io
2 | from nltk.tokenize import sent_tokenize, word_tokenize
3 | 
4 | with io.open('filename.txt', 'r', encoding="UTF8") as myfile:
5 |     data=myfile.read()
6 | 
7 | print(word_tokenize(data))
8 | 


--------------------------------------------------------------------------------
/scripts/OCR/local OCR processing/striptext.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | #This one-liner will rip raw text files from PDFs producing a text file with the same name in the same directory
4 | for file in *.pdf; do pdftotext "$file" "$file.txt"; done
5 | 


--------------------------------------------------------------------------------
/scripts/NLP/sentence_tokenize.py:
--------------------------------------------------------------------------------
1 | import io
2 | from nltk.tokenize import sent_tokenize, word_tokenize
3 | 
4 | with io.open('filename.txt', 'r', encoding="UTF8") as myfile:
5 |     data=myfile.read().replace('\n', '')
6 | 
7 | print(sent_tokenize(data))
8 | 


--------------------------------------------------------------------------------
/scripts/NLP/stopword_filter.py:
--------------------------------------------------------------------------------
1 | from nltk.corpus import stopwords
2 | from nltk.tokenize import word_tokenize
3 | 
4 | filename = "filename.txt"
5 | file = open(filename, "r")
6 | for line in file:
7 |   stop_words = set(stopwords.words("english"))
8 |   print(stop_words)
9 | 


--------------------------------------------------------------------------------
/scripts/NLP/speechparts.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import io
 3 | from nltk.tokenize import sent_tokenize, word_tokenize
 4 | 
 5 | with io.open('filename.txt', 'r', encoding="UTF8") as myfile:
 6 |     data=myfile.read().replace('\n', '')
 7 | 
 8 | text = word_tokenize(data)
 9 | finished = nltk.pos_tag(text)
10 | 
11 | print(finished)
12 | 


--------------------------------------------------------------------------------
/scripts/OCR/local OCR processing/OCR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | for f in *.pdf;
 4 | do
 5 |   echo "Running OCR on "$f""
 6 |   ocrmypdf -v --deskew --clean --clean-final "$f" "$f";
 7 | 
 8 |   echo "moving ocr'd file"
 9 | #  Make sure this value is changed to fit your destination!
10 |   mv "$f" /root/OCRd
11 | 
12 |   echo "finished"
13 | done
14 | 


--------------------------------------------------------------------------------
/scripts/NLP/bulk-wordripper.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | from nltk.tokenize import sent_tokenize, word_tokenize
 4 | 
 5 | for filename in os.listdir("."):
 6 |     if filename.endswith(".txt"):
 7 |         with io.open(filename, 'r', encoding="UTF8") as f:
 8 |             data=f.read()
 9 |             print(word_tokenize(data))
10 |     else:
11 |         continue
12 | 


--------------------------------------------------------------------------------
/scripts/OCR/google cloud processing/client_secret.json:
--------------------------------------------------------------------------------
1 | {"installed":{"client_id":"<api_key","project_id":"list-test-225819","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://www.googleapis.com/oauth2/v3/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"<fill_in_here","redirect_uris":["urn:ietf:wg:oauth:2.0:oob","http://localhost"]}}
2 | 


--------------------------------------------------------------------------------
/scripts/helper_scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Helper Scripts
 2 | These scripts will help do bulk actions that you may need to do during pre-processing of the files you're working with
 3 | 
 4 | ## movestuff.sh
 5 | This script is helpful for moving large volumes of files from one place to another.
 6 | 
 7 | ## gdrive_upload_bulk
 8 | Uploads multiple files to your google drive. Adapated from https://github.com/kshcherban/gdrive_uploader
 9 | 
10 | 


--------------------------------------------------------------------------------
/scripts/NLP/bulk-speechparts.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import io
 3 | import os
 4 | 
 5 | from nltk.tokenize import sent_tokenize, word_tokenize
 6 | 
 7 | for filename in os.listdir("."):
 8 |     if filename.endswith(".txt"):
 9 |         with io.open(filename, 'r', encoding="UTF8") as f:
10 |           data=f.read().replace('\n', '')
11 |           text = word_tokenize(data)
12 |           finished = nltk.pos_tag(text)
13 |           print(finished)
14 | 


--------------------------------------------------------------------------------
/scripts/NLP/README.md:
--------------------------------------------------------------------------------
 1 | # NLP Scripts
 2 | 
 3 | ## sentence_tokenize.py
 4 | rips out whole sentences from raw text files.
 5 | 
 6 | ## speechparts.py
 7 | Performs part-of-speech tagging.
 8 | 
 9 | ### bulk-speechparts.py
10 | Bulk part-of-speech tagging script. 
11 | 
12 | ## stopword_filter.py
13 | Removes stopwords from text.
14 | 
15 | ## wordripper.py
16 | Word tokenizer.
17 | 
18 | ### Bulk-wordripper.py
19 | Rips tokenized words out of the current directory of text files
20 | 


--------------------------------------------------------------------------------
/scripts/experimental/Trigraph_Finder.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import nltk
 3 | import io
 4 | import os
 5 | 
 6 | from nltk.tokenize import sent_tokenize, word_tokenize
 7 | 
 8 | for filename in os.listdir("."):
 9 |     if filename.endswith(".txt"):
10 |         with io.open(filename, 'r', encoding="UTF8") as f:
11 |           data=f.read().replace('\n', '')
12 |           text = word_tokenize(data)
13 |           finished = nltk.pos_tag(text)
14 | #          print(finished)
15 | 
16 | for line in finished:
17 |     match = re.search('[A-Z]{4,}', 'NNP'', line)
18 |     print(match)
19 | #    if match:
20 | #        new_line=match.group() + '\n'
21 | #        print(match)
22 | 


--------------------------------------------------------------------------------
/scripts/OCR/README.md:
--------------------------------------------------------------------------------
 1 | # OCR Scripts
 2 | 
 3 | These scripts are for OCR activation and a little bulk file management.
 4 | 
 5 | It may seem silly to have scripts that do simple things like moving a certain number of files from one directory to another, but with the volume of files you may be dealing with, your arguments list may be too long for processing.
 6 | 
 7 | In order to install system requirements, use the following command:
 8 | xargs -rxa requirements.txt -- sudo apt-get install --
 9 | 
10 | In order to use this script, you will need to install the programs inside of requirements.txt
11 | 
12 | The file structure this works around is as follows:
13 | ```
14 | |/root/
15 | -|OCRd
16 | -|foo/
17 | --|OCR.sh
18 | --|<.PDF files to be OCR'd>
19 | ```
20 | 
21 | ## OCR.sh
22 | Recommend you use this with tmux, especially if you're processing through a high volume of files, most especially if your files have a high number of pages in them.
23 | 
24 | ## striptext.sh
25 | This script will help rip raw text out of your PDFs. It will dump raw text files out with the same name as the original file.
26 | 
27 | ## OCR_Drive.py
28 | This script supports the Google docs upload method. It's much less resource intensive and much faster per document, but there are things you should take note of before using it:
29 | 
30 | THIS SCRIPT WILL REPLACE THE ORIGINAL PDF FILE! Make sure you have copies if you want to keep the originals intact!
31 | Be aware that this will only work for documents under 5mb in size and you will need an active network connection to upload and download the pdf files you wish to process.
32 | 
33 | You will also need to have a google drive api key in order to make this work.
34 | 
35 | Credit for the singule document processing script goes out to https://tanaikech.github.io/2017/05/02/ocr-using-google-drive-api/
36 | Great thanks to @glitchliz for helping with the failover protections!
37 | 


--------------------------------------------------------------------------------
/scripts/OCR/google cloud processing/OCR_drive.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import httplib2
 3 | import os
 4 | import io
 5 | import shutil
 6 | import time
 7 | 
 8 | from apiclient import discovery
 9 | from oauth2client import client
10 | from oauth2client import tools
11 | from oauth2client.file import Storage
12 | from apiclient.http import MediaFileUpload, MediaIoBaseDownload
13 | 
14 | start = time.time()
15 | 
16 | try:
17 |     import argparse
18 |     flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
19 | except ImportError:
20 |     flags = None
21 | 
22 | # If modifying these scopes, delete your previously saved credentials
23 | # at ~/.credentials/drive-python-quickstart.json
24 | SCOPES = 'https://www.googleapis.com/auth/drive'
25 | CLIENT_SECRET_FILE = 'client_secret.json'
26 | APPLICATION_NAME = 'Drive API Python Quickstart'
27 | 
28 | 
29 | def get_credentials():
30 |     """Gets valid user credentials from storage.
31 |     If nothing has been stored, or if the stored credentials are invalid,
32 |     the OAuth2 flow is completed to obtain the new credentials.
33 |     Returns:
34 |         Credentials, the obtained credential.
35 |     """
36 |     credential_path = os.path.join("./", 'drive-python-quickstart.json')
37 |     store = Storage(credential_path)
38 |     credentials = store.get()
39 |     if not credentials or credentials.invalid:
40 |         flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
41 |         flow.user_agent = APPLICATION_NAME
42 |         if flags:
43 |             credentials = tools.run_flow(flow, store, flags)
44 |         else:  # Needed only for compatibility with Python 2.6
45 |             credentials = tools.run(flow, store)
46 |         print('Storing credentials to ' + credential_path)
47 |     return credentials
48 | 
49 | 
50 | def main():
51 |     for filename in os.listdir("."):
52 |         if filename.endswith(".pdf"):
53 |             credentials = get_credentials()
54 |             http = credentials.authorize(httplib2.Http())
55 |             service = discovery.build('drive', 'v3', http=http)
56 | 
57 |             imgfile = filename  # Image with texts (png, jpg, bmp, gif, pdf)
58 |             txtfile = filename  # Text file outputted by OCR
59 | # change these values for the path of the locally stored files, and where you want them moved after processing
60 | 	    path = '/media/Archives/CREST/copycopy/'
61 | 	    finDir = '/media/Archives/CREST/donedone/'
62 |             rejDir = '/media/Archives/CREST/rejected'
63 | 
64 | # This will help you if the script ever dies while processing a certain file
65 |             print (filename)
66 |             try:
67 | 
68 | 
69 |             	mime = 'application/vnd.google-apps.document'
70 |             	res = service.files().create(
71 |                 	body={
72 |                     	'name': imgfile,
73 |              	       	'mimeType': mime
74 | #            	'uploadType': multipart
75 |                 },
76 |                 media_body=MediaFileUpload(imgfile, mimetype=mime, resumable=True)
77 |             ).execute()
78 | 
79 |             	downloader = MediaIoBaseDownload(
80 |                	    io.FileIO(txtfile, 'wb'),
81 |                	    service.files().export_media(fileId=res['id'], mimeType="text/plain")
82 |             )
83 |             	done = False
84 |             	while done is False:
85 |                 	status, done = downloader.next_chunk()
86 |            		shutil.move(os.path.join(path, filename), finDir)
87 |             	service.files().delete(fileId=res['id']).execute()
88 |             except:
89 |             	shutil.move(os.path.join(path, filename), rejDir)
90 |             print("Done.")
91 | 
92 |         end = time.time()
93 |         print("I have been running for", end - start, "seconds")
94 | 
95 | if __name__ == '__main__':
96 |     main()
97 | 


--------------------------------------------------------------------------------
/scripts/helper_scripts/gdrive_upload_bulk.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import logging
  4 | import httplib2
  5 | from mimetypes import guess_type
  6 | 
  7 | # Following libraries can be installed by executing:
  8 | # sudo pip install --upgrade google-api-python-client
  9 | from apiclient.discovery import build
 10 | from apiclient.http import MediaFileUpload
 11 | from apiclient.errors import ResumableUploadError
 12 | from oauth2client.client import OAuth2WebServerFlow
 13 | from oauth2client.file import Storage
 14 | 
 15 | credentials = ""
 16 | 
 17 | # Log only oauth2client errors
 18 | logging.basicConfig(level="ERROR")
 19 | 
 20 | # Path to token json file, it should be in same directory as script
 21 | token_file = sys.path[0] + '/auth_token.txt'
 22 | 
 23 | # Copy your credentials from the APIs Console
 24 | CLIENT_ID = ''
 25 | CLIENT_SECRET = ''
 26 | # Check https://developers.google.com/drive/scopes for all available scopes
 27 | OAUTH_SCOPE = 'https://www.googleapis.com/auth/drive.file'
 28 | # Redirect URI for installed apps, can be left as is
 29 | REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
 30 | 
 31 | for filename in os.listdir("."):
 32 |         if filename.endswith(".pdf"):
 33 |         	# Get mime type and name of given file
 34 | 			def file_ops(filename):
 35 | 			    mime_type = guess_type(filename)[0]
 36 | 			    mime_type = mime_type if mime_type else 'text/plain'
 37 | #			    file_name = filename.split('/')[-1]
 38 | #			    return file_name, mime_type
 39 | 
 40 | 
 41 | 			def create_token_file(token_file):
 42 | 			# Run through the OAuth flow and retrieve credentials
 43 | 			    flow = OAuth2WebServerFlow(
 44 | 			        CLIENT_ID,
 45 | 			        CLIENT_SECRET,
 46 | 			        OAUTH_SCOPE,
 47 | 			        redirect_uri=REDIRECT_URI
 48 | 			        )
 49 | 			    authorize_url = flow.step1_get_authorize_url()
 50 | 			    print('Go to the following link in your browser: ' + authorize_url)
 51 | 			    code = raw_input('Enter verification code: ').strip()
 52 | 			    credentials = flow.step2_exchange(code)
 53 | 			    storage = Storage(token_file)
 54 | 			    storage.put(credentials)
 55 | 			    return storage
 56 | 
 57 | 
 58 | 			def authorize(token_file, storage):
 59 | 			# Get credentials
 60 | 		    	    if storage is None:
 61 | 			        storage = Storage(token_file)
 62 | 			        credentials = storage.get()
 63 | 			# Create an httplib2.Http object and authorize it with our credentials
 64 | 			    http = httplib2.Http()
 65 | 			    credentials.refresh(http)
 66 | 			    http = credentials.authorize(http)
 67 | 			    return http
 68 | 
 69 | 
 70 | 			def upload_file(filename, mime_type):
 71 | 			# Create Google Drive service instance
 72 | 			    drive_service = build('drive', 'v2', http=http)
 73 | 			# File body description
 74 | 			    media_body = MediaFileUpload(filename,
 75 | 			                                 mimetype=mime_type,
 76 | 			                                 resumable=True)
 77 | 			    body = {
 78 | 			        'title': filename,
 79 | 			        'description': 'backup',
 80 | 			        'mimeType': mime_type,
 81 | 			    }
 82 | 			# Permissions body description: anyone who has link can upload
 83 | 			# Other permissions can be found at https://developers.google.com/drive/v2/reference/permissions
 84 | 			    permissions = {
 85 | 			        'role': 'reader',
 86 | 			        'type': 'anyone',
 87 | 			        'value': None,
 88 | 			        'withLink': True
 89 | 			    }
 90 | 			# Insert a file
 91 | 			    file = drive_service.files().insert(body=body, media_body=media_body).execute()
 92 | 			# Insert new permissions
 93 | 			    drive_service.permissions().insert(fileId=file['id'], body=permissions).execute()
 94 | 			# Define file instance and get url for download
 95 | 			    file = drive_service.files().get(fileId=file['id']).execute()
 96 | 			    download_url = file.get('webContentLink')
 97 | 			    return download_url
 98 | 
 99 | 			if __name__ == '__main__':
100 | 			    try:
101 | 			        with open(filename) as f: pass
102 | 			    except IOError as e:
103 | 			        print(e)
104 | 			        sys.exit(1)
105 | 			# Check if token file exists, if not create it by requesting authorization code
106 | 			try:
107 | 			    with open(token_file) as f: pass
108 | 			except IOError:
109 | 			    http = authorize(token_file, create_token_file(token_file))
110 | 			# Authorize, get file parameters, upload file and print out result URL for download
111 | 			http = authorize(token_file, None)
112 | 			mime_type = file_ops(filename)
113 | 			# Sometimes API fails to retrieve starting URI, we wrap it.
114 | 			try:
115 | 			    print(upload_file(filename, mime_type))
116 | 			except ResumableUploadError as e:
117 | 			    print("Error occured while first upload try:", e)
118 | 			    print("Trying one more time.")
119 | 			    print(upload_file(filename, mime_type))
120 | 


--------------------------------------------------------------------------------