├── .gitignore ├── Website_Cleaning_NLP.ipynb ├── readme.md └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | scraped_pages/ 3 | .ipynb_checkpoints/ 4 | -------------------------------------------------------------------------------- /Website_Cleaning_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Extracting Text" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import re\n", 18 | "from boilerpy3 import extractors\n", 19 | "\n", 20 | "# Condenses all repeating newline characters into one single newline character\n", 21 | "def condense_newline(text):\n", 22 | " return '\\n'.join([p for p in re.split('\\n|\\r', text) if len(p) > 0])\n", 23 | "\n", 24 | "# Returns the text from a HTML file\n", 25 | "def parse_html(html_path):\n", 26 | " # Text extraction with boilerpy3\n", 27 | " html_extractor = extractors.ArticleExtractor()\n", 28 | " return condense_newline(html_extractor.get_content_from_file(html_path))\n", 29 | "\n", 30 | "# Extracts the text from all html files in a specified directory\n", 31 | "def html_to_text(folder):\n", 32 | " parsed_texts = []\n", 33 | " filepaths = os.listdir(folder)\n", 34 | "\n", 35 | " for filepath in filepaths:\n", 36 | " filepath_full = os.path.join(folder, filepath)\n", 37 | " if filepath_full.endswith(\".html\"):\n", 38 | " parsed_texts.append(parse_html(filepath_full))\n", 39 | " return parsed_texts\n", 40 | "\n", 41 | "# Your directory to the folder with scraped websites\n", 42 | "scraped_dir = './scraped_pages'\n", 43 | "parsed_texts = html_to_text(scraped_dir)\n", 44 | "\n", 45 | "from bs4 import BeautifulSoup" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Returns the text from a HTML file based on specified tags\n", 55 | "def parse_html(html_path):\n", 56 | " with open(html_path, 'r') as fr:\n", 57 | " html_content = fr.read()\n", 58 | " soup = BeautifulSoup(html_content, 'html.parser')\n", 59 | "\n", 60 | " # Check that file is valid HTML\n", 61 | " if not soup.find():\n", 62 | " raise ValueError(\"File is not a valid HTML file\")\n", 63 | "\n", 64 | " # Check the language of the file\n", 65 | " tag_meta_language = soup.head.find(\"meta\", attrs={\"http-equiv\": \"content-language\"})\n", 66 | " if tag_meta_language:\n", 67 | " document_language = tag_meta_language[\"content\"]\n", 68 | " if document_language and document_language not in [\"en\", \"en-us\", \"en-US\"]:\n", 69 | " raise ValueError(\"Language {} is not english\".format(document_language))\n", 70 | "\n", 71 | " # Get text from the specified tags. Add more tags if necessary.\n", 72 | " TAGS = ['p']\n", 73 | " return ' '.join([condense_newline(tag.text) for tag in soup.findAll(TAGS)])" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Large N-Gram Cleaning" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "import nltk\n", 90 | "nltk.download('punkt')\n", 91 | "import matplotlib.pyplot as plt\n", 92 | "from nltk.util import ngrams\n", 93 | "from nltk.tokenize import word_tokenize\n", 94 | "\n", 95 | "# Helper method for generating n-grams\n", 96 | "def extract_ngrams_sentences(sentences, num):\n", 97 | " all_grams = []\n", 98 | " for sentence in sentences:\n", 99 | " n_grams = ngrams(sentence, num)\n", 100 | " all_grams += [ ' '.join(grams) for grams in n_grams]\n", 101 | " return all_grams\n", 102 | "\n", 103 | "# Splits text up by newline and period\n", 104 | "def split_by_newline_and_period(pages):\n", 105 | " sentences = []\n", 106 | " for page in pages:\n", 107 | " sentences += re.split('\\n|\\. ', page)\n", 108 | " return sentences\n", 109 | "\n", 110 | "# Break the dataset up into sentences, split by newline characters and periods\n", 111 | "sentences = split_by_newline_and_period(parsed_texts)\n", 112 | "\n", 113 | "# Add unwanted strings into this array\n", 114 | "filter_strs = []\n", 115 | "\n", 116 | "# Filter out unwanted strings\n", 117 | "sentences = [x for x in sentences\n", 118 | " if not any([re.search(filter_str, x, re.IGNORECASE)\n", 119 | " for filter_str in filter_strs])]\n", 120 | "\n", 121 | "# Tokenize the sentences\n", 122 | "tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]\n", 123 | "\n", 124 | "# Adjust NGRAM_SIZE to capture unwanted phrases\n", 125 | "NGRAM_SIZE = 15\n", 126 | "ngrams_all = extract_ngrams_sentences(tokenized_sentences, NGRAM_SIZE)\n", 127 | "\n", 128 | "# Sort the n-grams by most common\n", 129 | "n_gram_all = nltk.FreqDist(ngrams_all).most_common()\n", 130 | "\n", 131 | "# Print out the top 10 most commmon n-grams\n", 132 | "print(f'{NGRAM_SIZE}-Gram Frequencies')\n", 133 | "for gram, count in n_gram_all[:10]:\n", 134 | " print(f'{count}\\t\\\"{gram}\\\"')\n", 135 | "\n", 136 | "# Plot the distribution of n-grams\n", 137 | "plt.subplots(figsize=(15,5))\n", 138 | "\n", 139 | "plt.plot([count for _, count in n_gram_all])\n", 140 | "plt.xlabel('n-gram')\n", 141 | "plt.ylabel('frequency')\n", 142 | "plt.title(f'{NGRAM_SIZE}-Gram Frequencies')\n", 143 | "plt.show()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Punctuation, Capitalization, and Tokenization" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "import gensim\n", 160 | "import string\n", 161 | "\n", 162 | "# Uses gensim to process the sentences\n", 163 | "def sentence_to_words(sentences):\n", 164 | " for sentence in sentences:\n", 165 | " sentence_tokenized = gensim.utils.simple_preprocess(sentence,\n", 166 | " deacc=True,\n", 167 | " min_len=2,\n", 168 | " max_len=15)\n", 169 | " \n", 170 | " # Make sure we don't yield empty arrays\n", 171 | " if len(sentence_tokenized) > 0:\n", 172 | " yield sentence_tokenized\n", 173 | "\n", 174 | "# Process the sentences manually\n", 175 | "def sentence_to_words_from_scratch(sentences):\n", 176 | " for sentence in sentences:\n", 177 | " sentence_tokenized = [token.lower() for token in \n", 178 | " word_tokenize(sentence.translate(str.maketrans('','',string.punctuation)))]\n", 179 | " \n", 180 | " # Make sure we don't yield empty arrays\n", 181 | " if len(sentence_tokenized) > 0:\n", 182 | " yield sentence_tokenized\n", 183 | "\n", 184 | "sentences = list(sentence_to_words(sentences))" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "## Stop Word Removal, Lemmatizing, and Stemming" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "from nltk.corpus import stopwords\n", 201 | "from nltk.stem import WordNetLemmatizer\n", 202 | "from nltk.stem import SnowballStemmer\n", 203 | "nltk.download('stopwords')\n", 204 | "nltk.download('wordnet')\n", 205 | "\n", 206 | "# Remove all stopwords\n", 207 | "stop_words = stopwords.words('english')\n", 208 | "def remove_stopwords(tokenized_sentences):\n", 209 | " for sentence in tokenized_sentences:\n", 210 | " yield([token for token in sentence if token not in stop_words])\n", 211 | "\n", 212 | "# Lemmatize all words\n", 213 | "wordnet_lemmatizer = WordNetLemmatizer()\n", 214 | "def lemmatize_words(tokenized_sentences):\n", 215 | " for sentence in tokenized_sentences:\n", 216 | " yield([wordnet_lemmatizer.lemmatize(token) for token in sentence])\n", 217 | "\n", 218 | "snowball_stemmer = SnowballStemmer('english')\n", 219 | "def stem_words(tokenized_sentences):\n", 220 | " for sentence in tokenized_sentences:\n", 221 | " yield([snowball_stemmer.stem(token) for token in sentence])\n", 222 | "\n", 223 | "sentences = list(remove_stopwords(sentences))\n", 224 | "sentences = list(lemmatize_words(sentences))\n", 225 | "sentences = list(stem_words(sentences))" 226 | ] 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": "Python 3", 232 | "language": "python", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": { 237 | "name": "ipython", 238 | "version": 3 239 | }, 240 | "file_extension": ".py", 241 | "mimetype": "text/x-python", 242 | "name": "python", 243 | "nbconvert_exporter": "python", 244 | "pygments_lexer": "ipython3", 245 | "version": "3.8.5" 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 4 250 | } 251 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # HTML Data CLeaning in Python for NLP 2 | 3 | This notebook contains the steps for preprocessing your scraped website data. 4 | 5 | To install requirements, use the following command: 6 | 7 | pip install -r requirements.txt 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boilerpy3 2 | nltk 3 | matplotlib 4 | --------------------------------------------------------------------------------