├── .gitignore
├── Website_Cleaning_NLP.ipynb
├── readme.md
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | scraped_pages/
3 | .ipynb_checkpoints/
4 | 


--------------------------------------------------------------------------------
/Website_Cleaning_NLP.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Extracting Text"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "import re\n",
 18 |     "from boilerpy3 import extractors\n",
 19 |     "\n",
 20 |     "# Condenses all repeating newline characters into one single newline character\n",
 21 |     "def condense_newline(text):\n",
 22 |     "    return '\\n'.join([p for p in re.split('\\n|\\r', text) if len(p) > 0])\n",
 23 |     "\n",
 24 |     "# Returns the text from a HTML file\n",
 25 |     "def parse_html(html_path):\n",
 26 |     "    # Text extraction with boilerpy3\n",
 27 |     "    html_extractor = extractors.ArticleExtractor()\n",
 28 |     "    return condense_newline(html_extractor.get_content_from_file(html_path))\n",
 29 |     "\n",
 30 |     "# Extracts the text from all html files in a specified directory\n",
 31 |     "def html_to_text(folder):\n",
 32 |     "    parsed_texts = []\n",
 33 |     "    filepaths = os.listdir(folder)\n",
 34 |     "\n",
 35 |     "    for filepath in filepaths:\n",
 36 |     "        filepath_full = os.path.join(folder, filepath)\n",
 37 |     "        if filepath_full.endswith(\".html\"):\n",
 38 |     "            parsed_texts.append(parse_html(filepath_full))\n",
 39 |     "    return parsed_texts\n",
 40 |     "\n",
 41 |     "# Your directory to the folder with scraped websites\n",
 42 |     "scraped_dir = './scraped_pages'\n",
 43 |     "parsed_texts = html_to_text(scraped_dir)\n",
 44 |     "\n",
 45 |     "from bs4 import BeautifulSoup"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# Returns the text from a HTML file based on specified tags\n",
 55 |     "def parse_html(html_path):\n",
 56 |     "    with open(html_path, 'r') as fr:\n",
 57 |     "        html_content = fr.read()\n",
 58 |     "        soup = BeautifulSoup(html_content, 'html.parser')\n",
 59 |     "\n",
 60 |     "        # Check that file is valid HTML\n",
 61 |     "        if not soup.find():\n",
 62 |     "            raise ValueError(\"File is not a valid HTML file\")\n",
 63 |     "\n",
 64 |     "        # Check the language of the file\n",
 65 |     "        tag_meta_language = soup.head.find(\"meta\", attrs={\"http-equiv\": \"content-language\"})\n",
 66 |     "        if tag_meta_language:\n",
 67 |     "            document_language = tag_meta_language[\"content\"]\n",
 68 |     "            if document_language and document_language not in [\"en\", \"en-us\", \"en-US\"]:\n",
 69 |     "                raise ValueError(\"Language {} is not english\".format(document_language))\n",
 70 |     "\n",
 71 |     "        # Get text from the specified tags. Add more tags if necessary.\n",
 72 |     "        TAGS = ['p']\n",
 73 |     "        return ' '.join([condense_newline(tag.text) for tag in soup.findAll(TAGS)])"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Large N-Gram Cleaning"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import nltk\n",
 90 |     "nltk.download('punkt')\n",
 91 |     "import matplotlib.pyplot as plt\n",
 92 |     "from nltk.util import ngrams\n",
 93 |     "from nltk.tokenize import word_tokenize\n",
 94 |     "\n",
 95 |     "# Helper method for generating n-grams\n",
 96 |     "def extract_ngrams_sentences(sentences, num):\n",
 97 |     "    all_grams = []\n",
 98 |     "    for sentence in sentences:\n",
 99 |     "        n_grams = ngrams(sentence, num)\n",
100 |     "        all_grams += [ ' '.join(grams) for grams in n_grams]\n",
101 |     "    return all_grams\n",
102 |     "\n",
103 |     "# Splits text up by newline and period\n",
104 |     "def split_by_newline_and_period(pages):\n",
105 |     "    sentences = []\n",
106 |     "    for page in pages:\n",
107 |     "        sentences += re.split('\\n|\\. ', page)\n",
108 |     "    return sentences\n",
109 |     "\n",
110 |     "# Break the dataset up into sentences, split by newline characters and periods\n",
111 |     "sentences = split_by_newline_and_period(parsed_texts)\n",
112 |     "\n",
113 |     "# Add unwanted strings into this array\n",
114 |     "filter_strs = []\n",
115 |     "\n",
116 |     "# Filter out unwanted strings\n",
117 |     "sentences = [x for x in sentences\n",
118 |     "             if not any([re.search(filter_str, x, re.IGNORECASE)\n",
119 |     "                         for filter_str in filter_strs])]\n",
120 |     "\n",
121 |     "# Tokenize the sentences\n",
122 |     "tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]\n",
123 |     "\n",
124 |     "# Adjust NGRAM_SIZE to capture unwanted phrases\n",
125 |     "NGRAM_SIZE = 15\n",
126 |     "ngrams_all = extract_ngrams_sentences(tokenized_sentences, NGRAM_SIZE)\n",
127 |     "\n",
128 |     "# Sort the n-grams by most common\n",
129 |     "n_gram_all = nltk.FreqDist(ngrams_all).most_common()\n",
130 |     "\n",
131 |     "# Print out the top 10 most commmon n-grams\n",
132 |     "print(f'{NGRAM_SIZE}-Gram Frequencies')\n",
133 |     "for gram, count in n_gram_all[:10]:\n",
134 |     "    print(f'{count}\\t\\\"{gram}\\\"')\n",
135 |     "\n",
136 |     "# Plot the distribution of n-grams\n",
137 |     "plt.subplots(figsize=(15,5))\n",
138 |     "\n",
139 |     "plt.plot([count for _, count in n_gram_all])\n",
140 |     "plt.xlabel('n-gram')\n",
141 |     "plt.ylabel('frequency')\n",
142 |     "plt.title(f'{NGRAM_SIZE}-Gram Frequencies')\n",
143 |     "plt.show()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Punctuation, Capitalization, and Tokenization"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "import gensim\n",
160 |     "import string\n",
161 |     "\n",
162 |     "# Uses gensim to process the sentences\n",
163 |     "def sentence_to_words(sentences):\n",
164 |     "    for sentence in sentences:\n",
165 |     "        sentence_tokenized = gensim.utils.simple_preprocess(sentence,\n",
166 |     "                                                            deacc=True,\n",
167 |     "                                                            min_len=2,\n",
168 |     "                                                            max_len=15)\n",
169 |     "        \n",
170 |     "        # Make sure we don't yield empty arrays\n",
171 |     "        if len(sentence_tokenized) > 0:\n",
172 |     "            yield sentence_tokenized\n",
173 |     "\n",
174 |     "# Process the sentences manually\n",
175 |     "def sentence_to_words_from_scratch(sentences):\n",
176 |     "    for sentence in sentences:\n",
177 |     "        sentence_tokenized = [token.lower() for token in \n",
178 |     "               word_tokenize(sentence.translate(str.maketrans('','',string.punctuation)))]\n",
179 |     "        \n",
180 |     "        # Make sure we don't yield empty arrays\n",
181 |     "        if len(sentence_tokenized) > 0:\n",
182 |     "            yield sentence_tokenized\n",
183 |     "\n",
184 |     "sentences = list(sentence_to_words(sentences))"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "## Stop Word Removal, Lemmatizing, and Stemming"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "from nltk.corpus import stopwords\n",
201 |     "from nltk.stem import WordNetLemmatizer\n",
202 |     "from nltk.stem import SnowballStemmer\n",
203 |     "nltk.download('stopwords')\n",
204 |     "nltk.download('wordnet')\n",
205 |     "\n",
206 |     "# Remove all stopwords\n",
207 |     "stop_words = stopwords.words('english')\n",
208 |     "def remove_stopwords(tokenized_sentences):\n",
209 |     "    for sentence in tokenized_sentences:\n",
210 |     "        yield([token for token in sentence if token not in stop_words])\n",
211 |     "\n",
212 |     "# Lemmatize all words\n",
213 |     "wordnet_lemmatizer = WordNetLemmatizer()\n",
214 |     "def lemmatize_words(tokenized_sentences):\n",
215 |     "    for sentence in tokenized_sentences:\n",
216 |     "        yield([wordnet_lemmatizer.lemmatize(token) for token in sentence])\n",
217 |     "\n",
218 |     "snowball_stemmer = SnowballStemmer('english')\n",
219 |     "def stem_words(tokenized_sentences):\n",
220 |     "    for sentence in tokenized_sentences:\n",
221 |     "        yield([snowball_stemmer.stem(token) for token in sentence])\n",
222 |     "\n",
223 |     "sentences = list(remove_stopwords(sentences))\n",
224 |     "sentences = list(lemmatize_words(sentences))\n",
225 |     "sentences = list(stem_words(sentences))"
226 |    ]
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "kernelspec": {
231 |    "display_name": "Python 3",
232 |    "language": "python",
233 |    "name": "python3"
234 |   },
235 |   "language_info": {
236 |    "codemirror_mode": {
237 |     "name": "ipython",
238 |     "version": 3
239 |    },
240 |    "file_extension": ".py",
241 |    "mimetype": "text/x-python",
242 |    "name": "python",
243 |    "nbconvert_exporter": "python",
244 |    "pygments_lexer": "ipython3",
245 |    "version": "3.8.5"
246 |   }
247 |  },
248 |  "nbformat": 4,
249 |  "nbformat_minor": 4
250 | }
251 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # HTML Data CLeaning in Python for NLP
2 | 
3 | This notebook contains the steps for preprocessing your scraped website data.
4 | 
5 | To install requirements, use the following command:
6 | 
7 | 	pip install -r requirements.txt
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | boilerpy3
2 | nltk
3 | matplotlib
4 | 


--------------------------------------------------------------------------------