├── README.md ├── poetry.lock ├── pyproject.toml ├── tests ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── test_web_scraper.cpython-38-pytest-6.1.2.pyc └── test_web_scraper.py └── web_scraper ├── __init__.py ├── __pycache__ ├── __init__.cpython-38.pyc └── secraper.cpython-38.pyc └── secraper.py /README.md: -------------------------------------------------------------------------------- 1 | # web-scraper 2 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "atomicwrites" 3 | version = "1.4.0" 4 | description = "Atomic file writes." 5 | category = "dev" 6 | optional = false 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 8 | 9 | [[package]] 10 | name = "attrs" 11 | version = "20.3.0" 12 | description = "Classes Without Boilerplate" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 16 | 17 | [package.extras] 18 | dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"] 19 | docs = ["furo", "sphinx", "zope.interface"] 20 | tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] 21 | tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"] 22 | 23 | [[package]] 24 | name = "beautifulsoup4" 25 | version = "4.9.3" 26 | description = "Screen-scraping library" 27 | category = "main" 28 | optional = false 29 | python-versions = "*" 30 | 31 | [package.dependencies] 32 | soupsieve = {version = ">1.2", markers = "python_version >= \"3.0\""} 33 | 34 | [package.extras] 35 | html5lib = ["html5lib"] 36 | lxml = ["lxml"] 37 | 38 | [[package]] 39 | name = "bs4" 40 | version = "0.0.1" 41 | description = "Dummy package for Beautiful Soup" 42 | category = "main" 43 | optional = false 44 | python-versions = "*" 45 | 46 | [package.dependencies] 47 | beautifulsoup4 = "*" 48 | 49 | [[package]] 50 | name = "certifi" 51 | version = "2020.12.5" 52 | description = "Python package for providing Mozilla's CA Bundle." 53 | category = "main" 54 | optional = false 55 | python-versions = "*" 56 | 57 | [[package]] 58 | name = "chardet" 59 | version = "4.0.0" 60 | description = "Universal encoding detector for Python 2 and 3" 61 | category = "main" 62 | optional = false 63 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 64 | 65 | [[package]] 66 | name = "colorama" 67 | version = "0.4.4" 68 | description = "Cross-platform colored terminal text." 69 | category = "dev" 70 | optional = false 71 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 72 | 73 | [[package]] 74 | name = "idna" 75 | version = "2.10" 76 | description = "Internationalized Domain Names in Applications (IDNA)" 77 | category = "main" 78 | optional = false 79 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 80 | 81 | [[package]] 82 | name = "more-itertools" 83 | version = "8.6.0" 84 | description = "More routines for operating on iterables, beyond itertools" 85 | category = "dev" 86 | optional = false 87 | python-versions = ">=3.5" 88 | 89 | [[package]] 90 | name = "packaging" 91 | version = "20.8" 92 | description = "Core utilities for Python packages" 93 | category = "dev" 94 | optional = false 95 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 96 | 97 | [package.dependencies] 98 | pyparsing = ">=2.0.2" 99 | 100 | [[package]] 101 | name = "pluggy" 102 | version = "0.13.1" 103 | description = "plugin and hook calling mechanisms for python" 104 | category = "dev" 105 | optional = false 106 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 107 | 108 | [package.extras] 109 | dev = ["pre-commit", "tox"] 110 | 111 | [[package]] 112 | name = "py" 113 | version = "1.10.0" 114 | description = "library with cross-python path, ini-parsing, io, code, log facilities" 115 | category = "dev" 116 | optional = false 117 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 118 | 119 | [[package]] 120 | name = "pyparsing" 121 | version = "2.4.7" 122 | description = "Python parsing module" 123 | category = "dev" 124 | optional = false 125 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 126 | 127 | [[package]] 128 | name = "pytest" 129 | version = "5.4.3" 130 | description = "pytest: simple powerful testing with Python" 131 | category = "dev" 132 | optional = false 133 | python-versions = ">=3.5" 134 | 135 | [package.dependencies] 136 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} 137 | attrs = ">=17.4.0" 138 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 139 | more-itertools = ">=4.0.0" 140 | packaging = "*" 141 | pluggy = ">=0.12,<1.0" 142 | py = ">=1.5.0" 143 | wcwidth = "*" 144 | 145 | [package.extras] 146 | checkqa-mypy = ["mypy (==v0.761)"] 147 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] 148 | 149 | [[package]] 150 | name = "requests" 151 | version = "2.25.1" 152 | description = "Python HTTP for Humans." 153 | category = "main" 154 | optional = false 155 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 156 | 157 | [package.dependencies] 158 | certifi = ">=2017.4.17" 159 | chardet = ">=3.0.2,<5" 160 | idna = ">=2.5,<3" 161 | urllib3 = ">=1.21.1,<1.27" 162 | 163 | [package.extras] 164 | security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] 165 | socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] 166 | 167 | [[package]] 168 | name = "soupsieve" 169 | version = "2.1" 170 | description = "A modern CSS selector implementation for Beautiful Soup." 171 | category = "main" 172 | optional = false 173 | python-versions = ">=3.5" 174 | 175 | [[package]] 176 | name = "urllib3" 177 | version = "1.26.2" 178 | description = "HTTP library with thread-safe connection pooling, file post, and more." 179 | category = "main" 180 | optional = false 181 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" 182 | 183 | [package.extras] 184 | brotli = ["brotlipy (>=0.6.0)"] 185 | secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] 186 | socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] 187 | 188 | [[package]] 189 | name = "wcwidth" 190 | version = "0.2.5" 191 | description = "Measures the displayed width of unicode strings in a terminal" 192 | category = "dev" 193 | optional = false 194 | python-versions = "*" 195 | 196 | [metadata] 197 | lock-version = "1.1" 198 | python-versions = "^3.8" 199 | content-hash = "518438860ca1326bd6a52c1647e5306e38da80bea2a31c395ea04803fd96bc99" 200 | 201 | [metadata.files] 202 | atomicwrites = [ 203 | {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, 204 | {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, 205 | ] 206 | attrs = [ 207 | {file = "attrs-20.3.0-py2.py3-none-any.whl", hash = "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6"}, 208 | {file = "attrs-20.3.0.tar.gz", hash = "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"}, 209 | ] 210 | beautifulsoup4 = [ 211 | {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"}, 212 | {file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"}, 213 | {file = "beautifulsoup4-4.9.3.tar.gz", hash = "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25"}, 214 | ] 215 | bs4 = [ 216 | {file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"}, 217 | ] 218 | certifi = [ 219 | {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"}, 220 | {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"}, 221 | ] 222 | chardet = [ 223 | {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"}, 224 | {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"}, 225 | ] 226 | colorama = [ 227 | {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, 228 | {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, 229 | ] 230 | idna = [ 231 | {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, 232 | {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, 233 | ] 234 | more-itertools = [ 235 | {file = "more-itertools-8.6.0.tar.gz", hash = "sha256:b3a9005928e5bed54076e6e549c792b306fddfe72b2d1d22dd63d42d5d3899cf"}, 236 | {file = "more_itertools-8.6.0-py3-none-any.whl", hash = "sha256:8e1a2a43b2f2727425f2b5839587ae37093f19153dc26c0927d1048ff6557330"}, 237 | ] 238 | packaging = [ 239 | {file = "packaging-20.8-py2.py3-none-any.whl", hash = "sha256:24e0da08660a87484d1602c30bb4902d74816b6985b93de36926f5bc95741858"}, 240 | {file = "packaging-20.8.tar.gz", hash = "sha256:78598185a7008a470d64526a8059de9aaa449238f280fc9eb6b13ba6c4109093"}, 241 | ] 242 | pluggy = [ 243 | {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, 244 | {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, 245 | ] 246 | py = [ 247 | {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, 248 | {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, 249 | ] 250 | pyparsing = [ 251 | {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, 252 | {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, 253 | ] 254 | pytest = [ 255 | {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"}, 256 | {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"}, 257 | ] 258 | requests = [ 259 | {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, 260 | {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, 261 | ] 262 | soupsieve = [ 263 | {file = "soupsieve-2.1-py3-none-any.whl", hash = "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851"}, 264 | {file = "soupsieve-2.1.tar.gz", hash = "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"}, 265 | ] 266 | urllib3 = [ 267 | {file = "urllib3-1.26.2-py2.py3-none-any.whl", hash = "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"}, 268 | {file = "urllib3-1.26.2.tar.gz", hash = "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08"}, 269 | ] 270 | wcwidth = [ 271 | {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, 272 | {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, 273 | ] 274 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "web-scraper" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Hamza Rashed "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.8" 9 | bs4 = "^0.0.1" 10 | requests = "^2.25.1" 11 | 12 | [tool.poetry.dev-dependencies] 13 | pytest = "^5.2" 14 | 15 | [build-system] 16 | requires = ["poetry-core>=1.0.0"] 17 | build-backend = "poetry.core.masonry.api" 18 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hamza-Rashed/web-scraper/4ffc26e1708dd65cb7ef55b37a4cfc07307a0fca/tests/__init__.py -------------------------------------------------------------------------------- /tests/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hamza-Rashed/web-scraper/4ffc26e1708dd65cb7ef55b37a4cfc07307a0fca/tests/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /tests/__pycache__/test_web_scraper.cpython-38-pytest-6.1.2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hamza-Rashed/web-scraper/4ffc26e1708dd65cb7ef55b37a4cfc07307a0fca/tests/__pycache__/test_web_scraper.cpython-38-pytest-6.1.2.pyc -------------------------------------------------------------------------------- /tests/test_web_scraper.py: -------------------------------------------------------------------------------- 1 | from web_scraper import __version__ 2 | from web_scraper.secraper import * 3 | 4 | def test_version(): 5 | assert __version__ == '0.1.0' 6 | 7 | def test_citations_counting(): 8 | assert get_citations_needed_count("https://en.wikipedia.org/wiki/History_of_Mexico") == 5 9 | 10 | def test_citations_report(): 11 | assert get_citations_needed_report("https://en.wikipedia.org/wiki/History_of_Mexico") == ['The first people to settle in Mexico encountered a climate far milder than the current one. In particular, the Valley of Mexico contained several large paleo-lakes (known collectively as Lake Texcoco) surrounded by dense forest. Deer were found in this area, but most fauna were small land animals and fish and other lacustrine animals were found in the lake region.[citation needed][7] Such conditions encouraged the initial pursuit of a hunter-gatherer existence.\n', 'The Mexica people arrived in the Valley of Mexico in 1248 AD. They had migrated from the deserts north of the Rio Grande[citation needed] over a period traditionally said to have been 100 years. They may have thought of themselves as the heirs to the prestigious civilizations that had preceded them.[citation needed] What the Aztec initially lacked in political power, they made up for with ambition and military skill. In 1325, they established the biggest city in the world at that time, Tenochtitlan.\n', 'The Spanish had no intention to turn over Tenochtitlan to the Tlaxcalteca. While Tlaxcalteca troops continued to help the Spaniards, and Tlaxcala received better treatment than other indigenous nations, the Spanish eventually disowned the treaty. Forty years after the conquest, the Tlaxcalteca had to pay the same tax as any other indigenous community.[citation needed]\n', "During the three centuries of colonial rule, fewer than 700,000 Spaniards, most of them men, settled in Mexico.[citation needed] Europeans, Africans, and indigenous intermixed, creating a mixed-race casta population in a process known as mestizaje. Mestizos, people of mixed European-indigenous ancestry, constitute the majority of Mexico's population.\n"] 12 | -------------------------------------------------------------------------------- /web_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /web_scraper/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hamza-Rashed/web-scraper/4ffc26e1708dd65cb7ef55b37a4cfc07307a0fca/web_scraper/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /web_scraper/__pycache__/secraper.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hamza-Rashed/web-scraper/4ffc26e1708dd65cb7ef55b37a4cfc07307a0fca/web_scraper/__pycache__/secraper.cpython-38.pyc -------------------------------------------------------------------------------- /web_scraper/secraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | URL = 'https://en.wikipedia.org/wiki/History_of_Mexico' 5 | page = requests.get(URL) 6 | def get_citations_needed_count(soup): 7 | page = requests.get(URL) 8 | results = BeautifulSoup(page.content, 'html.parser').find(id="bodyContent").find_all('a', title='Wikipedia:Citation needed') 9 | return len(results) 10 | 11 | def get_citations_needed_report(soup): 12 | all_paragraphs = BeautifulSoup(page.content, 'html.parser').find(id="bodyContent").find_all('p') 13 | 14 | all_paragraph=[] 15 | for paragraph in all_paragraphs: 16 | result_paragraph = paragraph.find('a', { "title" : "Wikipedia:Citation needed"}) 17 | if result_paragraph: 18 | all_paragraph.append(paragraph.text) 19 | return all_paragraph 20 | 21 | def get_citations_needed_by_section(soup): 22 | all_sections = BeautifulSoup(page.content, 'html.parser').find(id="bodyContent").find_all('section') 23 | 24 | all_section=[] 25 | for section in all_sections: 26 | result_section = section.find('a', { "title" : "Wikipedia:Citation needed"}) 27 | if result_section: 28 | all_section.append(section.text) 29 | return all_section 30 | 31 | 32 | if __name__ == "__main__": 33 | 34 | soup = BeautifulSoup(page.content, 'html.parser') 35 | 36 | print ("citations needed: ", get_citations_needed_count(URL)) 37 | print(get_citations_needed_report(URL)) 38 | print(get_citations_needed_by_section(URL)) --------------------------------------------------------------------------------