├── python
├── lxml-tutorial
│ ├── src
│ │ ├── requirements.txt
│ │ ├── sample.xml
│ │ ├── reading_html.py
│ │ ├── input.html
│ │ ├── list_of_countries.py
│ │ ├── countries_flags.py
│ │ ├── countries.py
│ │ └── creating_xml_html.py
│ └── README.md
├── playwright-web-scraping
│ ├── python
│ │ ├── requirements.txt
│ │ └── books.py
│ ├── node
│ │ ├── package.json
│ │ ├── book.js
│ │ └── package-lock.json
│ └── README.md
├── Rotating-Proxies-With-Python
│ ├── requirements.txt
│ ├── no_proxy.py
│ ├── single_proxy.py
│ ├── rotating_multiple_proxies.py
│ └── rotating_multiple_proxies_async.py
├── building-scraping-pipeline-apache-airflow
│ ├── setup.py
│ ├── pusher.py
│ ├── puller.py
│ ├── bootstrap.py
│ ├── DAG
│ │ ├── setup.py
│ │ ├── push-pull.py
│ │ └── scrape.py
│ ├── oxylabs.py
│ └── messenger.py
├── Pagination-With-Python
│ ├── images
│ │ ├── load_more_button.png
│ │ ├── next_button_example.png
│ │ ├── next_button_locate.png
│ │ ├── pager_without_next.png
│ │ ├── scroll_html_response.png
│ │ ├── scroll_json_response.png
│ │ ├── next_button_example_page2.png
│ │ ├── next_button_example_page3.png
│ │ └── scroll_json_response_has_next.png
│ ├── infinite_scroll_json.py
│ ├── no_next_button.py
│ ├── next_button.py
│ ├── load_more_json.py
│ └── infinite_scroll_html.py
├── Price-Parsing-Tutorial
│ └── images
│ │ └── Preview-of-RegEx.png
├── Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup
│ ├── images
│ │ ├── libribox.png
│ │ ├── command_menu.png
│ │ ├── author_markup.png
│ │ ├── json_embedded.png
│ │ ├── infinite_scroll.png
│ │ ├── dynamic_site_no_js.png
│ │ └── infinite_scroll_no_js.png
│ ├── data_in_same_page.py
│ ├── selenium_example.py
│ ├── selenium_bs4.py
│ └── selenium_bs4_headless.py
├── Python-Web-Scraping-Tutorial
│ ├── webscraping_5lines.py
│ ├── web_scraping_toc.csv
│ ├── python_toc.csv
│ └── wiki_toc.py
├── beautiful-soup-parsing-tutorial
│ ├── content-tags.py
│ ├── finding-all-tags.py
│ ├── traversing-tags.py
│ ├── export-to-csv.py
│ └── README.md
├── News-Article-Scraper
│ ├── JavaScript
│ │ ├── package.json
│ │ ├── extract_article_links.js
│ │ ├── news_article_scraper.js
│ │ └── package-lock.json
│ └── Python
│ │ ├── extract_article_links.py
│ │ └── news_article_scraper.py
├── automate-competitors-benchmark-analysis
│ ├── src
│ │ ├── get_serp.py
│ │ ├── off_page_metrics.py
│ │ ├── page_speed_metrics.py
│ │ └── get_top_urls.py
│ └── README.md
├── regex-web-scraping
│ ├── demo.py
│ └── README.md
├── how-to-make-web-scraping-faster
│ ├── sync-scraping.py
│ ├── multiproc-scraping.py
│ ├── multithread-scraping.py
│ ├── async-scraping.py
│ └── README.md
├── pandas-read-html-tables
│ └── src
│ │ └── population.html
├── how-to-build-a-price-tracker
│ ├── tracker.py
│ └── README.md
├── news-scraping
│ └── README.md
├── scrape-images-from-website
│ └── img-scraper.py
├── Web-Scraping-With-Selenium
│ └── books_selenium.py
└── web-scraping-machine-learning
│ └── README.md
├── other
└── curl-with-proxy
│ ├── src
│ ├── _curlrc
│ ├── socks_proxy.sh
│ ├── one_time_proxy.sh
│ └── env_variables.sh
│ ├── simple_proxy.sh
│ └── README.md
├── VBA
└── Web Scraping With Excel VBA Guide
│ ├── images
│ ├── image1.png
│ ├── image2.png
│ ├── image3.png
│ ├── image4.png
│ ├── image5.png
│ ├── image6.png
│ ├── image7.png
│ ├── image8.png
│ └── image9.png
│ ├── src
│ ├── automate_ie.vb
│ └── scrape_quotes.vb
│ └── README.md
├── javascript
├── rotating-proxies-javascript
│ ├── package.json
│ ├── proxy_list.csv
│ ├── no_proxy.js
│ ├── single_proxy_axios.js
│ ├── rotating_proxies.js
│ └── README.md
├── puppeteer-on-aws-lambda
│ ├── demo.js
│ └── README.md
├── node-js-fetch-api
│ ├── axios-post.js
│ ├── fetch-post.js
│ └── README.md
├── puppeteer-tutorial
│ └── bnb.js
├── how-to-build-web-scraper
│ └── web_scraper.js
├── javascript-web-scraping
│ └── books.js
└── playwright-web-scraping
│ └── README.md
├── csharp
└── csharp-web-scraping
│ ├── Program.cs
│ ├── Export-to-csv.cs
│ ├── GetBookLinks.cs
│ ├── GetBookDetails.cs
│ └── README.md
├── r
└── web-scraping-r
│ ├── src
│ ├── dynamic_rvest.R
│ ├── download_images_rvest.R
│ ├── static_rvest.R
│ └── dynamic_rselenium.R
│ └── README.md
├── README.md
├── golang
└── golang-web-scraper
│ ├── src
│ ├── go.mod
│ ├── books.go
│ └── go.sum
│ └── README.md
├── ruby
└── webscraping-with-ruby
│ └── README.md
└── php
└── web-scraping-php
└── README.md
/python/lxml-tutorial/src/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | requests
--------------------------------------------------------------------------------
/other/curl-with-proxy/src/_curlrc:
--------------------------------------------------------------------------------
1 | proxy="http://user:pwd@127.0.0.1:1234"
--------------------------------------------------------------------------------
/python/playwright-web-scraping/python/requirements.txt:
--------------------------------------------------------------------------------
1 | playwright
2 |
--------------------------------------------------------------------------------
/python/Rotating-Proxies-With-Python/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.8.1
2 | requests==2.27.1
3 |
--------------------------------------------------------------------------------
/other/curl-with-proxy/src/socks_proxy.sh:
--------------------------------------------------------------------------------
1 | curl -x "socks5://user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
--------------------------------------------------------------------------------
/python/playwright-web-scraping/node/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "dependencies": {
3 | "playwright": "^1.27.0"
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/setup.py:
--------------------------------------------------------------------------------
1 | from bootstrap import queue
2 |
3 | success = queue.setup()
4 | if not success:
5 | exit(1)
--------------------------------------------------------------------------------
/python/Rotating-Proxies-With-Python/no_proxy.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | response = requests.get('https://ip.oxylabs.io/location')
4 | print(response.text)
5 |
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image1.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image2.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image3.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image4.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image5.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image6.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image7.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image8.png
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/images/image9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/VBA/Web Scraping With Excel VBA Guide/images/image9.png
--------------------------------------------------------------------------------
/other/curl-with-proxy/simple_proxy.sh:
--------------------------------------------------------------------------------
1 | curl --proxy "http://user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
2 | # or
3 | curl --proxy "user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/load_more_button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/load_more_button.png
--------------------------------------------------------------------------------
/python/Price-Parsing-Tutorial/images/Preview-of-RegEx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Price-Parsing-Tutorial/images/Preview-of-RegEx.png
--------------------------------------------------------------------------------
/javascript/rotating-proxies-javascript/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "dependencies": {
3 | "async-csv": "^2.1.3",
4 | "axios": "^0.24.0",
5 | "puppeteer": "^13.1.0"
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/other/curl-with-proxy/src/one_time_proxy.sh:
--------------------------------------------------------------------------------
1 | curl --proxy "http://user:pwd@1.0.0.1:8090" "http://httpbin.org/ip"
2 | # OR
3 | curl -x "http://user:pwd@1.0.0.1:8090" "http://httpbin.org/ip"
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/next_button_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/next_button_example.png
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/next_button_locate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/next_button_locate.png
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/pager_without_next.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/pager_without_next.png
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/scroll_html_response.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/scroll_html_response.png
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/scroll_json_response.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/scroll_json_response.png
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/next_button_example_page2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/next_button_example_page2.png
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/next_button_example_page3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/next_button_example_page3.png
--------------------------------------------------------------------------------
/python/Pagination-With-Python/images/scroll_json_response_has_next.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Pagination-With-Python/images/scroll_json_response_has_next.png
--------------------------------------------------------------------------------
/python/lxml-tutorial/src/sample.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/javascript/rotating-proxies-javascript/proxy_list.csv:
--------------------------------------------------------------------------------
1 | 20.94.229.106,80
2 | 209.141.55.228,80
3 | 103.149.162.194,80
4 | 206.253.164.122,80
5 | 49.206.233.104,80
6 | 199.19.226.12,80
7 | 206.253.164.198,80
8 | 38.94.111.208,80
9 |
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/libribox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/libribox.png
--------------------------------------------------------------------------------
/csharp/csharp-web-scraping/Program.cs:
--------------------------------------------------------------------------------
1 | // Parses the URL and returns HtmlDocument object
2 | static HtmlDocument GetDocument(string url)
3 | {
4 | HtmlWeb web = new HtmlWeb();
5 | HtmlDocument doc = web.Load(url);
6 | return doc;
7 | }
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/command_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/command_menu.png
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/author_markup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/author_markup.png
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/json_embedded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/json_embedded.png
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/infinite_scroll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/infinite_scroll.png
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/dynamic_site_no_js.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/dynamic_site_no_js.png
--------------------------------------------------------------------------------
/python/Python-Web-Scraping-Tutorial/webscraping_5lines.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | response = requests.get("https://en.wikipedia.org/wiki/Web_scraping")
4 | bs = BeautifulSoup(response.text, "lxml")
5 | print(bs.find("p").text)
6 |
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/infinite_scroll_no_js.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oxylabs/web-scraping-tutorials/HEAD/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/images/infinite_scroll_no_js.png
--------------------------------------------------------------------------------
/other/curl-with-proxy/src/env_variables.sh:
--------------------------------------------------------------------------------
1 | # Enable Proxy
2 | export http_proxy="http://user:pwd@127.0.0.1:1234"
3 | export https_proxy="http://user:pwd@127.0.0.1:1234"
4 |
5 | curl "http://httpbin.org/ip"
6 |
7 | # Disable proxy
8 | unset http_proxy
9 | unset https_proxy
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/src/automate_ie.vb:
--------------------------------------------------------------------------------
1 | Sub scrape_quotes()
2 | Dim browser As InternetExplorer
3 | Dim page As HTMLDocument
4 | Set browser = New InternetExplorer
5 | browser.Visible = True
6 | browser.navigate ("https://quotes.toscrape.com")
7 | End Sub
--------------------------------------------------------------------------------
/csharp/csharp-web-scraping/Export-to-csv.cs:
--------------------------------------------------------------------------------
1 | static void exportToCSV(List books)
2 | {
3 | using (var writer = new StreamWriter("./books.csv"))
4 | using (var csv = new CsvWriter(writer, CultureInfo.InvariantCulture))
5 | {
6 | csv.WriteRecords(books);
7 | }
8 | }
--------------------------------------------------------------------------------
/python/beautiful-soup-parsing-tutorial/content-tags.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 |
3 | with open('index.html', 'r') as f:
4 | contents = f.read()
5 | soup = BeautifulSoup(contents, features="html.parser")
6 |
7 | print(soup.h2)
8 | print(soup.p)
9 | print(soup.li)
--------------------------------------------------------------------------------
/python/beautiful-soup-parsing-tutorial/finding-all-tags.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 |
3 | with open('index.html', 'r') as f:
4 | contents = f.read()
5 | soup = BeautifulSoup(contents, features="html.parser")
6 |
7 | for tag in soup.find_all('li'):
8 | print(tag.text)
--------------------------------------------------------------------------------
/javascript/puppeteer-on-aws-lambda/demo.js:
--------------------------------------------------------------------------------
1 | const browser = await chromium.puppeteer
2 | .launch({
3 | args: chromium.args,
4 | defaultViewport: chromium.defaultViewport,
5 | executablePath: await chromium.executablePath,
6 | headless: chromium.headless
7 | });
--------------------------------------------------------------------------------
/python/beautiful-soup-parsing-tutorial/traversing-tags.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 |
3 | with open('index.html', 'r') as f:
4 | contents = f.read()
5 | soup = BeautifulSoup(contents, features="html.parser")
6 |
7 | for child in soup.descendants:
8 | if child.name:
9 | print(child.name)
--------------------------------------------------------------------------------
/python/lxml-tutorial/src/reading_html.py:
--------------------------------------------------------------------------------
1 | from lxml import html
2 | with open('input.html') as f:
3 | html_string = f.read()
4 | tree = html.fromstring(html_string)
5 | para = tree.xpath('//p/text()')
6 | for e in para:
7 | print(e)
8 |
9 | # Output
10 | # This HTML is XML Compliant!
11 | # This is the second paragraph
--------------------------------------------------------------------------------
/python/lxml-tutorial/src/input.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | This is Page Title
4 |
5 |
6 | Hello World!
7 | This HTML is XML Compliant!
8 | This is the second paragraph.
9 |
10 |
--------------------------------------------------------------------------------
/r/web-scraping-r/src/dynamic_rvest.R:
--------------------------------------------------------------------------------
1 | library(rvest)
2 | library(httr)
3 | library(jsonlite)
4 |
5 | url <- "https://quotes.toscrape.com/api/quotes?page=1"
6 | page<-read_html(GET(url, timeout(10)))
7 | jsontext <- page %>% html_element("p") %>% html_text()
8 | r_object <- jsontext %>% fromJSON()
9 | print(r_object$quotes)
10 |
--------------------------------------------------------------------------------
/python/beautiful-soup-parsing-tutorial/export-to-csv.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 |
4 | with open('index.html', 'r') as f:
5 | contents = f.read()
6 |
7 | soup = BeautifulSoup(contents, features="html.parser")
8 | results = soup.find_all('li')
9 |
10 | df = pd.DataFrame({'Names': results})
11 | df.to_csv('names.csv', index=False, encoding='utf-8')
--------------------------------------------------------------------------------
/r/web-scraping-r/src/download_images_rvest.R:
--------------------------------------------------------------------------------
1 | library(rvest)
2 | library(dplyr)
3 |
4 | url = "https://en.wikipedia.org/wiki/Eiffel_Tower"
5 | page <- read_html(url)
6 | image_element <- page %>% html_element(".thumbborder")
7 | image_url <- image_element %>% html_attr("src")
8 | image_url <- url_absolute(image_url, url)
9 |
10 |
11 | download.file(image_url, destfile = basename("paris.jpg"))
12 |
--------------------------------------------------------------------------------
/python/lxml-tutorial/src/list_of_countries.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import html
3 |
4 | response = requests.get('https://en.wikipedia.org/wiki/List_of_countries_by_population_in_2010')
5 |
6 | tree = html.fromstring(response.text)
7 |
8 | countries = tree.xpath('//span[@class="flagicon"]')
9 | for country in countries:
10 | print(country.xpath('./following-sibling::a/text()')[0])
11 |
--------------------------------------------------------------------------------
/python/News-Article-Scraper/JavaScript/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "code",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "get_links_from_sitemap_cheerio.js",
6 | "scripts": {
7 | "test": "echo \"Error: no test specified\" && exit 1"
8 | },
9 | "keywords": [],
10 | "author": "",
11 | "license": "ISC",
12 | "dependencies": {
13 | "axios": "^0.21.1",
14 | "cheerio": "^1.0.0-rc.10"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/javascript/node-js-fetch-api/axios-post.js:
--------------------------------------------------------------------------------
1 | const axios = require('axios');
2 | const url = 'https://httpbin.org/post'
3 | const data = {
4 | x: 1920,
5 | y: 1080,
6 | };
7 | const customHeaders = {
8 | "Content-Type": "application/json",
9 | }
10 | axios.post(url, data, {
11 | headers: customHeaders,
12 | })
13 | .then(({ data }) => {
14 | console.log(data);
15 | })
16 | .catch((error) => {
17 | console.error(error);
18 | });
--------------------------------------------------------------------------------
/python/lxml-tutorial/src/countries_flags.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import html
3 |
4 | response = requests.get(
5 | 'https://en.wikipedia.org/wiki/List_of_countries_by_population_in_2010')
6 |
7 | tree = html.fromstring(response.text)
8 | countries = tree.xpath('//span[@class="flagicon"]')
9 | for country in countries:
10 | flag = country.xpath('./img/@src')[0]
11 | country = country.xpath('./following-sibling::a/text()')[0]
12 | print(country, ":", flag)
13 |
--------------------------------------------------------------------------------
/r/web-scraping-r/src/static_rvest.R:
--------------------------------------------------------------------------------
1 | library(rvest)
2 | library(dplyr)
3 |
4 | httr::set_config(httr::user_agent("Mozilla/5.0 (Macintosh; Chrome/96.0.4664.45"))
5 |
6 | link = "https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes"
7 | df = read_html(link) %>%
8 | html_element("table.sortable") %>%
9 | html_table(header = FALSE)
10 |
11 | # take column names from second row
12 | names(df) <- df[2,]
13 | # drop first two rows
14 | df = df[-1:-2,]
15 | View(df)
16 |
17 |
18 |
--------------------------------------------------------------------------------
/javascript/node-js-fetch-api/fetch-post.js:
--------------------------------------------------------------------------------
1 | const url = 'https://httpbin.org/post'
2 | const data = {
3 | x: 1920,
4 | y: 1080,
5 | };
6 | const customHeaders = {
7 | "Content-Type": "application/json",
8 | }
9 |
10 | fetch(url, {
11 | method: "POST",
12 | headers: customHeaders,
13 | body: JSON.stringify(data),
14 | })
15 | .then((response) => response.json())
16 | .then((data) => {
17 | console.log(data);
18 | })
19 | .catch((error) => {
20 | console.error(error);
21 | });
--------------------------------------------------------------------------------
/csharp/csharp-web-scraping/GetBookLinks.cs:
--------------------------------------------------------------------------------
1 | static List GetBookLinks(string url)
2 | {
3 | var bookLinks = new List();
4 | HtmlDocument doc = GetDocument(url);
5 | HtmlNodeCollection linkNodes = doc.DocumentNode.SelectNodes("//h3/a");
6 | var baseUri = new Uri(url);
7 | foreach (var link in linkNodes)
8 | {
9 | string href = link.Attributes["href"].Value;
10 | bookLinks.Add(new Uri(baseUri, href).AbsoluteUri);
11 | }
12 | return bookLinks;
13 | }
--------------------------------------------------------------------------------
/python/Python-Web-Scraping-Tutorial/web_scraping_toc.csv:
--------------------------------------------------------------------------------
1 | heading_number,heading_text
2 | 1,History
3 | 2,Techniques
4 | 2.1,Human copy-and-paste
5 | 2.2,Text pattern matching
6 | 2.3,HTTP programming
7 | 2.4,HTML parsing
8 | 2.5,DOM parsing
9 | 2.6,Vertical aggregation
10 | 2.7,Semantic annotation recognizing
11 | 2.8,Computer vision web-page analysis
12 | 3,Software
13 | 4,Legal issues
14 | 4.1,United States
15 | 4.2,The EU
16 | 4.3,Australia
17 | 4.4,India
18 | 5,Methods to prevent web scraping
19 | 6,See also
20 | 7,References
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://oxylabs.io/pages/gitoxy?utm_source=877&utm_medium=affiliate&groupid=877&utm_content=web-scraping-tutorials-github&transaction_id=102f49063ab94276ae8f116d224b67)
2 |
3 | [](https://discord.gg/Pds3gBmKMH) [](https://www.youtube.com/@oxylabs)
4 |
--------------------------------------------------------------------------------
/python/Rotating-Proxies-With-Python/single_proxy.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from requests.exceptions import ProxyError, ReadTimeout, ConnectTimeout
3 |
4 | PROXY = 'http://2.56.215.247:3128'
5 | TIMEOUT_IN_SECONDS = 10
6 |
7 | scheme_proxy_map = {
8 | 'https': PROXY,
9 | }
10 | try:
11 | response = requests.get(
12 | 'https://ip.oxylabs.io', proxies=scheme_proxy_map, timeout=TIMEOUT_IN_SECONDS
13 | )
14 | except (ProxyError, ReadTimeout, ConnectTimeout) as error:
15 | print('Unable to connect to the proxy: ', error)
16 | else:
17 | print(response.text)
18 |
--------------------------------------------------------------------------------
/javascript/rotating-proxies-javascript/no_proxy.js:
--------------------------------------------------------------------------------
1 | // import axios
2 | const axios = require("axios");
3 |
4 | // Creaet and execute a new Promise
5 | (async function () {
6 | try {
7 | // This URL returns the IP address
8 | const url = `https://httpbin.org/ip`;
9 |
10 | // call the GET method on the URL
11 | const response = await axios.get(url);
12 |
13 | // print the response data, which is the IP address
14 | console.log(response.data);
15 | } catch (err) {
16 |
17 | // print the error message
18 | console.error(err);
19 | }
20 | })();
21 |
22 |
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/data_in_same_page.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 | import json
5 |
6 | response = requests.get('https://quotes.toscrape.com/js/')
7 | soup = BeautifulSoup(response.text, "lxml")
8 | script_tag = soup.find("script", src=None)
9 | pattern = "var data =(.+?);\n"
10 | raw_data = re.findall(pattern, script_tag.string, re.S)
11 | if raw_data:
12 | data = json.loads(raw_data[0])
13 | # prints whole data
14 | print(data)
15 |
16 | # prints only the author
17 | for i in data:
18 | print(i['author']['name'])
19 |
--------------------------------------------------------------------------------
/csharp/csharp-web-scraping/GetBookDetails.cs:
--------------------------------------------------------------------------------
1 | static List GetBookDetails(List urls)
2 | {
3 | var books = new List();
4 | foreach (var url in urls)
5 | {
6 | HtmlDocument document = GetDocument(url);
7 | var titleXPath = "//h1";
8 | var priceXPath = "//div[contains(@class,\"product_main\")]/p[@class=\"price_color\"]";
9 | var book = new Book();
10 | book.Title = document.DocumentNode.SelectSingleNode(titleXPath).InnerText;
11 | book.Price = document.DocumentNode.SelectSingleNode(priceXPath).InnerText;
12 | books.Add(book);
13 | }
14 | return books;
15 | }
--------------------------------------------------------------------------------
/python/lxml-tutorial/src/countries.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import html
3 |
4 | response = requests.get('https://en.wikipedia.org/wiki/List_of_countries_by_population_in_2010')
5 |
6 | tree = html.fromstring(response.text)
7 | countries = tree.xpath('//span[@class="flagicon"]')
8 | print()
9 | for country in countries:
10 | flag = country.xpath('./img/@src')[0]
11 | country = country.xpath('./following-sibling::a/text()')[0]
12 | print(country, flag)
13 |
14 | # countries = tree.xpath('//span[@class="flagicon"]')
15 | # for country in countries:
16 | # print(country.xpath('./following-sibling::a/text()')[0])
17 |
--------------------------------------------------------------------------------
/python/Pagination-With-Python/infinite_scroll_json.py:
--------------------------------------------------------------------------------
1 | # Handling pages with load more with JSON response
2 | import requests
3 |
4 |
5 | def process_pages():
6 | url = 'http://quotes.toscrape.com/api/quotes?page={}'
7 | page_numer = 1
8 | while True:
9 | response = requests.get(url.format(page_numer))
10 | data = response.json()
11 | # Process data
12 | # ...
13 | print(response.url) # only for debug
14 | if data.get('has_next'):
15 | page_numer += 1
16 | else:
17 | break
18 |
19 |
20 | if __name__ == '__main__':
21 | process_pages()
22 |
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/pusher.py:
--------------------------------------------------------------------------------
1 | from bootstrap import queue, client
2 |
3 | jobs = client.create_jobs([
4 | 'https://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
5 | 'https://books.toscrape.com/catalogue/sharp-objects_997/index.html',
6 | 'https://books.toscrape.com/catalogue/soumission_998/index.html',
7 | 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
8 | 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
9 | ])
10 |
11 | for job in jobs['queries']:
12 | queue.push(job['id'])
13 | print('job id: %s' % job['id'])
--------------------------------------------------------------------------------
/python/automate-competitors-benchmark-analysis/src/get_serp.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pandas as pd
3 |
4 |
5 | keyword = ""
6 |
7 | payload = {
8 | "source": "SEARCH_ENGINE_search",
9 | "domain": "com",
10 | "query": keyword,
11 | "parse": "true",
12 | }
13 |
14 | response = requests.request(
15 | "POST",
16 | "https://realtime.oxylabs.io/v1/queries",
17 | auth=("", ""),
18 | json=payload,
19 | )
20 |
21 | list_comparison = [
22 | [x["url"], x["title"]]
23 | for x in response.json()["results"][0]["content"]["results"]["organic"]
24 | ]
25 |
26 | print(list_comparison)
27 |
--------------------------------------------------------------------------------
/python/News-Article-Scraper/Python/extract_article_links.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 |
4 |
5 | def parse_sitemap() -> list:
6 | response = requests.get("https://www.example.com/sitemap.xml")
7 | if response.status_code != 200:
8 | return None
9 | xml_as_str = response.text
10 |
11 | soup = BeautifulSoup(xml_as_str, "lxml")
12 | loc_elements = soup.find_all("loc")
13 | links = []
14 | for loc in loc_elements:
15 | links.append(loc.text)
16 |
17 | print(f'Found {len(links)} links')
18 | return links
19 |
20 |
21 | if __name__ == '__main__':
22 | links = parse_sitemap()
23 |
--------------------------------------------------------------------------------
/python/News-Article-Scraper/JavaScript/extract_article_links.js:
--------------------------------------------------------------------------------
1 | const cheerio = require("cheerio");
2 | const axios = require("axios");
3 | url = `https://www.patrika.com/googlenewssitemap1.xml`;
4 | let links = [];
5 | async function getLinks() {
6 | try {
7 | const response = await axios.get(url);
8 | const $ = cheerio.load(response.data, { xmlMode: true });
9 | all_loc = $('loc')
10 | all_loc.each(function () {
11 | links.push($(this).text())
12 | })
13 | console.log(links.length + ' links found.')
14 |
15 | } catch (error) {
16 | console.error(error);
17 | }
18 | }
19 | getLinks();
20 |
--------------------------------------------------------------------------------
/python/automate-competitors-benchmark-analysis/src/off_page_metrics.py:
--------------------------------------------------------------------------------
1 | import time
2 | from mozscape import Mozscape
3 |
4 | client = Mozscape("", "")
5 |
6 | for y in list_comparison:
7 | try:
8 | print("Getting MOZ results for: " + y[0])
9 | domainAuthority = client.urlMetrics(y[0])
10 | y.extend([domainAuthority["ueid"], domainAuthority["uid"], domainAuthority["pda"]])
11 | except Exception as e:
12 | print(e)
13 | time.sleep(10) # Retry once after 10 seconds.
14 | domainAuthority = client.urlMetrics(y[0])
15 | y.extend([domainAuthority["ueid"], domainAuthority["uid"], domainAuthority["pda"]])
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/selenium_example.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver import Chrome
2 |
3 | # update executable_path as required
4 | driver = Chrome(executable_path='c:/driver/chromedriver.exe')
5 |
6 | driver.get('https://quotes.toscrape.com/js/')
7 |
8 | try:
9 | # print first author
10 | author_element = driver.find_element_by_tag_name("small")
11 | print(author_element.text)
12 |
13 | # print all authors
14 | all_author_elements = driver.find_elements_by_tag_name("small")
15 | for element in all_author_elements:
16 | print(element.text)
17 | finally:
18 | # always close the browser
19 | driver.quit()
20 |
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/puller.py:
--------------------------------------------------------------------------------
1 | from pprint import pprint
2 | from bootstrap import queue, client
3 |
4 | queue_item = queue.pull()
5 | if not queue_item:
6 | print('No jobs left in the queue, exiting')
7 | exit(0)
8 |
9 | if not client.is_status_done(queue_item['job_id']):
10 | queue.touch(queue_item['job_id'])
11 | print('Job is not yet finished, skipping')
12 | exit(0)
13 |
14 | content_list = client.fetch_content_list(queue_item['job_id'])
15 | if content_list is None:
16 | print('Job no longer exists in oxy')
17 | queue.delete(queue_item['job_id'])
18 | exit(0)
19 |
20 | queue.complete(queue_item['job_id'])
21 |
22 | for content in content_list:
23 | pprint(content)
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/src/scrape_quotes.vb:
--------------------------------------------------------------------------------
1 | Sub scrape_quotes()
2 | Dim browser As InternetExplorer
3 | Dim page As HTMLDocument
4 | Dim quotes As Object
5 | Dim authors As Object
6 |
7 | Set browser = New InternetExplorer
8 | browser.Visible = True
9 | browser.navigate ("https://quotes.toscrape.com")
10 | Do While browser.Busy: Loop
11 |
12 | Set page = browser.document
13 | Set quotes = page.getElementsByClassName("quote")
14 | Set authors = page.getElementsByClassName("author")
15 |
16 | For num = 1 To 5
17 | Cells(num, 1).Value = quotes.Item(num).innerText
18 | Cells(num, 2).Value = authors.Item(num).innerText
19 | Next num
20 |
21 | browser.Quit
22 | End Sub
23 |
--------------------------------------------------------------------------------
/python/Pagination-With-Python/no_next_button.py:
--------------------------------------------------------------------------------
1 | # Handling pages with Next button
2 | import requests
3 | from bs4 import BeautifulSoup
4 | from urllib.parse import urljoin
5 |
6 |
7 | def process_pages():
8 | url = 'https://www.gosc.pl/doc/791526.Zaloz-zbroje'
9 | response = requests.get(url)
10 | soup = BeautifulSoup(response.text, 'lxml')
11 | page_link_el = soup.select('.pgr_nrs a')
12 | # process first page
13 | for link_el in page_link_el:
14 | link = urljoin(url, link_el.get('href'))
15 | response = requests.get(link)
16 | soup = BeautifulSoup(response.text, 'lxml')
17 | print(response.url)
18 | # process remaining pages
19 |
20 |
21 | if __name__ == '__main__':
22 | process_pages()
23 |
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/selenium_bs4.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver import Chrome
2 | from bs4 import BeautifulSoup
3 | # update executable_path as required
4 | driver = Chrome(executable_path='c:/driver/chromedriver.exe')
5 |
6 | driver.get('https://quotes.toscrape.com/js/')
7 |
8 | try:
9 | soup = BeautifulSoup(driver.page_source, "lxml")
10 | # print first author
11 | author_element = soup.find("small", class_="author")
12 | print(author_element.text)
13 |
14 | # print all authors
15 | all_author_elements = soup.find_all("small", class_="author")
16 | for element in all_author_elements:
17 | print(element.text)
18 | finally:
19 | # always close the browser
20 | driver.quit()
21 |
--------------------------------------------------------------------------------
/python/lxml-tutorial/src/creating_xml_html.py:
--------------------------------------------------------------------------------
1 |
2 | from lxml import etree
3 |
4 | root = etree.Element("html")
5 | head = etree.SubElement(root, "head")
6 | title = etree.SubElement(head, "title")
7 | title.text = "This is Page Title"
8 | body = etree.SubElement(root, "body")
9 | heading = etree.SubElement(body, "h1", style="font-size:20pt", id="head")
10 | heading.text = "Hello World!"
11 | para = etree.SubElement(body, "p", id="firstPara")
12 | para.text = "This HTML is XML Compliant!"
13 | para = etree.SubElement(body, "p", id="secondPara")
14 | para.text = "This is the second paragraph."
15 |
16 | etree.dump(root) # prints everything to console. Use for debug only
17 |
18 |
19 | with open('input.html', 'wb') as f:
20 | f.write(etree.tostring(root, pretty_print=True))
21 |
--------------------------------------------------------------------------------
/javascript/puppeteer-tutorial/bnb.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require("puppeteer");
2 | (async () => {
3 | let url = "https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&search_type=section_navigation&property_type_id%5B%5D=8";
4 | const browser = await puppeteer.launch(url);
5 | const page = await browser.newPage();
6 | await page.goto(url);
7 | data = await page.evaluate(() => {
8 | root = Array.from(document.querySelectorAll("#FMP-target [itemprop='itemListElement']"));
9 | hotels = root.map(hotel => ({
10 | Name: hotel.querySelector('ol').parentElement.nextElementSibling.textContent,
11 | Photo: hotel.querySelector("img").getAttribute("src")
12 | }));
13 | return hotels;
14 | });
15 | console.log(data);
16 | await browser.close();
17 | })();
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/bootstrap.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import psycopg2
4 |
5 | from messenger import Queue
6 | from oxylabs import Client
7 |
8 | DB_HOST = os.getenv('DB_HOST', 'postgres')
9 | DB_USER = os.getenv('DB_USER', 'airflow')
10 | DB_PASS = os.getenv('DB_PASS', 'airflow')
11 | DB_NAME = os.getenv('DB_NAME', 'scraper')
12 | OXYLABS_USERNAME = os.getenv('OXYLABS_USERNAME', 'your-oxylabs-username')
13 | OXYLABS_PASSWORD = os.getenv('OXYLABS_PASSWORD', 'your-oxylabs-password')
14 |
15 | connection = psycopg2.connect(
16 | host=DB_HOST,
17 | user=DB_USER,
18 | password=DB_PASS,
19 | database=DB_NAME
20 | )
21 |
22 | queue = Queue(
23 | connection
24 | )
25 |
26 | client = Client(
27 | OXYLABS_USERNAME,
28 | OXYLABS_PASSWORD,
29 | )
--------------------------------------------------------------------------------
/python/regex-web-scraping/demo.py:
--------------------------------------------------------------------------------
1 | # Importing the required libraries.
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import re
5 |
6 | # Requesting the HTML from the web page.
7 | page = requests.get("https://books.toscrape.com/")
8 |
9 | # Selecting the data.
10 | soup = BeautifulSoup(page.content, "html.parser")
11 | content = soup.find_all(class_="product_pod")
12 | content = str(content)
13 |
14 | # Processing the data using Regular Expressions.
15 | re_titles = r'title="(.*?)">'
16 | titles_list = re.findall(re_titles, content)
17 | re_prices = "£(.*?)
"
18 | price_list = re.findall(re_prices, content)
19 |
20 | # Saving the output.
21 | with open("output.txt", "w") as f:
22 | for title, price in zip(titles_list, price_list):
23 | f.write(title + "\t" + price + "\n")
--------------------------------------------------------------------------------
/python/Python-Web-Scraping-Tutorial/python_toc.csv:
--------------------------------------------------------------------------------
1 | heading_number,heading_text
2 | 1,History
3 | 2,Design philosophy and features
4 | 3,Syntax and semantics
5 | 3.1,Indentation
6 | 3.2,Statements and control flow
7 | 3.3,Expressions
8 | 3.4,Methods
9 | 3.5,Typing
10 | 3.6,Arithmetic operations
11 | 4,Programming examples
12 | 5,Libraries
13 | 6,Development environments
14 | 7,Implementations
15 | 7.1,Reference implementation
16 | 7.2,Other implementations
17 | 7.3,Unsupported implementations
18 | 7.4,Cross-compilers to other languages
19 | 7.5,Performance
20 | 8,Development
21 | 9,API documentation generators
22 | 10,Naming
23 | 11,Uses
24 | 12,Languages influenced by Python
25 | 13,See also
26 | 14,References
27 | 14.1,Sources
28 | 15,Further reading
29 | 16,External links
30 |
--------------------------------------------------------------------------------
/python/Rotating-Proxies-With-Python/rotating_multiple_proxies.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | import requests
4 | from requests.exceptions import ProxyError, ReadTimeout, ConnectTimeout
5 |
6 | TIMEOUT_IN_SECONDS = 10
7 | CSV_FILENAME = 'proxies.csv'
8 |
9 | with open(CSV_FILENAME) as open_file:
10 | reader = csv.reader(open_file)
11 | for csv_row in reader:
12 | scheme_proxy_map = {
13 | 'https': csv_row[0],
14 | }
15 |
16 | try:
17 | response = requests.get(
18 | 'https://ip.oxylabs.io',
19 | proxies=scheme_proxy_map,
20 | timeout=TIMEOUT_IN_SECONDS,
21 | )
22 | except (ProxyError, ReadTimeout, ConnectTimeout) as error:
23 | pass
24 | else:
25 | print(response.text)
26 |
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/DAG/setup.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | import pendulum
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 |
7 | default_args = {
8 | 'owner': 'airflow',
9 | 'depends_on_past': False,
10 | 'retries': 2,
11 | 'retry_delay': timedelta(hours=3),
12 | }
13 | with DAG(
14 | 'setup',
15 | default_args=default_args,
16 | schedule_interval='@once',
17 | description='Setup',
18 | start_date=pendulum.datetime(2022, 5, 1, tz='UTC'),
19 | dagrun_timeout=timedelta(minutes=1),
20 | tags=['scrape', 'database'],
21 | catchup=False
22 | ) as dag:
23 | setup_task = BashOperator(
24 | task_id='setup',
25 | bash_command='python /opt/airflow/src/setup.py',
26 | )
--------------------------------------------------------------------------------
/python/playwright-web-scraping/node/book.js:
--------------------------------------------------------------------------------
1 | const playwright = require('playwright');
2 |
3 | (async () => {
4 | const browser = await playwright.chromium.launch();
5 | const page = await browser.newPage();
6 | await page.goto('https://books.toscrape.com/');
7 | const books = await page.$$eval('.product_pod', all_items => {
8 | const data = [];
9 | all_items.forEach(book => {
10 | const name = book.querySelector('h3').innerText;
11 | const price = book.querySelector('.price_color').innerText;
12 | const stock = book.querySelector('.availability').innerText;
13 | data.push({ name, price, stock});
14 | });
15 | return data;
16 | });
17 | console.log(books);
18 | console.log(books.length);
19 | await browser.close();
20 | })();
21 |
--------------------------------------------------------------------------------
/python/Pagination-With-Python/next_button.py:
--------------------------------------------------------------------------------
1 | # Handling pages with Next button
2 | import requests
3 | from bs4 import BeautifulSoup
4 | from urllib.parse import urljoin
5 |
6 |
7 | def process_pages():
8 | url = 'http://books.toscrape.com/catalogue/category/books/fantasy_19/index.html'
9 |
10 | while True:
11 | response = requests.get(url)
12 | soup = BeautifulSoup(response.text, "lxml")
13 |
14 | footer_element = soup.select_one('li.current')
15 | print(footer_element.text.strip())
16 |
17 | # Pagination
18 | next_page_element = soup.select_one('li.next > a')
19 | if next_page_element:
20 | next_page_url = next_page_element.get('href')
21 | url = urljoin(url, next_page_url)
22 | else:
23 | break
24 |
25 |
26 | if __name__ == '__main__':
27 | process_pages()
28 |
--------------------------------------------------------------------------------
/python/Scraping-Dynamic-JavaScript-Ajax-Websites-With-BeautifulSoup/selenium_bs4_headless.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver import Chrome, ChromeOptions
2 | from bs4 import BeautifulSoup
3 |
4 | # Hide the browser
5 | options = ChromeOptions()
6 | options.headless = True
7 |
8 | # update executable_path as required
9 | driver = Chrome(executable_path='c:/driver/chromedriver.exe', options=options)
10 |
11 | driver.get('https://quotes.toscrape.com/js/')
12 |
13 | try:
14 | soup = BeautifulSoup(driver.page_source, "lxml")
15 | # print first author
16 | author_element = soup.find("small", class_="author")
17 | print(author_element.text)
18 |
19 | # print all authors
20 | all_author_elements = soup.find_all("small", class_="author")
21 | for element in all_author_elements:
22 | print(element.text)
23 | finally:
24 | # always close the browser
25 | driver.quit()
26 |
--------------------------------------------------------------------------------
/python/how-to-make-web-scraping-faster/sync-scraping.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import re
3 | import time
4 | import requests
5 |
6 | def get_links():
7 | links = []
8 | with open("links.csv", "r") as f:
9 | reader = csv.reader(f)
10 | for i, row in enumerate(reader):
11 | links.append(row[0])
12 |
13 | return links
14 |
15 | def get_response(session, url):
16 | with session.get(url) as resp:
17 | print('.', end='', flush=True)
18 | text = resp.text
19 | exp = r'().*(<\/title>)'
20 | return re.search(exp, text,flags=re.DOTALL).group(0)
21 |
22 | def main():
23 | start_time = time.time()
24 | with requests.Session() as session:
25 | results = []
26 | for url in get_links():
27 | result = get_response(session, url)
28 | print(result)
29 |
30 | print(f"{(time.time() - start_time):.2f} seconds")
31 |
32 | main()
--------------------------------------------------------------------------------
/python/Pagination-With-Python/load_more_json.py:
--------------------------------------------------------------------------------
1 | # Handling pages with load more button with JSON
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import math
5 |
6 |
7 | def process_pages():
8 | url = 'https://smarthistory.org/wp-json/smthstapi/v1/objects?tag=938&page={}'
9 | headers = {
10 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
11 | }
12 | page_numer = 1
13 | while True:
14 | response = requests.get(url.format(page_numer), headers=headers)
15 | data = response.json()
16 | # Process data
17 | # ...
18 | print(response.url) # only for debug
19 | if data.get('remaining') and int(data.get('remaining')) > 0:
20 | page_numer += 1
21 | else:
22 | break
23 |
24 |
25 | if __name__ == '__main__':
26 | process_pages()
27 |
--------------------------------------------------------------------------------
/javascript/how-to-build-web-scraper/web_scraper.js:
--------------------------------------------------------------------------------
1 | const fs = require("fs");
2 | const j2cp = require("json2csv").Parser;
3 | const axios = require("axios");
4 | const cheerio = require("cheerio");
5 |
6 | const wiki_python = "https://en.wikipedia.org/wiki/Python_(programming_language)";
7 |
8 | async function getWikiTOC(url) {
9 | try {
10 | const response = await axios.get(url);
11 | const $ = cheerio.load(response.data);
12 |
13 | const TOC = $("li.toclevel-1");
14 | let toc_data = [];
15 | TOC.each(function () {
16 | level = $(this).find("span.tocnumber").first().text();
17 | text = $(this).find("span.toctext").first().text();
18 | toc_data.push({ level, text });
19 | });
20 | const parser = new j2cp();
21 | const csv = parser.parse(toc_data);
22 | fs.writeFileSync("./wiki_toc.csv", csv);
23 | } catch (err) {
24 | console.error(err);
25 | }
26 | }
27 |
28 | getWikiTOC(wiki_python);
29 |
--------------------------------------------------------------------------------
/golang/golang-web-scraper/src/go.mod:
--------------------------------------------------------------------------------
1 | module oxylabs.io/web-scraping-with-go
2 |
3 | go 1.19
4 |
5 | require (
6 | github.com/PuerkitoBio/goquery v1.8.0 // indirect
7 | github.com/andybalholm/cascadia v1.3.1 // indirect
8 | github.com/antchfx/htmlquery v1.2.5 // indirect
9 | github.com/antchfx/xmlquery v1.3.12 // indirect
10 | github.com/antchfx/xpath v1.2.1 // indirect
11 | github.com/gobwas/glob v0.2.3 // indirect
12 | github.com/gocolly/colly v1.2.0 // indirect
13 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
14 | github.com/golang/protobuf v1.3.1 // indirect
15 | github.com/kennygrant/sanitize v1.2.4 // indirect
16 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
17 | github.com/temoto/robotstxt v1.1.2 // indirect
18 | golang.org/x/net v0.0.0-20221004154528-8021a29435af // indirect
19 | golang.org/x/text v0.3.7 // indirect
20 | google.golang.org/appengine v1.6.7 // indirect
21 | )
22 |
--------------------------------------------------------------------------------
/python/how-to-make-web-scraping-faster/multiproc-scraping.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import re
3 | import time
4 | import requests
5 | from concurrent.futures import ThreadPoolExecutor
6 |
7 | def get_links():
8 | links = []
9 | with open("links.csv", "r") as f:
10 | reader = csv.reader(f)
11 | for i, row in enumerate(reader):
12 | links.append(row[0])
13 |
14 | return links
15 |
16 | def get_response(url):
17 | resp = requests.get(url)
18 | print('.', end='', flush=True)
19 | text = resp.text
20 |
21 | exp = r'().*(<\/title>)'
22 | return re.search(exp, text, flags=re.DOTALL).group(0)
23 |
24 | def main():
25 | start_time = time.time()
26 | links = get_links()
27 |
28 | with Pool(100) as p:
29 | results = p.map(get_response, links)
30 |
31 | for result in results:
32 | print(result)
33 |
34 | print(f"{(time.time() - start_time):.2f} seconds")
--------------------------------------------------------------------------------
/python/how-to-make-web-scraping-faster/multithread-scraping.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import re
3 | import time
4 | import requests
5 | from concurrent.futures import ThreadPoolExecutor
6 |
7 | def get_links():
8 | links = []
9 | with open("links.csv", "r") as f:
10 | reader = csv.reader(f)
11 | for i, row in enumerate(reader):
12 | links.append(row[0])
13 |
14 | return links
15 |
16 | def get_response(url):
17 | resp = requests.get(url)
18 | print('.', end='', flush=True)
19 | text = resp.text
20 |
21 | exp = r'().*(<\/title>)'
22 | return re.search(exp, text, flags=re.DOTALL).group(0)
23 |
24 | def main():
25 | start_time = time.time()
26 | links = get_links()
27 |
28 | with ThreadPoolExecutor(max_workers=100) as p:
29 | results = p.map(get_response, links)
30 |
31 | for result in results:
32 | print(result)
33 |
34 | print(f"{(time.time() - start_time):.2f} seconds")
--------------------------------------------------------------------------------
/python/playwright-web-scraping/python/books.py:
--------------------------------------------------------------------------------
1 | from playwright.async_api import async_playwright
2 | import asyncio
3 |
4 |
5 | async def main():
6 | async with async_playwright() as pw:
7 | browser = await pw.chromium.launch()
8 | page = await browser.new_page()
9 | await page.goto('https://books.toscrape.com')
10 |
11 | all_items = await page.query_selector_all('.product_pod')
12 | books = []
13 | for item in all_items:
14 | book = {}
15 | name_el = await item.query_selector('h3')
16 | book['name'] = await name_el.inner_text()
17 | price_el = await item.query_selector('.price_color')
18 | book['price'] = await price_el.inner_text()
19 | stock_el = await item.query_selector('.availability')
20 | book['stock'] = await stock_el.inner_text()
21 | books.append(book)
22 | print(books)
23 | await browser.close()
24 |
25 | if __name__ == '__main__':
26 | asyncio.run(main())
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/DAG/push-pull.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | import pendulum
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 |
7 | default_args = {
8 | 'owner': 'airflow',
9 | 'depends_on_past': True,
10 | 'retries': 2,
11 | 'retry_delay': timedelta(hours=3),
12 | }
13 | with DAG(
14 | 'push_pull',
15 | default_args=default_args,
16 | schedule_interval='@daily',
17 | description='Push-Pull workflow',
18 | start_date=pendulum.datetime(2022, 5, 1, tz='UTC'),
19 | dagrun_timeout=timedelta(minutes=1),
20 | tags=['scrape', 'database'],
21 | catchup=False
22 | ) as dag:
23 | task_push = BashOperator(
24 | task_id='push',
25 | bash_command='python /opt/airflow/src/pusher.py',
26 | )
27 |
28 | task_pull = BashOperator(
29 | task_id='pull',
30 | bash_command='python /opt/airflow/src/puller.py'
31 | )
32 |
33 | task_push.set_downstream(task_pull)
--------------------------------------------------------------------------------
/javascript/rotating-proxies-javascript/single_proxy_axios.js:
--------------------------------------------------------------------------------
1 | // Import axios
2 | const axios = require("axios");
3 |
4 | // Create and execute a new Promise
5 | (async function () {
6 | try {
7 |
8 | // Proxy with authentication
9 | proxy_no_auth = {
10 | host: '206.253.164.122',
11 | port: 80
12 | }
13 |
14 | // Proxy with authentication
15 | proxy_with_auth = {
16 | host: '46.138.246.248',
17 | port: 8088,
18 | auth: {
19 | username: 'USERNAME',
20 | password: 'PASSWORD'
21 | }
22 | }
23 | const url = `https://httpbin.org/ip`;
24 |
25 | // Call the GET method on the URL with proxy information
26 | const response = await axios.get(url, {
27 | proxy: proxy_no_auth
28 | });
29 | // Print effective IP address
30 | console.log(response.data);
31 | } catch (err) {
32 |
33 | //Log the error message
34 | console.error(err);
35 | }
36 | })();
37 |
38 |
--------------------------------------------------------------------------------
/python/how-to-make-web-scraping-faster/async-scraping.py:
--------------------------------------------------------------------------------
1 | import aiohttp
2 | import asyncio
3 | import csv
4 | import re
5 | import time
6 |
7 | def get_links():
8 | links = []
9 | with open("links.csv", "r") as f:
10 | reader = csv.reader(f)
11 | for i, row in enumerate(reader):
12 | links.append(row[0])
13 |
14 | return links
15 |
16 | async def get_response(session, url):
17 | async with session.get(url) as resp:
18 | text = await resp.text()
19 |
20 | exp = r'().*(<\/title>)'
21 | return re.search(exp, text,flags=re.DOTALL).group(0)
22 |
23 | async def main():
24 | start_time = time.time()
25 | async with aiohttp.ClientSession() as session:
26 |
27 | tasks = []
28 | for url in get_links():
29 | tasks.append(asyncio.create_task(get_response(session, url)))
30 |
31 | results = await asyncio.gather(*tasks)
32 | for result in results:
33 | print(result)
34 |
35 | print(f"{(time.time() - start_time):.2f} seconds")
36 |
37 |
38 | asyncio.run(main())
--------------------------------------------------------------------------------
/python/News-Article-Scraper/JavaScript/news_article_scraper.js:
--------------------------------------------------------------------------------
1 | const cheerio = require("cheerio");
2 | const axios = require("axios");
3 | url = `https://www.example.com/sitemap.xml`;
4 | let links = [];
5 | async function getLinks() {
6 | try {
7 | const response = await axios.get(url);
8 | const $ = cheerio.load(response.data, { xmlMode: true });
9 | all_loc = $('loc');
10 | all_loc.each(function () {
11 | links.push($(this).text());
12 | })
13 | console.log(links.length + ' links found.');
14 | links.forEach(async function (story_link) {
15 | try {
16 | let story = await axios.get(story_link);
17 | let $ = cheerio.load(story.data);
18 | heading = $('h1').text()
19 | body = $('.complete-story p').text()
20 |
21 | } catch (error) {
22 | console.error('internal\n' + error)
23 | }
24 | })
25 |
26 | } catch (error) {
27 | console.error(error);
28 | }
29 | }
30 | getLinks();
31 |
--------------------------------------------------------------------------------
/python/Pagination-With-Python/infinite_scroll_html.py:
--------------------------------------------------------------------------------
1 | # Handling pages with load more with HTML response
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import math
5 |
6 |
7 | def process_pages():
8 | index_page = 'https://techinstr.myshopify.com/collections/all'
9 | url = 'https://techinstr.myshopify.com/collections/all?page={}'
10 |
11 | session = requests.session()
12 | response = session.get(index_page)
13 | soup = BeautifulSoup(response.text, "lxml")
14 | count_element = soup.select_one('.filters-toolbar__product-count')
15 | count_str = count_element.text.replace('products', '')
16 | count = int(count_str)
17 | # Process page 1 data here
18 | page_count = math.ceil(count/8)
19 | for page_numer in range(2, page_count+1):
20 | response = session.get(url.format(page_numer))
21 | soup = BeautifulSoup(response.text, "lxml")
22 | first_product = soup.select_one('.product-card:nth-child(1) > a > span')
23 | print(first_product.text.strip())
24 |
25 |
26 | if __name__ == '__main__':
27 | process_pages()
28 |
--------------------------------------------------------------------------------
/python/automate-competitors-benchmark-analysis/src/page_speed_metrics.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | pagespeed_key = ""
4 |
5 |
6 | for y in list_comparison:
7 | try:
8 |
9 | print("Getting results for: " + y[0])
10 | url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url=" + y[0] + "&strategy=mobile&locale=en&key=" + pagespeed_key
11 | response = requests.request("GET", url)
12 | data = response.json()
13 |
14 | overall_score = data["lighthouseResult"]["categories"]["performance"]["score"] * 100
15 | fcp = data["loadingExperience"]["metrics"]["FIRST_CONTENTFUL_PAINT_MS"]["percentile"]/1000
16 | fid = data["loadingExperience"]["metrics"]["FIRST_INPUT_DELAY_MS"]["percentile"]/1000
17 | lcp = data["loadingExperience"]["metrics"]["LARGEST_CONTENTFUL_PAINT_MS"]["percentile"]
18 | cls = data["loadingExperience"]["metrics"]["CUMULATIVE_LAYOUT_SHIFT_SCORE"]["percentile"]/100
19 |
20 |
21 |
22 | y.extend([fcp, fid, lcp, cls, overall_score])
23 |
24 | except Exception as e:
25 | print(e)
26 | y.extend(["No data", "No data", "No data", "No data", overall_score])
--------------------------------------------------------------------------------
/python/Rotating-Proxies-With-Python/rotating_multiple_proxies_async.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import aiohttp
3 | import asyncio
4 |
5 | CSV_FILENAME = 'proxies.csv'
6 | URL_TO_CHECK = 'https://ip.oxylabs.io'
7 | TIMEOUT_IN_SECONDS = 10
8 |
9 |
10 | async def check_proxy(url, proxy):
11 | try:
12 | session_timeout = aiohttp.ClientTimeout(
13 | total=None, sock_connect=TIMEOUT_IN_SECONDS, sock_read=TIMEOUT_IN_SECONDS
14 | )
15 | async with aiohttp.ClientSession(timeout=session_timeout) as session:
16 | async with session.get(
17 | url, proxy=proxy, timeout=TIMEOUT_IN_SECONDS
18 | ) as resp:
19 | print(await resp.text())
20 | except Exception as error:
21 | print('Proxy responded with an error: ', error)
22 | return
23 |
24 |
25 | async def main():
26 | tasks = []
27 | with open(CSV_FILENAME) as open_file:
28 | reader = csv.reader(open_file)
29 | for csv_row in reader:
30 | task = asyncio.create_task(check_proxy(URL_TO_CHECK, csv_row[0]))
31 | tasks.append(task)
32 |
33 | await asyncio.gather(*tasks)
34 |
35 |
36 | asyncio.run(main())
37 |
--------------------------------------------------------------------------------
/golang/golang-web-scraper/src/books.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/csv"
5 | "fmt"
6 | "log"
7 | "os"
8 |
9 | "github.com/gocolly/colly"
10 | )
11 |
12 | type Book struct {
13 | Title string
14 | Price string
15 | }
16 |
17 | func main() {
18 | file, err := os.Create("export.csv")
19 | if err != nil {
20 | log.Fatal(err)
21 | }
22 | defer file.Close()
23 | writer := csv.NewWriter(file)
24 | defer writer.Flush()
25 | headers := []string{"Title", "Price"}
26 | writer.Write(headers)
27 |
28 | c := colly.NewCollector(
29 | colly.AllowedDomains("books.toscrape.com"),
30 | )
31 |
32 | c.OnRequest(func(r *colly.Request) {
33 | fmt.Println("Visiting: ", r.URL.String())
34 | })
35 |
36 | c.OnHTML(".next > a", func(e *colly.HTMLElement) {
37 | nextPage := e.Request.AbsoluteURL(e.Attr("href"))
38 | c.Visit(nextPage)
39 | })
40 |
41 | c.OnHTML(".product_pod", func(e *colly.HTMLElement) {
42 | book := Book{}
43 | book.Title = e.ChildAttr(".image_container img", "alt")
44 | book.Price = e.ChildText(".price_color")
45 | row := []string{book.Title, book.Price}
46 | writer.Write(row)
47 | })
48 |
49 | startUrl := "https://books.toscrape.com/"
50 | c.Visit(startUrl)
51 | }
52 |
--------------------------------------------------------------------------------
/javascript/javascript-web-scraping/books.js:
--------------------------------------------------------------------------------
1 | const fs = require("fs");
2 | const j2cp = require("json2csv").Parser;
3 | const axios = require("axios");
4 | const cheerio = require("cheerio");
5 |
6 | const mystery = "http://books.toscrape.com/catalogue/category/books/mystery_3/index.html";
7 |
8 | const books_data = [];
9 |
10 | async function getBooks(url) {
11 | try {
12 | const response = await axios.get(url);
13 | const $ = cheerio.load(response.data);
14 |
15 | const books = $("article");
16 | books.each(function () {
17 | title = $(this).find("h3 a").text();
18 | price = $(this).find(".price_color").text();
19 | stock = $(this).find(".availability").text().trim();
20 | books_data.push({ title, price, stock });
21 | });
22 | // console.log(books_data);
23 | const baseUrl = "http://books.toscrape.com/catalogue/category/books/mystery_3/";
24 | if ($(".next a").length > 0) {
25 | next = baseUrl + $(".next a").attr("href");
26 | getBooks(next);
27 | } else {
28 | const parser = new j2cp();
29 | const csv = parser.parse(books_data);
30 | fs.writeFileSync("./books.csv", csv);
31 | }
32 | } catch (err) {
33 | console.error(err);
34 | }
35 | }
36 |
37 | getBooks(mystery);
--------------------------------------------------------------------------------
/python/automate-competitors-benchmark-analysis/src/get_top_urls.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 |
4 | for y in list_comparison:
5 | try:
6 | print("Scraping: " + y[0])
7 | html = requests.request("get", y[0])
8 | soup = BeautifulSoup(html.text)
9 |
10 | try:
11 | metatitle = (soup.find("title")).get_text()
12 | except Exception:
13 | metatitle = ""
14 |
15 | try:
16 | metadescription = soup.find("meta", attrs={"name": "description"})["content"]
17 | except Exception:
18 | metadescription = ""
19 |
20 | try:
21 | h1 = soup.find("h1").get_text()
22 | except Exception:
23 | h1 = ""
24 |
25 | paragraph = [a.get_text() for a in soup.find_all('p')]
26 | text_length = sum(len(a) for a in paragraph)
27 | text_counter = sum(a.lower().count(keyword) for a in paragraph)
28 | metatitle_occurrence = keyword in metatitle.lower()
29 | h1_occurrence = keyword in h1.lower()
30 | metatitle_equal = metatitle == y[1]
31 | y.extend([metatitle, metatitle_equal, metadescription, h1, paragraph, text_length, text_counter, metatitle_occurrence, h1_occurrence])
32 |
33 | except Exception as e:
34 | print(e)
35 | y.extend(["No data"]*9)
--------------------------------------------------------------------------------
/r/web-scraping-r/src/dynamic_rselenium.R:
--------------------------------------------------------------------------------
1 | # install.packages("RSelenium")
2 | # install.packages("dplyr")
3 |
4 | library(RSelenium)
5 | library(dplyr)
6 |
7 | # Method 1
8 | # Install chromedriver
9 | # Documentation at https://cran.r-project.org/web/packages/RSelenium/RSelenium.pdf page 16
10 |
11 | rD <- rsDriver(browser="chrome", port=9511L, verbose=F)
12 | remDr <- rD[["client"]]
13 |
14 | # Method 2
15 | # Run the following from terminal. Docker required.
16 | # docker pull selenium/standalone-firefox
17 | # docker run -d -p 4445:4444 selenium/standalone-firefox
18 | # Run
19 | # remDr <- remoteDriver(
20 | # remoteServerAddr = "localhost",
21 | # port = 4445L,
22 | # browserName = "firefox"
23 | # )
24 | # remDr$open()
25 |
26 | remDr$navigate("https://books.toscrape.com/")
27 | remDr$getCurrentUrl()
28 |
29 | titleElements <- remDr$findElements(using = "xpath", "//article//img")
30 | titles <- sapply(titleElements, function(x){x$getElementAttribute("alt")[[1]]})
31 | pricesElements <- remDr$findElements(using = "xpath", "//*[@class='price_color']")
32 | prices <- sapply(pricesElements, function(x){x$getElementText()[[1]]})
33 | stockElements <- remDr$findElements(using = "xpath", "//*[@class='instock availability']")
34 | stocks <- sapply(stockElements, function(x){x$getElementText()[[1]]})
35 |
36 | df <- data.frame(titles, prices, stocks)
37 | remDr$close()
38 |
39 |
40 | write.csv(df, "./books_selenium.csv")
41 |
42 |
--------------------------------------------------------------------------------
/python/News-Article-Scraper/Python/news_article_scraper.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | import csv
4 |
5 |
6 | def parse_sitemap() -> list:
7 | response = requests.get("https://www.example.com/sitemap.xml")
8 | if response.status_code != 200:
9 | return None
10 | xml_as_str = response.text
11 |
12 | soup = BeautifulSoup(xml_as_str, "lxml")
13 | loc_elements = soup.find_all("loc")
14 | links = []
15 | for loc in loc_elements:
16 | links.append(loc.text)
17 |
18 | print(f'Found {len(links)} links')
19 | return links
20 |
21 |
22 | def parse_articles(links: list):
23 | s = requests.Session()
24 | with open("news.csv", "w", encoding="utf-8", newline="") as f:
25 | writer = csv.DictWriter(f, fieldnames=['Heading', 'Body'])
26 | writer.writeheader()
27 | for link in links:
28 | response = s.get(link)
29 | soup = BeautifulSoup(response.text, "lxml")
30 | heading = soup.select_one('h1').text
31 | para = []
32 | for p in soup.select('.complete-story p'):
33 | para.append(p.text)
34 | body = '\n'.join(para)
35 | writer.writerow({'Heading': heading,
36 | 'Body': body
37 | })
38 |
39 |
40 | if __name__ == '__main__':
41 | links = parse_sitemap()
42 | parse_articles(links)
43 |
--------------------------------------------------------------------------------
/javascript/rotating-proxies-javascript/rotating_proxies.js:
--------------------------------------------------------------------------------
1 | const csv = require('async-csv');
2 | const fs = require('fs').promises;
3 | const axios = require("axios");
4 |
5 | (async () => {
6 | // Read file from disk:
7 | const csvFile = await fs.readFile('proxy_list.csv');
8 |
9 | // Convert CSV string into rows:
10 | const data = await csv.parse(csvFile);
11 | await Promise.all(data.map(async (item) => {
12 | try {
13 |
14 | // Create the Proxy object
15 | proxy_no_auth = {
16 | host: '206.253.164.122',
17 | port: 80
18 | }
19 |
20 | // Proxy with authentication
21 | proxy_with_auth = {
22 | host: '46.138.246.248',
23 | port: 8088,
24 | auth: {
25 | username: 'USERNAME',
26 | password: 'PASSWORD'
27 | }
28 | }
29 |
30 | // This URL returns the IP
31 | const url = `https://httpbin.org/ip`;
32 |
33 | // Call the GET method on the URL with proxy information
34 | const response = await axios.get(url, {
35 | proxy: proxy_no_auth
36 | });
37 | // Print effective IP address
38 | console.log(response.data);
39 | } catch (err) {
40 |
41 | // Log failed proxy
42 | console.log('Proxy Failed: ' + item[0]);
43 | }
44 | }));
45 |
46 | })();
47 |
--------------------------------------------------------------------------------
/python/Python-Web-Scraping-Tutorial/wiki_toc.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import requests
5 |
6 |
7 | def get_data(url):
8 | response = requests.get(url)
9 | soup = BeautifulSoup(response.text, 'lxml')
10 | table_of_contents = soup.find("div", id="toc")
11 | headings = table_of_contents.find_all("li")
12 | data = []
13 | for heading in headings:
14 | heading_text = heading.find("span", class_="toctext").text
15 | heading_number = heading.find("span", class_="tocnumber").text
16 | data.append({
17 | 'heading_number': heading_number,
18 | 'heading_text': heading_text,
19 | })
20 | return data
21 |
22 |
23 | def export_data(data, file_name):
24 | with open(file_name, "w", newline="") as file:
25 | writer = csv.DictWriter(file, fieldnames=['heading_number', 'heading_text'])
26 | writer.writeheader()
27 | writer.writerows(data)
28 |
29 |
30 | def main():
31 | url_to_parse = "https://en.wikipedia.org/wiki/Python_(programming_language)"
32 | file_name = "python_toc.csv"
33 | data = get_data(url_to_parse)
34 | export_data(data, file_name)
35 |
36 | url_to_parse = "https://en.wikipedia.org/wiki/Web_scraping"
37 | file_name = "web_scraping_toc.csv"
38 | data = get_data(url_to_parse)
39 | export_data(data, file_name)
40 |
41 | print('Done')
42 |
43 |
44 | if __name__ == '__main__':
45 | main()
46 |
--------------------------------------------------------------------------------
/python/pandas-read-html-tables/src/population.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | Document
9 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | | Sequence |
30 | Country |
31 | Population |
32 | Updated |
33 |
34 |
35 |
36 |
37 | | 1 |
38 | China |
39 | 1,439,323,776 |
40 | 1-Dec-2020 |
41 |
42 |
43 | | 2 |
44 | India |
45 | 1,380,004,385 |
46 | 1-Dec-2020 |
47 |
48 |
49 | | 3 |
50 | United States |
51 | 331,002,651 |
52 | 1-Dec-2020 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/oxylabs.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | JOB_STATUS_DONE = 'done'
4 |
5 | HTTP_NO_CONTENT = 204
6 |
7 |
8 | class Client:
9 | def __init__(self, username, password):
10 | self.username = username
11 | self.password = password
12 |
13 | def create_jobs(self, urls):
14 | payload = {
15 | 'source': 'universal_ecommerce',
16 | 'url': urls
17 | }
18 |
19 | response = requests.request(
20 | 'POST',
21 | 'https://data.oxylabs.io/v1/queries/batch',
22 | auth=(self.username, self.password),
23 | json=payload,
24 | )
25 |
26 | return response.json()
27 |
28 | def is_status_done(self, job_id):
29 | job_status_response = requests.request(
30 | method='GET',
31 | url='http://data.oxylabs.io/v1/queries/%s' % job_id,
32 | auth=(self.username, self.password),
33 | )
34 |
35 | job_status_data = job_status_response.json()
36 |
37 | return job_status_data['status'] == JOB_STATUS_DONE
38 |
39 | def fetch_content_list(self, job_id):
40 | job_result_response = requests.request(
41 | method='GET',
42 | url='http://data.oxylabs.io/v1/queries/%s/results' % job_id,
43 | auth=(self.username, self.password),
44 | )
45 | if job_result_response.status_code == HTTP_NO_CONTENT:
46 | return None
47 |
48 | job_results_json = job_result_response.json()
49 |
50 | return job_results_json['results']
--------------------------------------------------------------------------------
/python/how-to-build-a-price-tracker/tracker.py:
--------------------------------------------------------------------------------
1 | import smtplib
2 | import pandas as pd
3 | import requests
4 | from bs4 import BeautifulSoup
5 | from price_parser import Price
6 |
7 | PRODUCT_URL_CSV = "products.csv"
8 | SAVE_TO_CSV = True
9 | PRICES_CSV = "prices.csv"
10 | SEND_MAIL = True
11 |
12 | def get_urls(csv_file):
13 | df = pd.read_csv(csv_file)
14 | return df
15 |
16 | def get_response(url):
17 | response = requests.get(url)
18 | return response.text
19 |
20 | def get_price(html):
21 | soup = BeautifulSoup(html, "lxml")
22 | el = soup.select_one(".price_color")
23 | price = Price.fromstring(el.text)
24 | return price.amount_float
25 |
26 | def process_products(df):
27 | updated_products = []
28 | for product in df.to_dict("records"):
29 | html = get_response(product["url"])
30 | product["price"] = get_price(html)
31 | product["alert"] = product["price"] < product["alert_price"]
32 | updated_products.append(product)
33 | return pd.DataFrame(updated_products)
34 |
35 | def get_mail(df):
36 | subject = "Price Drop Alert"
37 | body = df[df["alert"]].to_string()
38 | subject_and_message = f"Subject:{subject}\n\n{body}"
39 | return subject_and_message
40 |
41 | def send_mail(df):
42 | message_text = get_mail(df)
43 | with smtplib.SMTP("smtp.server.address", 587) as smtp:
44 | smtp.starttls()
45 | smtp.login(mail_user, mail_pass)
46 | smtp.sendmail(mail_user, mail_to, message_text)
47 |
48 | def main():
49 | df = get_urls(PRODUCT_URL_CSV)
50 | df_updated = process_products(df)
51 | if SAVE_TO_CSV:
52 | df_updated.to_csv(PRICES_CSV, index=False, mode="a")
53 | if SEND_MAIL:
54 | send_mail(df_updated)
55 |
--------------------------------------------------------------------------------
/python/news-scraping/README.md:
--------------------------------------------------------------------------------
1 | # News Scraping
2 |
3 | [
](https://github.com/topics/playwright) [
](https://github.com/topics/Proxy)
4 |
5 | - [Fetch HTML Page](#fetch-html-page)
6 | - [Parsing HTML](#parsing-html)
7 | - [Extracting Text](#extracting-text)
8 |
9 | This article discusses everything you need to know about news scraping, including the benefits and use cases of news scraping as well as how you can use Python to create an article scraper.
10 |
11 | For a detailed explanation, see our [blog post](https://oxy.yt/YrD0).
12 |
13 |
14 |
15 | ## Fetch HTML Page
16 |
17 | ```shell
18 | pip3 install requests
19 | ```
20 |
21 | Create a new Python file and enter the following code:
22 |
23 | ```python
24 | import requests
25 | response = requests.get(https://quotes.toscrape.com')
26 |
27 | print(response.text) # Prints the entire HTML of the webpage.
28 | ```
29 |
30 | ## Parsing HTML
31 |
32 | ```shell
33 | pip3 install lxml beautifulsoup4
34 | ```
35 |
36 | ```python
37 | from bs4 import BeautifulSoup
38 | response = requests.get('https://quotes.toscrape.com')
39 | soup = BeautifulSoup(response.text, 'lxml')
40 |
41 | title = soup.find('title')
42 | ```
43 |
44 | ## Extracting Text
45 |
46 | ```python
47 | print(title.get_text()) # Prints page title.
48 | ```
49 |
50 | ### Fine Tuning
51 |
52 | ```python
53 | soup.find('small',itemprop="author")
54 | ```
55 |
56 | ```python
57 | soup.find('small',class_="author")
58 | ```
59 |
60 | ### Extracting Headlines
61 |
62 | ```python
63 | headlines = soup.find_all(itemprop="text")
64 |
65 | for headline in headlines:
66 | print(headline.get_text())
67 | ```
68 |
69 |
70 |
71 | If you wish to find out more about News Scraping, see our [blog post](https://oxy.yt/YrD0).
72 |
--------------------------------------------------------------------------------
/python/playwright-web-scraping/node/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "node",
3 | "lockfileVersion": 2,
4 | "requires": true,
5 | "packages": {
6 | "": {
7 | "dependencies": {
8 | "playwright": "^1.27.0"
9 | }
10 | },
11 | "node_modules/playwright": {
12 | "version": "1.27.0",
13 | "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.27.0.tgz",
14 | "integrity": "sha512-F+0+0RD03LS+KdNAMMp63OBzu+NwYYLd52pKLczuSlTsV5b/SLkUoNhSfzDFngEFOuRL2gk0LlfGW3mKiUBk6w==",
15 | "hasInstallScript": true,
16 | "dependencies": {
17 | "playwright-core": "1.27.0"
18 | },
19 | "bin": {
20 | "playwright": "cli.js"
21 | },
22 | "engines": {
23 | "node": ">=14"
24 | }
25 | },
26 | "node_modules/playwright-core": {
27 | "version": "1.27.0",
28 | "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.27.0.tgz",
29 | "integrity": "sha512-VBKaaFUVKDo3akW+o4DwbK1ZyXh46tcSwQKPK3lruh8IJd5feu55XVZx4vOkbb2uqrNdIF51sgsadYT533SdpA==",
30 | "bin": {
31 | "playwright": "cli.js"
32 | },
33 | "engines": {
34 | "node": ">=14"
35 | }
36 | }
37 | },
38 | "dependencies": {
39 | "playwright": {
40 | "version": "1.27.0",
41 | "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.27.0.tgz",
42 | "integrity": "sha512-F+0+0RD03LS+KdNAMMp63OBzu+NwYYLd52pKLczuSlTsV5b/SLkUoNhSfzDFngEFOuRL2gk0LlfGW3mKiUBk6w==",
43 | "requires": {
44 | "playwright-core": "1.27.0"
45 | }
46 | },
47 | "playwright-core": {
48 | "version": "1.27.0",
49 | "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.27.0.tgz",
50 | "integrity": "sha512-VBKaaFUVKDo3akW+o4DwbK1ZyXh46tcSwQKPK3lruh8IJd5feu55XVZx4vOkbb2uqrNdIF51sgsadYT533SdpA=="
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/python/scrape-images-from-website/img-scraper.py:
--------------------------------------------------------------------------------
1 | import io
2 | import pathlib
3 | import hashlib
4 | import pandas as pd
5 | import requests
6 | from bs4 import BeautifulSoup
7 | from PIL import Image
8 | from selenium import webdriver
9 |
10 |
11 | def get_content_from_url(url):
12 | driver = webdriver.Chrome() # add "executable_path=" if driver not in running directory
13 | driver.get(url)
14 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
15 | page_content = driver.page_source
16 | driver.quit() # We do not need the browser instance for further steps.
17 | return page_content
18 |
19 |
20 | def parse_image_urls(content, classes, location, source):
21 | soup = BeautifulSoup(content)
22 | results = []
23 | for a in soup.findAll(attrs={"class": classes}):
24 | name = a.find(location)
25 | if name not in results:
26 | results.append(name.get(source))
27 | return results
28 |
29 |
30 | def save_urls_to_csv(image_urls):
31 | df = pd.DataFrame({"links": image_urls})
32 | df.to_csv("links.csv", index=False, encoding="utf-8")
33 |
34 |
35 | def get_and_save_image_to_file(image_url, output_dir):
36 | response = requests.get(image_url, headers={"User-agent": "Mozilla/5.0"})
37 | image_content = response.content
38 | image_file = io.BytesIO(image_content)
39 | image = Image.open(image_file).convert("RGB")
40 | filename = hashlib.sha1(image_content).hexdigest()[:10] + ".png"
41 | file_path = output_dir / filename
42 | image.save(file_path, "PNG", quality=80)
43 |
44 |
45 | def main():
46 | url = "https://your.url/here?yes=brilliant"
47 | content = get_content_from_url(url)
48 | image_urls = parse_image_urls(
49 | content=content, classes="blog-card__link", location="img", source="src",
50 | )
51 | save_urls_to_csv(image_urls)
52 |
53 | for image_url in image_urls:
54 | get_and_save_image_to_file(
55 | image_url, output_dir=pathlib.Path("nix/path/to/test"),
56 | )
57 |
58 |
59 | if __name__ == "__main__": #only executes if imported as main file
60 | main()
--------------------------------------------------------------------------------
/python/Web-Scraping-With-Selenium/books_selenium.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from selenium.webdriver import Chrome, ChromeOptions
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.webdriver.support import expected_conditions as EC
6 | from selenium.webdriver.common.keys import Keys
7 |
8 | CHROME_DRIVER_PATH = 'c:/WebDrivers/chromedriver.exe'
9 | HOMEPAGE = "http://books.toscrape.com"
10 |
11 |
12 | def get_data(url, categories):
13 | browser_options = ChromeOptions()
14 | browser_options.headless = True
15 |
16 | driver = Chrome(executable_path=CHROME_DRIVER_PATH, options=browser_options)
17 | driver.get(url)
18 | driver.implicitly_wait(10)
19 | data = []
20 | for category in categories:
21 | humor = driver.find_element_by_xpath(f'//a[contains(text(),{category})]')
22 | humor.click()
23 |
24 | try:
25 | books = WebDriverWait(driver, 10).until(
26 | EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.product_pod'))
27 | )
28 | except Exception as e:
29 | raise e
30 |
31 | for book in books:
32 | title = book.find_element_by_css_selector("h3 > a")
33 | price = book.find_element_by_css_selector(".price_color")
34 | stock = book.find_element_by_css_selector(".instock.availability")
35 | data.append({
36 | 'title': title.get_attribute("title"),
37 | 'price': price.text,
38 | 'stock': stock.text,
39 | 'Category': category
40 | })
41 |
42 | driver.get(url)
43 |
44 | driver.quit()
45 | return data
46 |
47 |
48 | def export_csv(data):
49 | df = pd.DataFrame(data)
50 | # Apply transformations if needed
51 | df.to_csv("books_exported.csv", index=False)
52 | print(df) # DEBUG
53 |
54 |
55 | def main():
56 | data = get_data(url=HOMEPAGE, categories=["Humor", "Art"])
57 | export_csv(data)
58 | print('DONE')
59 |
60 |
61 | if __name__ == '__main__':
62 | main()
63 |
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/DAG/scrape.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | import pendulum
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 | from airflow.operators.python import ShortCircuitOperator
7 |
8 | default_args = {
9 | 'owner': 'airflow',
10 | 'depends_on_past': True,
11 | 'retries': 2,
12 | 'retry_delay': timedelta(hours=3),
13 | }
14 | with DAG(
15 | 'scrape',
16 | default_args=default_args,
17 | schedule_interval='* * * * *',
18 | description='Scrape the website',
19 | start_date=pendulum.datetime(2022, 5, 1, tz='UTC'),
20 | dagrun_timeout=timedelta(minutes=1),
21 | tags=['scrape', 'oxylabs', 'push', 'pull'],
22 | catchup=False
23 | ) as dag:
24 | trigger_always = ShortCircuitOperator(
25 | task_id='always',
26 | python_callable=lambda prev_start_date_success: prev_start_date_success is not None,
27 | provide_context=True,
28 | dag=dag
29 | )
30 |
31 | trigger_once = ShortCircuitOperator(
32 | task_id='once',
33 | python_callable=lambda prev_start_date_success: prev_start_date_success is None,
34 | provide_context=True,
35 | dag=dag
36 | )
37 |
38 | setup_task = BashOperator(
39 | task_id='setup',
40 | bash_command='python /opt/airflow/src/setup.py',
41 | )
42 |
43 | trigger_once.set_downstream(setup_task)
44 | def is_midnight(logical_date):
45 | return logical_date.hour == 0 and logical_date.minute == 0
46 |
47 | trigger_once_per_day = ShortCircuitOperator(
48 | task_id='once_per_day',
49 | python_callable=is_midnight,
50 | provide_context=True,
51 | dag=dag
52 | )
53 |
54 | task_push = BashOperator(
55 | task_id='push',
56 | bash_command='python /opt/airflow/src/pusher.py',
57 | )
58 | trigger_once_per_day.set_downstream(task_push)
59 |
60 | task_pull = BashOperator(
61 | task_id='pull',
62 | bash_command='python /opt/airflow/src/puller.py'
63 | )
64 |
65 | trigger_always.set_downstream(task_pull)
66 | trigger_always.set_downstream(trigger_once_per_day)
67 |
--------------------------------------------------------------------------------
/javascript/puppeteer-on-aws-lambda/README.md:
--------------------------------------------------------------------------------
1 | # Puppeteer on AWS Lambda
2 |
3 | ## Problem #1 – Puppeteer is too big to push to Lambda
4 |
5 | AWS Lambda has a 50 MB limit on the zip file you push directly to it. Due to the fact that it installs Chromium, the Puppeteer package is significantly larger than that. However, this 50 MB limit doesn’t apply when you load the function from S3! See the documentation [here](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html).
6 |
7 | AWS Lambda quotas can be tight for Puppeteer:
8 |
9 | 
10 |
11 | The 250 MB unzipped can be bypassed by uploading directly from an S3 bucket. So I create a bucket in S3, use a node script to upload to S3, and then update my Lambda code from that bucket. The script looks something like this:
12 |
13 | ```bash
14 | "zip": "npm run build && 7z a -r function.zip ./dist/* node_modules/",
15 | "sendToLambda": "npm run zip && aws s3 cp function.zip s3://chrome-aws && rm function.zip && aws lambda update-function-code --function-name puppeteer-examples --s3-bucket chrome-aws --s3-key function.zip"
16 | ```
17 |
18 | ## Problem #2 – Puppeteer on AWS Lambda doesn’t work
19 |
20 | By default, Linux (including AWS Lambda) doesn’t include the necessary libraries required to allow Puppeteer to function.
21 |
22 | Fortunately, there already exists a package of Chromium built for AWS Lambda. You can find it [here](https://www.npmjs.com/package/chrome-aws-lambda). You will need to install it and puppeteer-core in your function that you are sending to Lambda.
23 |
24 | The regular Puppeteer package will not be needed and, in fact, counts against your 250 MB limit.
25 |
26 | ```node
27 | npm i --save chrome-aws-lambda puppeteer-core
28 | ```
29 |
30 | And then, when you are setting it up to launch a browser from Puppeteer, it will look like this:
31 |
32 | ```javascript
33 | const browser = await chromium.puppeteer
34 | .launch({
35 | args: chromium.args,
36 | defaultViewport: chromium.defaultViewport,
37 | executablePath: await chromium.executablePath,
38 | headless: chromium.headless
39 | });
40 | ```
41 |
42 | ## Final note
43 |
44 | Puppeteer requires more memory than a regular script, so keep an eye on your max memory usage. When using Puppeteer, I recommend at least 512 MB on your AWS Lambda function.
45 | Also, don’t forget to run `await browser.close()` at the end of your script. Otherwise, you may end up with your function running until timeout for no reason because the browser is still alive and waiting for commands.
46 |
--------------------------------------------------------------------------------
/python/building-scraping-pipeline-apache-airflow/messenger.py:
--------------------------------------------------------------------------------
1 | import atexit
2 |
3 | import psycopg2.extras
4 |
5 | STATUS_PENDING = 'pending'
6 | STATUS_COMPLETE = 'complete'
7 | STATUS_DELETED = 'deleted'
8 |
9 |
10 | class Queue:
11 | def __init__(self, connection):
12 | self.connection = connection
13 |
14 | atexit.register(self.cleanup)
15 |
16 | def setup(self):
17 | cursor = self.connection.cursor()
18 |
19 | cursor.execute('''
20 | select table_name
21 | from information_schema.tables
22 | where table_schema='public'
23 | and table_type='BASE TABLE'
24 | ''')
25 | for cursor_result in cursor:
26 | if cursor_result[0] == 'queue':
27 | print('Table already exists')
28 | return False
29 |
30 | cursor.execute('''
31 | create sequence queue_seq;
32 |
33 | create table queue (
34 | id int check (id > 0) primary key default nextval ('queue_seq'),
35 | created_at timestamp(0) not null DEFAULT CURRENT_TIMESTAMP,
36 | updated_at timestamp(0) not null DEFAULT CURRENT_TIMESTAMP,
37 | status varchar(255) not null DEFAULT 'pending',
38 | job_id varchar(255)
39 | )
40 | ''')
41 |
42 | return True
43 |
44 | def push(self, job_id):
45 | self.__execute_and_commit(
46 | 'insert into queue (job_id) values (%s)',
47 | [job_id]
48 | )
49 |
50 | def pull(self):
51 | cursor = self.connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
52 |
53 | cursor.execute('start transaction')
54 | cursor.execute(
55 | '''
56 | select * from queue where status = %s and
57 | updated_at < now() - interval '10 second'
58 | order by random()
59 | limit 1
60 | for update
61 | ''',
62 | [STATUS_PENDING]
63 | )
64 | return cursor.fetchone()
65 |
66 | def delete(self, job_id):
67 | self.__change_status(job_id, STATUS_DELETED)
68 |
69 | def complete(self, job_id):
70 | self.__change_status(job_id, STATUS_COMPLETE)
71 |
72 | def touch(self, job_id):
73 | self.__execute_and_commit(
74 | 'update queue set updated_at = now() where job_id = %s',
75 | [job_id]
76 | )
77 |
78 | def __change_status(self, job_id, status):
79 | self.__execute_and_commit(
80 | 'update queue set status = %s where job_id = %s',
81 | [status, job_id]
82 | )
83 |
84 | def __execute_and_commit(self, sql, val):
85 | cursor = self.connection.cursor()
86 | cursor.execute(sql, val)
87 |
88 | self.connection.commit()
89 |
90 | def cleanup(self):
91 | self.connection.commit()
--------------------------------------------------------------------------------
/r/web-scraping-r/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With R
2 |
3 | [
](https://github.com/topics/r) [
](https://github.com/topics/web-scraping)
4 |
5 | - [Installing requirements](#installing-requirements)
6 | - [Web scraping with rvest](#web-scraping-with-rvest)
7 | - [Web scraping with RSelenium](#web-scraping-with-rselenium)
8 |
9 |
10 | This tutorial covers the basics of web scraping with R. We’ll begin with the scraping of static pages and shift the focus to the techniques that can be used for scraping data from dynamic websites that use JavaScript to render the content.
11 |
12 | For a detailed explanation, see [this blog post](https://oxy.yt/1r8m).
13 |
14 | ## Installing requirements
15 |
16 | For macOS, run the following:
17 |
18 | ```shell
19 | brew install r
20 | brew install --cask r-studio
21 |
22 | ```
23 |
24 | For Windows, run the following:
25 |
26 | ```batch
27 | choco install r.project
28 | choco install r.studio
29 | ```
30 |
31 | ### Installing required libraries
32 |
33 | ```R
34 | install.packages("rvest")
35 | install.packages("dplyr")
36 | ```
37 |
38 | ## Web scraping with rvest
39 |
40 | ```R
41 | library(rvest)
42 | link = "https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes"
43 | page = read_html(link)
44 |
45 | ```
46 |
47 | ### Parsing HTML Content
48 |
49 | ```R
50 | page %>% html_elements(css="")
51 | page %>% html_elements(xpath="")
52 | ```
53 |
54 |
55 |
56 | 
57 |
58 | For above page, use the following:
59 |
60 | ```R
61 | htmlElement <- page %>% html_element("table.sortable")
62 | ```
63 |
64 | ### Saving data to a data frame
65 |
66 | ```R
67 | df <- html_table(htmlEl, header = FALSE)
68 | names(df) <- df[2,]
69 | df = df[-1:-2,]
70 | ```
71 |
72 | ### Exporting data frame to a CSV file
73 |
74 | ```R
75 | write.csv(df, "iso_codes.csv")
76 | ```
77 |
78 | ### Downloading Images
79 |
80 | ```R
81 | page <- read_html(url)
82 | image_element <- page %>% html_element(".thumbborder")
83 | image_url <- image_element %>% html_attr("src")
84 | download.file(image_url, destfile = basename("paris.jpg"))
85 | ```
86 |
87 | ### Scrape Dynamic Pages with Rvest
88 |
89 | Find the API endpoint and use that as following:
90 | ```R
91 | page<-read_html(GET(api_url, timeout(10)))
92 | jsontext <- page %>% html_element("p") %>% html_text()
93 | ```
94 | For a complete example, see [dynamic_rvest.R](src/dynamic_rvest.R).
95 |
96 | ## Web scraping with RSelenium
97 |
98 | ```R
99 | install.package("RSelenium")
100 | library(RSelenium)
101 |
102 | ```
103 |
104 | ### Starting Selenium
105 |
106 | #### Method 1
107 |
108 | ```R
109 | # Method 1
110 | rD <- rsDriver(browser="chrome", port=9515L, verbose=FALSE)
111 | remDr <- rD[["client"]]
112 |
113 | ```
114 |
115 | #### Method 2
116 |
117 | ```shell
118 | docker run -d -p 4445:4444 selenium/standalone-firefox
119 | ```
120 |
121 | ```R
122 | remDr <- remoteDriver(
123 | remoteServerAddr = "localhost",
124 | port = 4445L,
125 | browserName = "firefox"
126 | )
127 | remDr$open()
128 | ```
129 |
130 | ### Working with elements in Selenium
131 |
132 | ```R
133 | remDr$navigate("https://books.toscrape.com/catalogue/category/books/science-fiction_16")
134 | ```
135 |
136 | 
137 |
138 | ```R
139 | titleElements <- remDr$findElements(using = "xpath", "//article//img")
140 | titles <- sapply(titleElements, function(x){x$getElementAttribute("alt")[[1]]})
141 |
142 | pricesElements <- remDr$findElements(using = "xpath", "//*[@class='price_color']")
143 | prices <- sapply(pricesElements, function(x){x$getElementText()[[1]]})
144 |
145 | stockElements <- remDr$findElements(using = "xpath", "//*[@class='instock availability']")
146 | stocks <- sapply(stockElements, function(x){x$getElementText()[[1]]})
147 |
148 | ```
149 |
150 | ### Creating a data frame
151 |
152 | ```R
153 | df <- data.frame(titles, prices, stocks)
154 | ```
155 |
156 | #### Save CSV
157 |
158 | ```R
159 | write.csv(df, "books.csv")
160 | ```
161 |
162 | If you wish to find out more about web scraping with R, see our [blog post](https://oxy.yt/1r8m).
163 |
--------------------------------------------------------------------------------
/python/regex-web-scraping/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With RegEx
2 |
3 | # Creating virutal environment
4 |
5 | ```bash
6 | python3 -m venv scrapingdemo
7 | ```
8 |
9 | ```bash
10 | source ./scrapingdemo/bin/activate
11 | ```
12 |
13 | # Installing requirements
14 |
15 | ```bash
16 | pip install requests
17 | ```
18 |
19 | ```bash
20 | pip install beautifulsoup4
21 | ```
22 |
23 | # Importing the required libraries
24 |
25 | ```python
26 | import requests
27 | from bs4 import BeautifulSoup
28 | import re
29 | ```
30 |
31 | ## Sending the GET request
32 |
33 | Use the Requests library to send a request to a web page from which you want to scrape the data. In this case, https://books.toscrape.com/. To commence, enter the following:
34 |
35 | ```python
36 | page = requests.get('https://books.toscrape.com/')
37 | ```
38 |
39 | ## Selecting data
40 |
41 | First, create a Beautiful Soup object and pass the page content received from your request during the initialization, including the parser type. As you’re working with an HTML code, select `HTML.parser` as the parser type.
42 |
43 | 
44 |
45 | By inspecting the elements (right-click and select inspect element) in a browser, you can see that each book title and price are presented inside an `article` element with the class called `product_pod`. Use Beautiful Soup to get all the data inside these elements, and then convert it to a string:
46 |
47 | ```python
48 | soup = BeautifulSoup(page.content, 'html.parser')
49 | content = soup.find_all(class_='product_pod')
50 | content = str(content)
51 | ```
52 |
53 | ## Processing the data using RegEx
54 |
55 | Since the acquired content has a lot of unnecessary data, create two regular expressions to get only the desired data.
56 |
57 | 
58 |
59 | ### Expression # 1
60 | ### Finding the pattern
61 |
62 | First, inspect the title of the book to find the pattern. You can see above that every title is present after the text `title=` in the format `title=“Titlename”`.
63 |
64 | ### Generating the expression
65 |
66 | Then, create an expression that returns the data inside quotations after the `title=` by specifying `"(.*?)"`.
67 |
68 | The first expression is as follows:
69 |
70 | ```python
71 | re_titles = r'title="(.*?)">'
72 | ```
73 |
74 | ### Expression # 2
75 | ### Finding the pattern
76 |
77 | First, inspect the price of the book. Every price is present after the text `£` in the format `£=price` before the paragraph tag ``.
78 |
79 | ### Generating the expression
80 |
81 | Then, create an expression that returns the data inside quotations after the `£=` and before the `` by specifying `£(.*?)`.
82 |
83 | The second expression is as follows:
84 |
85 | ```python
86 | re_prices = '£(.*?)'
87 | ```
88 |
89 | To conclude, use the expressions with `re.findall` to find the substrings matching the patterns. Lastly, save them in the variables `title_list` and `price_list`.
90 |
91 | ```python
92 | titles_list = re.findall(re_titles, content)
93 | price_list = re.findall(re_prices, content)
94 | ```
95 |
96 | ## Saving the output
97 |
98 | To save the output, loop over the pairs for the titles and prices and write them to the `output.txt` file.
99 |
100 | ```python
101 | with open("output.txt", "w") as f:
102 | for title, price in zip(titles_list, price_list):
103 | f.write(title + "\t" + price + "\n")
104 | ```
105 |
106 | 
107 |
108 | Putting everything together, this is the complete code that can be run by calling `python demo.py`:
109 |
110 | ```python
111 | # Importing the required libraries.
112 | import requests
113 | from bs4 import BeautifulSoup
114 | import re
115 |
116 | # Requesting the HTML from the web page.
117 | page = requests.get("https://books.toscrape.com/")
118 |
119 | # Selecting the data.
120 | soup = BeautifulSoup(page.content, "html.parser")
121 | content = soup.find_all(class_="product_pod")
122 | content = str(content)
123 |
124 | # Processing the data using Regular Expressions.
125 | re_titles = r'title="(.*?)">'
126 | titles_list = re.findall(re_titles, content)
127 | re_prices = "£(.*?)"
128 | price_list = re.findall(re_prices, content)
129 |
130 | # Saving the output.
131 | with open("output.txt", "w") as f:
132 | for title, price in zip(titles_list, price_list):
133 | f.write(title + "\t" + price + "\n")
134 |
135 | ```
136 |
--------------------------------------------------------------------------------
/golang/golang-web-scraper/src/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
2 | github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
3 | github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
4 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
5 | github.com/antchfx/htmlquery v1.2.5 h1:1lXnx46/1wtv1E/kzmH8vrfMuUKYgkdDBA9pIdMJnk4=
6 | github.com/antchfx/htmlquery v1.2.5/go.mod h1:2MCVBzYVafPBmKbrmwB9F5xdd+IEgRY61ci2oOsOQVw=
7 | github.com/antchfx/xmlquery v1.3.12 h1:6TMGpdjpO/P8VhjnaYPXuqT3qyJ/VsqoyNTmJzNBTQ4=
8 | github.com/antchfx/xmlquery v1.3.12/go.mod h1:3w2RvQvTz+DaT5fSgsELkSJcdNgkmg6vuXDEuhdwsPQ=
9 | github.com/antchfx/xpath v1.2.1 h1:qhp4EW6aCOVr5XIkT+l6LJ9ck/JsUH/yyauNgTQkBF8=
10 | github.com/antchfx/xpath v1.2.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
11 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
12 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
13 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
14 | github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
15 | github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
16 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
17 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
18 | github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
19 | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
20 | github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
21 | github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
22 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
23 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
24 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
25 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
26 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
27 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
28 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
29 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
30 | golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
31 | golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
32 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
33 | golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
34 | golang.org/x/net v0.0.0-20221004154528-8021a29435af h1:wv66FM3rLZGPdxpYL+ApnDe2HzHcTFta3z5nsc13wI4=
35 | golang.org/x/net v0.0.0-20221004154528-8021a29435af/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
36 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
37 | golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
38 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
39 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
40 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
41 | golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
42 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
43 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
44 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
45 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
46 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
47 | golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
48 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
49 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
50 | google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c=
51 | google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
52 |
--------------------------------------------------------------------------------
/golang/golang-web-scraper/README.md:
--------------------------------------------------------------------------------
1 | # Building a Web Scraper in Golang
2 |
3 | [
](https://github.com/topics/go) [
](https://github.com/topics/web-scraping)
4 | - [Installing Go](#installing-go)
5 | - [Parsing HTML with Colly](#parsing-html-with-colly)
6 | - [Handling pagination](#handling-pagination)
7 | - [Writing data to a CSV file](#writing-data-to-a-csv-file)
8 |
9 | Web scraping is an automated process of data extraction from a website. As a tool, a web scraper collects and exports data to a more usable format (JSON, CSV) for further analysis. Building a scraper could be complicated, requiring guidance and practical examples. A vast majority of web scraping tutorials concentrate on the most popular scraping languages, such as JavaScript, PHP, and, more often than not – Python. This time let’s take a look at Golang.
10 |
11 | Golang, or Go, is designed to leverage the static typing and run-time efficiency of C and usability of Python and JavaScript, with added features of high-performance networking and multiprocessing. It’s also compiled and excels in concurrency, making it quick.
12 |
13 | This article will guide you an overview of the process of writing a fast and efficient Golang web scraper.
14 |
15 | For a detailed explanation, [see this blog post](https://oxy.yt/IrPZ).
16 |
17 | ## Installing Go
18 |
19 | ```shell
20 | # macOS
21 | brew install go
22 |
23 | # Windows
24 | choco install golang
25 | ```
26 |
27 | ## Parsing HTML with Colly
28 |
29 | ```shell
30 | go mod init oxylabs.io/web-scraping-with-go
31 | go get github.com/gocolly/colly
32 |
33 | ```
34 |
35 |
36 |
37 | ```go
38 | //books.go
39 |
40 | package main
41 |
42 | import (
43 | "encoding/csv"
44 | "fmt"
45 | "log"
46 | "os"
47 |
48 | "github.com/gocolly/colly"
49 | )
50 | func main() {
51 | // Scraping code here
52 | fmt.Println("Done")
53 | }
54 | ```
55 |
56 | ### Sending HTTP requests with Colly
57 |
58 |
59 |
60 | ```go
61 | c := colly.NewCollector(
62 | colly.AllowedDomains("books.toscrape.com"),
63 | )
64 | c.OnRequest(func(r *colly.Request) {
65 | fmt.Println("Visiting", r.URL)
66 | })
67 | c.OnResponse(func(r *colly.Response) {
68 | fmt.Println(r.StatusCode)
69 | })
70 | ```
71 |
72 | ### Locating HTML elements via CSS selector
73 |
74 | ```go
75 | func main() {
76 | c := colly.NewCollector(
77 | colly.AllowedDomains("books.toscrape.com"),
78 | )
79 |
80 | c.OnHTML("title", func(e *colly.HTMLElement) {
81 | fmt.Println(e.Text)
82 | })
83 |
84 | c.OnResponse(func(r *colly.Response) {
85 | fmt.Println(r.StatusCode)
86 | })
87 |
88 | c.OnRequest(func(r *colly.Request) {
89 | fmt.Println("Visiting", r.URL)
90 | })
91 |
92 | c.Visit("https://books.toscrape.com/")
93 | }
94 | ```
95 |
96 | ### Extracting the HTML elements
97 |
98 | 
99 |
100 | ```go
101 | type Book struct {
102 | Title string
103 | Price string
104 | }
105 | c.OnHTML(".product_pod", func(e *colly.HTMLElement) {
106 | book := Book{}
107 | book.Title = e.ChildAttr(".image_container img", "alt")
108 | book.Price = e.ChildText(".price_color")
109 | fmt.Println(book.Title, book.Price)
110 | })
111 | ```
112 |
113 | ## Handling pagination
114 |
115 | ```go
116 | c.OnHTML(".next > a", func(e *colly.HTMLElement) {
117 | nextPage := e.Request.AbsoluteURL(e.Attr("href"))
118 | c.Visit(nextPage)
119 | })
120 | ```
121 |
122 | ## Writing data to a CSV file
123 |
124 | ```go
125 | func crawl() {
126 | file, err := os.Create("export2.csv")
127 | if err != nil {
128 | log.Fatal(err)
129 | }
130 | defer file.Close()
131 | writer := csv.NewWriter(file)
132 | defer writer.Flush()
133 | headers := []string{"Title", "Price"}
134 | writer.Write(headers)
135 |
136 | c := colly.NewCollector(
137 | colly.AllowedDomains("books.toscrape.com"),
138 | )
139 |
140 | c.OnRequest(func(r *colly.Request) {
141 | fmt.Println("Visiting: ", r.URL.String())
142 | })
143 |
144 | c.OnHTML(".next > a", func(e *colly.HTMLElement) {
145 | nextPage := e.Request.AbsoluteURL(e.Attr("href"))
146 | c.Visit(nextPage)
147 | })
148 |
149 | c.OnHTML(".product_pod", func(e *colly.HTMLElement) {
150 | book := Book{}
151 | book.Title = e.ChildAttr(".image_container img", "alt")
152 | book.Price = e.ChildText(".price_color")
153 | row := []string{book.Title, book.Price}
154 | writer.Write(row)
155 | })
156 |
157 | startUrl := fmt.Sprintf("https://books.toscrape.com/")
158 | c.Visit(startUrl)
159 | }
160 |
161 | ```
162 |
163 | #### Run the file
164 |
165 | ```shell
166 | go run books.go
167 | ```
168 |
169 |
170 |
171 | If you wish to find out more about web scraping with Go, see our [blog post](https://oxy.yt/IrPZ).
172 |
--------------------------------------------------------------------------------
/python/playwright-web-scraping/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With Playwright
2 |
3 | [
](https://github.com/topics/playwright) [
](https://github.com/topics/web-scraping)
4 |
5 | - [Support for proxies in Playwright](#support-for-proxies-in-playwright)
6 | - [Basic scraping with Playwright](#basic-scraping-with-playwright)
7 | - [Web Scraping](#web-scraping)
8 |
9 | This article discusses everything you need to know about news scraping, including the benefits and use cases of news scraping as well as how you can use Python to create an article scraper.
10 |
11 | For a detailed explanation, see our [blog post](https://oxy.yt/erHw).
12 |
13 |
14 | ## Support for proxies in Playwright
15 |
16 | #### Without Proxy.js
17 |
18 | ```javascript
19 |
20 | // Node.js
21 |
22 | const { chromium } = require('playwright'); "
23 | const browser = await chromium.launch();
24 | ```
25 |
26 |
27 |
28 | ```python
29 |
30 | # Python
31 |
32 | from playwright.async_api import async_playwright
33 | import asyncio
34 | with async_playwright() as p:
35 | browser = await p.chromium.launch()
36 | ```
37 |
38 | #### With Proxy
39 |
40 | ```javascript
41 | // Node.js
42 | const launchOptions = {
43 | proxy: {
44 | server: 123.123.123.123:80'
45 | },
46 | headless: false
47 | }
48 | const browser = await chromium.launch(launchOptions);
49 | ```
50 |
51 |
52 |
53 | ```python
54 | # Python
55 | proxy_to_use = {
56 | 'server': '123.123.123.123:80'
57 | }
58 | browser = await p.chromium.launch(proxy=proxy_to_use, headless=False)
59 | ```
60 |
61 | ## Basic scraping with Playwright
62 |
63 | ### Node.Js
64 |
65 | ```shell
66 | npm init -y
67 | npm install playwright
68 | ```
69 |
70 | ```javascript
71 | const playwright = require('playwright');
72 | (async () => {
73 | const browser = await playwright.chromium.launch({
74 | headless: false // Show the browser.
75 | });
76 |
77 | const page = await browser.newPage();
78 | await page.goto('https://books.toscrape.com/');
79 | await page.waitForTimeout(1000); // wait for 1 seconds
80 | await browser.close();
81 | })();
82 | ```
83 |
84 | ### Python
85 |
86 | ```shell
87 | pip install playwright
88 | ```
89 |
90 |
91 |
92 | ```python
93 | from playwright.async_api import async_playwright
94 | import asyncio
95 |
96 | async def main():
97 | async with async_playwright() as pw:
98 | browser = await pw.chromium.launch(
99 | headless=False # Show the browser
100 | )
101 | page = await browser.new_page()
102 | await page.goto('https://books.toscrape.com/')
103 | # Data Extraction Code Here
104 | await page.wait_for_timeout(1000) # Wait for 1 second
105 | await browser.close()
106 |
107 | if __name__ == '__main__':
108 | asyncio.run(main())
109 | ```
110 |
111 | ## Web Scraping
112 |
113 |
114 |
115 | 
116 |
117 | #### Node.JS
118 |
119 | ```javascript
120 | const playwright = require('playwright');
121 |
122 | (async () => {
123 | const browser = await playwright.chromium.launch();
124 | const page = await browser.newPage();
125 | await page.goto('https://books.toscrape.com/');
126 | const books = await page.$$eval('.product_pod', all_items => {
127 | const data = [];
128 | all_items.forEach(book => {
129 | const name = book.querySelector('h3').innerText;
130 | const price = book.querySelector('.price_color').innerText;
131 | const stock = book.querySelector('.availability').innerText;
132 | data.push({ name, price, stock});
133 | });
134 | return data;
135 | });
136 | console.log(books);
137 | await browser.close();
138 | })();
139 | ```
140 |
141 | #### Python
142 |
143 | ```python
144 | from playwright.async_api import async_playwright
145 | import asyncio
146 |
147 |
148 | async def main():
149 | async with async_playwright() as pw:
150 | browser = await pw.chromium.launch()
151 | page = await browser.new_page()
152 | await page.goto('https://books.toscrape.com')
153 |
154 | all_items = await page.query_selector_all('.product_pod')
155 | books = []
156 | for item in all_items:
157 | book = {}
158 | name_el = await item.query_selector('h3')
159 | book['name'] = await name_el.inner_text()
160 | price_el = await item.query_selector('.price_color')
161 | book['price'] = await price_el.inner_text()
162 | stock_el = await item.query_selector('.availability')
163 | book['stock'] = await stock_el.inner_text()
164 | books.append(book)
165 | print(books)
166 | await browser.close()
167 |
168 | if __name__ == '__main__':
169 | asyncio.run(main())
170 | ```
171 |
172 | If you wish to find out more about Web Scraping With Playwright, see our [blog post](https://oxy.yt/erHw).
173 |
--------------------------------------------------------------------------------
/javascript/playwright-web-scraping/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With Playwright
2 |
3 | [
](https://github.com/topics/playwright) [
](https://github.com/topics/web-scraping)
4 |
5 | - [Support for proxies in Playwright](#support-for-proxies-in-playwright)
6 | - [Basic scraping with Playwright](#basic-scraping-with-playwright)
7 | - [Web Scraping](#web-scraping)
8 |
9 | This article discusses everything you need to know about news scraping, including the benefits and use cases of news scraping as well as how you can use Python to create an article scraper.
10 |
11 | For a detailed explanation, see our [blog post](https://oxy.yt/erHw).
12 |
13 |
14 | ## Support for proxies in Playwright
15 |
16 | #### Without Proxy.js
17 |
18 | ```javascript
19 |
20 | // Node.js
21 |
22 | const { chromium } = require('playwright'); "
23 | const browser = await chromium.launch();
24 | ```
25 |
26 |
27 |
28 | ```python
29 |
30 | # Python
31 |
32 | from playwright.async_api import async_playwright
33 | import asyncio
34 | with async_playwright() as p:
35 | browser = await p.chromium.launch()
36 | ```
37 |
38 | #### With Proxy
39 |
40 | ```javascript
41 | // Node.js
42 | const launchOptions = {
43 | proxy: {
44 | server: 123.123.123.123:80'
45 | },
46 | headless: false
47 | }
48 | const browser = await chromium.launch(launchOptions);
49 | ```
50 |
51 |
52 |
53 | ```python
54 | # Python
55 | proxy_to_use = {
56 | 'server': '123.123.123.123:80'
57 | }
58 | browser = await p.chromium.launch(proxy=proxy_to_use, headless=False)
59 | ```
60 |
61 | ## Basic scraping with Playwright
62 |
63 | ### Node.Js
64 |
65 | ```shell
66 | npm init -y
67 | npm install playwright
68 | ```
69 |
70 | ```javascript
71 | const playwright = require('playwright');
72 | (async () => {
73 | const browser = await playwright.chromium.launch({
74 | headless: false // Show the browser.
75 | });
76 |
77 | const page = await browser.newPage();
78 | await page.goto('https://books.toscrape.com/');
79 | await page.waitForTimeout(1000); // wait for 1 seconds
80 | await browser.close();
81 | })();
82 | ```
83 |
84 | ### Python
85 |
86 | ```shell
87 | pip install playwright
88 | ```
89 |
90 |
91 |
92 | ```python
93 | from playwright.async_api import async_playwright
94 | import asyncio
95 |
96 | async def main():
97 | async with async_playwright() as pw:
98 | browser = await pw.chromium.launch(
99 | headless=False # Show the browser
100 | )
101 | page = await browser.new_page()
102 | await page.goto('https://books.toscrape.com/')
103 | # Data Extraction Code Here
104 | await page.wait_for_timeout(1000) # Wait for 1 second
105 | await browser.close()
106 |
107 | if __name__ == '__main__':
108 | asyncio.run(main())
109 | ```
110 |
111 | ## Web Scraping
112 |
113 |
114 |
115 | 
116 |
117 | #### Node.JS
118 |
119 | ```javascript
120 | const playwright = require('playwright');
121 |
122 | (async () => {
123 | const browser = await playwright.chromium.launch();
124 | const page = await browser.newPage();
125 | await page.goto('https://books.toscrape.com/');
126 | const books = await page.$$eval('.product_pod', all_items => {
127 | const data = [];
128 | all_items.forEach(book => {
129 | const name = book.querySelector('h3').innerText;
130 | const price = book.querySelector('.price_color').innerText;
131 | const stock = book.querySelector('.availability').innerText;
132 | data.push({ name, price, stock});
133 | });
134 | return data;
135 | });
136 | console.log(books);
137 | await browser.close();
138 | })();
139 | ```
140 |
141 | #### Python
142 |
143 | ```python
144 | from playwright.async_api import async_playwright
145 | import asyncio
146 |
147 |
148 | async def main():
149 | async with async_playwright() as pw:
150 | browser = await pw.chromium.launch()
151 | page = await browser.new_page()
152 | await page.goto('https://books.toscrape.com')
153 |
154 | all_items = await page.query_selector_all('.product_pod')
155 | books = []
156 | for item in all_items:
157 | book = {}
158 | name_el = await item.query_selector('h3')
159 | book['name'] = await name_el.inner_text()
160 | price_el = await item.query_selector('.price_color')
161 | book['price'] = await price_el.inner_text()
162 | stock_el = await item.query_selector('.availability')
163 | book['stock'] = await stock_el.inner_text()
164 | books.append(book)
165 | print(books)
166 | await browser.close()
167 |
168 | if __name__ == '__main__':
169 | asyncio.run(main())
170 | ```
171 |
172 | If you wish to find out more about Web Scraping With Playwright, see our [blog post](https://oxy.yt/erHw).
173 |
--------------------------------------------------------------------------------
/ruby/webscraping-with-ruby/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With Ruby
2 |
3 | [
](https://github.com/topics/ruby) [
](https://github.com/topics/web-scraping)
4 |
5 | - [Installing Ruby](#installing-ruby)
6 | - [Scraping static pages](#scraping-static-pages)
7 | - [Scraping dynamic pages](#scraping-dynamic-pages)
8 |
9 | Ruby is a time-tested, open-source programming language. Its first version was released in 1996, while the latest major iteration 3 was dropped in 2020. This article covers tools and techniques for web scraping with Ruby that work with the latest version 3.
10 |
11 | We’ll begin with a step-by-step overview of scraping static public web pages first and shift our focus to the means of scraping dynamic pages. While the first approach works with most websites, it will not function with the dynamic pages that use JavaScript to render the content. To handle these sites, we’ll look at headless browsers.
12 |
13 |
14 | For a detailed explanation, see our [blog post](https://oxy.yt/Dr5a).
15 |
16 |
17 | ## Installing Ruby
18 |
19 | To install Ruby on **Windows**, run the following:
20 |
21 | ```batch
22 | choco install ruby
23 | ```
24 |
25 | To install Ruby on **macOS**, use a package manager such as [Homebrew](https://brew.sh/). Enter the following in the terminal:
26 |
27 | ```shell
28 | brew install ruby
29 | ```
30 |
31 | For **Linux**, use the package manager for your distro. For example, run the following for Ubuntu:
32 |
33 | ```shell
34 | sudo apt install ruby-full
35 | ```
36 |
37 | ## Scraping static pages
38 |
39 | In this section, we’ll write a web scraper that can scrape data from [https://books.toscrape.com](https://books.toscrape.com/). It is a dummy book store for practicing web scraping with static websites.
40 |
41 | ### Installing required gems
42 |
43 | ```shell
44 | gem install httparty
45 | gem install nokogiri
46 | gem install csv
47 | ```
48 |
49 | ### Making an HTTP request
50 |
51 | ```ruby
52 | require 'httparty'
53 | response = HTTParty.get('https://books.toscrape.com/')
54 | if response.code == 200
55 | puts response.body
56 | else
57 | puts "Error: #{response.code}"
58 | exit
59 | end
60 | ```
61 |
62 | ### Parsing HTML with Nokogiri
63 |
64 | ```ruby
65 | require 'nokogiri'
66 | document = Nokogiri::HTML4(response.body)
67 | ```
68 |
69 | 
70 |
71 | ```ruby
72 | books = []
73 | 50.times do |i|
74 | url = "https://books.toscrape.com/catalogue/page-#{i + 1}.html"
75 | response = HTTParty.get(url)
76 | document = Nokogiri::HTML(response.body)
77 | all_book_containers = document.css('article.product_pod')
78 |
79 | all_book_containers.each do |container|
80 | title = container.css('.image_container > a > img').first['alt']
81 | price = container.css('.price_color').text.delete('^0-9.')
82 | availability = container.css('.availability').text.strip
83 | book = [title, price, availability]
84 | books << book
85 | end
86 |
87 | end
88 | ```
89 |
90 | ### Writing scraped data to a CSV file
91 |
92 | ```ruby
93 | require 'csv'
94 | CSV.open('books.csv', 'w+',
95 | write_headers: true,
96 | headers: %w[Title Price Availability]) do |csv|
97 |
98 | 50.times do |i|
99 | url = "https://books.toscrape.com/catalogue/page-#{i + 1}.html"
100 | response = HTTParty.get(url)
101 | document = Nokogiri::HTML(response.body)
102 | all_book_containers = document.css('article.product_pod')
103 |
104 | all_book_containers.each do |container|
105 | title = container.css('h3 a').first['title']
106 | price = container.css('.price_color').text.delete('^0-9.')
107 | availability = container.css('.availability').text.strip
108 | book = [title, price, availability]
109 |
110 | csv << book
111 | end
112 | end
113 | end
114 | ```
115 |
116 | ## Scraping dynamic pages
117 |
118 | ### Required installation
119 |
120 | ```shell
121 | gem install selenium-webdriver
122 | gem install csv
123 | ```
124 |
125 | ### Loading a dynamic website
126 |
127 | ```ruby
128 | require 'selenium-webdriver'
129 |
130 | driver = Selenium::WebDriver.for(:chrome)
131 | ```
132 |
133 | ### Locating HTML elements via CSS selectors
134 |
135 | ```ruby
136 | document = Nokogiri::HTML(driver.page_source)
137 | ```
138 |
139 | 
140 |
141 | ```ruby
142 | quotes = []
143 | quote_elements = driver.find_elements(css: '.quote')
144 | quote_elements.each do |quote_el|
145 | quote_text = quote_el.find_element(css: '.text').attribute('textContent')
146 | author = quote_el.find_element(css: '.author').attribute('textContent')
147 | quotes << [quote_text, author]
148 | end
149 | ```
150 |
151 | ### Handling pagination
152 |
153 | ```ruby
154 | quotes = []
155 | while true do
156 | quote_elements = driver.find_elements(css: '.quote')
157 | quote_elements.each do |quote_el|
158 | quote_text = quote_el.find_element(css: '.text').attribute('textContent')
159 | author = quote_el.find_element(css: '.author').attribute('textContent')
160 | quotes << [quote_text, author]
161 | end
162 | begin
163 | driver.find_element(css: '.next >a').click
164 | rescue
165 | break # Next button not found
166 | end
167 | end
168 | ```
169 |
170 | ### Creating a CSV file
171 |
172 | ```ruby
173 | require 'csv'
174 |
175 | CSV.open('quotes.csv', 'w+', write_headers: true,
176 | headers: %w[Quote Author]) do |csv|
177 | quotes.each do |quote|
178 | csv << quote
179 | end
180 | end
181 | ```
182 |
183 | If you wish to find out more about web scraping with Ruby, see our [blog post](https://oxy.yt/Dr5a).
184 |
--------------------------------------------------------------------------------
/php/web-scraping-php/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With PHP
2 |
3 | [
](https://github.com/topics/php) [
](https://github.com/topics/web-scraping)
4 |
5 | - [Installing Prerequisites](#installing-prerequisites)
6 | - [Making an HTTP GET request](#making-an-http-get-request)
7 | - [Web scraping in PHP with Goutte](#web-scraping-in-php-with-goutte)
8 | - [Web scraping with Symfony Panther](#web-scraping-with-symfony-panther)
9 |
10 | PHP is a general-purpose scripting language and one of the most popular options for web development. For example, WordPress, the most common content management system to create websites, is built using PHP.
11 |
12 | PHP offers various building blocks required to build a web scraper, although it can quickly become an increasingly complicated task. Conveniently, there are many open-source libraries that can make web scraping with PHP more accessible.
13 |
14 | This article will guide you through the step-by-step process of writing various PHP web scraping routines that can extract public data from static and dynamic web pages
15 |
16 | For a detailed explanation, see our [blog post](https://oxy.yt/Jr3d).
17 |
18 | ## Installing Prerequisites
19 |
20 | ```sh
21 | # Windows
22 | choco install php
23 | choco install composer
24 | ```
25 |
26 | or
27 |
28 | ```sh
29 | # macOS
30 | brew install php
31 | brew install composer
32 | ```
33 |
34 | ## Making an HTTP GET request
35 |
36 | ```php
37 | =7.1"
46 | composer require fabpot/goutte
47 | composer update
48 | ```
49 |
50 | ```php
51 | request('GET', 'https://books.toscrape.com');
56 | echo $crawler->html();
57 | ```
58 |
59 | ### Locating HTML elements via CSS Selectors
60 |
61 | ```php
62 | echo $crawler->filter('title')->text(); //CSS
63 | echo $crawler->filterXPath('//title')->text(); //XPath
64 |
65 | ```
66 |
67 | ### Extracting the elements
68 |
69 | ```php
70 | function scrapePage($url, $client){
71 | $crawler = $client->request('GET', $url);
72 | $crawler->filter('.product_pod')->each(function ($node) {
73 | $title = $node->filter('.image_container img')->attr('alt');
74 | $price = $node->filter('.price_color')->text();
75 | echo $title . "-" . $price . PHP_EOL;
76 | });
77 | }
78 | ```
79 |
80 |
81 |
82 | ### Handling pagination
83 |
84 | ```php
85 | function scrapePage($url, $client, $file)
86 | {
87 | //...
88 | // Handling Pagination
89 | try {
90 | $next_page = $crawler->filter('.next > a')->attr('href');
91 | } catch (InvalidArgumentException) { //Next page not found
92 | return null;
93 | }
94 | return "https://books.toscrape.com/catalogue/" . $next_page;
95 | }
96 |
97 | ```
98 |
99 | ### Writing Data to CSV
100 |
101 | ```php
102 | function scrapePage($url, $client, $file)
103 | {
104 | $crawler = $client->request('GET', $url);
105 | $crawler->filter('.product_pod')->each(function ($node) use ($file) {
106 | $title = $node->filter('.image_container img')->attr('alt');
107 | $price = $node->filter('.price_color')->text();
108 | fputcsv($file, [$title, $price]);
109 | });
110 | try {
111 | $next_page = $crawler->filter('.next > a')->attr('href');
112 | } catch (InvalidArgumentException) { //Next page not found
113 | return null;
114 | }
115 | return "https://books.toscrape.com/catalogue/" . $next_page;
116 | }
117 |
118 | $client = new Client();
119 | $file = fopen("books.csv", "a");
120 | $nextUrl = "https://books.toscrape.com/catalogue/page-1.html";
121 |
122 | while ($nextUrl) {
123 | echo "" . $nextUrl . "
" . PHP_EOL;
124 | $nextUrl = scrapePage($nextUrl, $client, $file);
125 | }
126 | fclose($file);
127 | ```
128 |
129 |
130 |
131 | ## Web scraping with Symfony Panther
132 |
133 | ```sh
134 | composer init --no-interaction --require="php >=7.1"
135 | composer require symfony/panther
136 | composer update
137 |
138 | brew install chromedriver
139 | ```
140 |
141 | ### Sending HTTP requests with Panther
142 |
143 | ```php
144 | get('https://quotes.toscrape.com/js/');
149 | ```
150 |
151 | ### Locating HTML elements via CSS Selectors
152 |
153 | ```php
154 | $crawler = $client->waitFor('.quote');
155 | $crawler->filter('.quote')->each(function ($node) {
156 | $author = $node->filter('.author')->text();
157 | $quote = $node->filter('.text')->text();
158 | echo $autor." - ".$quote
159 | });
160 | ```
161 |
162 | ### Handling pagination
163 |
164 | ```php
165 | while (true) {
166 | $crawler = $client->waitFor('.quote');
167 | …
168 | try {
169 | $client->clickLink('Next');
170 | } catch (Exception) {
171 | break;
172 | }
173 | }
174 | ```
175 |
176 | ### Writing data to a CSV file
177 |
178 | ```php
179 | $file = fopen("quotes.csv", "a");
180 | while (true) {
181 | $crawler = $client->waitFor('.quote');
182 | $crawler->filter('.quote')->each(function ($node) use ($file) {
183 | $author = $node->filter('.author')->text();
184 | $quote = $node->filter('.text')->text();
185 | fputcsv($file, [$author, $quote]);
186 | });
187 | try {
188 | $client->clickLink('Next');
189 | } catch (Exception) {
190 | break;
191 | }
192 | }
193 | fclose($file);
194 | ```
195 |
196 |
197 |
198 | If you wish to find out more about web scraping with PHP, see our [blog post](https://oxy.yt/Jr3d).
199 |
--------------------------------------------------------------------------------
/javascript/rotating-proxies-javascript/README.md:
--------------------------------------------------------------------------------
1 | # Rotating-Proxies-with-JavaScript
2 |
3 | [
](https://github.com/topics/javascript) [
](https://github.com/topics/web-scraping) [
](https://github.com/topics/rotating-proxies)
4 |
5 | - [Requirements](#requirements)
6 | - [Finding Current IP Aaddress](#finding-current-ip-aaddress)
7 | - [Using Proxy](#using-proxy)
8 | - [Rotating Multiple Proxies](#rotating-multiple-proxies)
9 |
10 | ## Requirements
11 |
12 | In this tutorial, we will be using [Axios](https://github.com/axios/axios) to make requests. If needed, the code can be easily modified for other libraries as well.
13 |
14 | Open the terminal and run the following command to initiate a new Node project:
15 |
16 | ```shell
17 | npm init -y
18 | ```
19 |
20 | Next step is to install Axios by running the following command:
21 |
22 | ```sh
23 | npm install axios
24 | ```
25 |
26 | ## Finding Current IP Address
27 |
28 | To check if the proxy works properly, first, we need a basic code that prints the current IP address.
29 |
30 | The website http://httpbin.org/ip is appropriate for this purpose as it returns IP addresses in a clean format.
31 |
32 | Create a new JavaScript file and make changes as outlined below.
33 |
34 | The first step would be to import `axios`.
35 |
36 | ```JavaScript
37 | const axios = require("axios");
38 | ```
39 | Next, call the `get()` method and send the URL of the target website.
40 |
41 | ```javascript
42 | const url = 'https://httpbin.org/ip';
43 | const response = await axios.get(url);
44 | ```
45 |
46 | To see the data returned by the server, access `data` attribute of the `response` object:
47 |
48 | ```JavaScript
49 | console.log(response.data);
50 | // Prints current IP
51 | ```
52 |
53 | For the complete implementation, see the [no_proxy.js](no_proxy.js) file.
54 |
55 | ## Using a Proxy
56 |
57 | For this example, we are going to use a proxy with IP 46.138.246.248 and port 8088.
58 |
59 | Axios can handle proxies directly. The proxy information needs to be sent as the second parameter of the `get()` method.
60 |
61 | The proxy object should have a `host` and `port`. See an example:
62 |
63 | ```JavaScript
64 | proxy_no_auth = {
65 | host: '46.138.246.248',
66 | port: 8088
67 | }
68 | ```
69 |
70 | If proxies need authentication, simply add an `auth` object with `username` and `password`.
71 |
72 | ```javascript
73 | proxy_with_auth = {
74 | host: '46.138.246.248',
75 | port: 8088,
76 | auth: {
77 | username: 'USERNAME',
78 | password: 'PASSWORD'
79 | }
80 | }
81 | ```
82 |
83 | This `proxy_no_auth` or `proxy_with_auth` object can then be sent with the `get` method.
84 |
85 | ```javascript
86 | const response = await axios.get(url, {
87 | proxy: proxy_no_auth
88 | });
89 | ```
90 |
91 | Run this code from the terminal to see the effective IP address.
92 |
93 | You will notice that now, instead of your original IP, the IP address of the proxy is printed.
94 |
95 | ```sh
96 | node single_proxy_axios.js
97 | // Prints {'origin': '46.138.246.248'}
98 | ```
99 |
100 | See the complete implementation in the [single_proxy_axios.js](single_proxy_axios.js) file.
101 |
102 | ## Rotating Multiple Proxies
103 |
104 | If multiple proxies are available, it is possible to rotate proxies with JavaScript.
105 |
106 | Some websites allow downloading a list of proxies as CSV or similar format.
107 |
108 | In this example, we will be working with a file downloaded from one of the free websites.
109 |
110 | This file contains the proxies in this format. Note that proxy and port are separated by a comma.
111 |
112 | ```
113 | 20.94.229.106,80
114 | 209.141.55.228,80
115 | 103.149.162.194,80
116 | 206.253.164.122,80
117 | 200.98.114.237,8888
118 | 193.164.131.202,7890
119 | 98.12.195.129,44
120 | 49.206.233.104,80
121 | ```
122 |
123 | To get a rotating IP proxy using this file, first, we need to read this CSV file in asynchronous code.
124 |
125 | To read CSV file asynchronously, install the package [async-csv](https://www.npmjs.com/package/async-csv).
126 |
127 | ```sh
128 | npm install async-csv
129 | ```
130 |
131 | We will also need the `fs` package, which does not need a separate install.
132 |
133 | After the imports, use the following lines of code to read the CSV file.
134 |
135 | ```javascript
136 | // Read file from disk:
137 | const csvFile = await fs.readFile('proxy_list.csv');
138 |
139 | // Convert CSV string into rows:
140 | const data = await csv.parse(csvFile);
141 | ```
142 |
143 | The data object is an `Array` that contains each row as `Array`.
144 |
145 | We can loop over all these rows using the `map` function.
146 |
147 | Note that in the loop, we will call the get method of Axios to call the same URL, each time with a different proxy.
148 |
149 | The `get` method of Axios is `async`. This means that we can not call the `map` function of `data` directly.
150 |
151 | Instead, we need to use the `Promise` object as follows:
152 |
153 | ```JavaScript
154 | await Promise.all(data.map(async (item) => {
155 | // More async code here
156 | }));
157 | ```
158 |
159 | It is time to create the `proxy` object. The structure will be as explained in the earlier section.
160 |
161 | ```javascript
162 | // Create the Proxy object:
163 | proxy_no_auth = {
164 | host: item[0],
165 | port: item[1]
166 | };
167 | ```
168 |
169 | Above lines convert the data from `[ '20.94.229.106', '80' ]` format to `{ host: '20.94.229.106', port: '80' }`format.
170 |
171 | Next, call the `get` method and send the proxy object.
172 |
173 | ```javascript
174 | const url = 'https://httpbin.org/ip';
175 | const response = await axios.get(url, {
176 | proxy: proxy_no_auth
177 | });
178 | ```
179 |
180 | For the complete code, please see the [rotating_proxies.js](rotating_proxies.js) file.
181 |
--------------------------------------------------------------------------------
/python/automate-competitors-benchmark-analysis/README.md:
--------------------------------------------------------------------------------
1 | # How to Automate Competitors’ & Benchmark Analysis With Python
2 |
3 | - [Using Oxylabs’ solution to retrieve the SERPs results](#using-oxylabs-solution-to-retrieve-the-serps-results)
4 | - [Scraping URLs of the top results](#scraping-urls-of-the-top-results)
5 | - [Obtaining the off-page metrics](#obtaining-the-off-page-metrics)
6 | - [Obtaining the Page Speed metrics](#obtaining-the-page-speed-metrics)
7 | - [Converting Python list into a dataframe and exporting it as an Excel file](#converting-python-list-into-a-dataframe-and-exporting-it-as-an-excel-file)
8 |
9 | Doing competitors’ or benchmark analysis for SEO can be a burdensome task as it requires taking into account many factors which usually are extracted from different data sources.
10 |
11 | The purpose of this article is to help you automate the data extraction processes as much as possible. After learning how to do this, you can dedicate your time to what matters: the analysis itself and coming up with actionable insights to strategize.
12 |
13 | For a detailed explanation, see our [blog post](https://oxy.yt/erEh).
14 |
15 | ## Using Oxylabs’ solution to retrieve the SERPs results
16 |
17 | ```python
18 | import requests
19 |
20 | keyword = ""
21 |
22 | payload = {
23 | "source": "SEARCH_ENGINE_search",
24 | "domain": "com",
25 | "query": keyword,
26 | "parse": "true",
27 | }
28 |
29 | response = requests.request(
30 | "POST",
31 | "https://realtime.oxylabs.io/v1/queries",
32 | auth=("", ""),
33 | json=payload,
34 | )
35 |
36 | list_comparison = [
37 | [x["url"], x["title"]]
38 | for x in response.json()["results"][0]["content"]["results"]["organic"]
39 | ]
40 | ```
41 |
42 | Viewing the results:
43 |
44 | ```python
45 | >>> print(list_comparison)
46 | [
47 | ["https://example.com/result/example-link", "Example Link - Example"],
48 | ["https://more-examples.net", "Homepage - More Examples"],
49 | ["https://you-searched-for.com/query=your_keyword", "You Searched for 'your_keyword'. Analyze your search now!"],
50 | ]
51 | ```
52 |
53 | ## Scraping URLs of the top results
54 |
55 | ```python
56 | import requests
57 | from bs4 import BeautifulSoup
58 |
59 | for y in list_comparison:
60 | try:
61 | print("Scraping: " + y[0])
62 | html = requests.request("get", y[0])
63 | soup = BeautifulSoup(html.text)
64 |
65 | try:
66 | metatitle = (soup.find("title")).get_text()
67 | except Exception:
68 | metatitle = ""
69 |
70 | try:
71 | metadescription = soup.find("meta", attrs={"name": "description"})["content"]
72 | except Exception:
73 | metadescription = ""
74 |
75 | try:
76 | h1 = soup.find("h1").get_text()
77 | except Exception:
78 | h1 = ""
79 |
80 | paragraph = [a.get_text() for a in soup.find_all('p')]
81 | text_length = sum(len(a) for a in paragraph)
82 | text_counter = sum(a.lower().count(keyword) for a in paragraph)
83 | metatitle_occurrence = keyword in metatitle.lower()
84 | h1_occurrence = keyword in h1.lower()
85 | metatitle_equal = metatitle == y[1]
86 | y.extend([metatitle, metatitle_equal, metadescription, h1, paragraph, text_length, text_counter, metatitle_occurrence, h1_occurrence])
87 |
88 | except Exception as e:
89 | print(e)
90 | y.extend(["No data"]*9)
91 | ```
92 |
93 | ## Obtaining the off-page metrics
94 |
95 | ```python
96 | import time
97 | from mozscape import Mozscape
98 |
99 | client = Mozscape("", "")
100 |
101 | for y in list_comparison:
102 | try:
103 | print("Getting MOZ results for: " + y[0])
104 | domainAuthority = client.urlMetrics(y[0])
105 | y.extend([domainAuthority["ueid"], domainAuthority["uid"], domainAuthority["pda"]])
106 | except Exception as e:
107 | print(e)
108 | time.sleep(10) # Retry once after 10 seconds.
109 | domainAuthority = client.urlMetrics(y[0])
110 | y.extend([domainAuthority["ueid"], domainAuthority["uid"], domainAuthority["pda"]])
111 | ```
112 |
113 | ## Obtaining the Page Speed metrics
114 |
115 | ```python
116 | import json
117 |
118 | pagespeed_key = ""
119 |
120 |
121 | for y in list_comparison:
122 | try:
123 |
124 | print("Getting results for: " + y[0])
125 | url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url=" + y[0] + "&strategy=mobile&locale=en&key=" + pagespeed_key
126 | response = requests.request("GET", url)
127 | data = response.json()
128 |
129 | overall_score = data["lighthouseResult"]["categories"]["performance"]["score"] * 100
130 | fcp = data["loadingExperience"]["metrics"]["FIRST_CONTENTFUL_PAINT_MS"]["percentile"]/1000
131 | fid = data["loadingExperience"]["metrics"]["FIRST_INPUT_DELAY_MS"]["percentile"]/1000
132 | lcp = data["loadingExperience"]["metrics"]["LARGEST_CONTENTFUL_PAINT_MS"]["percentile"]
133 | cls = data["loadingExperience"]["metrics"]["CUMULATIVE_LAYOUT_SHIFT_SCORE"]["percentile"]/100
134 |
135 |
136 |
137 | y.extend([fcp, fid, lcp, cls, overall_score])
138 |
139 | except Exception as e:
140 | print(e)
141 | y.extend(["No data", "No data", "No data", "No data", overall_score])
142 | ```
143 |
144 | ## Converting Python list into a dataframe and exporting it as an Excel file
145 |
146 | ```python
147 | import pandas as pd
148 |
149 | df = pd.DataFrame(list_comparison)
150 | df.columns = ["URL","Metatitle SERPs", "Metatitle Onpage","Metatitle Equal", "Metadescription", "H1", "Paragraphs", "Text Length", "Keyword Occurrences Paragraph", "Metatitle Occurrence", "Metadescription Occurrence", "Equity Backlinks MOZ", "Total Backlinks MOZ", "Domain Authority", "FCP", "FID","LCP","CLS","Overall Score"]
151 | df.to_excel('.xlsx', header=True, index=False)
152 | ```
153 |
154 | If you wish to find out more, see our [blog post](https://oxy.yt/erEh).
155 |
--------------------------------------------------------------------------------
/python/News-Article-Scraper/JavaScript/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "code",
3 | "version": "1.0.0",
4 | "lockfileVersion": 1,
5 | "requires": true,
6 | "dependencies": {
7 | "axios": {
8 | "version": "0.21.1",
9 | "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.1.tgz",
10 | "integrity": "sha512-dKQiRHxGD9PPRIUNIWvZhPTPpl1rf/OxTYKsqKUDjBwYylTvV7SjSHJb9ratfyzM6wCdLCOYLzs73qpg5c4iGA==",
11 | "requires": {
12 | "follow-redirects": "^1.10.0"
13 | }
14 | },
15 | "boolbase": {
16 | "version": "1.0.0",
17 | "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
18 | "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24="
19 | },
20 | "cheerio": {
21 | "version": "1.0.0-rc.10",
22 | "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.10.tgz",
23 | "integrity": "sha512-g0J0q/O6mW8z5zxQ3A8E8J1hUgp4SMOvEoW/x84OwyHKe/Zccz83PVT4y5Crcr530FV6NgmKI1qvGTKVl9XXVw==",
24 | "requires": {
25 | "cheerio-select": "^1.5.0",
26 | "dom-serializer": "^1.3.2",
27 | "domhandler": "^4.2.0",
28 | "htmlparser2": "^6.1.0",
29 | "parse5": "^6.0.1",
30 | "parse5-htmlparser2-tree-adapter": "^6.0.1",
31 | "tslib": "^2.2.0"
32 | }
33 | },
34 | "cheerio-select": {
35 | "version": "1.5.0",
36 | "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-1.5.0.tgz",
37 | "integrity": "sha512-qocaHPv5ypefh6YNxvnbABM07KMxExbtbfuJoIie3iZXX1ERwYmJcIiRrr9H05ucQP1k28dav8rpdDgjQd8drg==",
38 | "requires": {
39 | "css-select": "^4.1.3",
40 | "css-what": "^5.0.1",
41 | "domelementtype": "^2.2.0",
42 | "domhandler": "^4.2.0",
43 | "domutils": "^2.7.0"
44 | }
45 | },
46 | "css-select": {
47 | "version": "4.1.3",
48 | "resolved": "https://registry.npmjs.org/css-select/-/css-select-4.1.3.tgz",
49 | "integrity": "sha512-gT3wBNd9Nj49rAbmtFHj1cljIAOLYSX1nZ8CB7TBO3INYckygm5B7LISU/szY//YmdiSLbJvDLOx9VnMVpMBxA==",
50 | "requires": {
51 | "boolbase": "^1.0.0",
52 | "css-what": "^5.0.0",
53 | "domhandler": "^4.2.0",
54 | "domutils": "^2.6.0",
55 | "nth-check": "^2.0.0"
56 | }
57 | },
58 | "css-what": {
59 | "version": "5.0.1",
60 | "resolved": "https://registry.npmjs.org/css-what/-/css-what-5.0.1.tgz",
61 | "integrity": "sha512-FYDTSHb/7KXsWICVsxdmiExPjCfRC4qRFBdVwv7Ax9hMnvMmEjP9RfxTEZ3qPZGmADDn2vAKSo9UcN1jKVYscg=="
62 | },
63 | "dom-serializer": {
64 | "version": "1.3.2",
65 | "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-1.3.2.tgz",
66 | "integrity": "sha512-5c54Bk5Dw4qAxNOI1pFEizPSjVsx5+bpJKmL2kPn8JhBUq2q09tTCa3mjijun2NfK78NMouDYNMBkOrPZiS+ig==",
67 | "requires": {
68 | "domelementtype": "^2.0.1",
69 | "domhandler": "^4.2.0",
70 | "entities": "^2.0.0"
71 | }
72 | },
73 | "domelementtype": {
74 | "version": "2.2.0",
75 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.2.0.tgz",
76 | "integrity": "sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A=="
77 | },
78 | "domhandler": {
79 | "version": "4.2.0",
80 | "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-4.2.0.tgz",
81 | "integrity": "sha512-zk7sgt970kzPks2Bf+dwT/PLzghLnsivb9CcxkvR8Mzr66Olr0Ofd8neSbglHJHaHa2MadfoSdNlKYAaafmWfA==",
82 | "requires": {
83 | "domelementtype": "^2.2.0"
84 | }
85 | },
86 | "domutils": {
87 | "version": "2.7.0",
88 | "resolved": "https://registry.npmjs.org/domutils/-/domutils-2.7.0.tgz",
89 | "integrity": "sha512-8eaHa17IwJUPAiB+SoTYBo5mCdeMgdcAoXJ59m6DT1vw+5iLS3gNoqYaRowaBKtGVrOF1Jz4yDTgYKLK2kvfJg==",
90 | "requires": {
91 | "dom-serializer": "^1.0.1",
92 | "domelementtype": "^2.2.0",
93 | "domhandler": "^4.2.0"
94 | }
95 | },
96 | "entities": {
97 | "version": "2.2.0",
98 | "resolved": "https://registry.npmjs.org/entities/-/entities-2.2.0.tgz",
99 | "integrity": "sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A=="
100 | },
101 | "follow-redirects": {
102 | "version": "1.14.1",
103 | "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.14.1.tgz",
104 | "integrity": "sha512-HWqDgT7ZEkqRzBvc2s64vSZ/hfOceEol3ac/7tKwzuvEyWx3/4UegXh5oBOIotkGsObyk3xznnSRVADBgWSQVg=="
105 | },
106 | "htmlparser2": {
107 | "version": "6.1.0",
108 | "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-6.1.0.tgz",
109 | "integrity": "sha512-gyyPk6rgonLFEDGoeRgQNaEUvdJ4ktTmmUh/h2t7s+M8oPpIPxgNACWa+6ESR57kXstwqPiCut0V8NRpcwgU7A==",
110 | "requires": {
111 | "domelementtype": "^2.0.1",
112 | "domhandler": "^4.0.0",
113 | "domutils": "^2.5.2",
114 | "entities": "^2.0.0"
115 | }
116 | },
117 | "nth-check": {
118 | "version": "2.0.0",
119 | "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.0.0.tgz",
120 | "integrity": "sha512-i4sc/Kj8htBrAiH1viZ0TgU8Y5XqCaV/FziYK6TBczxmeKm3AEFWqqF3195yKudrarqy7Zu80Ra5dobFjn9X/Q==",
121 | "requires": {
122 | "boolbase": "^1.0.0"
123 | }
124 | },
125 | "parse5": {
126 | "version": "6.0.1",
127 | "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz",
128 | "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw=="
129 | },
130 | "parse5-htmlparser2-tree-adapter": {
131 | "version": "6.0.1",
132 | "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-6.0.1.tgz",
133 | "integrity": "sha512-qPuWvbLgvDGilKc5BoicRovlT4MtYT6JfJyBOMDsKoiT+GiuP5qyrPCnR9HcPECIJJmZh5jRndyNThnhhb/vlA==",
134 | "requires": {
135 | "parse5": "^6.0.1"
136 | }
137 | },
138 | "tslib": {
139 | "version": "2.3.0",
140 | "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.3.0.tgz",
141 | "integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
142 | }
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/python/beautiful-soup-parsing-tutorial/README.md:
--------------------------------------------------------------------------------
1 | # Using Python and Beautiful Soup to Parse Data: Intro Tutorial
2 |
3 | ## Installing Beautiful Soup
4 |
5 | ```bash
6 | pip install BeautifulSoup4
7 | ```
8 |
9 | ## Getting started
10 |
11 | A sample HTML file will help demonstrate the main methods of how Beautiful Soup parses data. This file is much more simple than your average modern website, however, it will be sufficient for the scope of this tutorial.
12 |
13 | ```html
14 |
15 |
16 |
17 | What is a Proxy?
18 |
19 |
20 |
21 |
22 | Proxy types
23 |
24 |
25 | There are many different ways to categorize proxies. However, two of
26 | the most popular types are residential and data center proxies. Here is a list of the most common types.
27 |
28 |
29 |
30 | - Residential proxies
31 | - Datacenter proxies
32 | - Shared proxies
33 | - Semi-dedicated proxies
34 | - Private proxies
35 |
36 |
37 |
38 |
39 | ```
40 |
41 | ## Traversing for HTML tags
42 |
43 | First, we can use Beautiful Soup to extract a list of all the tags used in our sample HTML file. For this, we will use the soup.descendants generator.
44 |
45 | ```python
46 | from bs4 import BeautifulSoup
47 |
48 | with open('index.html', 'r') as f:
49 | contents = f.read()
50 |
51 | soup = BeautifulSoup(contents, features="html.parser")
52 |
53 | for child in soup.descendants:
54 |
55 | if child.name:
56 | print(child.name)
57 | ```
58 |
59 | After running this code (right click on code and click “Run”) you should get the below output:
60 |
61 | ```html
62 | html
63 | head
64 | title
65 | meta
66 | body
67 | h2
68 | p
69 | ul
70 | li
71 | li
72 | li
73 | li
74 | li
75 | ```
76 |
77 | What just happened? Beautiful Soup traversed our HTML file and printed all the HTML tags that it has found sequentially. Let’s take a quick look at what each line did.
78 |
79 | ```python
80 | from bs4 import BeautifulSoup
81 | ```
82 |
83 | This tells Python to use the Beautiful Soup library.
84 |
85 | ```python
86 | with open('index.html', 'r') as f:
87 | contents = f.read()
88 | ```
89 |
90 | And this code, as you could probably guess, gives an instruction to open our sample HTML file and read its contents.
91 |
92 | ```python
93 | soup = BeautifulSoup(contents, features="html.parser")
94 | ```
95 |
96 | This line creates a BeautifulSoup object and passes it to Python’s built-in BeautifulSoup HTML parser. Other parsers, such as lxml, might also be used, but it is a separate external library and for the purpose of this tutorial the built-in parser will do just fine.
97 |
98 | ```python
99 | for child in soup.descendants:
100 |
101 | if child.name:
102 | print(child.name)
103 | ```
104 |
105 | The final pieces of code, namely the soup.descendants generator, instruct Beautiful Soup to look for HTML tags and print them in the PyCharm console. The results can also easily be exported to a .csv file but we will get to this later.
106 |
107 | ## Getting the full content of tags
108 |
109 | To get the content of tags, this is what we can do:
110 |
111 | ```python
112 | from bs4 import BeautifulSoup
113 |
114 | with open('index.html', 'r') as f:
115 | contents = f.read()
116 |
117 | soup = BeautifulSoup(contents, features="html.parser")
118 |
119 | print(soup.h2)
120 | print(soup.p)
121 | print(soup.li)
122 | ```
123 |
124 | This is a simple instruction that outputs the HTML tag with its full content in the specified order. Here’s what the output should look like:
125 |
126 | ```html
127 | Proxy types
128 |
129 | There are many different ways to categorize proxies. However, two of the most popular types are residential and data center proxies. Here is a list of the most common types.
130 |
131 | Residential proxies
132 | ```
133 |
134 | You could also remove the HTML tags and print text only, by using, for example:
135 |
136 | ```python
137 | print(soup.li.text)
138 | ```
139 |
140 | Which in our case will give the following output:
141 |
142 | ```html
143 | Residential proxies
144 | ```
145 |
146 | Note that this only prints the first instance of the specified tag. Let’s continue to see how to find elements by ID or using the find_all method to filter elements by specific criteria.
147 |
148 | ## Using Beautiful Soup to find elements by ID
149 |
150 | We can use two similar ways to find elements by ID:
151 |
152 | ```python
153 | print(soup.find('ul', attrs={'id': 'proxytypes'}))
154 | ```
155 |
156 | or
157 |
158 | ```python
159 | print(soup.find('ul', id='proxytypes'))
160 | ```
161 |
162 | Both of these will output the same result in the Python Console:
163 |
164 | ```html
165 |
166 | - Residential proxies
167 | - Datacenter proxies
168 | - Shared proxies
169 | - Semi-dedicated proxies
170 | - Private proxies
171 |
172 | ```
173 |
174 | ## Finding all specified tags and extracting text
175 |
176 | The find_all method is a great way to extract specific data from an HTML file. It accepts many criteria that make it a flexible tool allowing us to filter data in convenient ways. Yet for this tutorial we do not need anything more complex. Let’s find all items of our list and print them as text only:
177 |
178 | ```python
179 | for tag in soup.find_all('li'):
180 | print(tag.text)
181 | ```
182 |
183 | This is how the full code should look like:
184 |
185 | ```python
186 | from bs4 import BeautifulSoup
187 |
188 | with open('index.html', 'r') as f:
189 | contents = f.read()
190 |
191 | soup = BeautifulSoup(contents, features="html.parser")
192 |
193 | for tag in soup.find_all('li'):
194 | print(tag.text)
195 | ```
196 |
197 | And here’s the output:
198 |
199 | ```
200 | Residential proxies
201 | Datacenter proxies
202 | Shared proxies
203 | Semi-dedicated proxies
204 | Private proxies
205 | ```
206 |
207 | ## Exporting data to a .csv file
208 |
209 | ```bash
210 | pip install pandas
211 | ```
212 |
213 | Add this line to the beginning of your code to import the library:
214 |
215 | ```python
216 | import pandas as pd
217 | ```
218 |
219 | Going further, let’s add some lines that will export the list we extracted earlier to a .csv file. This is how our full code should look like:
220 |
221 | ```python
222 | from bs4 import BeautifulSoup
223 | import pandas as pd
224 |
225 | with open('index.html', 'r') as f:
226 | contents = f.read()
227 |
228 | soup = BeautifulSoup(contents, features="html.parser")
229 | results = soup.find_all('li')
230 |
231 | df = pd.DataFrame({'Names': results})
232 | df.to_csv('names.csv', index=False, encoding='utf-8')
233 | ```
234 |
235 | What happened here? Let’s take a look:
236 |
237 | ```python
238 | results = soup.find_all('li')
239 | ```
240 |
241 | This line finds all instances of the `` tag and stores it in the results object.
242 |
243 | ```python
244 | df = pd.DataFrame({'Names': results})
245 | df.to_csv('names.csv', index=False, encoding='utf-8')
246 | ```
247 |
--------------------------------------------------------------------------------
/other/curl-with-proxy/README.md:
--------------------------------------------------------------------------------
1 | # How to Use cURL With Proxy
2 |
3 | [
](https://github.com/topics/curl) [
](https://github.com/topics/proxy)
4 |
5 | - [What is cURL?](#what-is-curl)
6 | - [Installation](#installation)
7 | - [What you need to connect to a proxy](#what-you-need-to-connect-to-a-proxy)
8 | - [Command line argument to set proxy in cURL](#command-line-argument-to-set-proxy-in-curl)
9 | - [Using environment variables](#using-environment-variables)
10 | - [Configure cURL to always use proxy](#configure-curl-to-always-use-proxy)
11 | - [Ignore or override proxy for one request](#ignore-or-override-proxy-for-one-request)
12 | - [Bonus tip – turning proxies off and on quickly](#bonus-tip--turning-proxies-off-and-on-quickly)
13 | - [cURL socks proxy](#curl-socks-proxy)
14 |
15 | This step-by-step guide will explain how to use cURL or simply, curl, with proxy servers. It covers all the aspects, beginning from installation to explaining various options to set the proxy.
16 |
17 | For a detailed explanation, see our [blog post](https://oxy.yt/ArRn).
18 |
19 | ## What is cURL?
20 |
21 | cURL is a command line tool for sending and receiving data using the url.
22 |
23 | ```shell
24 | curl https://www.google.com
25 | ```
26 |
27 | The question “[what is cURL](https://oxy.yt/ArRn)?” is also answered in one of our previous articles. We recommend reading it if you want to learn how it became such a universal asset.
28 |
29 | ## Installation
30 |
31 | cURL is provided with many Linux distributions and with MacOS. Now it is provided with Windows 10 as well.
32 |
33 | If your Linux distribution is not provided with it, you can install it by running the install command. For example, on Ubuntu, open Terminal and run this command:
34 |
35 | ```shell
36 | sudo apt install curl
37 | ```
38 |
39 | If you are running an older version of Windows, or if you want to install an alternate version, you can download curl from the [official download page](https://curl.se/download.html).
40 |
41 | ## What you need to connect to a proxy
42 |
43 | Irrespective of which proxy service you use, you will need the following information to use a:
44 |
45 | - proxy server address
46 | - port
47 | - protocol
48 | - username (if authentication is required)
49 | - password (if authentication is required)
50 |
51 | In this tutorial, we are going to assume that the proxy server is **127.0.0.1**, the port is **1234**, the user name is **user**, and the password is **pwd**. We will look into multiple examples covering various protocols..
52 |
53 | ## Command line argument to set proxy in cURL
54 |
55 | Open terminal and type the following command, and press Enter:
56 |
57 | ```shell
58 | curl --help
59 | ```
60 |
61 | The output is going to be a huge list of options. One of them is going to look like this:
62 |
63 | ```shell
64 | -x, --proxy [protocol://]host[:port]
65 | ```
66 |
67 | Note that **x** is small, and it is case-sensitive. The proxy details can be supplied using **-x** or **–proxy** switch. Both mean the same thing. Bot of the curl with proxy commands are same:
68 |
69 | ```shell
70 | curl -x "http://user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
71 | ```
72 |
73 | or
74 |
75 | ```shell
76 | curl --proxy "http://user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
77 | ```
78 |
79 | **NOTE.** If there are SSL certificate errors, add **-k** (note the small **k**) to the **curl** command. This will allow insecure server connections when using SSL.
80 |
81 | ```shell
82 | curl --proxy "http://user:pwd@127.0.0.1:1234" "http://httpbin.org/ip" -k
83 | ```
84 |
85 | Another interesting thing to note here is that the default proxy protocol is http. Thus, following two commands will do exactly the same:
86 |
87 | ```shell
88 | curl --proxy "http://user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
89 | curl --proxy "user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
90 | ```
91 |
92 | ## Using environment variables
93 |
94 | Another way to use proxy with curl is to set the environment variables **http_proxy** and **https_proxy**.
95 |
96 | ```shell
97 | export http_proxy="http://user:pwd@127.0.0.1:1234"
98 | export https_proxy="http://user:pwd@127.0.0.1:1234"
99 | ```
100 |
101 | After running these two commands, run **curl** normally.
102 |
103 | ```shell
104 | curl "http://httpbin.org/ip"
105 | ```
106 |
107 | To stop using proxy, turn off the global proxy by unsetting these two variables:
108 |
109 | ```shell
110 | unset http_proxy
111 | unset https_proxy
112 | ```
113 |
114 | ## Configure cURL to always use proxy
115 |
116 | If you want a proxy for curl but not for other programs, this can be achieved by creating a [curl config file](https://everything.curl.dev/cmdline/cmdline-configfile).
117 |
118 | For Linux and MacOS, open terminal and navigate to your home directory. If there is already a **.curlrc** file, open it. If there is none, create a new file. Here are the set of commands that can be run:
119 |
120 | ```shell
121 | cd ~
122 | nano .curlrc
123 | ```
124 |
125 | In this file, add this line:
126 |
127 | ```shell
128 | proxy="http://user:pwd@127.0.0.1:1234"
129 | ```
130 |
131 | Save the file. Now curl with proxy is ready to be used.
132 |
133 | Simply run **curl** normally and it will read the proxy from **.curlrc** file.
134 |
135 | ```shell
136 | curl "http://httpbin.org/ip"
137 | ```
138 |
139 | On Windows, the file is named **_curlrc**. This file can be placed in the **%APPDATA%** directory.
140 |
141 | To find the exact path of **%APPDATA%**, open command prompt and run the following command:
142 |
143 | ```shell
144 | echo %APPDATA%
145 | ```
146 |
147 | This directory will be something like **C:\Users\\AppData\Roaming**. Now go to this directory, and create a new file **_curlrc**, and set the proxy by adding this line:
148 |
149 | ```shell
150 | proxy="http://user:pwd@127.0.0.1:1234"
151 | ```
152 |
153 | ## Ignore or override proxy for one request
154 |
155 | To override proxy for one request, set the new proxy using **-x** or **–proxy** switch as usual:
156 |
157 | ```shell
158 | curl --proxy "http://user:pwd@1.0.0.1:8090" "http://httpbin.org/ip"
159 | ```
160 |
161 | ## Bonus tip – turning proxies off and on quickly
162 |
163 | You can create an alias in your **.bashrc** file to set proxies and unset proxies. For example, open **.bashrc** file using any editor and add these lines:
164 |
165 | ```shell
166 | alias proxyon="export http_proxy=' http://user:pwd@127.0.0.1:1234';export https_proxy=' http://user:pwd@127.0.0.1:1234'"
167 | alias proxyoff="unset http_proxy;unset https_proxy"
168 | ```
169 |
170 | After adding these lines, save the **.bashrc** and update the shell to read this **.bashrc**. To do this, run this this command in the terminal:
171 |
172 | ```shell
173 | . ~/.bashrc
174 | ```
175 |
176 | Now, whenever you need to turn on the proxy, you can quickly turn on the proxy, run one or more curl commands and then turn off the proxies like this:
177 |
178 | ```shell
179 | proxyon
180 | curl "http://httpbin.org/ip"
181 | curl "http://google.com"
182 | proxyoff
183 | ```
184 |
185 | ## cURL socks proxy
186 |
187 | If the proxy server is using socks protocol, the syntax remains the same:
188 |
189 | ```shell
190 | curl -x "socks5://user:pwd@127.0.0.1:1234" "http://httpbin.org/ip"
191 | ```
192 |
193 | If you wish to find out more about How to Use cURL With Proxy, see our [blog post](https://oxy.yt/ArRn).
194 |
--------------------------------------------------------------------------------
/VBA/Web Scraping With Excel VBA Guide/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With Excel VBA
2 |
3 | [](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=877&url_id=112)
4 |
5 |
6 | - [Prerequisites](#prerequisites)
7 | - [Step 1 - Open Microsoft Excel](#step-1---open-microsoft-excel)
8 | - [Step 2 - Go to Option to enable developer menu](#step-2---go-to-option-to-enable-developer-menu)
9 | - [Step 3 - Select Customize Ribbon](#step-3----select-customize-ribbon)
10 | - [Step 4 - Open Visual Basic Application Dialog](#step-4---open-visual-basic-application-dialog)
11 | - [Step 5 - Insert a new Module](#step-5---insert-a-new-module)
12 | - [Step 6 - Add new references](#step-6---add-new-references)
13 | - [Step 7 - Automate Microsoft Edge to Open a website](#step-7---automate-microsoft-edge-to-open-a-website)
14 | - [Step 8 - Scrape Data using VBA Script & Save it to Excel](#step-8---scrape-data-using-vba-script-and-save-it-to-excel)
15 | - [Output](#output)
16 | - [Source Code](#source-code)
17 |
18 | In this tutorial, we'll focus on how to perform Excel web scraping using
19 | VBA. We’ll briefly go through the installation and preparation of the
20 | environment and then write a scraper using VBA macro to successfully
21 | fetch data from a web page into Excel.
22 |
23 | See the full [blog post](https://oxylabs.io/blog/web-scraping-excel-vba) for a detailed
24 | explanation of VBA and its use in web scraping.
25 |
26 | Before we begin, let’s make sure we’ve installed all the prerequisites
27 | and set up our environment properly so that it will be easier to follow
28 | along.
29 |
30 | ## Prerequisites
31 |
32 | We’ll be using Windows 10 and Microsoft Office 10.
33 | However, the steps will be the same or similar for other versions of
34 | Windows. You’ll only need a computer with Windows Operating System. In
35 | addition, it’s necessary to install Microsoft Office if you don’t have
36 | it already. Detailed installation instructions can be found in
37 | [Microsoft's Official
38 | documentation](https://www.microsoft.com/en-us/download/office.aspx).
39 |
40 | Now that you’ve installed MS Office, follow the steps below to set up
41 | the development environment and scrape the public data you want.
42 |
43 | ## Step 1 - Open Microsoft Excel
44 |
45 | From the start menu or Cortana search, find Microsoft Excel and open the application. You will see a similar interface as below:
46 |
47 | Click on File
48 |
49 | 
50 |
51 | ## Step 2 - Go to Option to enable developer menu
52 |
53 | By default, Excel doesn’t show the developer button in the top ribbon. To enable this we will have to go to “Options” from the File menu.
54 |
55 | 
56 |
57 | ## Step 3 - Select Customize Ribbon
58 |
59 | Once you click the “Options”, a dialog will pop up, from the side menu select “Customize Ribbon”. Click on the check box next to “developer”. Make sure it is ticked and then click on Ok.
60 |
61 | 
62 |
63 | ## Step 4 - Open Visual Basic Application Dialog
64 |
65 | Now you will see a new developer button on the top ribbon, clicking on it will expand the developer menu. From the menu, select “Visual Basic”
66 |
67 | 
68 |
69 | ## Step 5 - Insert a new Module
70 |
71 | Once you click on visual basic, it will open a new window like below:
72 |
73 | 
74 |
75 | Click on “insert” and select “Module” to insert a new module. It will open the module editor
76 |
77 | 
78 |
79 | ## Step 6 - Add new references
80 |
81 |
82 | From the top menu select `Tools > References...`, it will open a new window like the one below. Make sure to scroll through the available list of references and find Microsoft HTML Client Library & Microsoft Internet Controls in the check box. Click on the check box next to both of them to enable these references. Once you are done click ok.
83 |
84 | 
85 |
86 | That’s it! Our development environment is all set. Let’s write our first Excel VBA scraper
87 |
88 | ## Step 7 - Automate Microsoft Edge to Open a website
89 |
90 | In this step, we will update our newly created module to open the following website: . In the module editor let’s write the below code:
91 |
92 | ```vb
93 | Sub scrape_quotes()
94 | Dim browser As InternetExplorer
95 | Dim page As HTMLDocument
96 | Set browser = New InternetExplorer
97 | browser.Visible = True
98 | browser.navigate ("https://quotes.toscrape.com")
99 | End Sub
100 | ```
101 |
102 | We are defining a subroutine named `scrape_quotes()`. This function will be executed when we run this script. Inside the subroutine, we are defining two objects `browser` and `page`.
103 |
104 | The `browser` object will allow us to interact with Microsoft Edge. Then we also set the browser as visible so that we can see it in action. The browser.`navigate()` function tells the VBA browser object to open the URL. The output will be similar to this:
105 |
106 | 
107 |
108 | >💡 Note: You might be wondering why we are writing `InternetExplorer` to interact with Microsoft Edge. VBA initially only supported Internet Explorer-based automation, but once Microsoft discontinued Internet Explorer. They deployed some updates so that VBA’s InternetExplorer module can run the Microsoft Edge browser in IEMode without any issues. The above code will also work in older Windows that have Internet Explorer still available instead of Edge.
109 |
110 | ## Step 8 - Scrape Data using VBA Script and Save it to Excel
111 |
112 | Now, we will scrape the quotes and authors from the website. For simplicity, we will store it in the first Sheet of the excel spreadsheet and, grab the top 5 quotes for now.
113 |
114 | We will begin by defining two new objects one for quotes & other for authors
115 |
116 | ```vb
117 | Dim quotes As Object
118 | Dim authors As Object
119 | ```
120 |
121 | After navigating to the website we will also add a little bit of pause so that the website loads properly by using Loop.
122 |
123 | ```vb
124 | Do While browser.Busy: Loop
125 | ```
126 |
127 | Next we will grab the quotes and authors from the HTML document:
128 |
129 | ```vb
130 | Set page = browser.document
131 | Set quotes = page.getElementsByClassName("quote")
132 | Set authors = page.getElementsByClassName("author")
133 | ```
134 |
135 | Then, we will use a for loop to populate the excel rows with the extracted data by calling the Cells function and passing the row and column position:
136 |
137 | ```vb
138 | For num = 1 To 5
139 | Cells(num, 1).Value = quotes.Item(num).innerText
140 | Cells(num, 2).Value = authors.Item(num).innerText
141 | Next num
142 | ```
143 |
144 | Finally, we will close the browser by calling the quit function. This will close the browser Window.
145 |
146 | ```vb
147 | browser.Quit
148 | ```
149 |
150 | ## Output
151 |
152 | Now if we run the script again, it will open Microsoft Edge and browse to the quotes.toscrape.com website, grab the top 5 quotes from the list and save them to the current excel file’s first sheet.
153 |
154 | 
155 |
156 | ## Source Code
157 |
158 | The full source code is given below:
159 |
160 | ```vb
161 | Sub scrape_quotes()
162 | Dim browser As InternetExplorer
163 | Dim page As HTMLDocument
164 | Dim quotes As Object
165 | Dim authors As Object
166 |
167 | Set browser = New InternetExplorer
168 | browser.Visible = True
169 | browser.navigate ("https://quotes.toscrape.com")
170 | Do While browser.Busy: Loop
171 |
172 | Set page = browser.document
173 | Set quotes = page.getElementsByClassName("quote")
174 | Set authors = page.getElementsByClassName("author")
175 |
176 | For num = 1 To 5
177 | Cells(num, 1).Value = quotes.Item(num).innerText
178 | Cells(num, 2).Value = authors.Item(num).innerText
179 | Next num
180 |
181 | browser.Quit
182 | End Sub
183 | ```
184 |
185 |
186 |
--------------------------------------------------------------------------------
/python/how-to-build-a-price-tracker/README.md:
--------------------------------------------------------------------------------
1 | # How to Build a Price Tracker With Python
2 |
3 | ## Project requirements
4 |
5 | The following price monitoring script works with Python version 3.6 and above. The recommended libraries are as follows:
6 |
7 | `Requests` – for sending HTTP requests. In other words, for downloading web pages without a browser. It’s the essential library for the upcoming price monitoring script.
8 |
9 | `BeautifulSoup` – for querying the HTML for specific elements. It’s a wrapper over a parser library.
10 |
11 | `lxml` – for parsing the HTML. An HTML retrieved by the Requests library is a string that requires parsing into a Python object before querying. Instead of directly using this library, we’ll use BeautifulSoup as a wrapper for a more straightforward API.
12 |
13 | `Price-parser` – a library useful for every price monitoring script. It helps to extract the price component from a string that contains it.
14 |
15 | `smtplib` – for sending emails.
16 |
17 | `Pandas` – for filtering product data and reading and writing CSV files.
18 |
19 | Optionally, creating a virtual environment will keep the whole process more organized:
20 |
21 | ```bash
22 | $ python3 -m venv .venv
23 | $ source .venv/bin/activate
24 | ```
25 |
26 | To install the dependencies, open the terminal and run the following command:
27 |
28 | ```bash
29 | $ pip install pandas requests beautifulsoup4 price-parser
30 | ```
31 |
32 | Note that the `smtlib` library is part of Python Standard Library and doesn’t need to be installed separately.
33 |
34 | Once the installation is complete, create a new Python file and add the following imports:
35 |
36 | ```python
37 | import smtplib
38 | import pandas as pd
39 | import requests
40 | from bs4 import BeautifulSoup
41 | from price_parser import Price
42 | ```
43 |
44 | Additionally, add the following lines for initial configuration:
45 |
46 | ```python
47 | PRODUCT_URL_CSV = "products.csv"
48 | SAVE_TO_CSV = True
49 | PRICES_CSV = “prices.csv"
50 | SEND_MAIL = True
51 | ```
52 |
53 | The CSV that contains the target URLs is supplied as `PRODUCT_URL_CSV`.
54 |
55 | If the `SAVE_TO_CSV` flag is set to `True`, the fetched prices will be saved to the CSV file specified as `PRICES_CSV`.
56 |
57 | `SEND_MAIL` is a flag that can be set to `True` to send email alerts.
58 |
59 | ## Reading a list of product URLs
60 |
61 | The easiest way to store and manage the product URLs is to keep them in a CSV or JSON file. This time we’ll use CSV as it’s easily updatable using a text editor or spreadsheet application.
62 |
63 | The CSV should contain at least two fields — `url` and `alert_price`. The product’s title can be extracted from the product URL or stored in the same CSV file. If the price monitor finds product price dropping below a value of the `alert_price` field, it’ll trigger an email alert.
64 |
65 | 
66 |
67 | The CSV file can be read and converted to a dictionary object using Pandas. Let’s wrap this up in a simple function:
68 |
69 | ```python
70 | def get_urls(csv_file):
71 | df = pd.read_csv(csv_file)
72 | return df
73 | ```
74 |
75 | The function will return a Pandas’ DataFrame object that contains three columns — product, URL, and alert_price (see the image above).
76 |
77 | ## Scraping the prices
78 |
79 | The initial step is to loop over the target URLs.
80 |
81 | Note that the `get_urls()` returns a DataFrame object.
82 |
83 | To run a loop, first use the `to_dict()` method of Pandas. When the `to_dict` method is called with the parameter as `records`, it converts the DataFrame into a list of dictionaries.
84 |
85 | Run a loop over each dictionary as follows:
86 |
87 | ```python
88 | def process_products(df):
89 | for product in df.to_dict("records"):
90 | # product["url"] is the URL
91 | ```
92 |
93 | We’ll revisit this method after writing two additional functions. The first function is to get the HTML and the second function is to extract the price from it.
94 |
95 | To get the HTML from response for each URL, run the following function:
96 |
97 | ```python
98 | def get_response(url):
99 | response = requests.get(url)
100 | return response.text
101 | ```
102 |
103 | Next, create a BeautifulSoup object according to the response and locate the price element using a CSS selector. Use the Price-parser library to extract the price as a float for comparison with the alert price. If you want to better understand how the Price-parser library works, head over to our GitHub repository for examples.
104 |
105 | The following function will extract the price from the given HTML, returning it as a float:
106 |
107 | ```python
108 | def get_price(html):
109 | soup = BeautifulSoup(html, "lxml")
110 | el = soup.select_one(".price_color")
111 | price = Price.fromstring(el.text)
112 | return price.amount_float
113 | ```
114 |
115 | Note that the CSS selector used in this example is specific to the scraping target. If you are working with any other site, this is the only place where you would have to change the code.
116 |
117 | We’re using BeautifulSoup to locate an element containing the price via CSS selectors. The element is stored in the `el` variable. The text attribute of the `el` tag, `el.text`, contains the price and currency symbol. Price-parser parses this string to extract the price as a float value.
118 |
119 | There is more than one product URL in the DataFrame object. Let’s loop over all the rows and update the DataFrame with new information.
120 |
121 | The easiest approach is to convert each row into a dictionary. This way, you can read the URL, call the `get_price()` function, and update the required fields.
122 |
123 | We’ll add two new keys — the extracted price (price) and a boolean value (alert), which filters rows for sending an email.
124 |
125 | The `process_products()` function can now be extended to demonstrate the aforementioned sequence:
126 |
127 | ```python
128 | def process_products(df):
129 | updated_products = []
130 | for product in df.to_dict("records"):
131 | html = get_response(product["url"])
132 | product["price"] = get_price(html)
133 | product["alert"] = product["price"] < product["alert_price"]
134 | updated_products.append(product)
135 | return pd.DataFrame(updated_products)
136 | ```
137 |
138 | This function will return a new DataFrame object containing the product URL and a name read from the CSV. Additionally, it includes the price and alert flag used to send an email on a price drop.
139 |
140 | ## Saving the output
141 | The final DataFrame containing the updated product data can be saved as CSV using a simple call to the to_csv() function.
142 |
143 | Additionally, we’ll check the `SAVE_TO_CSV` flag as follows:
144 |
145 | ```python
146 | if SAVE_TO_CSV:
147 | df_updated.to_csv(PRICES_CSV, mode="a")
148 | ```
149 |
150 | You’ll notice that the mode is set to "a", which stands for “append” to ensure new data is appended if the CSV file is present.
151 |
152 | ## Sending email alerts
153 |
154 | Optionally, you can send an email alert on price drop based on the alert flag. First, create a function that filters the data frame and returns email’s subject and body:
155 |
156 | ```python
157 | def get_mail(df):
158 | subject = "Price Drop Alert"
159 | body = df[df["alert"]].to_string()
160 | subject_and_message = f"Subject:{subject}\n\n{body}"
161 | return subject_and_message
162 | ```
163 |
164 | Now, using `smtplib`, create another function that sends alert emails:
165 |
166 | ```python
167 | def send_mail(df):
168 | message_text = get_mail(df)
169 | with smtplib.SMTP("smtp.server.address", 587) as smtp:
170 | smtp.starttls()
171 | smtp.login(mail_user, mail_pass)
172 | smtp.sendmail(mail_user, mail_to, message_text)
173 | ```
174 |
175 | This code snippet assumes that you’ll set the variables `mail_user`, `mail_pass`, and `mail_to`.
176 |
177 | Putting everything together, this is the main function:
178 |
179 | ```python
180 | def main():
181 | df = get_urls(PRODUCT_URL_CSV)
182 | df_updated = process_products(df)
183 | if SAVE_TO_CSV:
184 | df_updated.to_csv(PRICES_CSV, index=False, mode="a")
185 | if SEND_MAIL:
186 | send_mail(df_updated)
187 | ```
188 |
189 | Execute this function to run the entire code.
190 |
191 | If you wish to run this automatically at certain intervals, use cronjob on macOS/Linux or Task Scheduler on Windows.
192 |
193 | Alternatively, you can also deploy this price monitoring script on any cloud service environment.
194 |
--------------------------------------------------------------------------------
/python/lxml-tutorial/README.md:
--------------------------------------------------------------------------------
1 | # lxml Tutorial: XML Processing and Web Scraping With lxml
2 |
3 | [
](https://github.com/topics/lxml) [
](https://github.com/topics/web-scraping)
4 |
5 | - [Installation](#installation)
6 | - [Creating a simple XML document](#creating-a-simple-xml-document)
7 | - [The Element class](#the-element-class)
8 | - [The SubElement class](#the-subelement-class)
9 | - [Setting text and attributes](#setting-text-and-attributes)
10 | - [Parse an XML file using LXML in Python](#parse-an-xml-file-using-lxml-in-python)
11 | - [Finding elements in XML](#finding-elements-in-xml)
12 | - [Handling HTML with lxml.html](#handling-html-with-lxmlhtml)
13 | - [lxml web scraping tutorial](#lxml-web-scraping-tutorial)
14 | - [Conclusion](#conclusion)
15 |
16 | In this lxml Python tutorial, we will explore the lxml library. We will go through the basics of creating XML documents and then jump on processing XML and HTML documents. Finally, we will put together all the pieces and see how to extract data using lxml.
17 |
18 | For a detailed explanation, see our [blog post](https://oxy.yt/BrAk).
19 |
20 | ## Installation
21 |
22 | The best way to download and install the lxml library is to use the pip package manager. This works on Windows, Mac, and Linux:
23 |
24 | ```shell
25 | pip3 install lxml
26 | ```
27 |
28 | ## Creating a simple XML document
29 |
30 | A very simple XML document would look like this:
31 |
32 | ```xml
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 | ```
42 |
43 | ## The Element class
44 |
45 | To create an XML document using python lxml, the first step is to import the `etree` module of lxml:
46 |
47 | ```python
48 | >>> from lxml import etree
49 | ```
50 |
51 | In this example, we will create an HTML, which is XML compliant. It means that the root element will have its name as html:
52 |
53 | ```python
54 | >>> root = etree.Element("html")
55 | ```
56 |
57 | Similarly, every html will have a head and a body:
58 |
59 | ```python
60 | >>> head = etree.Element("head")
61 | >>> body = etree.Element("body")
62 | ```
63 |
64 | To create parent-child relationships, we can simply use the append() method.
65 |
66 | ```python
67 | >>> root.append(head)
68 | >>> root.append(body)
69 | ```
70 |
71 | This document can be serialized and printed to the terminal with the help of `tostring()` function:
72 |
73 | ```python
74 | >>> print(etree.tostring(root, pretty_print=True).decode())
75 | ```
76 |
77 | ## The SubElement class
78 |
79 | Creating an `Element` object and calling the `append()` function can make the code messy and unreadable. The easiest way is to use the `SubElement` type:
80 |
81 | ```python
82 | body = etree.Element("body")
83 | root.append(body)
84 |
85 | # is same as
86 |
87 | body = etree.SubElement(root,"body")
88 | ```
89 |
90 | ## Setting text and attributes
91 |
92 | Here are the examples:
93 |
94 | ```python
95 | para = etree.SubElement(body, "p")
96 | para.text="Hello World!"
97 | ```
98 |
99 | Similarly, attributes can be set using key-value convention:
100 |
101 | ```python
102 | para.set("style", "font-size:20pt")
103 | ```
104 |
105 | One thing to note here is that the attribute can be passed in the constructor of SubElement:
106 |
107 | ```python
108 | para = etree.SubElement(body, "p", style="font-size:20pt", id="firstPara")
109 | para.text = "Hello World!"
110 | ```
111 |
112 | Here is the complete code:
113 |
114 | ```python
115 | from lxml import etree
116 |
117 | root = etree.Element("html")
118 | head = etree.SubElement(root, "head")
119 | title = etree.SubElement(head, "title")
120 | title.text = "This is Page Title"
121 | body = etree.SubElement(root, "body")
122 | heading = etree.SubElement(body, "h1", style="font-size:20pt", id="head")
123 | heading.text = "Hello World!"
124 | para = etree.SubElement(body, "p", id="firstPara")
125 | para.text = "This HTML is XML Compliant!"
126 | para = etree.SubElement(body, "p", id="secondPara")
127 | para.text = "This is the second paragraph."
128 |
129 | etree.dump(root) # prints everything to console. Use for debug only
130 | ```
131 |
132 | Add the following lines at the bottom of the snippet and run it again:
133 |
134 | ```python
135 | with open(‘input.html’, ‘wb’) as f:
136 | f.write(etree.tostring(root, pretty_print=True)
137 | ```
138 |
139 | ## Parse an XML file using LXML in Python
140 |
141 | Save the following snippet as input.html.
142 |
143 | ```html
144 |
145 |
146 | This is Page Title
147 |
148 |
149 | Hello World!
150 | This HTML is XML Compliant!
151 | This is the second paragraph.
152 |
153 |
154 | ```
155 |
156 | To get the root element, simply call the `getroot()` method.
157 |
158 | ```python
159 | from lxml import etree
160 |
161 | tree = etree.parse('input.html')
162 | elem = tree.getroot()
163 | etree.dump(elem) #prints file contents to console
164 | ```
165 |
166 | The lxml.etree module exposes another method that can be used to parse contents from a valid xml string — `fromstring()`
167 |
168 | ```python
169 | xml = 'Hello'
170 | root = etree.fromstring(xml)
171 | etree.dump(root)
172 | ```
173 |
174 | If you want to dig deeper into parsing, we have already written a tutorial on [BeautifulSoup](https://oxylabs.io/blog/beautiful-soup-parsing-tutorial), a Python package used for parsing HTML and XML documents.
175 |
176 | ## Finding elements in XML
177 |
178 | Broadly, there are two ways of finding elements using the Python lxml library. The first is by using the Python lxml querying languages: XPath and ElementPath.
179 |
180 | ```python
181 | tree = etree.parse('input.html')
182 | elem = tree.getroot()
183 | para = elem.find('body/p')
184 | etree.dump(para)
185 |
186 | # Output
187 | # This HTML is XML Compliant!
188 | ```
189 |
190 | Similarly, `findall()` will return a list of all the elements matching the selector.
191 |
192 | ```python
193 | elem = tree.getroot()
194 | para = elem.findall('body/p')
195 | for e in para:
196 | etree.dump(e)
197 |
198 | # Outputs
199 | # This HTML is XML Compliant!
200 | # This is the second paragraph.
201 | ```
202 |
203 | Another way of selecting the elements is by using XPath directly
204 |
205 | ```python
206 | para = elem.xpath('//p/text()')
207 | for e in para:
208 | print(e)
209 |
210 | # Output
211 | # This HTML is XML Compliant!
212 | # This is the second paragraph.
213 | ```
214 |
215 | ## Handling HTML with lxml.html
216 |
217 | Here is the code to print all paragraphs from the same HTML file.
218 |
219 | ```python
220 | from lxml import html
221 | with open('input.html') as f:
222 | html_string = f.read()
223 | tree = html.fromstring(html_string)
224 | para = tree.xpath('//p/text()')
225 | for e in para:
226 | print(e)
227 |
228 | # Output
229 | # This HTML is XML Compliant!
230 | # This is the second paragraph
231 | ```
232 |
233 | ## lxml web scraping tutorial
234 |
235 | Now that we know how to parse and find elements in XML and HTML, the only missing piece is getting the HTML of a web page.
236 |
237 | For this, the Requests library is a great choice:
238 |
239 | ```
240 | pip install requests
241 | ```
242 |
243 | Once the requests library is installed, HTML of any web page can be retrieved using `get()` method. Here is an example.
244 |
245 | ```python
246 | import requests
247 |
248 | response = requests.get('http://books.toscrape.com/')
249 | print(response.text)
250 | # prints source HTML
251 | ```
252 |
253 | Here is a quick example that prints a list of countries from Wikipedia:
254 |
255 | ```python
256 | import requests
257 | from lxml import html
258 |
259 | response = requests.get('https://en.wikipedia.org/wiki/List_of_countries_by_population_in_2010')
260 |
261 | tree = html.fromstring(response.text)
262 | countries = tree.xpath('//span[@class="flagicon"]')
263 | for country in countries:
264 | print(country.xpath('./following-sibling::a/text()')[0])
265 | ```
266 |
267 | The following modified code prints the country name and image URL of the flag.
268 |
269 | ```python
270 | for country in countries:
271 | flag = country.xpath('./img/@src')[0]
272 | country = country.xpath('./following-sibling::a/text()')[0]
273 | print(country, flag)
274 | ```
275 |
276 | ## Conclusion
277 |
278 | If you wish to find out more about XML Processing and Web Scraping With lxml, see our [blog post](https://oxy.yt/BrAk).
279 |
--------------------------------------------------------------------------------
/python/how-to-make-web-scraping-faster/README.md:
--------------------------------------------------------------------------------
1 | # How to Make Web Scraping Faster – Python Tutorial
2 |
3 | ## How do you speed up web scraping in Python?
4 |
5 | There are a few possible approaches that can help increase the scraping speed:
6 |
7 | * Multiprocessing
8 |
9 | * Multithreading
10 |
11 | * Asyncio
12 |
13 | However, let’s first take a look at an unoptimized code to make sure the difference between all is clear.
14 |
15 | ## Web scraping without optimization
16 |
17 | We will be scraping 1000 books from books.toscrape.com. This website is a dummy book store that is perfect for learning.
18 |
19 | ## Preparation
20 |
21 | The first step is to extract all 1000 links to the books and store them in a CSV file. Run this code file to create the links.csv file. You will need to install requests and Beautiful Soup packages for this code to work.
22 |
23 | ```python
24 | import requests
25 | from bs4 import BeautifulSoup
26 | from urllib.parse import urljoin
27 |
28 | def fetch_links(url="https://books.toscrape.com/", links=[]):
29 | r = requests.get(url)
30 | print(r.url, flush=True)
31 | soup = BeautifulSoup(r.text, "html.parser")
32 |
33 | for link in soup.select("h3 a"):
34 | links.append(urljoin(url, link.get("href")))
35 |
36 | next_page = soup.select_one("li.next a")
37 | if next_page:
38 |
39 | return fetch_links(urljoin(url, next_page.get("href"), links))
40 |
41 | else:
42 | return links
43 |
44 | def refresh_links():
45 | links = fetch_links()
46 |
47 | with open('links.csv', 'w') as f:
48 | for link in links:
49 | f.write(link + '\n')
50 |
51 | refresh_links()
52 | ```
53 |
54 | The fetch_links function will retrieve all the links, and refresh_links() will store the output in a file. We skipped sending the user agent as this is a test site. However, you can do so easily using the requests library.
55 |
56 | ## Writing unoptimized web scraper
57 |
58 | We will focus on optimizing 1,000 pages of web scraping in Python.
59 |
60 | First, install the requests library using pip:
61 |
62 | ```bash
63 | pip install requests
64 | ```
65 |
66 | To keep things simple, we will use regular expressions to extract the title element of the page. Note the `get_links` functions that loads the urls we saved in the previous step.
67 |
68 | ```python
69 | import csv
70 | import re
71 | import time
72 | import requests
73 |
74 | def get_links():
75 | links = []
76 | with open("links.csv", "r") as f:
77 | reader = csv.reader(f)
78 | for i, row in enumerate(reader):
79 | links.append(row[0])
80 |
81 | return links
82 |
83 | def get_response(session, url):
84 | with session.get(url) as resp:
85 | print('.', end='', flush=True)
86 | text = resp.text
87 | exp = r'().*(<\/title>)'
88 | return re.search(exp, text,flags=re.DOTALL).group(0)
89 |
90 | def main():
91 | start_time = time.time()
92 | with requests.Session() as session:
93 | results = []
94 | for url in get_links():
95 | result = get_response(session, url)
96 | print(result)
97 |
98 | print(f"{(time.time() - start_time):.2f} seconds")
99 |
100 | main()
101 | ```
102 |
103 | The code without optimization code took 288.62 seconds.
104 |
105 | ## Web scraping using multiprocessing
106 |
107 | Multiprocessing, as the name suggests, is utilizing more than one processor. Most modern computers have more than one CPU core, if not multiple CPUs. Using the multiprocessing module, included with the Python standard library, we can write code that uses all these cores.
108 |
109 | For example, if we have an 8-core CPU, we can essentially write code that can split the task into eight different processes where each process runs in a separate CPU core.
110 |
111 | Note that this approach is more suitable when the bottleneck is CPU or when the code is CPU-Bound. We will still see some improvements in our case, though.
112 |
113 | The first step is to import `Pool` and `cpu_count` from the multiprocessing module
114 |
115 | ```python
116 | from multiprocessing import Pool
117 | ```
118 |
119 | The other change is required in both `get_response` and `main` functions.
120 |
121 | ```python
122 | def get_response(url):
123 | resp = requests.get(url)
124 | print('.', end='', flush=True)
125 | text = resp.text
126 |
127 | exp = r'().*(<\/title>)'
128 | return re.search(exp, text, flags=re.DOTALL).group(0)
129 |
130 | def main():
131 | start_time = time.time()
132 | links = get_links()
133 |
134 | with Pool(100) as p:
135 | results = p.map(get_response, links)
136 |
137 | for result in results:
138 | print(result)
139 |
140 | print(f"{(time.time() - start_time):.2f} seconds")
141 | ```
142 |
143 | The most critical line of the code is where we create a Pool. Note that we are using `cpu_count()` function to get the count of CPU cores dynamically. This ensures that this code runs on every machine without any change.
144 |
145 | In our example, the execution time came down to about 142 seconds from 288 seconds on a machine with eight cores. This, as expected, is not a vast improvement. Remember that multiprocessing is suitable when the code is CPU-Bound. Our code is I/O bound; thus we don’t see much improvement.
146 |
147 | ## Web scraping using multithreading
148 |
149 | Multithreading is a great option to optimize web scraping code. A thread is essentially a separate flow of execution. Operating systems typically spawn hundreds of threads and switch the CPU time among these. The switching is so fast that we get the illusion of multitasking. The CPU controls this switching, and it cannot be customized.
150 |
151 | Using the `concurrent.futures` module of Python, we can customize how many threads we create to optimize our code. There is only one huge caveat: managing threads can become messy and error-prone as the code becomes more complex.
152 |
153 | To change our code to utilize multithreading, minimal changes are needed.
154 |
155 | First, import `ThreadPoolExecutor`.
156 |
157 | ```python
158 | from concurrent.futures import ThreadPoolExecutor
159 | ```
160 |
161 | Next, instead of creating a Pool , create a `ThreadPoolExecutor`:
162 |
163 | ```python
164 | with ThreadPoolExecutor(max_workers=100) as p:
165 | results = p.map(get_response, links)
166 | ```
167 |
168 | Note that you have to specify max workers. This number will depend on the complexity of the code. A too high number may harm your code as the overload of creating the threads may be too much.
169 |
170 | For this code, the code execution was complete in 12.10 seconds.
171 |
172 | For reference, the unoptimized code took 288 seconds. This is a massive improvement.
173 |
174 | ## Asyncio for asynchronous programming
175 |
176 | Asynchronous coding using the asyncio module is essentially threading where the code controls the context switching. It also makes coding more effortless and less error-prone. Specifically, for web scraping projects, this is the most suitable approach.
177 |
178 | This approach requires quite a lot of changes. First, the requests library will not work. Instead, we will use the aiohttp library for web scraping in Python. This requires a separate installation:
179 |
180 | ```bash
181 | python3 -m pip install aiohttp
182 | ```
183 |
184 | Next, import `asyncio` and `aiohttp` modules.
185 |
186 | ```python
187 | import aiohttp
188 | import asyncio
189 | ```
190 |
191 | The `get_response()` function now needs to change to a coroutine. Also, we will be using the same session for every execution. Optionally, you can send the user agent if needed.
192 |
193 | Note the use of `async` and `await` keywords.
194 |
195 | ```python
196 | async def get_response(session, url):
197 | async with session.get(url) as resp:
198 | text = await resp.text()
199 |
200 | exp = r'().*(<\/title>)'
201 | return re.search(exp, text,flags=re.DOTALL).group(0)
202 | ```
203 |
204 | The most significant changes are in the `main()` function.
205 |
206 | First, it needs to change to a coroutine. Next, we will use `aiohttp.ClientSession` to create the session object. Most importantly, we will need to create tasks for all the links. Finally, all the tasks will be sent to an event loop using the `asyncio.gather` method.
207 |
208 | ```python
209 | async def main():
210 | start_time = time.time()
211 | async with aiohttp.ClientSession() as session:
212 |
213 | tasks = []
214 | for url in get_links():
215 | tasks.append(asyncio.create_task(get_response(session, url)))
216 |
217 | results = await asyncio.gather(*tasks)
218 | for result in results:
219 | print(result)
220 |
221 | print(f"{(time.time() - start_time):.2f} seconds")
222 | ```
223 |
224 | Lastly, to run the `main()` coroutine, we would need to use `asyncio.run(main())`
225 |
226 | This execution took 9.43 seconds.
227 |
228 | As you can see, the asyncio approach was the fastest. This, however, requires an entirely new way of thinking. If you have experience with async-await in any programming language, you will find it familiar.
229 |
--------------------------------------------------------------------------------
/csharp/csharp-web-scraping/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping With C#
2 |
3 | ## Setup Development environment
4 |
5 | ```bash
6 | dotnet --version
7 | ```
8 | ## Project Structure and Dependencies
9 |
10 | ```bash
11 | dotnet new console
12 | ```
13 |
14 | ```bash
15 | dotnet add package HtmlAgilityPack
16 | ```
17 |
18 | ```bash
19 | dotnet add package CsvHelper
20 | ```
21 |
22 | ## Download and Parse Web Pages
23 |
24 | The first step of any web scraping program is to download the HTML of a web page. This HTML will be a string that you’ll need to convert into an object that can be processed further. The latter part is called parsing. Html Agility Pack can read and parse files from local files, HTML strings, any URL, or even a browser.
25 |
26 | In our case, all we need to do is get HTML from a URL. Instead of using .NET native functions, Html Agility Pack provides a convenient class – HtmlWeb. This class offers a Load function that can take a URL and return an instance of the HtmlDocument class, which is also part of the package we use. With this information, we can write a function that takes a URL and returns an instance of HtmlDocument.
27 |
28 | Open `Program.cs` file and enter this function in the class Program:
29 |
30 | ```csharp
31 | // Parses the URL and returns HtmlDocument object
32 | static HtmlDocument GetDocument(string url)
33 | {
34 | HtmlWeb web = new HtmlWeb();
35 | HtmlDocument doc = web.Load(url);
36 | return doc;
37 | }
38 | ```
39 |
40 | With this, the first step of the code is complete. The next step is to parse the document.
41 |
42 | ## Parsing the HTML: Getting Book Links
43 |
44 | In this part of the code, we’ll be extracting the required information from the web page. At this stage, a document is now an object of type HtmlDocument. This class exposes two functions to select the elements. Both functions accept XPath as input and return HtmlNode or HtmlNodeCollection. Here is the signature of these two functions:
45 |
46 | ```csharp
47 | public HtmlNodeCollection SelectNodes(string xpath);
48 | ```
49 |
50 | ```csharp
51 | public HtmlNode SelectSingleNode(string xpath);
52 | ```
53 |
54 | Let’s discuss `SelectNodes` first.
55 |
56 | For this example – C# web scraper – we are going to scrape all the book details from this page. First, it needs to be parsed so that all the links to the books can be extracted. To do that, open this page in the browser, right-click any of the book links and click Inspect. This will open the Developer Tools.
57 |
58 | After understanding some time with the markup, your XPath to select should be something like this:
59 |
60 | ```css
61 | //h3/a
62 | ```
63 |
64 | This XPath can now be passed to the `SelectNodes` function.
65 |
66 | ```csharp
67 | HtmlDocument doc = GetDocument(url);
68 | HtmlNodeCollection linkNodes = doc.DocumentNode.SelectNodes("//h3/a");
69 | ```
70 |
71 | Note that the `SelectNodes` function is being called by the `DocumentNode` attribute of the `HtmlDocument`.
72 |
73 | The variable `linkNodes` is a collection. We can write a `foreach` loop over it and get the `href` from each link one by one. There is one tiny problem that we need to take care of – the links on the page are relative. Hence, they need to be converted into an absolute URL before we can scrape these extracted links.
74 |
75 | For converting the relative URLs, we can make use of the `Uri` class. We can use this constructor to get a `Uri` object with an absolute URL.
76 |
77 | ```csharp
78 | Uri(Uri baseUri, string? relativeUri);
79 | ```
80 |
81 | Once we have the Uri object, we can simply check the `AbsoluteUri` property to get the complete URL.
82 |
83 | We can write all this in a function to keep the code organized.
84 |
85 | ```csharp
86 | static List GetBookLinks(string url)
87 | {
88 | var bookLinks = new List();
89 | HtmlDocument doc = GetDocument(url);
90 | HtmlNodeCollection linkNodes = doc.DocumentNode.SelectNodes("//h3/a");
91 | var baseUri = new Uri(url);
92 | foreach (var link in linkNodes)
93 | {
94 | string href = link.Attributes["href"].Value;
95 | bookLinks.Add(new Uri(baseUri, href).AbsoluteUri);
96 | }
97 | return bookLinks;
98 | }
99 | ```
100 |
101 | In this function, we are starting with an empty `List` object. In the `foreach` loop, we are adding all the links to this object and returning it.
102 |
103 | Now, it’s time to modify the `Main()` function so that we can test the C# code that we have written so far. Modify the function so that it looks like this:
104 |
105 | ```csharp
106 | static void Main(string[] args)
107 | {
108 | var bookLinks = GetBookLinks("http://books.toscrape.com/catalogue/category/books/mystery_3/index.html");
109 | Console.WriteLine("Found {0} links", bookLinks.Count);
110 | }
111 | ```
112 |
113 | To run this code, open the terminal and navigate to the directory which contains this file, and type in the following:
114 |
115 | ```bash
116 | dotnet run
117 | ```
118 |
119 | The output should be as follows:
120 |
121 | ```
122 | Found 20 links
123 | ```
124 |
125 | Let’s move to the next part where we will be processing all the links to get the book data.
126 |
127 | ## Parsing the HTML: Getting Book Details
128 |
129 | At this point, we have a list of strings that contain the URLs of the books. We can simply write a loop that will first get the document using the GetDocument function that we’ve already written. After that, we’ll use the SelectSingleNode function to extract the title and the price of the book.
130 |
131 | To keep the data organized, let’s start with a class. This class will represent a book. This class will have two properties – Title and Price. It will look like this:
132 |
133 | ```csharp
134 | public class Book
135 | {
136 | public string Title { get; set; }
137 | public string Price { get; set; }
138 | }
139 | ```
140 |
141 | Now, open a book page in the browser and create the XPath for the `Title – //h1`. Creating an XPath for the price is a little trickier because the additional books at the bottom have the same class applied.
142 |
143 | 
144 |
145 | The XPath of the price will be this:
146 |
147 | ```
148 | //div[contains(@class,"product_main")]/p[@class="price_color"]
149 | ```
150 |
151 | Note that XPath contains double quotes. We will have to escape these characters by prefixing them with a backslash.
152 |
153 | Now we can use the `SelectSingleNode` function to get the Node, and then employ the `InnerText` property to get the text contained in the element. We can organize everything in a function as follows:
154 |
155 | ```csharp
156 | static List GetBookDetails(List urls)
157 | {
158 | var books = new List();
159 | foreach (var url in urls)
160 | {
161 | HtmlDocument document = GetDocument(url);
162 | var titleXPath = "//h1";
163 | var priceXPath = "//div[contains(@class,\"product_main\")]/p[@class=\"price_color\"]";
164 | var book = new Book();
165 | book.Title = document.DocumentNode.SelectSingleNode(titleXPath).InnerText;
166 | book.Price = document.DocumentNode.SelectSingleNode(priceXPath).InnerText;
167 | books.Add(book);
168 | }
169 | return books;
170 | }
171 | ```
172 |
173 | This function will return a list of `Book` objects. It’s time to update the `Main()` function as well:
174 |
175 | ```csharp
176 | static void Main(string[] args)
177 | {
178 | var bookLinks = GetBookLinks("http://books.toscrape.com/catalogue/category/books/mystery_3/index.html");
179 | Console.WriteLine("Found {0} links", bookLinks.Count);
180 | var books = GetBookDetails(bookLinks);
181 | }
182 | ```
183 |
184 | ## Exporting Data
185 | If you haven’t yet installed the `CsvHelper`, you can do this by running the command `dotnet add package CsvHelper` from within the terminal.
186 |
187 | The export function is pretty straightforward. First, we need to create a `StreamWriter` and send the CSV file name as the parameter. Next, we will use this object to create a `CsvWriter`. Finally, we can use the `WriteRecords` function to write all the books in just one line of code.
188 |
189 | To ensure that all the resources are closed properly, we can use the `using` block. We can also wrap everything in a function as follows:
190 |
191 | ```csharp
192 | static void exportToCSV(List books)
193 | {
194 | using (var writer = new StreamWriter("./books.csv"))
195 | using (var csv = new CsvWriter(writer, CultureInfo.InvariantCulture))
196 | {
197 | csv.WriteRecords(books);
198 | }
199 | }
200 | ```
201 |
202 | Finally, we can call this function from the `Main()` function:
203 |
204 | ```csharp
205 | static void Main(string[] args)
206 | {
207 | var bookLinks = GetBookLinks("http://books.toscrape.com/catalogue/category/books/mystery_3/index.html");
208 | var books = GetBookDetails(bookLinks);
209 | exportToCSV(books);
210 | }
211 | ```
212 |
213 | That’s it! To run this code, open the terminal and run the following command:
214 |
215 | ```bash
216 | dotnet run
217 | ```
218 |
219 | Within seconds, you will have a `books.csv` file created.
220 |
--------------------------------------------------------------------------------
/python/web-scraping-machine-learning/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping for Machine Learning
2 |
3 | ## Project requirements
4 |
5 | ```bash
6 | $ python3 -m pip install requests_html beautifulsoup4
7 | ```
8 |
9 | ```bash
10 | $ python3 -m pip install pandas numpy matplotlib seaborn tensorflow sklearn
11 | ```
12 |
13 | ## Extracting the data
14 |
15 | If we’re looking at machine learning projects, Jupyter Notebook is a great choice as it’s easier to run and rerun a few lines of code. Moreover, the plots are in the same Notebook.
16 |
17 | Begin with importing required libraries as follows:
18 |
19 | ```python
20 | from requests_html import HTMLSession
21 | import pandas as pd
22 | ```
23 |
24 | For web scraping, we only need `Requests-HTML`. The primary reason is that `Requests-HTML` is a powerful library that can handle all our web scraping tasks, such as extracting the HTML code from websites and parsing this code into Python objects. Further benefits come from the library’s ability to function as an HTML parser, meaning collecting data and labeling can be performed using the same library.
25 |
26 | Next, we use Pandas for loading the data in a DataFrame for further processing.
27 |
28 | In the next cell, create a session and get the response from your target URL.
29 |
30 | ```python
31 | url = 'http://your-target-url'
32 | session = HTMLSession()
33 | r = session.get(url)
34 | ```
35 |
36 | After this, use XPath to select the desired data. It’ll be easier if each row is represented as a dictionary where the key is the column name. All these dictionaries can then be added to a list.
37 |
38 | ```python
39 | rows = r.html.xpath('//table/tbody/tr')
40 | symbol = 'AAPL'
41 | data = []
42 | for row in rows:
43 | if len(row.xpath('.//td')) < 7:
44 | continue
45 | data.append({
46 | 'Symbol':symbol,
47 | 'Date':row.xpath('.//td[1]/span/text()')[0],
48 | 'Open':row.xpath('.//td[2]/span/text()')[0],
49 | 'High':row.xpath('.//td[3]/span/text()')[0],
50 | 'Low':row.xpath('.//td[4]/span/text()')[0],
51 | 'Close':row.xpath('.//td[5]/span/text()')[0],
52 | 'Adj Close':row.xpath('.//td[6]/span/text()')[0],
53 | 'Volume':row.xpath('.//td[7]/span/text()')[0]
54 | })
55 | ```
56 |
57 | The results of web scraping are being stored in the variable data. To understand why such actions are taken, we must consider that these variables are a list of dictionaries that can be easily converted to a data frame. Furthermore, completing the steps mentioned above will also help to complete the vital step of data labeling.
58 |
59 | 
60 |
61 | The provided example’s data frame is not yet ready for the machine learning step. It still needs additional cleaning.
62 |
63 | ## Cleaning the data
64 |
65 | Now that the data has been collected using web scraping, we need to clean it up. The primary reason for this action is uncertainty whether the data frame is acceptable; therefore, it’s recommended to verify everything by running `df.info()`.
66 |
67 | 
68 |
69 | As evident from the above screen-print, all the columns have data type as object. For machine learning algorithms, these should be numbers.
70 |
71 | Dates can be handled using `Pandas.to_datetime`. It’ll take a series and convert the values to `datetime`. This can then be used as follows:
72 |
73 | ```python
74 | df['Date'] = pd.to_datetime(df['Date'])
75 | ```
76 |
77 | The issue we ran into now is that the other columns were not automatically converted to numbers because of comma separators.
78 |
79 | Thankfully, there are multiple ways to handle this. The easiest one is to remove the comma by calling `str.replace()` function. The astype function can also be called in the same line which will then return a `float`.
80 |
81 | ```python
82 | str_cols = ['High', 'Low', 'Close', 'Adj Close', 'Volume']
83 | df[str_cols]=df[str_cols].replace(',', '', regex=True).astype(float)
84 | ```
85 |
86 | Finally, if there are any `None` or `NaN` values, these can be deleted by calling the `dropna()`.
87 |
88 | ```python
89 | df.dropna(inplace=True)
90 | ```
91 |
92 | As the last step, set the `Date` column as the index and preview the data frame.
93 |
94 | ```python
95 | df = df.set_index('Date')
96 | df.head()
97 | ```
98 |
99 | 
100 |
101 | The data frame is now clean and ready to be sent to the machine learning model.
102 |
103 | ## Visualizing the data
104 |
105 | Before we begin the section on machine learning, let’s have a quick look at the closing price trend.
106 |
107 | First, import the packages and set the plot styles:
108 |
109 | ```python
110 | import matplotlib.pyplot as plt
111 | import seaborn as sns
112 | sns.set_style('darkgrid')
113 | plt.style.use("ggplot")
114 | ```
115 |
116 | Next, enter the following lines to plot the `Adj Close`, which is the adjusted closing price
117 |
118 | ```python
119 | plt.figure(figsize=(15, 6))
120 | df['Adj Close'].plot()
121 | plt.ylabel('Adj Close')
122 | plt.xlabel(None)
123 | plt.title('Closing Price of AAPL')
124 | ```
125 |
126 | 
127 |
128 | ## Preparing data for machine learning
129 |
130 | The first step to machine learning is the selection of features and values we want to predict.
131 |
132 | In this example, the “Adjusted Close” is dependent on the “Close” part. Therefore, we’ll ignore the `Close` column and focus on `Adj Close`.
133 |
134 | The features are usually stored in a variable named `X` and the values that we want to predict are stored in a variable `y`.
135 |
136 | ```python
137 | features = ['Open', 'High', 'Low', 'Volume']
138 | y = df.filter(['Adj Close'])
139 | ```
140 |
141 | The next step we have to consider is feature scaling. It’s used to normalize the features, i.e., the independent variables. Within our example, we can use `MinMaxScaler`. This class is part of the preprocessing module of the Sci Kit Learn library.
142 |
143 | First, we’ll create an object of this class. Then, we’ll train and transform the values using the `fit_transform` method as follows:
144 |
145 | ```python
146 | from sklearn.preprocessing import MinMaxScaler
147 | scaler = MinMaxScaler()
148 | X = scaler.fit_transform(df[features])
149 | ```
150 |
151 | The next step is splitting the data we have received into two datasets, test and training.
152 |
153 | The example we’re working with today is a time-series data, meaning data that changes over a time period requires specialized handling. The `TimeSeriesSplit` function from SKLearn’s `model_selection` module will be what we need here.
154 |
155 | ```python
156 | from sklearn.model_selection import TimeSeriesSplit
157 | tscv = TimeSeriesSplit(n_splits=10)
158 | for train_index, test_index in tscv.split(X):
159 | X_train, X_test = X[train_index], X[test_index]
160 | y_train, y_test = y[train_index], y[test_index]
161 | ```
162 |
163 | Our approach for today will be creating a neural network that uses an LSTM or a Long Short-Term Memory layer. LSTM expects a 3-dimensional input with information about the batch size, timesteps, and input dimensions. We need to reshape the features as follows:
164 |
165 | ```python
166 | X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
167 | X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
168 | ```
169 |
170 | ## Training the model and predictions
171 | We’re now ready to create a model. Import the `Sequential` model, `LSTM` layer, and `Dense` layer from Keras as follows:
172 |
173 | ```python
174 | from keras.models import Sequential
175 | from keras.layers import LSTM, Dense
176 | ```
177 |
178 | Continue by creating an instance of the Sequential model and adding two layers. The first layer will be an LSTM with 32 units while the second will be a Dense layer.
179 |
180 | ```python
181 | model = Sequential()
182 | model.add(LSTM(32, activation='relu', return_sequences=False))
183 | model.add(Dense(1))
184 | model.compile(loss='mean_squared_error', optimizer='adam')
185 | ```
186 |
187 | The model can be trained with the following line of code:
188 |
189 | ```python
190 | model.fit(X_train, y_train, epochs=100, batch_size=8)
191 | ```
192 |
193 | While the predictions can be made using this line of code:
194 |
195 | ```python
196 | y_pred= model.predict(X_test)
197 | ```
198 |
199 | Finally, let’s plot the actual values and predicted values with the following:
200 |
201 | ```python
202 | plt.figure(figsize=(15, 6))
203 | plt.plot(y_test, label='Actual Value')
204 | plt.plot(y_pred, label='Predicted Value')
205 | plt.ylabel('Adjusted Close (Scaled)')
206 | plt.xlabel('Time Scale')
207 | plt.legend()
208 | ```
209 |
210 | 
211 |
--------------------------------------------------------------------------------
/javascript/node-js-fetch-api/README.md:
--------------------------------------------------------------------------------
1 | # How to Make HTTP Requests in Node.js With Fetch API
2 |
3 | ## What is Fetch API
4 |
5 | Fetch API is an application programming interface for fetching network resources. It facilitates making HTTP requests such as GET, POST, etc.
6 |
7 | Fetch API supports new standards, such as Promise, resulting in cleaner code that doesn’t require callbacks.
8 |
9 | The native support for the Fetch API exists in all major browsers. JavaScript developers rely on the node-fetch package for the server-side code. The package is wildly popular, with millions of downloads every week.
10 |
11 | Node.js has released experimental support for the Fetch API with version 17.5. Since then, you can write your server-side JavaScript code that uses the Fetch API without installing a third-party library. To do so, run the following command:
12 |
13 | ```node
14 | node --experimental-fetch your_code.js
15 | ```
16 |
17 | ## How to use Fetch API
18 |
19 | For the following examples, a dummy website will be used as a target. As the Fetch API returns a Promise object, you can use the fetch-then syntax. To see Node Fetch in action, create a file using a code editor and enter the following lines of code:
20 |
21 | ```javascript
22 | fetch('https://quotes.toscrape.com/random')
23 | .then((response) => response.text())
24 | .then((body) => {
25 | console.log(body);
26 | });
27 | ```
28 |
29 | This code sends an HTTP GET request and prints the HTML.
30 |
31 | To explain it further, the `fetch()` method returns a Promise object. The first `then()` extracts the text from the response, and the second `then()` prints the response HTML.
32 |
33 | Save it as `quotes.js`, open the terminal, and run the following:
34 |
35 | ```node
36 | node --experimental-fetch quotes.js
37 | ```
38 |
39 | It'll print the HTML of the page. Additionally, it may also print a warning that Fetch is an experimental feature.
40 |
41 | The same code for Node Fetch can also be written using the `async-await` syntax as follows:
42 |
43 | ```javascript
44 | (async () => {
45 | const response = await fetch('https://quotes.toscrape.com/random');
46 | const body = await response.text();
47 | console.log(body);
48 | })();
49 | ```
50 |
51 | If you want to extend the code to create a web scraper, you can install a parser such as `Cheerio` and extract specific elements. The following example extracts a quote:
52 |
53 | ```javascript
54 | const cheerio = require("cheerio");
55 |
56 | fetch('https://quotes.toscrape.com/random')
57 | .then((response) => response.text())
58 | .then((body) => {
59 | const $ = cheerio.load(body);
60 | console.log($('.text').text());
61 |
62 | })
63 | ```
64 |
65 | If you want to learn more about web scraping with JavaScript and `Node.js`, see this blog post.
66 |
67 | ## HTTP headers in Fetch API
68 |
69 | Now, let's talk about the response headers. The response object contains all of the response headers in the `response.headers` collection. If you wish to print the response headers, you can do so as follows:
70 |
71 | ```javascript
72 | const url = 'https://httpbin.org/get'
73 | fetch(url)
74 | .then(response => {
75 | for(const pair of response.headers){
76 | console.log(`${pair[0]}: ${pair[1]}`);
77 | }
78 | return response.text();
79 | }).then(data => {
80 | console.log(data);
81 | });
82 | ```
83 |
84 | While running this code using Node.js, you’ll see all of the response headers as expected. However, things will be unexpectedly different when running in the browser. If a server you attempt to query has CORS headers enabled, your browser will limit the headers you can access for security reasons.
85 |
86 | You’ll only be able access the following headers: `Cache-Control`, `Content-Language`, `Content-Type`, `Expires`, `Last-Modified`, and `Pragma`. Read more about it here.
87 |
88 | It’s also possible to send custom request headers using the second parameter of `fetch()`, where various options can be set, including headers. The following example shows how to send a custom user-agent in the HTTP request:
89 |
90 | ```javascript
91 | const url = 'https://httpbin.org/get';
92 | fetch(url, {
93 | headers: {
94 | "User-Agent": "My User Agent",
95 | },
96 | })
97 | .then((response) => response.json())
98 | .then(data => {
99 | console.log(data);
100 | })
101 | ```
102 |
103 | As discussed in the next section, the second parameter can be used for additional functionality.
104 |
105 | ## Sending POST requests
106 |
107 | The default request method used by the Fetch API is GET. However, it’s possible to send a POST request as follows:
108 |
109 | ```javascript
110 | fetch(url, {method: “POST”})
111 | ```
112 |
113 | Let’s practice sending some dummy data to a test website. You’ll need to convert the data you want to send in the HTTP POST request into a string:
114 |
115 | ```javascript
116 | const url = 'https://httpbin.org/post'
117 | const data = {
118 | x: 1920,
119 | y: 1080,
120 | };
121 | const customHeaders = {
122 | "Content-Type": "application/json",
123 | }
124 |
125 | fetch(url, {
126 | method: "POST",
127 | headers: customHeaders,
128 | body: JSON.stringify(data),
129 | })
130 | .then((response) => response.json())
131 | .then((data) => {
132 | console.log(data);
133 | });
134 | ```
135 |
136 | Notice how to set `method: “Post"` and how to use `JSON.stringify(data)` to convert the data into a string.
137 |
138 | Similarly, you can also use the HTTP methods such as `DELETE`, `PUT`, etc.
139 |
140 | Exception handling
141 | As the Node Fetch API returns a Promise object, you can use the `fetch - then - catch` convention to handle errors:
142 |
143 | ```javascript
144 | fetch('https://invalid_url')
145 | .then((response) => response.text())
146 | .then((body) => {
147 | console.log(body);
148 | }).catch((error) => {
149 | console.error('error in execution', error);
150 | });
151 | ```
152 |
153 | If you’re using the `async-await` syntax, you can handle errors with the `try - catch` block as follows:
154 |
155 | ```javascript
156 | (async () => {
157 | try {
158 | const response = await fetch('https://invalid_url');
159 | const body = await response.text();
160 | console.log(body);
161 | } catch (error) {
162 | console.error(error);
163 | }
164 | })();
165 | ```
166 |
167 | ## Axios vs Fetch API
168 |
169 | Axios is a popular Node package for making HTTP `GET` and `POST` requests with ease. Make sure to check our tutorial on web scraping with JavaScript and Node.js to see a practical example of Axios.
170 |
171 | To send a GET request, call the `get()` method as follows:
172 |
173 | ```javascript
174 | const response = await axios.get(url);
175 | ```
176 |
177 | Similarly, to send a POST request, call the `post()` method as follows:
178 |
179 | ```javascript
180 | const response = await axios.post(url);
181 | ```
182 |
183 | Let's take an example to see how the Node Fetch API differs from Axios. Send a `POST` request to https://httpbin.org/post with JSON data. The important things to note here are the following:
184 |
185 | * JSON data.
186 |
187 | * Custom request headers.
188 |
189 | * The response will be in JSON format
190 |
191 | Writing the same code using Axios and Fetch API will distinguish the differences.
192 |
193 | The following code uses Axios:
194 |
195 | ```javascript
196 | const axios = require('axios');
197 | const url = 'https://httpbin.org/post'
198 | const data = {
199 | x: 1920,
200 | y: 1080,
201 | };
202 | const customHeaders = {
203 | "Content-Type": "application/json",
204 | }
205 | axios.post(url, data, {
206 | headers: customHeaders,
207 | })
208 | .then(({ data }) => {
209 | console.log(data);
210 | })
211 | .catch((error) => {
212 | console.error(error);
213 | });
214 | ```
215 |
216 | And the code below uses Fetch API:
217 |
218 | ```javascript
219 | const url = 'https://httpbin.org/post'
220 | const data = {
221 | x: 1920,
222 | y: 1080,
223 | };
224 | const customHeaders = {
225 | "Content-Type": "application/json",
226 | }
227 |
228 | fetch(url, {
229 | method: "POST",
230 | headers: customHeaders,
231 | body: JSON.stringify(data),
232 | })
233 | .then((response) => response.json())
234 | .then((data) => {
235 | console.log(data);
236 | })
237 | .catch((error) => {
238 | console.error(error);
239 | });
240 | ```
241 |
242 | Both of these code snippets will produce the same output.
243 |
244 | As evident from the examples above, here are the differences between Axios and Fetch API:
245 |
246 | * Fetch API uses the `body` property of the request, while Axios uses the `data` property.
247 |
248 | * Using Axios, JSON data can be sent directly, while Fetch API requires the conversion to a string.
249 |
250 | * Axios can handle JSON directly. The Fetch API requires the `response.json()` method to be called first to get the response in JSON format.
251 |
252 | * The response data variable name must be data in the case of Axios, while it can be anything in the case of Fetch API.
253 |
254 | * Axios allows an easy way to monitor update progress using the progress event. There is no direct method in Fetch API.
255 |
256 | * Fetch API does not support interceptors, while Axios does.
257 |
258 | * Fetch API allows streaming of a response, while Axios doesn’t.
259 |
--------------------------------------------------------------------------------