├── Beautiful Soup ├── beautifulsoup.csv └── bs-scraping.py ├── DBM ├── read_sqlite.py └── sqlite.py ├── README.md └── Selenium Web Scraping ├── amazon ├── amazon.db ├── amazon_alchemy.db ├── chromedriver.exe ├── dbm.py ├── product_info_amazon.csv ├── run_amazon.bat └── scraping-amazon.py └── lazada ├── assets └── stylesheet.css ├── chromedriver.exe ├── dashboard.py ├── dbm.py ├── lazada_alchemy.db ├── product_info.csv ├── run_lazada.bat ├── scraping_lazada.py └── smtp_alert.py /Beautiful Soup/beautifulsoup.csv: -------------------------------------------------------------------------------- 1 | Country,City,Notes 2 | ,, 3 | United Arab Emirates,Abu Dhabi," 4 | " 5 | Nigeria,Abuja,"Lagos was the capital from 1914 to 1991. 6 | " 7 | Ghana,Accra," 8 | " 9 | Pitcairn Islands,Adamstown,"British Overseas Territory 10 | " 11 | Ethiopia,Addis Ababa," 12 | " 13 | Algeria,Algiers," 14 | " 15 | Niue,Alofi,"Self-governing in free association with New Zealand 16 | " 17 | Jordan,Amman," 18 | " 19 | Netherlands,Amsterdam (official)The Hague (de facto),"The Dutch constitution refers to Amsterdam as the ""capital"". The Dutch government is located in The Hague, which also hosts the monarch, government ministries, judiciary and diplomatic missions. See Capital of the Netherlands. 20 | " 21 | Andorra,Andorra la Vella," 22 | " 23 | Turkey,Ankara," 24 | " 25 | Madagascar,Antananarivo," 26 | " 27 | Samoa,Apia," 28 | " 29 | Turkmenistan,Ashgabat," 30 | " 31 | Eritrea,Asmara," 32 | " 33 | Paraguay,Asunción," 34 | " 35 | Greece,Athens," 36 | " 37 | Cook Islands,Avarua,"Self-governing in free association with New Zealand. 38 | " 39 | Iraq,Baghdad," 40 | " 41 | Azerbaijan,Baku," 42 | " 43 | Mali,Bamako," 44 | " 45 | Brunei,Bandar Seri Begawan," 46 | " 47 | Thailand,Bangkok," 48 | " 49 | Central African Republic,Bangui," 50 | " 51 | Gambia,Banjul," 52 | " 53 | Saint Kitts and Nevis,Basseterre," 54 | " 55 | China,Beijing,"See also: List of historical capitals of China 56 | " 57 | Lebanon,Beirut," 58 | " 59 | Serbia,Belgrade," 60 | " 61 | Belize,Belmopan," 62 | " 63 | Germany,Berlin," 64 | " 65 | Switzerland,Bern,"De facto capital 66 | " 67 | Kyrgyzstan,Bishkek," 68 | " 69 | Guinea-Bissau,Bissau," 70 | " 71 | Colombia,Bogotá," 72 | " 73 | Brazil,Brasília," 74 | " 75 | Slovakia,Bratislava," 76 | " 77 | Republic of the Congo,Brazzaville," 78 | " 79 | Barbados,Bridgetown," 80 | " 81 | Belgium,Brussels,"Also the de facto capital of the  European Union 82 | " 83 | Romania,Bucharest," 84 | " 85 | Hungary,Budapest," 86 | " 87 | Argentina,Buenos Aires," 88 | " 89 | Egypt,Cairo," 90 | " 91 | Australia,Canberra," 92 | " 93 | Venezuela,Caracas," 94 | " 95 | Saint Lucia,Castries," 96 | " 97 | United States Virgin Islands,Charlotte Amalie,"Territory of the U.S. 98 | " 99 | Moldova,Chișinău," 100 | " 101 | Turks and Caicos Islands,Cockburn Town,"British Overseas Territory 102 | " 103 | Guinea,Conakry," 104 | " 105 | Denmark,Copenhagen," 106 | " 107 | Senegal,Dakar," 108 | " 109 | Syria,Damascus," 110 | " 111 | Bangladesh,Dhaka," 112 | " 113 | East Timor,Dili," 114 | " 115 | Djibouti,Djibouti," 116 | " 117 | Tanzania,"Dodoma (official, legislative)Dar es Salaam (de facto, judicial)","Dar es Salaam, the former capital and largest city continues to serve as the de facto seat of government hosting the presidency, government ministries, judiciary and diplomatic missions. 118 | " 119 | Qatar,Doha," 120 | " 121 | Isle of Man,Douglas,"British Crown Dependency 122 | " 123 | Ireland,Dublin," 124 | " 125 | Tajikistan,Dushanbe," 126 | " 127 | Tristan da Cunha,Edinburgh of the Seven Seas,"Part of the British Overseas Territory of Saint Helena, Ascension and Tristan da Cunha 128 | " 129 | Sahrawi Arab Democratic Republic [c],El Aioun (declared)Tifariti (de facto),"The Sahrawi Republic, recognised by 82 states, claims the mostly Moroccan controlled Western Sahara. Moroccan controlled territory includes El Aaiún, and Morocco claims all Sahrawi controlled territory. Tifariti is now the temporary capital (instead of Bir Lehlou earlier) of the SADR. Some government and military structures reside in exile in Tindouf, Algeria.[1] See Politics of the Sahrawi Arab Democratic Republic. 130 | " 131 | Akrotiri and Dhekelia,Episkopi Cantonment,"British Overseas Territory 132 | " 133 | Christmas Island,Flying Fish Cove,"External territory of Australia 134 | " 135 | Sierra Leone,Freetown," 136 | " 137 | Tuvalu,Funafuti," 138 | " 139 | Botswana,Gaborone," 140 | " 141 | Cayman Islands,George Town,"British Overseas Territory 142 | " 143 | Ascension Island,Georgetown,"Part of the British Overseas Territory of Saint Helena, Ascension and Tristan da Cunha 144 | " 145 | Guyana,Georgetown," 146 | " 147 | Gibraltar,Gibraltar,"British Overseas Territory 148 | " 149 | Burundi,Gitega,"Bujumbura was the capital from 1962 to 2018 150 | " 151 | Guatemala,Guatemala City," 152 | " 153 | Saint Barthélemy,Gustavia,"Overseas collectivity of France 154 | " 155 | Guam,Hagåtña,"Territory of the United States 156 | " 157 | Bermuda,Hamilton,"British Overseas Territory 158 | " 159 | Easter Island,Hanga Roa,"Special territory of Chile 160 | " 161 | Vietnam,Hanoi," 162 | " 163 | Zimbabwe,Harare," 164 | " 165 | Somaliland,Hargeisa,"Unrecognized and self-declared state, de jure part of Somalia. 166 | " 167 | Cuba,Havana," 168 | " 169 | Finland,Helsinki," 170 | " 171 | Solomon Islands,Honiara," 172 | " 173 | Pakistan,Islamabad,"Karachi was selected as the first capital of Pakistan and served as such until the capital was shifted to Rawalpindi in 1958. Rawalpindi served as an interim capital for almost a decade until the construction in Islamabad was complete. 174 | " 175 | Indonesia,Jakarta," 176 | " 177 | Saint Helena,Jamestown,"Part of the British Overseas Territory of Saint Helena, Ascension and Tristan da Cunha 178 | " 179 | Syrian opposition,Jarabulus,"See Syrian Civil War. 180 | " 181 | Israel (de facto) Palestine (claimed),Jerusalem (disputed),"The Jerusalem Law states that ""Jerusalem, complete and united, is the capital of Israel"" and the city serves as the seat of the Israeli government and its institutions. United Nations Security Council Resolution 478 declared the Jerusalem Law ""null and void"" and called on member states to withdraw their diplomatic missions from Jerusalem. Most countries currently maintain their embassies in and around Tel Aviv. The United States recognized Jerusalem as the capital and moved its embassy there in May 2018. [2] Many countries officially adhere to the proposal that Jerusalem have international status, as called for in the 1947 Partition Plan.[3] The State of Palestine, a de jure state, claims East Jerusalem as its capital. See Status of Jerusalem and Positions on Jerusalem. 182 | " 183 | South Sudan,Juba," 184 | " 185 | Afghanistan,Kabul," 186 | " 187 | Uganda,Kampala," 188 | " 189 | Nepal,Kathmandu," 190 | " 191 | Sudan,Khartoum," 192 | " 193 | Ukraine,Kiev," 194 | " 195 | Rwanda,Kigali," 196 | " 197 | South Georgia and the South Sandwich Islands,King Edward Point,"British Overseas Territory 198 | " 199 | Jamaica,Kingston," 200 | " 201 | Norfolk Island,Kingston,"External territory of Australia 202 | " 203 | Saint Vincent and the Grenadines,Kingstown," 204 | " 205 | Democratic Republic of the Congo,Kinshasa," 206 | " 207 | Malaysia,"Kuala Lumpur (official, legislative and royal)Putrajaya (administrative and judicial)"," 208 | " 209 | Kuwait,Kuwait City," 210 | " 211 | Gabon,Libreville," 212 | " 213 | Malawi,Lilongwe," 214 | " 215 | Peru,Lima,"Cusco is declared as the ""Historical Capital"" (Spanish: Capital Historica), a merely symbolic statement, by Article 49 of the Peruvian Constitution. 216 | " 217 | Portugal,Lisbon," 218 | " 219 | Slovenia,Ljubljana," 220 | " 221 | Togo,Lomé," 222 | " 223 | United Kingdom,London,"Prior to the Acts of Union in 1707, London was the capital of England only; Edinburgh was the capital of the Kingdom of Scotland 224 | " 225 | Angola,Luanda," 226 | " 227 | Zambia,Lusaka," 228 | " 229 | Luxembourg,Luxembourg," 230 | " 231 | Spain,Madrid," 232 | " 233 | Marshall Islands,Majuro,"Self-governing in free association with United States. 234 | " 235 | Equatorial Guinea,Malabo," 236 | " 237 | Maldives,Malé," 238 | " 239 | Nicaragua,Managua," 240 | " 241 | Bahrain,Manama," 242 | " 243 | Philippines,Manila," 244 | " 245 | Mozambique,Maputo," 246 | " 247 | Saint Martin,Marigot,"Overseas collectivity of France 248 | " 249 | Lesotho,Maseru," 250 | " 251 | Wallis and Futuna,Mata-Utu,"Overseas collectivity of France 252 | " 253 | Eswatini (Swaziland),Mbabane (administrative)Lobamba (royal and legislative)," 254 | " 255 | Mexico,Mexico City," 256 | " 257 | Belarus,Minsk," 258 | " 259 | Somalia,Mogadishu," 260 | " 261 | Monaco,Monaco,"City-state 262 | " 263 | Liberia,Monrovia," 264 | " 265 | Uruguay,Montevideo," 266 | " 267 | Comoros,Moroni," 268 | " 269 | Russia,Moscow," 270 | " 271 | Oman,Muscat," 272 | " 273 | Kenya,Nairobi," 274 | " 275 | Bahamas,Nassau," 276 | " 277 | Myanmar,Naypyidaw," 278 | " 279 | Chad,N'Djamena," 280 | " 281 | India,New Delhi,"Kolkata was the capital of India until 1911 during the British Raj 282 | " 283 | Palau,Ngerulmud,"Self-governing in free association with United States. 284 | " 285 | Niger,Niamey," 286 | " 287 | Cyprus,Nicosia," 288 | " 289 | Northern Cyprus,Nicosia,"De facto independent state that is recognised only by Turkey. Northern Cyprus is claimed in whole by the Republic of Cyprus.[4] 290 | " 291 | Mauritania,Nouakchott," 292 | " 293 | New Caledonia,Nouméa,"Sui generis collectivity of France 294 | " 295 | Tonga,Nukuʻalofa," 296 | " 297 | Kazakhstan,Nur-Sultan,"Formerly known as Astana; the name was changed on 20 March 2019. 298 | " 299 | Greenland,Nuuk,"Self-governing country within the Kingdom of Denmark 300 | " 301 | Aruba,Oranjestad,"Self-governing country within the Kingdom of the Netherlands 302 | " 303 | Norway,Oslo," 304 | " 305 | Canada,Ottawa," 306 | " 307 | Burkina Faso,Ouagadougou," 308 | " 309 | American Samoa,Pago Pago,"Territory of the United States 310 | " 311 | Federated States of Micronesia,Palikir,"Self-governing in free association with United States. 312 | " 313 | Panama,Panama City," 314 | " 315 | French Polynesia,Papeete,"Overseas collectivity of France 316 | " 317 | Suriname,Paramaribo," 318 | " 319 | France,Paris," 320 | " 321 | Sint Maarten,Philipsburg,"Self-governing country within the Kingdom of the Netherlands 322 | " 323 | Cambodia,Phnom Penh," 324 | " 325 | Montserrat,Plymouth (official)Brades Estate (de facto),"British Overseas Territory. Plymouth was abandoned after the eruption of the Soufriere Hills volcano in 1997. Government offices since then have been moved to Brades Estate, which is in the northwestern part of Montserrat. 326 | " 327 | Montenegro,"Podgorica (official)Cetinje (Old Royal Capital, present seat of the President)"," 328 | " 329 | Mauritius,Port Louis," 330 | " 331 | Papua New Guinea,Port Moresby," 332 | " 333 | Vanuatu,Port Vila," 334 | " 335 | Haiti,Port-au-Prince," 336 | " 337 | Trinidad and Tobago,Port of Spain," 338 | " 339 | Benin,Porto-Novo (official)Cotonou (de facto)," 340 | " 341 | Czech Republic,Prague," 342 | " 343 | Cape Verde,Praia," 344 | " 345 | South Africa,Pretoria (executive)Bloemfontein (judicial)Cape Town (legislative)," 346 | " 347 | Kosovo[g],Pristina,"De facto independent state that is recognised by 112 UN member states and by Taiwan. Claimed in whole by the Republic of Serbia as part of its Autonomous Province of Kosovo and Metohija. Republic of Kosovo has de facto control over most of the territory, with limited control in North Kosovo. 348 | " 349 | North Korea,Pyongyang," 350 | " 351 | Ecuador,Quito,"Highest official capital (2,850 m).[5] 352 | " 353 | Morocco,Rabat," 354 | " 355 | Palestine,Ramallah,"De facto capital 356 | " 357 | Iceland,Reykjavík," 358 | " 359 | Latvia,Riga," 360 | " 361 | Saudi Arabia,Riyadh," 362 | " 363 | British Virgin Islands,Road Town,"British Overseas Territory 364 | " 365 | Italy,Rome," 366 | " 367 | Dominica,Roseau," 368 | " 369 | Northern Mariana Islands,Saipan,"Territory of the United States 370 | " 371 | Costa Rica,San José," 372 | " 373 | Puerto Rico,San Juan,"Territory of the United States 374 | " 375 | San Marino,San Marino," 376 | " 377 | El Salvador,San Salvador," 378 | " 379 | Yemen,"Sana'a (de jure)Aden (de facto, temporary)","Sana'a has been occupied by Houthis rebels since February 2015. See also: Yemeni Civil War (2015–present). 380 | " 381 | Chile,Santiago (official)Valparaíso (legislative),"The National Congress of Chile is located in Valparaíso 382 | " 383 | Dominican Republic,Santo Domingo," 384 | " 385 | São Tomé and Príncipe,São Tomé," 386 | " 387 | Bosnia and Herzegovina,Sarajevo," 388 | " 389 | South Korea,Seoul," 390 | " 391 | Singapore,Singapore,"City-state 392 | " 393 | North Macedonia,Skopje," 394 | " 395 | Bulgaria,Sofia," 396 | " 397 | Sri Lanka,"Sri Jayawardenepura Kotte (official)Colombo (executive, judicial)","Also known as ""Kotte"". Until the 1980s, the capital was Colombo, where many important governmental institutions still remain and which is still designated as the commercial capital of Sri Lanka. 398 | " 399 | Grenada,St. George's," 400 | " 401 | Jersey,St. Helier,"British Crown Dependency 402 | " 403 | Antigua and Barbuda,St. John's," 404 | " 405 | Guernsey,St. Peter Port,"British Crown Dependency 406 | " 407 | Saint Pierre and Miquelon,St. Pierre,"Overseas collectivity of France 408 | " 409 | Falkland Islands,Stanley,"British Overseas Territory 410 | " 411 | Artsakh,Stepanakert,"The self-declared country remains diplomatically unrecognised by UN-member states, including Armenia. Transnistria, South Ossetia, and Abkhazia, all UN non-member states, recognise the state. Claimed in whole by Azerbaijan. 412 | " 413 | Sweden,Stockholm," 414 | " 415 | Bolivia,Sucre (constitutional)La Paz (administrative),"La Paz is the highest administrative capital (3,650 m), higher than Quito.[5] 416 | " 417 | Abkhazia,Sukhumi,"De facto independent state recognised by Russia, Nauru, Nicaragua, Venezuela, South Ossetia and Transnistria. Claimed in whole by Republic of Georgia as the Autonomous Republic of Abkhazia. 418 | " 419 | Fiji,Suva," 420 | " 421 | Taiwan,Taipei,"Officially the Republic of China (ROC), it has been competing for recognition with the People's Republic of China (PRC) as the sole Chinese government since 1949. Taiwan controls the island of Taiwan and its associated islands, Quemoy, Matsu, the Pratas, and part of the Spratly Islands[Note 1] The territory of Taiwan is claimed in whole by the People's Republic of China.[Note 2] The Republic of China participates in the World Health Organization and a number of non-UN international organizations such as the World Trade Organization, International Olympic Committee and others under a variety of pseudonyms, most commonly Chinese Taipei. 422 | " 423 | Estonia,Tallinn," 424 | " 425 | Kiribati,Tarawa," 426 | " 427 | Uzbekistan,Tashkent," 428 | " 429 | Georgia,Tbilisi (official)Kutaisi (legislative)," 430 | " 431 | Honduras,Tegucigalpa," 432 | " 433 | Iran,Tehran," 434 | " 435 | Bhutan,Thimphu," 436 | " 437 | Albania,Tirana," 438 | " 439 | Transnistria,Tiraspol,"De facto independent state, not recognized by any UN-member, but by Abkhazia and South Ossetia. Claimed in whole by the Republic of Moldova as the Territorial Unit of Transnistria. 440 | " 441 | Japan,Tokyo," 442 | " 443 | Faroe Islands,Tórshavn,"Self-governing country within the Kingdom of Denmark 444 | " 445 | Libya,Tripoli," 446 | " 447 | South Ossetia,Tskhinvali,"De facto independent state recognised by Russia, Nicaragua, Nauru, Venezuela, Abkhazia and Transnistria. Claimed in whole by the Republic of Georgia as the Provisional Administrative Entity of South Ossetia. 448 | " 449 | Tunisia,Tunis," 450 | " 451 | Mongolia,Ulaanbaatar," 452 | " 453 | Liechtenstein,Vaduz," 454 | " 455 | Malta,Valletta," 456 | " 457 | Anguilla,The Valley,"British Overseas Territory 458 | " 459 | Vatican City,Vatican City,"City-state 460 | " 461 | Seychelles,Victoria," 462 | " 463 | Austria,Vienna," 464 | " 465 | Laos,Vientiane," 466 | " 467 | Lithuania,Vilnius," 468 | " 469 | Poland,Warsaw," 470 | " 471 | United States,Washington," 472 | " 473 | New Zealand,Wellington,"See also: Capital of New Zealand 474 | " 475 | Cocos (Keeling) Islands,West Island,"External territory of Australia 476 | " 477 | Curaçao,Willemstad,"Self-governing country within the Kingdom of the Netherlands 478 | " 479 | Namibia,Windhoek," 480 | " 481 | Ivory Coast,Yamoussoukro (official)Abidjan (former capital; still has many government offices)," 482 | " 483 | Cameroon,Yaoundé," 484 | " 485 | Nauru,Yaren (de facto),"Country does not have an official capital; however, the government offices are in Yaren. 486 | " 487 | Armenia,Yerevan," 488 | " 489 | Croatia,Zagreb," 490 | " 491 | -------------------------------------------------------------------------------- /Beautiful Soup/bs-scraping.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import pandas as pd 4 | 5 | # Add header and url 6 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'} 7 | # url = "https://en.wikipedia.org/wiki/List_of_national_capitals" 8 | url = "https://www.amazon.com/" 9 | r = requests.get(url) 10 | 11 | # Initiate beautiful and list element to extract all the rows in the table 12 | soup = BeautifulSoup(r.content, "html.parser") 13 | table = soup.find_all('table')[1] 14 | print("table is ", table) 15 | rows = table.find_all('tr') 16 | row_list = list() 17 | 18 | # Iterate through all of the rows in table and get through each of the cell to append it into rows and row_list 19 | for tr in rows: 20 | td = tr.find_all('td') 21 | row = [i.text for i in td] 22 | row_list.append(row) 23 | print("Row is ", row) 24 | 25 | # Create Pandas Dataframe and print it 26 | df_bs = pd.DataFrame(row_list,columns=['City','Country','Notes']) 27 | df_bs.set_index('Country',inplace=True) 28 | print(df_bs.head()) 29 | 30 | # Exporting the data into csv 31 | df_bs.to_csv('beautifulsoup.csv') 32 | 33 | -------------------------------------------------------------------------------- /DBM/read_sqlite.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sqlite3 3 | 4 | 5 | conn = sqlite3.connect("amazon.db") 6 | 7 | def read_sql(): 8 | df = pd.read_sql_query("select * from amazon_product;", conn) 9 | print(df) 10 | return df 11 | 12 | def write_sql(id,link,product_title,product_price,category): 13 | c = conn.cursor() 14 | c.execute("INSERT INTO amazon_product VALUES " 15 | "(CURRENT_TIMESTAMP,id, link,product_title,product_price,category)") 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /DBM/sqlite.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | sqlite_file = 'amazon.db' # name of the sqlite database file 4 | 5 | # Connecting to the database file 6 | conn = sqlite3.connect(sqlite_file) 7 | c = conn.cursor() 8 | 9 | column_names=['id','link','product_title','product_price','category'] 10 | 11 | # Creating the new SQLite table with 5 column 12 | c.execute(''' 13 | CREATE TABLE amazon_product ( 14 | time date_time , 15 | id INTEGER , 16 | link TEXT NOT NULL, 17 | product_title TEXT NOT NULL, 18 | product_price DOUBLE NOT NULL, 19 | category TEXT NOT NULL, 20 | PRIMARY KEY (time, id) 21 | ); 22 | ''') 23 | 24 | # c.execute('ALTER TABLE amazon_product ADD CONSTRAINT PK_PROD PRIMARY KEY (time, id);') 25 | 26 | try: 27 | c.execute("INSERT INTO amazon_product VALUES (CURRENT_TIMESTAMP,1, 'www.test1.com','product_test_1','23.3','testing')") 28 | c.execute("INSERT INTO amazon_product VALUES (CURRENT_TIMESTAMP,2, 'www.test2.com','product_test_2','43.3','testing')") 29 | except sqlite3.IntegrityError: 30 | print('ERROR: ID already exists in PRIMARY KEY column ') 31 | 32 | 33 | # Committing changes and closing the connection to the database file 34 | conn.commit() 35 | conn.close() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Scraping 2 | Web scraping is a very powerful tool to learn for any data professional. With web scraping the entire internet becomes your database. In this repository how to parse a web page into a data file (csv) using a Python package called BeautifulSoup Two ways to extract data from a website: 3 | 4 | ## Use the API of the website (Best way) The data on the websites are unstructured, 5 | Sadly, not all websites provide an API 6 | 7 | ## Web Scraping: Web scraping is an automated method used to extract useful information from the websites focuses on the transformation of unstructured data (HTML format) on the web into structured data. 8 | **STEPS**: To extract data using web scraping with python,you need to follow these basic steps: 9 | * Find the URL that you want to scrape 10 | * Check wheather is it legal to scrap from that website Goto www.URL/robots.txt if you are using Scrapy you no need to worry because it automatically allow only Legal links. in Settings.py ROBOTSTXT_OBEY=False 11 | * Inspecting the Website 12 | * Find the data you want to extract 13 | * Write the code 14 | * Run the code and extract the data 15 | * Store the data in the required format 16 | 17 | ## Need of Web Scraping 18 | * **Price Comparison**: Services such as ParseHub use web scraping to collect data from online shopping websites and use it to compare the prices of products. 19 | * **Email address gathering**: Many companies that use email as a medium for marketing, use web scraping to collect email ID and then send bulk emails. 20 | * **travel recommendation**: Scraping a few travel recommendation sites, pull out comments about various do to things and see which property is getting a lot of positive responses from the users! The list of use cases is endless. 21 | * **Social Media Scraping**: Web scraping is used to collect data from Social Media websites such as Twitter to find out what’s trending. 22 | * **Research and Development**: Web scraping is used to collect a large set of data (Statistics, General Information, Temperature, etc.) from websites, which are analyzed and used to carry out Surveys or for R&D. 23 | * **Job listings**: Details regarding job openings, interviews are collected from different websites and then listed in one place so that it is easily accessible to the user. 24 | 25 | ## Is Web Scraping legal? 26 | To know whether a website allows web scraping or not, you can look at the website’s “robots.txt” file. You can find this file by appending “/robots.txt” to the URL that you want to scrape. 27 | -------------------------------------------------------------------------------- /Selenium Web Scraping/amazon/amazon.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NinjaDevOps0831/Web-Scraping/fac364ec106433d00c9dfd5c1b86b9ac386990d9/Selenium Web Scraping/amazon/amazon.db -------------------------------------------------------------------------------- /Selenium Web Scraping/amazon/amazon_alchemy.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NinjaDevOps0831/Web-Scraping/fac364ec106433d00c9dfd5c1b86b9ac386990d9/Selenium Web Scraping/amazon/amazon_alchemy.db -------------------------------------------------------------------------------- /Selenium Web Scraping/amazon/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NinjaDevOps0831/Web-Scraping/fac364ec106433d00c9dfd5c1b86b9ac386990d9/Selenium Web Scraping/amazon/chromedriver.exe -------------------------------------------------------------------------------- /Selenium Web Scraping/amazon/dbm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sqlite3 3 | from sqlalchemy import create_engine 4 | 5 | conn = sqlite3.connect("amazon.db") 6 | disk_engine = create_engine('sqlite:///amazon_alchemy.db') 7 | c = conn.cursor() 8 | 9 | def read(): 10 | df = pd.read_sql_query("select * from amazon_product;", conn) 11 | print(df) 12 | return df 13 | 14 | def write_values(id,link,product_title,product_price,category): 15 | c.execute("INSERT INTO amazon_product VALUES " 16 | "(CURRENT_TIMESTAMP,id, link,product_title,product_price,category)") 17 | 18 | def write_from_df_with_sqlite3(df): 19 | for index, row in df.iterrows(): 20 | c.execute( 21 | ''' 22 | INSERT INTO amazon_product VALUES 23 | (CURRENT_TIMESTAMP,?,?,?,?,?) 24 | ''', 25 | (row['id'], row['link'],row['product_title'],row['product_price'], 26 | row['category']) 27 | ) 28 | 29 | def write_from_df_with_alchemy(df): 30 | df.to_sql('amazon_product', disk_engine, if_exists='append') 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /Selenium Web Scraping/amazon/product_info_amazon.csv: -------------------------------------------------------------------------------- 1 | id,link,product_title,product_price,category 2 | 0,https://www.amazon.com/Schick-Hydrate-Refill-Blades-Refills/dp/B00I1F2I3I?ref_=Oct_TopRatedC_13271080011_3&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f,"Schick Hydro Sense Hydrate Mens Razor Blade Refill With Skin Guards, Includes 12 Razor Blades Refills",$22.49,Beauty & Personal Care> Shave & Hair Removal> Men's> Razors & Blades> Cartridges & Refills 3 | 1,https://www.amazon.com/Gillette-Fusion-ProShield-Refills-Razors/dp/B0168MB6SS?ref_=Oct_DLandingS_PC_7e8aa158_2&smid=ATVPDKIKX0DER&th=1,"Gillette Fusion ProShield Chill Men's Razor Blade Refills, 8 Refills, Mens Razors / Blades",$27.31,Beauty & Personal Care> Shave & Hair Removal> Men's> Razors & Blades> Cartridges & Refills 4 | 2,https://www.amazon.com/Gillette-Mach3-Handle-Refills-Packaging/dp/B06X9V77XY?ref_=Oct_RAsinC_Ajax_13271080011_2&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER,"Gillette Mach3 Men's Razor, Handle & 2 Blade Refills (Packaging May Vary)",$7.99,Beauty & Personal Care> Shave & Hair Removal> Men's> Razors & Blades> Razor Systems 5 | 3,https://www.amazon.com/Made-Shaving-Razor-Blades-12-Count/dp/B07N7SFZ9S?ref_=Oct_TopRatedC_13271080011_0&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&th=1,"Made for YOU by BIC Shaving Razor Blades for Men and Women, 12-Count - Refill Cartridges with 5...",$22.79,Beauty & Personal Care> Shave & Hair Removal> Men's> Razors & Blades> Cartridges & Refills 6 | 4,https://www.amazon.com/Gillette-Mach3-Razor-Blades-Refills/dp/B0039LMTBA?ref_=Oct_BSellerC_13271080011_1&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&th=1,"Gillette Mach3 Men's Razor Blades, 10 Blade Refills",$14.22,Beauty & Personal Care> Shave & Hair Removal> Men's> Razors & Blades> Cartridges & Refills 7 | -------------------------------------------------------------------------------- /Selenium Web Scraping/amazon/run_amazon.bat: -------------------------------------------------------------------------------- 1 | C:\new_software\finance\Scripts\python.exe "C:\new_software\Web Scraping\Web-Scraping\Selenium Web Scraping\amazon\scraping-amazon.py" 2 | pause -------------------------------------------------------------------------------- /Selenium Web Scraping/amazon/scraping-amazon.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.common.exceptions import TimeoutException 6 | 7 | from random import shuffle 8 | import pandas as pd 9 | import time 10 | import dbm 11 | 12 | 13 | option = webdriver.ChromeOptions() 14 | 15 | # Run the argument with incognito 16 | option.add_argument(' — incognito') 17 | driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option) 18 | 19 | link_list =[ 20 | 'https://www.amazon.com/Gillette-Fusion-ProShield-Refills-Razors/dp/B0168MB6SS?ref_=Oct_DLandingS_PC_7e8aa158_2&smid=ATVPDKIKX0DER&th=1', 21 | 'https://www.amazon.com/Gillette-Mach3-Razor-Blades-Refills/dp/B0039LMTBA?ref_=Oct_BSellerC_13271080011_1&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&th=1', 22 | 'https://www.amazon.com/Gillette-Mach3-Handle-Refills-Packaging/dp/B06X9V77XY?ref_=Oct_RAsinC_Ajax_13271080011_2&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER', 23 | 'https://www.amazon.com/Made-Shaving-Razor-Blades-12-Count/dp/B07N7SFZ9S?ref_=Oct_TopRatedC_13271080011_0&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&th=1', 24 | 'https://www.amazon.com/Schick-Hydrate-Refill-Blades-Refills/dp/B00I1F2I3I?ref_=Oct_TopRatedC_13271080011_3&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f&pf_rd_s=merchandised-search-6&pf_rd_t=101&pf_rd_i=13271080011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&pf_rd_r=GMTB1EXXMWY98TYQ6G8H&pf_rd_p=c85fdb71-727d-58f5-9209-98379c35a68f' 25 | ] 26 | 27 | # Shuffling to avoid being detected by Amazon 28 | shuffle(link_list) 29 | 30 | # Creating lists of features interested 31 | product_title_list = list() 32 | product_price_list = list() 33 | category_list = list() 34 | 35 | # Getting the start time to track on time required 36 | start = time.time() 37 | 38 | # -------------------------------Web Scraping------------------------------- 39 | for link in link_list: 40 | # Open the url 41 | driver.get(link) 42 | 43 | # Wait 30 seconds for page to load and extract the element after it loads 44 | timeout = 30 45 | try: 46 | WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.ID, "productTitle"))) 47 | except TimeoutException: 48 | print('Timed out waiting for page to load') 49 | driver.quit() 50 | 51 | # -------------------------------Product title------------------------------- 52 | # find_elements_by_id returns an array of selenium objects. 53 | product_title = driver.find_element(By.ID,'productTitle').text; 54 | print("product title ",product_title) 55 | product_title_list.append(product_title) 56 | 57 | # -------------------------------Product price------------------------------- 58 | # This will return the product price, if product price of priceblock is not found, move on to the other element 59 | try: 60 | product_price = driver.find_element(By.XPATH,'//*[@id="priceblock_snsprice_Based"]/span').text 61 | except: 62 | product_price = driver.find_element(By.XPATH, '//*[@id="priceblock_ourprice"]').text 63 | 64 | print("product price ",product_price) 65 | product_price_list.append(product_price) 66 | 67 | # -------------------------------Category ------------------------------- 68 | # This will return the category of the product 69 | breadcrumb_container = driver.find_element(By.XPATH,'//*[@id="wayfinding-breadcrumbs_container"]') 70 | categories = list() 71 | categories_web_element = breadcrumb_container.find_elements(By.CLASS_NAME,'a-link-normal') 72 | for element in categories_web_element: 73 | categories.append(element.text) 74 | 75 | # This will join the list with , 76 | category = '> '.join(categories) 77 | print("category ",category) 78 | category_list.append(category) 79 | 80 | # Let us make a panda dataframe of title, price 81 | data = {'link':link_list,'product_title': product_title_list,'product_price': product_price_list, 'category': category_list} 82 | df_product = pd.DataFrame.from_dict(data) 83 | df_product.index.name = 'id' 84 | print(df_product.head()) 85 | 86 | # Generate time tracker print 87 | end = time.time() 88 | print('for end-start') 89 | print("For {} links, the time taken is {}".format(len(link_list), end-start)) 90 | 91 | # -------------------------------EXPORT and SAVE------------------------------- 92 | # Exporting the data into csv 93 | # df_product.to_csv('product_info_amazon.csv') 94 | 95 | # Inserting into sqlite 96 | # dbm.write_from_df_with_sqlite3(df_product) 97 | 98 | # Inserting into sqlite with alchemy 99 | dbm.write_from_df_with_alchemy(df_product) 100 | 101 | -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/assets/stylesheet.css: -------------------------------------------------------------------------------- 1 | /* Table of contents 2 | –––––––––––––––––––––––––––––––––––––––––––––––––– 3 | - Plotly.js 4 | - Grid 5 | - Base Styles 6 | - Typography 7 | - Links 8 | - Buttons 9 | - Forms 10 | - Lists 11 | - Code 12 | - Tables 13 | - Spacing 14 | - Utilities 15 | - Clearing 16 | - Media Queries 17 | */ 18 | 19 | /* PLotly.js 20 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 21 | /* plotly.js's modebar's z-index is 1001 by default 22 | * https://github.com/plotly/plotly.js/blob/7e4d8ab164258f6bd48be56589dacd9bdd7fded2/src/css/_modebar.scss#L5 23 | * In case a dropdown is above the graph, the dropdown's options 24 | * will be rendered below the modebar 25 | * Increase the select option's z-index 26 | */ 27 | 28 | /* This was actually not quite right - 29 | dropdowns were overlapping each other (edited October 26) 30 | 31 | .Select { 32 | z-index: 1002; 33 | }*/ 34 | 35 | /* Grid 36 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 37 | .container { 38 | position: relative; 39 | width: 100%; 40 | max-width: 960px; 41 | margin: 0 auto; 42 | padding: 0 20px; 43 | box-sizing: border-box; } 44 | .column, 45 | .columns { 46 | width: 100%; 47 | float: left; 48 | box-sizing: border-box; } 49 | 50 | /* For devices larger than 400px */ 51 | @media (min-width: 400px) { 52 | .container { 53 | width: 85%; 54 | padding: 0; } 55 | } 56 | 57 | /* For devices larger than 550px */ 58 | @media (min-width: 550px) { 59 | .container { 60 | width: 80%; } 61 | .column, 62 | .columns { 63 | margin-left: 4%; } 64 | .column:first-child, 65 | .columns:first-child { 66 | margin-left: 0; } 67 | 68 | .one.column, 69 | .one.columns { width: 4.66666666667%; } 70 | .two.columns { width: 13.3333333333%; } 71 | .three.columns { width: 22%; } 72 | .four.columns { width: 30.6666666667%; } 73 | .five.columns { width: 39.3333333333%; } 74 | .six.columns { width: 48%; } 75 | .seven.columns { width: 56.6666666667%; } 76 | .eight.columns { width: 65.3333333333%; } 77 | .nine.columns { width: 74.0%; } 78 | .ten.columns { width: 82.6666666667%; } 79 | .eleven.columns { width: 91.3333333333%; } 80 | .twelve.columns { width: 100%; margin-left: 0; } 81 | 82 | .one-third.column { width: 30.6666666667%; } 83 | .two-thirds.column { width: 65.3333333333%; } 84 | 85 | .one-half.column { width: 48%; } 86 | 87 | /* Offsets */ 88 | .offset-by-one.column, 89 | .offset-by-one.columns { margin-left: 8.66666666667%; } 90 | .offset-by-two.column, 91 | .offset-by-two.columns { margin-left: 17.3333333333%; } 92 | .offset-by-three.column, 93 | .offset-by-three.columns { margin-left: 26%; } 94 | .offset-by-four.column, 95 | .offset-by-four.columns { margin-left: 34.6666666667%; } 96 | .offset-by-five.column, 97 | .offset-by-five.columns { margin-left: 43.3333333333%; } 98 | .offset-by-six.column, 99 | .offset-by-six.columns { margin-left: 52%; } 100 | .offset-by-seven.column, 101 | .offset-by-seven.columns { margin-left: 60.6666666667%; } 102 | .offset-by-eight.column, 103 | .offset-by-eight.columns { margin-left: 69.3333333333%; } 104 | .offset-by-nine.column, 105 | .offset-by-nine.columns { margin-left: 78.0%; } 106 | .offset-by-ten.column, 107 | .offset-by-ten.columns { margin-left: 86.6666666667%; } 108 | .offset-by-eleven.column, 109 | .offset-by-eleven.columns { margin-left: 95.3333333333%; } 110 | 111 | .offset-by-one-third.column, 112 | .offset-by-one-third.columns { margin-left: 34.6666666667%; } 113 | .offset-by-two-thirds.column, 114 | .offset-by-two-thirds.columns { margin-left: 69.3333333333%; } 115 | 116 | .offset-by-one-half.column, 117 | .offset-by-one-half.columns { margin-left: 52%; } 118 | 119 | } 120 | 121 | 122 | /* Base Styles 123 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 124 | /* NOTE 125 | html is set to 62.5% so that all the REM measurements throughout Skeleton 126 | are based on 10px sizing. So basically 1.5rem = 15px :) */ 127 | html { 128 | font-size: 62.5%; } 129 | body { 130 | font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */ 131 | line-height: 1.6; 132 | font-weight: 400; 133 | font-family: "Open Sans", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 134 | color: rgb(50, 50, 50); } 135 | 136 | 137 | /* Typography 138 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 139 | h1, h2, h3, h4, h5, h6 { 140 | margin-top: 10; 141 | margin-bottom: 10; 142 | font-weight: 300; } 143 | h1 { font-size: 4.5rem; line-height: 1.2; letter-spacing: -.1rem; margin-bottom: 3rem; } 144 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; margin-bottom: 3rem; margin-top: 3rem;} 145 | h3 { font-size: 3.0rem; line-height: 1.3; letter-spacing: -.1rem; margin-bottom: 3rem; margin-top: 3rem;} 146 | h4 { font-size: 2.6rem; line-height: 1.35; letter-spacing: -.08rem; margin-bottom: 3rem; margin-top: 3rem;} 147 | h5 { font-size: 2.2rem; line-height: 1.5; letter-spacing: -.05rem; margin-bottom: 3rem; margin-top: 3rem;} 148 | h6 { font-size: 2.0rem; line-height: 1.6; letter-spacing: 0; margin-bottom: 3rem; margin-top: 3rem;} 149 | 150 | p { 151 | margin-top: 10; } 152 | 153 | 154 | /* Blockquotes 155 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 156 | blockquote { 157 | border-left: 4px lightgrey solid; 158 | padding-left: 1rem; 159 | margin-top: 2rem; 160 | margin-bottom: 2rem; 161 | margin-left: 0rem; 162 | } 163 | 164 | 165 | /* Links 166 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 167 | a { 168 | color: #1EAEDB; 169 | text-decoration: underline; 170 | cursor: pointer;} 171 | a:hover { 172 | color: #0FA0CE; } 173 | 174 | 175 | /* Buttons 176 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 177 | .button, 178 | button, 179 | input[type="submit"], 180 | input[type="reset"], 181 | input[type="button"] { 182 | display: inline-block; 183 | height: 38px; 184 | padding: 0 30px; 185 | color: #555; 186 | text-align: center; 187 | font-size: 11px; 188 | font-weight: 600; 189 | line-height: 38px; 190 | letter-spacing: .1rem; 191 | text-transform: uppercase; 192 | text-decoration: none; 193 | white-space: nowrap; 194 | background-color: transparent; 195 | border-radius: 4px; 196 | border: 1px solid #bbb; 197 | cursor: pointer; 198 | box-sizing: border-box; } 199 | .button:hover, 200 | button:hover, 201 | input[type="submit"]:hover, 202 | input[type="reset"]:hover, 203 | input[type="button"]:hover, 204 | .button:focus, 205 | button:focus, 206 | input[type="submit"]:focus, 207 | input[type="reset"]:focus, 208 | input[type="button"]:focus { 209 | color: #333; 210 | border-color: #888; 211 | outline: 0; } 212 | .button.button-primary, 213 | button.button-primary, 214 | input[type="submit"].button-primary, 215 | input[type="reset"].button-primary, 216 | input[type="button"].button-primary { 217 | color: #FFF; 218 | background-color: #33C3F0; 219 | border-color: #33C3F0; } 220 | .button.button-primary:hover, 221 | button.button-primary:hover, 222 | input[type="submit"].button-primary:hover, 223 | input[type="reset"].button-primary:hover, 224 | input[type="button"].button-primary:hover, 225 | .button.button-primary:focus, 226 | button.button-primary:focus, 227 | input[type="submit"].button-primary:focus, 228 | input[type="reset"].button-primary:focus, 229 | input[type="button"].button-primary:focus { 230 | color: #FFF; 231 | background-color: #1EAEDB; 232 | border-color: #1EAEDB; } 233 | 234 | 235 | /* Forms 236 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 237 | input[type="email"], 238 | input[type="number"], 239 | input[type="search"], 240 | input[type="text"], 241 | input[type="tel"], 242 | input[type="url"], 243 | input[type="password"], 244 | textarea, 245 | select { 246 | height: 38px; 247 | padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */ 248 | background-color: #fff; 249 | border: 1px solid #D1D1D1; 250 | border-radius: 4px; 251 | box-shadow: none; 252 | box-sizing: border-box; 253 | font-family: inherit; 254 | font-size: inherit; /*https://stackoverflow.com/questions/6080413/why-doesnt-input-inherit-the-font-from-body*/} 255 | /* Removes awkward default styles on some inputs for iOS */ 256 | input[type="email"], 257 | input[type="number"], 258 | input[type="search"], 259 | input[type="text"], 260 | input[type="tel"], 261 | input[type="url"], 262 | input[type="password"], 263 | textarea { 264 | -webkit-appearance: none; 265 | -moz-appearance: none; 266 | appearance: none; } 267 | textarea { 268 | min-height: 65px; 269 | padding-top: 6px; 270 | padding-bottom: 6px; } 271 | input[type="email"]:focus, 272 | input[type="number"]:focus, 273 | input[type="search"]:focus, 274 | input[type="text"]:focus, 275 | input[type="tel"]:focus, 276 | input[type="url"]:focus, 277 | input[type="password"]:focus, 278 | textarea:focus, 279 | select:focus { 280 | border: 1px solid #33C3F0; 281 | outline: 0; } 282 | label, 283 | legend { 284 | display: block; 285 | margin-bottom: 0px; } 286 | fieldset { 287 | padding: 0; 288 | border-width: 0; } 289 | input[type="checkbox"], 290 | input[type="radio"] { 291 | display: inline; } 292 | label > .label-body { 293 | display: inline-block; 294 | margin-left: .5rem; 295 | font-weight: normal; } 296 | 297 | 298 | /* Lists 299 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 300 | ul { 301 | list-style: circle inside; } 302 | ol { 303 | list-style: decimal inside; } 304 | ol, ul { 305 | padding-left: 0; 306 | margin-top: 0; } 307 | ul ul, 308 | ul ol, 309 | ol ol, 310 | ol ul { 311 | margin: 1.5rem 0 1.5rem 3rem; 312 | font-size: 90%; } 313 | li { 314 | margin-bottom: 1rem; } 315 | 316 | 317 | /* Tables 318 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 319 | table { 320 | border-collapse: collapse; 321 | } 322 | th, 323 | td { 324 | padding: 12px 15px; 325 | text-align: left; 326 | border-bottom: 1px solid #E1E1E1; } 327 | th:first-child, 328 | td:first-child { 329 | padding-left: 0; } 330 | th:last-child, 331 | td:last-child { 332 | padding-right: 0; } 333 | 334 | 335 | /* Spacing 336 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 337 | button, 338 | .button { 339 | margin-bottom: 0rem; } 340 | input, 341 | textarea, 342 | select, 343 | fieldset { 344 | margin-bottom: 0rem; } 345 | pre, 346 | dl, 347 | figure, 348 | table, 349 | form { 350 | margin-top: 3rem; 351 | margin-bottom: 3rem; } 352 | p, 353 | ul, 354 | ol { 355 | margin-bottom: 0.75rem; } 356 | 357 | /* Utilities 358 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 359 | .u-full-width { 360 | width: 100%; 361 | box-sizing: border-box; } 362 | .u-max-full-width { 363 | max-width: 100%; 364 | box-sizing: border-box; } 365 | .u-pull-right { 366 | float: right; } 367 | .u-pull-left { 368 | float: left; } 369 | 370 | 371 | /* Misc 372 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 373 | hr { 374 | margin-top: 3rem; 375 | margin-bottom: 3.5rem; 376 | border-width: 0; 377 | border-top: 1px solid #E1E1E1; } 378 | 379 | 380 | /* Clearing 381 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 382 | 383 | /* Self Clearing Goodness */ 384 | .container:after, 385 | .row:after, 386 | .u-cf { 387 | content: ""; 388 | display: table; 389 | clear: both; } 390 | 391 | 392 | /* Media Queries 393 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 394 | /* 395 | Note: The best way to structure the use of media queries is to create the queries 396 | near the relevant code. For example, if you wanted to change the styles for buttons 397 | on small devices, paste the mobile query code up in the buttons section and style it 398 | there. 399 | */ 400 | 401 | 402 | /* Larger than mobile */ 403 | @media (min-width: 400px) {} 404 | 405 | /* Larger than phablet (also point when grid becomes active) */ 406 | @media (min-width: 550px) {} 407 | 408 | /* Larger than tablet */ 409 | @media (min-width: 750px) {} 410 | 411 | /* Larger than desktop */ 412 | @media (min-width: 1000px) {} 413 | 414 | /* Larger than Desktop HD */ 415 | @media (min-width: 1200px) {} -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NinjaDevOps0831/Web-Scraping/fac364ec106433d00c9dfd5c1b86b9ac386990d9/Selenium Web Scraping/lazada/chromedriver.exe -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/dashboard.py: -------------------------------------------------------------------------------- 1 | import dash 2 | from dash.dependencies import Input, Output 3 | import dash_core_components as dcc 4 | import dash_html_components as html 5 | import dbm 6 | import plotly.graph_objs as go 7 | import re 8 | import smtp_alert 9 | 10 | 11 | # Set up the app 12 | app = dash.Dash(__name__) 13 | server = app.server 14 | 15 | global product_df 16 | global dict_products 17 | 18 | 19 | 20 | def create_dict_list_of_product(): 21 | dictlist = [] 22 | unique_list = product_df.product_title.unique() 23 | for product_title in unique_list: 24 | dictlist.append({'value': product_title, 'label': product_title}) 25 | return dictlist 26 | 27 | def dict_product_list(dict_list): 28 | product_list = [] 29 | for dict in dict_list: 30 | product_list.append(dict.get('value')) 31 | return product_list 32 | 33 | product_df = dbm.read() 34 | dict_products = create_dict_list_of_product() 35 | 36 | app.layout = html.Div([ 37 | html.Div([ 38 | html.H1('Price Optimization Dashboard'), 39 | html.H2('Choose a product name'), 40 | dcc.Dropdown( 41 | id='product-dropdown', 42 | options=dict_products, 43 | multi=True, 44 | value = ["Ben & Jerry's Wake and No Bake Cookie Dough Core Ice Cream","Brewdog Punk IPA"] 45 | ), 46 | dcc.Graph( 47 | id='product-like-bar' 48 | ) 49 | ], style={'width': '40%', 'display': 'inline-block'}), 50 | html.Div([ 51 | html.H2('All product info'), 52 | html.Table(id='my-table'), 53 | html.P(''), 54 | ], style={'width': '55%', 'float': 'right', 'display': 'inline-block'}), 55 | html.Div([ 56 | html.H2('price graph'), 57 | dcc.Graph(id='product-trend-graph'), 58 | html.P('') 59 | ], style={'width': '100%', 'display': 'inline-block'}), 60 | html.Div(id='hidden-email-alert', style={'display':'none'}) 61 | ]) 62 | 63 | 64 | 65 | @app.callback(Output('product-like-bar', 'figure'), [Input('product-dropdown', 'value')]) 66 | def update_graph(selected_dropdown_value): 67 | product_df_filter = product_df[(product_df['product_title'].isin(selected_dropdown_value))] 68 | 69 | # Take the one with max datetime and remove duplicates for this bar chart 70 | product_df_filter = product_df_filter.sort_values('datetime', ascending=False) 71 | product_df_filter = product_df_filter.drop_duplicates(['index']) 72 | 73 | #Rating count check 74 | def format_rating(rating): 75 | return re.sub('\((\d+)\)', r'\1', rating) 76 | 77 | product_df_filter['rating_count'] = product_df_filter['rating_count'].apply(format_rating) 78 | 79 | figure = { 80 | 'data': [go.Bar( 81 | y=product_df_filter.product_title, 82 | x=product_df_filter.rating_count, 83 | orientation='h' 84 | )], 85 | 'layout':go.Layout( 86 | title= 'Product Rating Trends', 87 | yaxis = dict( 88 | # autorange=True, 89 | automargin=True 90 | ) 91 | ) 92 | } 93 | return figure 94 | 95 | # For the top topics graph 96 | @app.callback(Output('product-trend-graph', 'figure'), [Input('product-dropdown', 'value')]) 97 | def update_graph(selected_dropdown_value): 98 | product_df_filter = product_df[(product_df['product_title'].isin(selected_dropdown_value))] 99 | 100 | data = timeline_top_product_filtered(product_df_filter,selected_dropdown_value) 101 | # Edit the layout 102 | layout = dict(title='Product Price Trends', 103 | xaxis=dict(title='datetime'), 104 | yaxis=dict(title='Price'), 105 | ) 106 | figure = dict(data=data,layout=layout) 107 | return figure 108 | 109 | def timeline_top_product_filtered(top_product_filtered_df, selected_dropdown_value): 110 | # Make a timeline 111 | trace_list = [] 112 | for value in selected_dropdown_value: 113 | top_product_value_df = top_product_filtered_df[top_product_filtered_df['product_title']==value] 114 | trace = go.Scatter( 115 | y=top_product_value_df.product_price, 116 | x=top_product_value_df.datetime, 117 | name = value 118 | ) 119 | trace_list.append(trace) 120 | return trace_list 121 | 122 | 123 | # for the table 124 | @app.callback(Output('my-table', 'children'), [Input('product-dropdown', 'value')]) 125 | def generate_table(selected_dropdown_value, max_rows=20): 126 | product_df_filter = product_df[(product_df['product_title'].isin(selected_dropdown_value))] 127 | product_df_filter = product_df_filter.sort_values(['index','datetime'], ascending=True) 128 | 129 | return [html.Tr([html.Th(col) for col in product_df_filter .columns])] + [html.Tr([ 130 | html.Td(product_df_filter.iloc[i][col]) for col in product_df_filter .columns 131 | ]) for i in range(min(len(product_df_filter ), max_rows))] 132 | 133 | @app.callback(Output('hidden-email-alert', 'id'), [Input('product-dropdown', 'value')]) 134 | def send_alert(selected_dropdown_value): 135 | # To send emails if the latest price is lower than original price 136 | for product_title in selected_dropdown_value: 137 | product_df_specific = product_df[product_df['product_title'] == product_title].sort_values('datetime', 138 | ascending=True) 139 | original_price = product_df_specific.product_price.values[0] 140 | latest_price = product_df_specific.product_price.values[-1] 141 | print(product_title, original_price, latest_price) 142 | if (latest_price < original_price): 143 | smtp_alert.send_alert_of_price_reduction(product_title, original_price, latest_price) 144 | return None 145 | 146 | 147 | 148 | if __name__ == '__main__': 149 | app.run_server(debug=True) 150 | 151 | 152 | 153 | # For the product price graph individual 154 | # @app.callback(Output('my-graph', 'figure'), [Input('my-dropdown', 'value')]) 155 | # def update_graph(selected_dropdown_value): 156 | # product_df_filter = product_df[(product_df['product_title'].isin(selected_dropdown_value))] 157 | # 158 | # return { 159 | # 'data': [{ 160 | # 'x': product_df_filter.datetime, 161 | # 'y': product_df_filter.product_price 162 | # }] 163 | # } 164 | -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/dbm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sqlite3 3 | from sqlalchemy import create_engine 4 | from sqlalchemy.orm import sessionmaker 5 | 6 | conn = sqlite3.connect("lazada_alchemy.db") 7 | disk_engine = create_engine('sqlite:///lazada_alchemy.db') 8 | c = conn.cursor() 9 | 10 | def read(): 11 | df = pd.read_sql_query("select * from lazada_product;", conn) 12 | return df 13 | 14 | def write_values(id,link,product_title,product_price,category): 15 | c.execute("INSERT INTO lazada_product VALUES " 16 | "(CURRENT_TIMESTAMP,id, link,product_title,product_price,category)") 17 | 18 | def write_from_df_with_sqlite3(df): 19 | for index, row in df.iterrows(): 20 | c.execute( 21 | ''' 22 | INSERT INTO lazada_product VALUES 23 | (CURRENT_TIMESTAMP,?,?,?,?,?) 24 | ''', 25 | (row['id'], row['link'],row['product_title'],row['product_price'], 26 | row['category']) 27 | ) 28 | 29 | def write_from_df_with_alchemy(df): 30 | #Adding Timestamp 31 | df['datetime'] = pd.Timestamp("today").strftime("%m/%d/%Y") 32 | 33 | # Appending the results to lazada_producct 34 | df.to_sql('lazada_product', disk_engine, if_exists='append') 35 | -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/lazada_alchemy.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NinjaDevOps0831/Web-Scraping/fac364ec106433d00c9dfd5c1b86b9ac386990d9/Selenium Web Scraping/lazada/lazada_alchemy.db -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/product_info.csv: -------------------------------------------------------------------------------- 1 | id,product_title,pack_size,product_price,rating_count 2 | 0,Whisper Ultra Clean Night Wing Sanitary Pads 32CM,14 per pack,$5.86,(41) 3 | 1,Royal Umbrella Fragrant Rice,10 kg,$31.16,(180) 4 | 2,Meiji Plain Crackers,832 g,$6.92,(115) 5 | 3,Tide Original Regular HE Laundry Detergent,4.43 L,$29.90,(43) 6 | 4,RedMart Australian Chilled Minced Pork (Freezer Ready Packaging),500 g,$6.70,(13) 7 | 5,Mission Tortillas Wrap Wholemeal 8 Per Pack,360 g,$5.35,(13) 8 | 6,Love Beauty & Planet Vegan Shampoo Tea Tree Oil and Vetiver Radical Refresher,400 ml,$12.90,(11) 9 | 7,Moet & Chandon Brut Imperial Champagne (Limited Edition - 150th Anniversary),750 ml,$69.00,(0) 10 | 8,Ben & Jerry's Non-Dairy Chocolate Fudge Brownie Ice Cream,458 ml,$11.92,(1) 11 | 9,Dove Dermaseries Gentle Cleansing Body Wash,250 ml,$11.81,(0) 12 | 10,NESCAFE GOLD Pure Soluble Coffee 200g (NEW),200 g,$13.91,(1) 13 | 11,Kellogg's Coco Pop with We Bare Bear Figurine,400 g,$4.92,(1) 14 | 12,Carlsberg Lager Beer 12 X 320Ml,12 × 320 ml,$22.80,(55) 15 | 13,Corona Extra Beer - Case,24 × 355 ml,$68.00,(5) 16 | 14,Wyeth Nutrition S-26 GOLD PROMISE Stage 4 Growing-up Formula 2'-FL 1.6KG,1.6 kg,$59.14,(8) 17 | 15,Canon Business High Grade 80gsm A4 Paper (Carton) - Printer Paper - By Steve and Leif,2500 per pack,$29.00,(5) 18 | 16,Walch Antibacterial Hand Wash Moisturizing Twin Pack,2 × 525 ml,$6.95,(8) 19 | 17,Coco Life Coconut Water - Case,12 × 330 ml,$13.95,(44) 20 | 18,US Red Cherries,250 g,$5.95,(0) 21 | 19,Crunchy Fresh Extra Fine Green Beans,150 g,$2.20,(8) 22 | 20,Dragonfruits,2 per pack,$2.25,(60) 23 | 21,Global Seasons Yellow Flesh Watermelon,4 kg,$5.62,(2) 24 | 22,Red Dragon Fruit,1 per pack,$2.30,(97) 25 | 23,Prime Asia 2+1 Hass Avocados,3 per pack,$4.95,(168) 26 | 24,RedMart German Salami,100 g,$6.00,(0) 27 | 25,RedMart Pepperoni Salami,100 g,$6.00,(0) 28 | 26,RedMart Grass Fed Angus Beef Sirloins (2 pcs) - New Zealand,500 g,$23.50,(26) 29 | 27,RedMart Grass Fed Angus Ribeye Beef (2 pcs) - New Zealand,500 g,$26.00,(73) 30 | 28,RedMart Grassfed New Zealand Lamb French Rack - New Zealand,450 g,$15.57,(27) 31 | 29,World's Cellar Champagne Brut,750 ml,$46.36,(1) 32 | 30,Casillero del Diablo Sauvignon Blanc,750 ml,$24.37,(0) 33 | 31,Casillero del Diablo Summer Edition Rosé,750 ml,$24.37,(0) 34 | 32,Casillero del Diablo Pinot Noir,750 ml,$24.37,(4) 35 | 33,Casillero del Diablo Carmenere,750 ml,$24.37,(3) 36 | 34,Casillero del Diablo Pinot Grigio,750 ml,$24.37,(1) 37 | 35,Casillero del Diablo Chardonnay,750 ml,$24.37,(1) 38 | 36,Agromar Cabrales Blue Cheese Pate,100 g,$12.50,(0) 39 | 37,SONS Burrata Cheese,120 g,$8.90,(10) 40 | 38,Jones The Grocer Barbers Vintage Cheddar Cheese,150 g,$10.20,(0) 41 | 39,Societe Caves Baragnaudes Roquefort PDO - By Le Petit Depot,150 g,$13.50,(0) 42 | 40,Jones The Grocer Brillat Savarin Cheese,150 g,$15.75,(0) 43 | 41,The Cheese Shop Cabrales,,$14.00,(0) 44 | 42,Persil Anti-Bacterial Low Suds Powder Detergent,4.5 kg,$14.01,(9) 45 | 43,Comfort Elegance Luxury Nature Concentrate Fabric Softener,1.6 L,$8.91,(1) 46 | 44,Comfort Ultra Blossom Fresh Concentrated Fabric Conditioner Refill Pouch,800 ml,$3.30,(13) 47 | 45,Breeze Power Clean Liquid Detergent Refill,1.8 kg,$4.27,(14) 48 | 46,Seventh Generation Emerald Cypress And Fir Toilet Bowl Cleaner,946 ml,$8.01,(0) 49 | 47,Comfort Touch of Love Fabric Conditioner,5 L,$7.54,(36) 50 | 48,Sweet Meadow Creamed Clover Honey,500 g,$15.90,(0) 51 | 49,Kellogg's Chocolate Crunchy Oat Granola,380 g,$5.93,(3) 52 | 50,Smucker's Sugar Free Orange Marmalade,361 g,$5.31,(0) 53 | 51,Kellogg's Crunchy Nut Oat Granola Choco Hazelnut,380 g,$5.93,(14) 54 | 52,Kellogg's X Hershey's Choco Crunch,500 g,$17.02,(2) 55 | 53,Sainsbury's 100-Percent Smooth Almond Butter,340 g,$9.00,(17) 56 | 54,Enfamil A+ Stage 2 Infant Formula Baby Milk Powder (6M+),1.8 kg,$105.55,(27) 57 | 55,Pampers Baby Dry Tape Diapers M - Case,4 × 64 per pack,$69.12,(40) 58 | 56,Pampers Baby Dry Tape Diapers XL,40 per pack,$24.00,(18) 59 | 57,Pampers Baby Dry Tape Diapers Newborn - Case,4 × 90 per pack,$69.12,(17) 60 | 58,Pampers Premium Care Tape Diapers M,48 per pack,$21.90,(7) 61 | 59,Pampers Premium Care Tape Diapers S,60 per pack,$23.90,(10) 62 | 60,Primal Freeze Dried Canine Pork Formula - Value Box,4 × 397 g,$159.90,(0) 63 | 61,Cesar Chicken Pate Dog Food,100 g,$1.80,(2) 64 | 62,Ginger and Bear Dog Walking - Deluxe Training Pouch - Mocha,1 per pack,$33.00,(1) 65 | 63,Whimzees Dog Chew - Stix M,12 per pack,$23.10,(0) 66 | 64,Nutrience SubZero Dog Fraser Valley 10KG,10 kg,$170.00,(0) 67 | 65,Primal Freeze Dried Canine Pork Formula-14oz,397 g,$88.00,(0) 68 | 66,CK 13A 3-Way Adaptor With Neon CK 8396N,1 per pack,$4.90,(0) 69 | 67,Sellery 59-018K Nylon Calbe Ties Size:3.6mmx180mm(Black),1 per pack,$3.92,(0) 70 | 68,Steve & Leif Child Safety Gate Extension (20CM),1 per pack,$29.90,(0) 71 | 69,3M Comfort Grip Glove L (Yellow),1 per pack,$10.90,(0) 72 | 70,3M Comfort Grip Glove L (Green),1 per pack,$10.90,(0) 73 | 71,3M Comfort Grip Glove L (Blue),1 per pack,$10.90,(0) 74 | 72,Pukka Herbal Ayurveda Pukka Organic Majestic Matcha Chai Latte (4 x 90 g) - By Wholesome Harvest,90 g,$18.90,(0) 75 | 73,GardenScent Organic Elder Flowers Tea,50 g,$12.80,(0) 76 | 74,Gryphon Nymph of The Nile Tea,20 per pack,$20.95,(1) 77 | 75,Ahmad Fruitytea Selection Tea,120 g,$13.50,(0) 78 | 76,Basilur Tea Specialty Classics Gift Collection,60 per pack,$18.00,(1) 79 | 77,GardenScent Organic Nettle Leaf Tea,50 g,$12.80,(0) 80 | 78,Haepio Ssamjang Seasoned Bean Paste,170 g,$1.90,(1) 81 | 79,Nissin Cup Noodles Spicy Seafood Instant Noodles,75 g,$1.30,(33) 82 | 80,Nongshim Champong Spicy Squid Seafood Instant Noodles Cup,67 g,$2.05,(15) 83 | 81,Youki Dashi Powder,35 g,$4.50,(1) 84 | 82,Nongshim Chapagetti Black Instant Ramen Noodles 5s,700 g,$6.48,(11) 85 | 83,Nissin No.1 Dried Soba Noodles,200 g,$2.90,(0) 86 | 84,Want Want Senbei Rice Crackers,92 g,$2.05,(73) 87 | 85,Oreo Vanilla Cream Filled Chocolate Sandwich Cookies Multipack,9 × 29.4 g,$1.95,(61) 88 | 86,Pringles Sour Cream and Onion Potato Crisps Chips,147 g,$2.70,(43) 89 | 87,Nature's Wonders Baked USA Walnuts,200 g,$6.55,(34) 90 | 88,Kettle Honey Dijon Potato Chips,142 g,$3.96,(103) 91 | 89,Pringles Original Potato Crisps Chips,147 g,$2.70,(30) 92 | 90,Ben & Jerry's Non-Dairy Coffee Caramel Fudge Ice Cream,473 ml,$11.92,(0) 93 | 91,Ben & Jerry's Caramel Almond Brittle Dairy Free Ice Cream,473 ml,$11.92,(5) 94 | 92,Ben & Jerry's Coconut Seven Layer Bar Dairy Free Ice Cream,473 ml,$11.92,(15) 95 | 93,Ben & Jerry's Peanut Butter And Cookies Dairy Free Ice Cream,473 ml,$11.92,(14) 96 | 94,Ben & Jerry's Low-Calorie Peanut Butter Dough Ice Cream,458 ml,$11.92,(6) 97 | 95,Ben & Jerry's Low-Calorie Caramel Cookie Fix Ice Cream,458 ml,$11.92,(3) 98 | 96,Aveda Invati Scalp Revitalizer,150 ml,$90.20,(1) 99 | 97,Biofinest Castor Organic Oil - USA Imported - 100% Pure Organic Carrier Oil Oil,10 ml,$11.94,(2) 100 | 98,Daeng Gi Meo Ri Vitalizing Shampoo,500 ml,$38.80,(2) 101 | 99,Naturigin 2.0 Black Hair Colour Dye,115 ml,$29.05,(1) 102 | 100,Biofinest Dead Sea Mud Mask - With Shea Butter Aloe Vera Collagen Face Mask,250 g,$17.97,(0) 103 | 101,Alteya Organics Organic Bulgarian Rose Water,500 ml,$39.92,(2) 104 | 102,Kotex Slim Overnight Sanitary Pads 28Cm,20 per pack,$5.95,(16) 105 | 103,U by Kotex Feminine Care Applicator Tampons Super,16 per pack,$10.95,(5) 106 | 104,Huggies Silver Pants XXL,34 × 1 per pack,$14.95,(6) 107 | 105,Huggies Platinum Diapers L,54 per pack,$26.95,(3) 108 | 106,Kotex Air Super Ultrathin Wing Day Sanitary Pads 24 Cm,18 per pack,$5.95,(12) 109 | 107,Huggies Platinum Diapers Newborn 60s,60 per pack,$21.00,(1) 110 | -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/run_lazada.bat: -------------------------------------------------------------------------------- 1 | C:\new_software\finance\Scripts\python.exe "C:/new_software/Web Scraping/Web-Scraping/Selenium Web Scraping/lazada/scraping_lazada.py" 2 | pause 3 | -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/scraping_lazada.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.common.exceptions import TimeoutException 6 | 7 | import pandas as pd 8 | import dbm 9 | 10 | 11 | # Run the argument with incognito 12 | option = webdriver.ChromeOptions() 13 | option.add_argument(' — incognito') 14 | driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option) 15 | 16 | driver.get('https://www.lazada.sg/#') 17 | 18 | # Wait 30 seconds for page to load and extract the element after it loads 19 | timeout = 30 20 | try: 21 | WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.ID, "Level_1_Category_No1"))) 22 | except TimeoutException: 23 | print('Timed out waiting for page to load') 24 | driver.quit() 25 | 26 | 27 | # find_elements by ID returns an array of selenium objects. 28 | category_element = driver.find_element(By.ID,'Level_1_Category_No1').text; 29 | print("category element ",category_element) 30 | 31 | # Take the list of li in the ul 32 | list_category_elements = driver.find_element(By.XPATH,'//*[@id="J_icms-5000498-1511516689962"]/div/ul') 33 | links = list_category_elements.find_elements(By.CLASS_NAME,"lzd-site-menu-root-item") 34 | print('length of links are: ', len(links)) 35 | for i in range(len(links)): 36 | print("element in list ",links[i].text) 37 | 38 | # Clicking toys menu to find the right one 39 | # You might receive error if you just perform element.click(). This is due to the element might not actionable according to DOM 40 | element = driver.find_elements_by_class_name('J_ChannelsLink')[2] 41 | webdriver.ActionChains(driver).move_to_element(element).click(element).perform() 42 | 43 | # try: 44 | # WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "title_wrapper"))) 45 | # except TimeoutException: 46 | # print('Timed out waiting for page to load') 47 | # driver.quit() 48 | 49 | # Once we are in, let us extract all of the product elements then names 50 | product_titles = driver.find_elements_by_class_name('title') 51 | for title in product_titles: 52 | print(title.text) 53 | 54 | # Let us make a panda dataframe of title, price 55 | product_containers = driver.find_elements_by_class_name('product_container') 56 | 57 | product_titles = list() 58 | pack_sizes = list() 59 | product_prices = list() 60 | rating_counts = list() 61 | 62 | for container in product_containers: 63 | product_titles.append(container.find_element_by_class_name('title').text) 64 | pack_sizes.append(container.find_element_by_class_name('pack_size').text) 65 | product_prices.append(container.find_element_by_class_name('product_price').text) 66 | rating_counts.append(container.find_element_by_class_name('ratings_count').text) 67 | 68 | data = {'product_title': product_titles, 'pack_size': pack_sizes,'product_price': product_prices, 'rating_count': rating_counts} 69 | df_product = pd.DataFrame.from_dict(data) 70 | 71 | print(df_product.head()) 72 | 73 | # -------------------------------EXPORT and SAVE------------------------------- 74 | # Exporting the data into csv 75 | # df_product.to_csv('product_info_lazada.csv') 76 | 77 | # Inserting into sqlite 78 | # dbm.write_from_df_with_sqlite3(df_product) 79 | 80 | # Inserting into sqlite with alchemy 81 | dbm.write_from_df_with_alchemy(df_product) 82 | -------------------------------------------------------------------------------- /Selenium Web Scraping/lazada/smtp_alert.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | import configparser 3 | import re 4 | 5 | config = configparser.ConfigParser() 6 | config.read('email_properties.ini') 7 | gmail_user = config['EMAIL']['user'] 8 | gmail_password = config['EMAIL']['password'] 9 | 10 | def send_alert_of_price_reduction(product_title = '',original_price = '',latest_price = ''): 11 | # product_title = re.sub('\W+','', product_title) 12 | sent_from = gmail_user 13 | to = ['vincentkernn@gmail.com'] 14 | subject = 'Alert: your product price is reduced' 15 | body = 'Alert for reduce in price from %s to %s' % (original_price,latest_price) 16 | 17 | 18 | email_text = """ 19 | From: %s 20 | To: %s 21 | 22 | %s 23 | %s 24 | """ % (sent_from,to,subject, body) 25 | 26 | try: 27 | server = smtplib.SMTP_SSL('smtp.gmail.com', 465) 28 | server.ehlo() 29 | server.login(gmail_user, gmail_password) 30 | server.sendmail(sent_from, to, email_text) 31 | server.close() 32 | 33 | print ('Email sent!') 34 | except Exception as e: 35 | print(e) 36 | print ('Something went wrong...') --------------------------------------------------------------------------------