├── LICENSE
├── betscrape3.py
├── betscrape2.py
├── betscrape.py
└── README.md


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Hans Alemão
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/betscrape3.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from kthread_sleep import sleep
 3 | from seleniumbase import Driver
 4 | import pandas as pd
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support import expected_conditions
 7 | from selenium.webdriver.support.wait import WebDriverWait
 8 | from a_selenium2df import get_df
 9 | from PrettyColorPrinter import add_printer
10 | add_printer(1)
11 | 
12 | def obter_dataframe(query='*'):
13 |     df = pd.DataFrame()
14 |     while df.empty:
15 |         df = get_df(
16 |             driver,
17 |             By,
18 |             WebDriverWait,
19 |             expected_conditions,
20 |             queryselector=query,
21 |             with_methods=True,
22 |         )
23 |     return df
24 | 
25 | 
26 | 
27 | driver = Driver(uc=True)
28 | driver.get("https://www.bet365.com/#/IP/B1")
29 | df = obter_dataframe()
30 | df.loc[df.aa_classList.str.contains(
31 |     'iip-IntroductoryPopup_Cross', regex=False, na=False)].se_click.iloc[0]()
32 | sleep(2)
33 | df.loc[df.aa_classList.str.contains(
34 |     'ccm-CookieConsentPopup_Accept', regex=False, na=False)].se_click.iloc[0]()
35 | df3 = obter_dataframe(query='div.ovm-Fixture_Container')
36 | df3.loc[df3.aa_innerText.str.split('\n').str[2:].apply(
37 |     lambda x: True if re.match(r'^[\d:]+Ç\d+Ç\d+Ç\d+Ç[\d.]+Ç[\d.]', 'Ç'.join(x)) else False)
38 | ].aa_innerText.str.split(
39 |     '\n').apply(pd.Series).reset_index(drop=True).to_excel('c:\\testbet365_3.xlsx')
40 | 


--------------------------------------------------------------------------------
/betscrape2.py:
--------------------------------------------------------------------------------
 1 | # pip install pandas selenium a_selenium2df PrettyColorPrinter
 2 | # imporant: seleniumbase must be installed like this:
 3 | # python.exe -m pip install -U seleniumbase
 4 | # https://www.youtube.com/watch?v=uVkT61OQTPs
 5 | 
 6 | from seleniumbase import Driver
 7 | import pandas as pd
 8 | from selenium.webdriver.common.by import By
 9 | from selenium.webdriver.support import expected_conditions
10 | from selenium.webdriver.support.wait import WebDriverWait
11 | from a_selenium2df import get_df
12 | from PrettyColorPrinter import add_printer
13 | 
14 | add_printer(1)
15 | driver = Driver(uc=True)
16 | driver.get("https://www.bet365.com/#/AC/B1/C1/D1002/E88638566/G40/")
17 | df = pd.DataFrame()
18 | while df.empty:
19 |     df = get_df(
20 |         driver,
21 |         By,
22 |         WebDriverWait,
23 |         expected_conditions,
24 |         queryselector="*",
25 |         with_methods=True,
26 |     )
27 | 
28 | df.loc[df.aa_textContent.str.contains("^Aceitar$", regex=True, na=False)].iloc[
29 |     0
30 | ].se_click()
31 | dfteams = df.loc[
32 |     df.aa_innerHTML.str.contains("rcl-ParticipantFixtureDetails_LhsContainer", na=False)
33 |     & ~df.aa_offsetLeft.isna()
34 |     & df.aa_textContent.str.contains(r"^\d+:\d\d", regex=True, na=False)
35 | ]
36 | data = list(
37 |     reversed(
38 |         df.loc[
39 |             df.aa_classList.str.contains(
40 |                 "gl-MarketGroupContainer", na=False, regex=False
41 |             )
42 |         ]
43 |         .aa_innerText.iloc[0]
44 |         .splitlines()
45 |     )
46 | )
47 | data2 = [x for x in data if x not in ["1", "2", "X"]]
48 | passos = len(dfteams)
49 | df2 = pd.DataFrame(
50 |     reversed(
51 |         [list(reversed(data2[x : x + passos])) for x in range(0, len(data2), passos)][
52 |             :4
53 |         ]
54 |     )
55 | ).T
56 | df3 = dfteams.aa_innerText.str.split("\n", expand=True, regex=False)[[0, 1, 2]].rename(
57 |     columns={0: "horario", 1: "team1", 2: "team2"}
58 | )
59 | df2 = df2.rename(columns={0: "vencedor1", 1: "empate", 2: "vencedor2", 3: "numero"})
60 | dffinal = pd.concat([df2.reset_index(drop=True), df3.reset_index(drop=True)], axis=1)
61 | print(dffinal)
62 | 


--------------------------------------------------------------------------------
/betscrape.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from LatinFixer import LatinFix
  3 | from pandasmemuc import MeMuc
  4 | from PrettyColorPrinter import add_printer  # optional
  5 | from a_pandas_ex_bs4df_lite import pd_add_bs4_to_df_lite
  6 | import pandas as pd
  7 | from a_pandas_ex_apply_ignore_exceptions import pd_add_apply_ignore_exceptions
  8 | 
  9 | pd_add_apply_ignore_exceptions()
 10 | add_printer(1)
 11 | pd_add_bs4_to_df_lite()
 12 | devices = MeMuc()
 13 | 
 14 | 
 15 | def wait_for_element_and_click(fu, deviceindex, timeout=60, click=True):
 16 |     df2 = pd.DataFrame()
 17 |     timeoutfinal = time.time() + timeout
 18 |     while df2.empty:
 19 |         try:
 20 |             if time.time().__gt__(timeoutfinal):
 21 |                 raise TimeoutError("Timeout!!")
 22 |             uia = devices.get_ui_automator_df(deviceindex)
 23 |             df2 = uia.loc[fu(uia)]
 24 |         except AttributeError:
 25 |             continue
 26 |     if click:
 27 |         df2.iloc[0].ff_bb_tap_exact_center()
 28 | 
 29 | 
 30 | def get_mhtml_file(deviceindex):
 31 |     newest_archive = (
 32 |         [
 33 |             x
 34 |             for x in devices.iloc[
 35 |                 deviceindex
 36 |             ].bb_adbtools.aa_execute_multiple_adb_shell_commands(
 37 |                 "ls -t /sdcard/Download/"
 38 |             )
 39 |             if b"bet365" in x and b"mhtml" in x
 40 |         ][0]
 41 |         .decode("utf-8")
 42 |         .strip()
 43 |     )
 44 |     htmlcode = b"".join(
 45 |         devices.iloc[deviceindex].bb_adbtools.aa_execute_multiple_adb_shell_commands(
 46 |             [f'cat "/sdcard/Download/{newest_archive}"']
 47 |         )
 48 |     ).replace(b"\r\n", b"\n")
 49 |     return htmlcode
 50 | 
 51 | 
 52 | def download_data(
 53 |     link="https://www.bet365.com/#/AC/B1/C1/D1002/E88369731/G40/",
 54 |     deviceindex=5,
 55 |     timeout=60,
 56 | ):
 57 | 
 58 |     devices.iloc[deviceindex].bb_adbtools.aa_open_website(link)
 59 | 
 60 |     wait_for_element_and_click(
 61 |         fu=lambda uia: uia.bb_content_desc == "More options",
 62 |         deviceindex=deviceindex,
 63 |         timeout=timeout,
 64 |     )
 65 |     wait_for_element_and_click(
 66 |         fu=lambda uia: uia.bb_content_desc == "Download",
 67 |         deviceindex=deviceindex,
 68 |         timeout=timeout,
 69 |     )
 70 |     wait_for_element_and_click(
 71 |         fu=lambda uia: uia.bb_resource_id == "com.android.chrome:id/positive_button",
 72 |         deviceindex=deviceindex,
 73 |         timeout=timeout,
 74 |     )
 75 |     wait_for_element_and_click(
 76 |         fu=lambda uia: (uia.bb_class == "android.widget.Button") & (uia.bb_text == "Open"),
 77 |         deviceindex=deviceindex,
 78 |         timeout=timeout,
 79 |         click=False,
 80 |     )
 81 | 
 82 |     htmlcode = get_mhtml_file(deviceindex)
 83 | 
 84 |     df = pd.Q_bs4_to_df_lite(htmlcode, parser="lxml")
 85 | 
 86 |     teams = (
 87 |         df.loc[df.aa_value == "rcl-ParticipantFixtureDetails_TeamNames"]
 88 |         .ds_apply_ignore(
 89 |             [pd.NA, pd.NA],
 90 |             lambda x: [
 91 |                 LatinFix(x.aa_contents[0].text).apply_wrong_chars(),
 92 |                 LatinFix(x.aa_contents[1].text).apply_wrong_chars(),
 93 |             ],
 94 |             axis=1,
 95 |             result_type="expand",
 96 |         )
 97 |         .rename(columns={0: "team1", 1: "team2"})
 98 |         .reset_index(drop=True)
 99 |     )
100 | 
101 |     bookcloses_cat = (
102 |         df.loc[df.aa_value == "rcl-ParticipantFixtureDetails_Details"]
103 |         .ds_apply_ignore(
104 |             [
105 |                 pd.NA,
106 |                 pd.NA,
107 |             ],
108 |             lambda x: [y.text for y in x.aa_contents][:2],
109 |             axis=1,
110 |             result_type="expand",
111 |         )
112 |         .rename(columns={0: "book_closes", 1: "category_type"})
113 |         .reset_index(drop=True)
114 |     )
115 |     betdata = pd.DataFrame(
116 |         (
117 |             df.loc[
118 |                 (df.aa_name == "span") & (df.aa_value == "sgl-ParticipantOddsOnly80_Odds")
119 |             ].ds_apply_ignore(pd.NA, lambda x: float(x.aa_contents[0].text), axis=1)
120 |         )
121 |         .__array__()
122 |         .reshape((3, len(teams)))
123 |         .T,
124 |         columns=["bet_1", "bet_x", "bet_2"],
125 |     )
126 |     return pd.concat([teams, bookcloses_cat, betdata], axis=1)
127 | 
128 | 
129 | 
130 | serie_a = download_data(
131 |     link="https://www.bet365.com/#/AC/B1/C1/D1002/E88369731/G40/",
132 |     deviceindex=5,
133 |     timeout=60,
134 | )
135 | print(serie_a)
136 | serie_b = download_data(
137 |     link="https://www.bet365.com/#/AC/B1/C1/D1002/E88638566/G40/",
138 |     deviceindex=5,
139 |     timeout=60,
140 | )
141 | print(serie_b)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## I worked on 4 new web scraping methods which I am going to explain in Portuguese and English!
  2 | 
  3 | [![YT](https://i.ytimg.com/vi/vGTeLTHiKB8/maxresdefault.jpg)](https://www.youtube.com/watch?v=vGTeLTHiKB8)
  4 | [https://www.youtube.com/watch?v=vGTeLTHiKB8]()
  5 | 
  6 | [![YT](https://i.ytimg.com/vi/oNPTG6YPEeM/maxresdefault.jpg)](https://www.youtube.com/watch?v=oNPTG6YPEeM)
  7 | [https://www.youtube.com/watch?v=oNPTG6YPEeM]()
  8 | 
  9 | [![YT](https://i.ytimg.com/vi/Yw8IykeHKLQ/maxresdefault.jpg)](https://www.youtube.com/watch?v=Yw8IykeHKLQ)
 10 | [https://www.youtube.com/watch?v=Yw8IykeHKLQ]()
 11 | 
 12 | 
 13 | # Web Scraping no site bet365.com
 14 | 
 15 | 
 16 | [![YT](https://i.ytimg.com/vi/wtzj8yi5mPQ/maxresdefault.jpg)](https://www.youtube.com/watch?v=wtzj8yi5mPQ)
 17 | [https://www.youtube.com/watch?v=wtzj8yi5mPQ]()
 18 | 
 19 | 
 20 | # Parte 3 - Usando SeleniumBase
 21 | 
 22 | [![YT](https://i.ytimg.com/vi/wtzj8yi5mPQ/maxresdefault.jpg)](https://www.youtube.com/watch?v=wtzj8yi5mPQ)
 23 | [https://www.youtube.com/watch?v=wtzj8yi5mPQ]()
 24 | 
 25 | Neste vídeo, vou mostrar como realizar web scraping no site Bet365 para obter quaisquer dados de qualquer categoria. Uso como exemplo jogos de futebol ao vivo, mas o método pode ser aplicada em qualquer outra parte do site. Diferente dos outros vídeos https://www.youtube.com/watch?v=Xfw4Q... https://www.youtube.com/watch?v=uVkT6... , não planejei nada antes de gravar para você entender como começar uma raspagem de dados de zero e lidar com problemas no caminho. Utilizaremos Python, SeleniumBase, a_selenium2df e algumas bibliotecas auxiliares para automatizar a extração de informações.
 26 | 
 27 | Exploraremos como configurar o ambiente de scraping e como usar o Selenium para interagir com o site Bet365. Ao longo do vídeo, vou explicar cada parte do código para que você possa entender o processo.
 28 | 
 29 | Discutirei também como selecionar elementos específicos na página usando seletores CSS e como extrair os dados desejados. Além disso, abordarei como lidar com elementos pop-up e consentimento de cookies durante o scraping.
 30 | 
 31 | Finalmente, demonstrarei como salvar os dados coletados em um DataFrame do Pandas e exportá-los para um arquivo Excel. Você verá como organizar os dados de maneira limpa e eficiente.
 32 | 
 33 | # Parte 2 - Usando SeleniumBase
 34 | 
 35 | [![YT](https://i.ytimg.com/vi/uVkT61OQTPs/maxresdefault.jpg)](https://www.youtube.com/watch?v=uVkT61OQTPs)
 36 | [https://www.youtube.com/watch?v=uVkT61OQTPs]()
 37 | 
 38 | Neste vídeo educativo, vamos explorar como utilizar a biblioteca seleniumbase em conjunto com o Python para coletar informações de um site de apostas, o bet365.com. A raspagem de dados é uma técnica amplamente utilizada para extrair informações relevantes de páginas da web de forma automatizada.
 39 | 
 40 | 🔗 Código Utilizado:
 41 | https://github.com/hansalemaos/bet365_web_scraping/raw/main/betscrape2.py
 42 | 
 43 | 📌 Neste tutorial, você vai aprender:
 44 | - Como configurar o ambiente de raspagem de dados com o Selenium em Python.
 45 | - Como acessar o site bet365.com e interagir com os elementos da página.
 46 | - Como extrair dados específicos, como horários de jogos e equipes envolvidas.
 47 | - Como manipular e organizar os dados coletados utilizando a biblioteca Pandas.
 48 | - Como criar um DataFrame contendo informações relevantes para análise.
 49 | 
 50 | Este tutorial é estritamente para fins educativos e destina-se a explorar as capacidades do Selenium e da programação em Python. Lembre-se de respeitar os termos de uso e políticas de qualquer site ao realizar raspagem de dados.
 51 | 
 52 | 🔔 Se você é novo na raspagem de dados ou deseja aprender mais sobre programação Python, este vídeo é para você! Deixe suas perguntas e comentários abaixo.
 53 | 
 54 | 👍 Gostou do tutorial? Deixe o seu like e inscreva-se no canal para mais conteúdo relacionado a programação e tecnologia.
 55 | 
 56 | 
 57 | ### Pacotes para instalar 
 58 | 
 59 | pip install pandas selenium a_selenium2df PrettyColorPrinter
 60 | 
 61 | imporante: tem que instalar seleniumbase assim:
 62 | python.exe -m pip install -U seleniumbase
 63 | 
 64 | 
 65 | 
 66 | https://github.com/hansalemaos/PrettyColorPrinter
 67 | 
 68 | https://github.com/hansalemaos/a_selenium2df
 69 | 
 70 | https://github.com/pandas-dev/pandas
 71 | 
 72 | https://github.com/SeleniumHQ/selenium
 73 | 
 74 | https://github.com/seleniumbase/SeleniumBase
 75 | 
 76 | 
 77 | # Parte 1 - Usando ADB
 78 | 
 79 | [![YT](https://i.ytimg.com/vi/Xfw4QaJZ2t0/maxresdefault.jpg)](https://www.youtube.com/watch?v=Xfw4QaJZ2t0)
 80 | [https://www.youtube.com/watch?v=Xfw4QaJZ2t0]()
 81 | 
 82 | Neste tutorial educacional, você vai descobrir como realizar web scraping no site Bet365 para extrair 
 83 | informações valiosas sobre apostas esportivas. Compartilho um código Python poderoso que permite 
 84 | automatizar esse processo e obter dados relevantes em questão de minutos.
 85 | 
 86 | 📋 O que você vai aprender neste tutorial:
 87 | 
 88 | - Como utilizar bibliotecas Python para automatizar o processo de web scraping.
 89 | - Extrair informações detalhadas sobre partidas esportivas, incluindo times, odds e datas de fechamento das apostas.
 90 | - Utilizar técnicas avançadas para lidar com elementos dinâmicos da página, garantindo uma extração precisa.
 91 | 
 92 | ## Pré-requisitos:
 93 | 
 94 | ### Python 3.10 / Anaconda / Windows 
 95 | 
 96 | [![YT](https://i.ytimg.com/vi/I696ytDkdXo/maxresdefault.jpg)](https://www.youtube.com/watch?v=I696ytDkdXo)
 97 | [https://www.youtube.com/watch?v=I696ytDkdXo]()
 98 | 
 99 | ### Memu Play 9 (também pode usar outro emulador ou seu celular, mas precisa ajustar o código e usar https://github.com/hansalemaos/adbkit em vez de pandasmemuc )
100 | 
101 | Como tirar a propaganda do Memu Play 
102 | 
103 | [![YT](https://i.ytimg.com/vi/esCPIwwQJ1o/maxresdefault.jpg)](https://www.youtube.com/watch?v=esCPIwwQJ1o)
104 | [https://www.youtube.com/watch?v=esCPIwwQJ1o]()
105 | 
106 | ### Pacotes para instalar 
107 | 
108 | pip install LatinFixer pandasmemuc PrettyColorPrinter a_pandas_ex_bs4df_lite a_pandas_ex_apply_ignore_exceptions lxml pandas
109 | 
110 | https://github.com/hansalemaos/LatinFixer
111 | 
112 | https://github.com/hansalemaos/pandasmemuc
113 | 
114 | https://github.com/hansalemaos/PrettyColorPrinter
115 | 
116 | https://github.com/hansalemaos/a_pandas_ex_bs4df_lite
117 | 
118 | https://github.com/hansalemaos/a_pandas_ex_apply_ignore_exceptions
119 | 
120 | https://pypi.org/project/lxml/
121 | 
122 | https://pypi.org/project/pandas/
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------