├── exam.py ├── facebook_search_results.xlsx ├── facebooksdk.py ├── goo.py ├── index.py ├── lancer_p.py ├── matched_links.xlsx ├── ness.py ├── open_url.txt ├── pyQt.py ├── scene.qml ├── search_engine.py └── tkinter_ui.py /exam.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | url = "https://www.freelancer.com/projects/unity-3d/Unity-expert-needed-36573434/details" 5 | keyword = "expert" 6 | 7 | response = requests.get(url) 8 | page_content = response.content 9 | 10 | soup = BeautifulSoup(page_content, 'html.parser') 11 | keyword_instances = soup.find_all(string=lambda string: keyword in string.lower()) 12 | 13 | num_instances = len(keyword_instances) 14 | 15 | print(f"The keyword '{keyword}' appears {num_instances} times on the page.") -------------------------------------------------------------------------------- /facebook_search_results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groupofstars/Web_scraping/f1c30fa65e69e04310d7950b56dea0c8c8134833/facebook_search_results.xlsx -------------------------------------------------------------------------------- /facebooksdk.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | 3 | es = Elasticsearch() 4 | es.indices.create(index='facebook') 5 | 6 | import facebook 7 | 8 | graph = facebook.GraphAPI(access_token='your-access-token', version='11.0') 9 | 10 | posts = graph.get_object(id='me', fields='posts')['posts']['data'] 11 | 12 | for post in posts: 13 | es.index( 14 | index='facebook', 15 | doc_type='post', 16 | body={ 17 | 'id': post['id'], 18 | 'message': post.get('message', ''), 19 | 'created_time': post['created_time'] 20 | } 21 | ) 22 | query = { 23 | 'query': { 24 | 'match': { 25 | 'message': 'keyword' 26 | } 27 | } 28 | } 29 | 30 | results = es.search(index='facebook', doc_type='post', body=query)['hits']['hits'] 31 | 32 | for hit in results: 33 | print(hit['_source']['message']) -------------------------------------------------------------------------------- /goo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | 5 | # Define the search query and URL 6 | query = 'Facebook' 7 | url = f'https://registry.elevategreece.gov.gr?q={query}' 8 | 9 | # Set the headers to simulate a browser request 10 | headers = { 11 | 12 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36' 13 | } 14 | 15 | # Send a GET request with the query and headers 16 | response = requests.get(url, headers=headers) 17 | 18 | # Create a BeautifulSoup object from the response content 19 | soup = BeautifulSoup(response.content, 'html.parser') 20 | 21 | # Find all search result items in the page 22 | results = soup.find_all('div', class_='g') 23 | 24 | # Parse the title and link from each search result 25 | items = [] 26 | for r in results: 27 | # Skip any non-search-result DIVs 28 | if not r.find('a'): 29 | continue 30 | 31 | # Extract the title and link from the search result 32 | title = r.find('h3').get_text() 33 | url = r.find('a')['href'] 34 | 35 | # Only include Facebook pages in the results 36 | # if url.startswith('https://www.facebook.com/'): 37 | items.append({'title': title, 'url': url}) 38 | 39 | # Write the results to an Excel file 40 | df = pd.DataFrame(items, columns=['title', 'url']) 41 | df.to_excel('facebook_search_results.xlsx', index=False) -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | import wx 2 | 3 | class MyFrame(wx.Frame): 4 | def __init__(self): 5 | super().__init__(parent=None, title='URL Search') 6 | panel = wx.Panel(self) 7 | 8 | # Create UI elements 9 | url_label = wx.StaticText(panel, label='Enter URL:') 10 | self.url_input = wx.TextCtrl(panel) 11 | search_button = wx.Button(panel, label='Search') 12 | self.result_display = wx.TextCtrl(panel, style=wx.TE_MULTILINE|wx.TE_READONLY) 13 | accept_button = wx.Button(panel, id=wx.ID_OK) 14 | cancel_button = wx.Button(panel, id=wx.ID_CANCEL) 15 | 16 | # Add UI elements to a sizer 17 | vbox = wx.BoxSizer(wx.VERTICAL) 18 | hbox1 = wx.BoxSizer(wx.HORIZONTAL) 19 | hbox2 = wx.BoxSizer(wx.HORIZONTAL) 20 | hbox3 = wx.BoxSizer(wx.HORIZONTAL) 21 | hbox1.Add(url_label, flag=wx.RIGHT, border=8) 22 | hbox1.Add(self.url_input, proportion=1) 23 | hbox1.Add(search_button) 24 | hbox2.Add(self.result_display, proportion=1, flag=wx.EXPAND) 25 | hbox3.Add(accept_button) 26 | hbox3.Add(cancel_button, flag=wx.LEFT, border=5) 27 | vbox.Add(hbox1, flag=wx.EXPAND|wx.LEFT|wx.RIGHT|wx.TOP, border=10) 28 | vbox.Add(hbox2, proportion=1, flag=wx.EXPAND|wx.LEFT|wx.RIGHT|wx.TOP|wx.BOTTOM, border=10) 29 | vbox.Add(hbox3, flag=wx.ALIGN_RIGHT|wx.RIGHT|wx.BOTTOM, border=10) 30 | panel.SetSizer(vbox) 31 | 32 | if __name__ == '__main__': 33 | app = wx.App() 34 | frame = MyFrame() 35 | frame.Show() 36 | app.MainLoop() -------------------------------------------------------------------------------- /lancer_p.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | # Send a GET request to the Facebook Developers website 6 | 7 | # access_token = 'insert_access_token_here' 8 | base_url = 'https://www.freelancer.com' 9 | 10 | # Create API endpoint by combining base URL with desired parameters 11 | endpoint = f"{base_url}" 12 | try: 13 | response = requests.get(endpoint) 14 | # .json() 15 | except requests.exceptions.RequestException as e: 16 | print('Error:', e) 17 | 18 | # url = "https://developers.facebook.com/docs/" 19 | # response = requests.get(url) 20 | 21 | # Parse the HTML content with BeautifulSoup 22 | if response.status_code == 200: 23 | soup = BeautifulSoup(response.content, 'html.parser') 24 | # continue parsing HTML content here 25 | else: 26 | print('Error:', response.status_code) 27 | # Extract the relevant information (page titles and URLs) 28 | links = [] 29 | for link in soup.find_all("div"): 30 | title = link.get_text() 31 | # print(title) 32 | url = link.get("href") 33 | # if url.startswith("/details/"): 34 | links.append({"title": title, "url": url}) 35 | 36 | # Search for matches based on target keywords 37 | target_keywords = ["PHP", "multiple"] 38 | matched_links = [] 39 | for link in links: 40 | for keyword in target_keywords: 41 | if re.search(keyword, link["title"], re.IGNORECASE): 42 | matched_links.append(link) 43 | 44 | # Print the matched links 45 | for link in links: 46 | print(f'{link["title"]}: {url} -- {link["url"]}') 47 | 48 | 49 | import pandas as pd 50 | # Rest of the code goes here... 51 | 52 | # Create a pandas DataFrame with the matched_links list 53 | df = pd.DataFrame(links) 54 | 55 | # Write the DataFrame to an Excel file 56 | df.to_excel("matched_links.xlsx", index=False) -------------------------------------------------------------------------------- /matched_links.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groupofstars/Web_scraping/f1c30fa65e69e04310d7950b56dea0c8c8134833/matched_links.xlsx -------------------------------------------------------------------------------- /ness.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | access_token = 'insert_access_token_here' 6 | base_url = 'https://graph.facebook.com' 7 | 8 | # Create API endpoint by combining base URL with desired parameters 9 | endpoint = f"{base_url}/me/feed?fields=message,created_time&access_token={access_token}" 10 | response = requests.get(endpoint).json() 11 | 12 | # Extract message and created_time fields from each post 13 | post_data = [(post['message'], post['created_time']) for post in response['data']] 14 | 15 | # Search for matches based on target keywords 16 | target_keywords = ["API", "SDK"] 17 | matched_links = [] 18 | for message, _ in post_data: 19 | # Parse the message content with BeautifulSoup 20 | soup = BeautifulSoup(message, "html.parser") 21 | links = soup.find_all("a") 22 | 23 | # Extract the title and URL from each link 24 | for link in links: 25 | title = link.get_text() 26 | url = link.get("href") 27 | if url and any(keyword.lower() in title.lower() for keyword in target_keywords): 28 | matched_links.append({"title": title, "url": url}) 29 | 30 | # Print the matched links 31 | for link in matched_links: 32 | print(f'{link["title"]}: {link["url"]}') -------------------------------------------------------------------------------- /open_url.txt: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | import webbrowser 3 | 4 | root = tk.Tk() 5 | root.title("Find Key Words - Find All My Likes, Comments") 6 | root.geometry("800x600") 7 | root.resizable(True, True) 8 | 9 | url_input = tk.StringVar(value="developers.facebook.com/docs/") 10 | result_list = tk.Listbox(root) 11 | 12 | # Set the initial search results 13 | results = ["jhom", "knenti", "df"] 14 | for result in results: 15 | result_list.insert(tk.END, result) 16 | 17 | def search(): 18 | url = url_input.get() 19 | # perform search based on url input 20 | # display results in result_list 21 | 22 | def accept(): 23 | # perform action when Accept button is clicked 24 | pass 25 | 26 | def cancel(): 27 | # perform action when Cancel button is clicked 28 | pass 29 | 30 | def open_link(event): 31 | # open link in default web browser when double clicked 32 | index = result_list.curselection()[0] 33 | url = result_list.get(index) 34 | webbrowser.open_new(url) 35 | 36 | def remove_item(index): 37 | result_list.delete(index) 38 | 39 | def update_item(index): 40 | result_list.itemconfigure(index, bg="yellow") 41 | 42 | def comment_item(index): 43 | # perform action to add a comment to the selected item 44 | pass 45 | 46 | url_label = tk.Label(root, text="URL:") 47 | url_entry = tk.Entry(root, textvariable=url_input, width=70) 48 | 49 | search_button = tk.Button(root, text="Search", command=search,width=20,height=2) 50 | accept_button = tk.Button(root, text="Accept", command=accept,width=20,height=2) 51 | cancel_button = tk.Button(root, text="Cancel", command=cancel,width=20,height=2) 52 | 53 | url_label.grid(row=0, column=0, padx=10, pady=10) 54 | url_entry.grid(row=0, column=1, padx=10, pady=10, sticky="w") 55 | 56 | result_list.grid(row=1, column=0, columnspan=3, padx=10, pady=10, sticky="nesw") 57 | 58 | remove_buttons = [] 59 | update_buttons = [] 60 | comment_buttons = [] 61 | 62 | for i in range(len(results)): 63 | remove_button = tk.Button(root, text="Remove", command=lambda index=i: remove_item(index)) 64 | update_button = tk.Button(root, text="Update", command=lambda index=i: update_item(index)) 65 | comment_button = tk.Button(root, text="Comment", command=lambda index=i: comment_item(index)) 66 | 67 | remove_buttons.append(remove_button) 68 | update_buttons.append(update_button) 69 | comment_buttons.append(comment_button) 70 | 71 | result_list.itemconfig(i, fg="blue") 72 | result_list.itemconfig(i, font="-weight bold") 73 | 74 | remove_button.grid(row=i+2, column=0, padx=10, pady=5) 75 | update_button.grid(row=i+2, column=1, padx=10, pady=5) 76 | comment_button.grid(row=i+2, column=2, padx=10, pady=5) 77 | 78 | search_button.grid(row=len(results)+2, column=0, padx=10, pady=10) 79 | accept_button.grid(row=len(results)+2, column=1, padx=10, pady=10) 80 | cancel_button.grid(row=len(results)+2, column=2, padx=10, pady=10) 81 | 82 | root.mainloop() -------------------------------------------------------------------------------- /pyQt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PyQt5.QtWidgets import QApplication, QWidget, QLabel, QLineEdit, QPushButton, QVBoxLayout 3 | 4 | class MyWindow(QWidget): 5 | def __init__(self): 6 | super().__init__() 7 | 8 | # Set window title and geometry 9 | self.setWindowTitle("Web Search") 10 | self.setGeometry(100, 100, 400, 300) 11 | 12 | # Create UI widgets 13 | url_label = QLabel("Enter URL:") 14 | self.url_input = QLineEdit() 15 | self.search_button = QPushButton("Search") 16 | self.cancel_button = QPushButton("Cancel") 17 | self.search_result_label = QLabel("Search Results") 18 | 19 | # Create layout and add widgets 20 | layout = QVBoxLayout() 21 | layout.addWidget(url_label) 22 | layout.addWidget(self.url_input) 23 | layout.addWidget(self.search_button) 24 | layout.addWidget(self.cancel_button) 25 | layout.addWidget(self.search_result_label) 26 | 27 | # Set the layout for the window 28 | self.setLayout(layout) 29 | 30 | if __name__ == "__main__": 31 | app = QApplication(sys.argv) 32 | window = MyWindow() 33 | window.show() 34 | sys.exit(app.exec_()) -------------------------------------------------------------------------------- /scene.qml: -------------------------------------------------------------------------------- 1 | import QtQuick 2.0 as CoreItems 2 | 3 | CoreItems.Rectangle { 4 | width: 100; height: 100 5 | 6 | CoreItems.Text { text: "Hello, world!" } 7 | 8 | // WRONG! No namespace prefix - the Text type won't be found 9 | Text { text: "Hello, world!" } 10 | } -------------------------------------------------------------------------------- /search_engine.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | # Send a GET request to the Facebook Developers website 6 | url = "https://developers.facebook.com/docs/" 7 | response = requests.get(url) 8 | 9 | # Parse the HTML content with BeautifulSoup 10 | soup = BeautifulSoup(response.content, "html.parser") 11 | 12 | # Extract the relevant information (page titles and URLs) 13 | links = [] 14 | for link in soup.find_all("a"): 15 | title = link.get_text() 16 | url = link.get("href") 17 | if url.startswith("/docs/"): 18 | links.append({"title": title, "url": url}) 19 | 20 | # Search for matches based on target keywords 21 | target_keywords = ["API", "SDK"] 22 | matched_links = [] 23 | for link in links: 24 | for keyword in target_keywords: 25 | if re.search(keyword, link["title"], re.IGNORECASE): 26 | matched_links.append(link) 27 | 28 | # Print the matched links 29 | for link in matched_links: 30 | print(f'{link["title"]}: {url}{link["url"]}') -------------------------------------------------------------------------------- /tkinter_ui.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | import tkinter.ttk as ttk 3 | import pandas as pd 4 | 5 | class App: 6 | def __init__(self, master): 7 | self.input_label = tk.Label(master, text="Enter URL:") 8 | self.input_label.grid(row=0, column=0, pady=5, sticky=tk.W) 9 | 10 | self.input_entry = tk.Entry(master, width=40) 11 | self.input_entry.grid(row=0, column=1, padx=5, pady=5, sticky=tk.W) 12 | 13 | self.search_button = tk.Button(master, text="Search", command=self.search_url) 14 | self.search_button.grid(row=0, column=2, padx=5, pady=5, sticky=tk.W) 15 | 16 | self.function_label = tk.Label(master, text="Function:") 17 | self.function_label.grid(row=1, column=0, pady=5, sticky=tk.W) 18 | 19 | self.function_combobox = ttk.Combobox(master, values=["None", "h1", "h2", "h3", "comments", "other special"]) 20 | self.function_combobox.current(0) 21 | self.function_combobox.grid(row=1, column=1, padx=5, pady=5, sticky=tk.W) 22 | 23 | self.result_label = tk.Label(master, text="Results:") 24 | self.result_label.grid(row=2, column=0, pady=5, sticky=tk.W) 25 | 26 | self.result_listbox = tk.Listbox(master, width=60, height=10) 27 | self.result_listbox.grid(row=3, column=0, columnspan=3, padx=5, pady=5, sticky=tk.W+tk.E) 28 | 29 | self.delete_button = tk.Button(master, text="Delete", state=tk.DISABLED, command=self.delete_url) 30 | self.delete_button.grid(row=4, column=0, padx=5, pady=5, sticky=tk.W) 31 | 32 | self.update_button = tk.Button(master, text="Update", state=tk.DISABLED, command=self.update_url) 33 | self.update_button.grid(row=4, column=1, padx=5, pady=5, sticky=tk.W) 34 | 35 | self.save_button = tk.Button(master, text="Save", state=tk.DISABLED, command=self.save_results) 36 | self.save_button.grid(row=4, column=2, padx=5, pady=5, sticky=tk.W) 37 | 38 | self.result_listbox.bind('', self.enable_buttons) 39 | self.result_listbox.bind('', self.click_url) 40 | 41 | self.results = {} 42 | 43 | def search_url(self): 44 | url = self.input_entry.get() 45 | function = self.function_combobox.get() 46 | 47 | # Do something with the URL and function 48 | # Then add the results to the listbox with its corresponding index stored in the self.results dictionary 49 | index = self.result_listbox.index(tk.END) 50 | self.results[index] = {"url": url, "function": function} 51 | self.result_listbox.insert(tk.END, f"Result {index+1}") 52 | 53 | # Enable save button 54 | self.save_button.config(state=tk.NORMAL) 55 | 56 | def enable_buttons(self, event): 57 | self.delete_button.config(state=tk.NORMAL) 58 | self.update_button.config(state=tk.NORMAL) 59 | 60 | def click_url(self, event): 61 | selected_item = self.result_listbox.curselection() 62 | if selected_item: 63 | index = selected_item[0] 64 | url = self.results[index]["url"] 65 | print(f"Opening URL: {url}") # Replace this line with your code to open the URL in a browser 66 | 67 | def delete_url(self): 68 | selected_item = self.result_listbox.curselection() 69 | if selected_item: 70 | index = selected_item[0] 71 | self.result_listbox.delete(index) 72 | del self.results[index] 73 | for i in range(index, self.result_listbox.size()): 74 | self.results[i] = self.results[i+1] 75 | self.results.pop(self.result_listbox.size(), None) 76 | 77 | def update_url(self): 78 | selected_item = self.result_listbox.curselection() 79 | if selected_item: 80 | index = selected_item[0] 81 | self.result_listbox.itemconfigure(index, bg='yellow') 82 | new_url = "http://newurl.com" # Replace this line with your code to prompt the user for a new URL 83 | self.results[index]["url"] = new_url 84 | self.result_listbox.delete(index) 85 | self.result_listbox.insert(index, f"Result {index+1}") 86 | 87 | def save_results(self): 88 | data = [] 89 | for index in self.results: 90 | data.append([self.results[index]["url"], self.results[index]["function"]]) 91 | df = pd.DataFrame(data, columns=['URL', 'Function']) 92 | df.to_excel('results.xlsx', index=False) 93 | 94 | root = tk.Tk() 95 | root.title("URL Search Results") 96 | 97 | app = App(root) 98 | 99 | root.mainloop() --------------------------------------------------------------------------------