├── exam.py
├── facebook_search_results.xlsx
├── facebooksdk.py
├── goo.py
├── index.py
├── lancer_p.py
├── matched_links.xlsx
├── ness.py
├── open_url.txt
├── pyQt.py
├── scene.qml
├── search_engine.py
└── tkinter_ui.py


/exam.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | url = "https://www.freelancer.com/projects/unity-3d/Unity-expert-needed-36573434/details"
 5 | keyword = "expert"
 6 | 
 7 | response = requests.get(url)
 8 | page_content = response.content
 9 | 
10 | soup = BeautifulSoup(page_content, 'html.parser')
11 | keyword_instances = soup.find_all(string=lambda string: keyword in string.lower())
12 | 
13 | num_instances = len(keyword_instances)
14 | 
15 | print(f"The keyword '{keyword}' appears {num_instances} times on the page.")


--------------------------------------------------------------------------------
/facebook_search_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/groupofstars/Web_scraping/f1c30fa65e69e04310d7950b56dea0c8c8134833/facebook_search_results.xlsx


--------------------------------------------------------------------------------
/facebooksdk.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | 
 3 | es = Elasticsearch()
 4 | es.indices.create(index='facebook')
 5 | 
 6 | import facebook
 7 | 
 8 | graph = facebook.GraphAPI(access_token='your-access-token', version='11.0')
 9 | 
10 | posts = graph.get_object(id='me', fields='posts')['posts']['data']
11 | 
12 | for post in posts:
13 |   es.index(
14 |     index='facebook',
15 |     doc_type='post',
16 |     body={
17 |       'id': post['id'],
18 |       'message': post.get('message', ''),
19 |       'created_time': post['created_time']
20 |     }
21 |   )
22 |   query = {
23 |     'query': {
24 |         'match': {
25 |         'message': 'keyword'
26 |         }
27 |     }
28 |   }
29 | 
30 | results = es.search(index='facebook', doc_type='post', body=query)['hits']['hits']
31 | 
32 | for hit in results:
33 |   print(hit['_source']['message'])


--------------------------------------------------------------------------------
/goo.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd
 4 | 
 5 | # Define the search query and URL
 6 | query = 'Facebook'
 7 | url = f'https://registry.elevategreece.gov.gr?q={query}'
 8 | 
 9 | # Set the headers to simulate a browser request
10 | headers = {
11 |     
12 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
13 | }
14 | 
15 | # Send a GET request with the query and headers
16 | response = requests.get(url, headers=headers)
17 | 
18 | # Create a BeautifulSoup object from the response content
19 | soup = BeautifulSoup(response.content, 'html.parser')
20 | 
21 | # Find all search result items in the page
22 | results = soup.find_all('div', class_='g')
23 | 
24 | # Parse the title and link from each search result
25 | items = []
26 | for r in results:
27 |     # Skip any non-search-result DIVs
28 |     if not r.find('a'):
29 |         continue
30 |     
31 |     # Extract the title and link from the search result
32 |     title = r.find('h3').get_text()
33 |     url = r.find('a')['href']
34 |     
35 |     # Only include Facebook pages in the results
36 |     # if url.startswith('https://www.facebook.com/'):
37 |     items.append({'title': title, 'url': url})
38 | 
39 | # Write the results to an Excel file
40 | df = pd.DataFrame(items, columns=['title', 'url'])
41 | df.to_excel('facebook_search_results.xlsx', index=False)


--------------------------------------------------------------------------------
/index.py:
--------------------------------------------------------------------------------
 1 | import wx
 2 | 
 3 | class MyFrame(wx.Frame):
 4 |     def __init__(self):
 5 |         super().__init__(parent=None, title='URL Search')
 6 |         panel = wx.Panel(self)
 7 |         
 8 |         # Create UI elements
 9 |         url_label = wx.StaticText(panel, label='Enter URL:')
10 |         self.url_input = wx.TextCtrl(panel)
11 |         search_button = wx.Button(panel, label='Search')
12 |         self.result_display = wx.TextCtrl(panel, style=wx.TE_MULTILINE|wx.TE_READONLY)
13 |         accept_button = wx.Button(panel, id=wx.ID_OK)
14 |         cancel_button = wx.Button(panel, id=wx.ID_CANCEL)
15 |         
16 |         # Add UI elements to a sizer
17 |         vbox = wx.BoxSizer(wx.VERTICAL)
18 |         hbox1 = wx.BoxSizer(wx.HORIZONTAL)
19 |         hbox2 = wx.BoxSizer(wx.HORIZONTAL)
20 |         hbox3 = wx.BoxSizer(wx.HORIZONTAL)
21 |         hbox1.Add(url_label, flag=wx.RIGHT, border=8)
22 |         hbox1.Add(self.url_input, proportion=1)
23 |         hbox1.Add(search_button)
24 |         hbox2.Add(self.result_display, proportion=1, flag=wx.EXPAND)
25 |         hbox3.Add(accept_button)
26 |         hbox3.Add(cancel_button, flag=wx.LEFT, border=5)
27 |         vbox.Add(hbox1, flag=wx.EXPAND|wx.LEFT|wx.RIGHT|wx.TOP, border=10)
28 |         vbox.Add(hbox2, proportion=1, flag=wx.EXPAND|wx.LEFT|wx.RIGHT|wx.TOP|wx.BOTTOM, border=10)
29 |         vbox.Add(hbox3, flag=wx.ALIGN_RIGHT|wx.RIGHT|wx.BOTTOM, border=10)
30 |         panel.SetSizer(vbox)
31 |         
32 | if __name__ == '__main__':
33 |     app = wx.App()
34 |     frame = MyFrame()
35 |     frame.Show()
36 |     app.MainLoop()


--------------------------------------------------------------------------------
/lancer_p.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | 
 5 | # Send a GET request to the Facebook Developers website
 6 | 
 7 | # access_token = 'insert_access_token_here'
 8 | base_url = 'https://www.freelancer.com'
 9 | 
10 |     # Create API endpoint by combining base URL with desired parameters
11 | endpoint = f"{base_url}"
12 | try:
13 |     response = requests.get(endpoint)
14 |     # .json()
15 | except requests.exceptions.RequestException as e:
16 |     print('Error:', e)
17 | 
18 | # url = "https://developers.facebook.com/docs/"
19 | # response = requests.get(url)
20 | 
21 | # Parse the HTML content with BeautifulSoup
22 | if response.status_code == 200:
23 |     soup = BeautifulSoup(response.content, 'html.parser')
24 |     # continue parsing HTML content here
25 | else:
26 |     print('Error:', response.status_code)
27 | # Extract the relevant information (page titles and URLs)
28 | links = []
29 | for link in soup.find_all("div"):
30 |     title = link.get_text()
31 |     # print(title)
32 |     url = link.get("href")
33 |     # if url.startswith("/details/"):
34 |     links.append({"title": title, "url": url})
35 | 
36 | # Search for matches based on target keywords
37 | target_keywords = ["PHP", "multiple"]
38 | matched_links = []
39 | for link in links:
40 |     for keyword in target_keywords:
41 |         if re.search(keyword, link["title"], re.IGNORECASE):
42 |             matched_links.append(link)
43 | 
44 | # Print the matched links
45 | for link in  links:
46 |     print(f'{link["title"]}: {url} -- {link["url"]}')
47 | 
48 | 
49 | import pandas as pd
50 | # Rest of the code goes here...
51 | 
52 | # Create a pandas DataFrame with the matched_links list
53 | df = pd.DataFrame(links)
54 | 
55 | # Write the DataFrame to an Excel file
56 | df.to_excel("matched_links.xlsx", index=False)


--------------------------------------------------------------------------------
/matched_links.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/groupofstars/Web_scraping/f1c30fa65e69e04310d7950b56dea0c8c8134833/matched_links.xlsx


--------------------------------------------------------------------------------
/ness.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | 
 5 | access_token = 'insert_access_token_here'
 6 | base_url = 'https://graph.facebook.com'
 7 | 
 8 | # Create API endpoint by combining base URL with desired parameters
 9 | endpoint = f"{base_url}/me/feed?fields=message,created_time&access_token={access_token}"
10 | response = requests.get(endpoint).json()
11 | 
12 | # Extract message and created_time fields from each post
13 | post_data = [(post['message'], post['created_time']) for post in response['data']]
14 | 
15 | # Search for matches based on target keywords
16 | target_keywords = ["API", "SDK"]
17 | matched_links = []
18 | for message, _ in post_data:
19 |     # Parse the message content with BeautifulSoup
20 |     soup = BeautifulSoup(message, "html.parser")
21 |     links = soup.find_all("a")
22 | 
23 |     # Extract the title and URL from each link
24 |     for link in links:
25 |         title = link.get_text()
26 |         url = link.get("href")
27 |         if url and any(keyword.lower() in title.lower() for keyword in target_keywords):
28 |             matched_links.append({"title": title, "url": url})
29 | 
30 | # Print the matched links
31 | for link in matched_links:
32 |     print(f'{link["title"]}: {link["url"]}')


--------------------------------------------------------------------------------
/open_url.txt:
--------------------------------------------------------------------------------
 1 | import tkinter as tk
 2 | import webbrowser
 3 | 
 4 | root = tk.Tk()
 5 | root.title("Find Key Words - Find All My Likes, Comments")
 6 | root.geometry("800x600")
 7 | root.resizable(True, True)
 8 | 
 9 | url_input = tk.StringVar(value="developers.facebook.com/docs/")
10 | result_list = tk.Listbox(root)
11 | 
12 | # Set the initial search results
13 | results = ["jhom", "knenti", "df"]
14 | for result in results:
15 |     result_list.insert(tk.END, result)
16 | 
17 | def search():
18 |     url = url_input.get()
19 |     # perform search based on url input
20 |     # display results in result_list
21 | 
22 | def accept():
23 |     # perform action when Accept button is clicked
24 |     pass
25 | 
26 | def cancel():
27 |     # perform action when Cancel button is clicked
28 |     pass
29 | 
30 | def open_link(event):
31 |     # open link in default web browser when double clicked
32 |     index = result_list.curselection()[0]
33 |     url = result_list.get(index)
34 |     webbrowser.open_new(url)
35 | 
36 | def remove_item(index):
37 |     result_list.delete(index)
38 | 
39 | def update_item(index):
40 |     result_list.itemconfigure(index, bg="yellow")
41 | 
42 | def comment_item(index):
43 |     # perform action to add a comment to the selected item
44 |     pass
45 | 
46 | url_label = tk.Label(root, text="URL:")
47 | url_entry = tk.Entry(root, textvariable=url_input, width=70)
48 | 
49 | search_button = tk.Button(root, text="Search", command=search,width=20,height=2)
50 | accept_button = tk.Button(root, text="Accept", command=accept,width=20,height=2)
51 | cancel_button = tk.Button(root, text="Cancel", command=cancel,width=20,height=2)
52 | 
53 | url_label.grid(row=0, column=0, padx=10, pady=10)
54 | url_entry.grid(row=0, column=1, padx=10, pady=10, sticky="w")
55 | 
56 | result_list.grid(row=1, column=0, columnspan=3, padx=10, pady=10, sticky="nesw")
57 | 
58 | remove_buttons = []
59 | update_buttons = []
60 | comment_buttons = []
61 | 
62 | for i in range(len(results)):
63 |     remove_button = tk.Button(root, text="Remove", command=lambda index=i: remove_item(index))
64 |     update_button = tk.Button(root, text="Update", command=lambda index=i: update_item(index))
65 |     comment_button = tk.Button(root, text="Comment", command=lambda index=i: comment_item(index))
66 | 
67 |     remove_buttons.append(remove_button)
68 |     update_buttons.append(update_button)
69 |     comment_buttons.append(comment_button)
70 | 
71 |     result_list.itemconfig(i, fg="blue")
72 |     result_list.itemconfig(i, font="-weight bold")
73 | 
74 |     remove_button.grid(row=i+2, column=0, padx=10, pady=5)
75 |     update_button.grid(row=i+2, column=1, padx=10, pady=5)
76 |     comment_button.grid(row=i+2, column=2, padx=10, pady=5)
77 | 
78 | search_button.grid(row=len(results)+2, column=0, padx=10, pady=10)
79 | accept_button.grid(row=len(results)+2, column=1, padx=10, pady=10)
80 | cancel_button.grid(row=len(results)+2, column=2, padx=10, pady=10)
81 | 
82 | root.mainloop()


--------------------------------------------------------------------------------
/pyQt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from PyQt5.QtWidgets import QApplication, QWidget, QLabel, QLineEdit, QPushButton, QVBoxLayout
 3 | 
 4 | class MyWindow(QWidget):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 |         
 8 |         # Set window title and geometry
 9 |         self.setWindowTitle("Web Search")
10 |         self.setGeometry(100, 100, 400, 300)
11 |         
12 |         # Create UI widgets
13 |         url_label = QLabel("Enter URL:")
14 |         self.url_input = QLineEdit()
15 |         self.search_button = QPushButton("Search")
16 |         self.cancel_button = QPushButton("Cancel")
17 |         self.search_result_label = QLabel("Search Results")
18 |         
19 |         # Create layout and add widgets
20 |         layout = QVBoxLayout()
21 |         layout.addWidget(url_label)
22 |         layout.addWidget(self.url_input)
23 |         layout.addWidget(self.search_button)
24 |         layout.addWidget(self.cancel_button)
25 |         layout.addWidget(self.search_result_label)
26 |         
27 |         # Set the layout for the window
28 |         self.setLayout(layout)
29 | 
30 | if __name__ == "__main__":
31 |     app = QApplication(sys.argv)
32 |     window = MyWindow()
33 |     window.show()
34 |     sys.exit(app.exec_())


--------------------------------------------------------------------------------
/scene.qml:
--------------------------------------------------------------------------------
 1 | import QtQuick 2.0 as CoreItems
 2 | 
 3 | CoreItems.Rectangle {
 4 |     width: 100; height: 100
 5 | 
 6 |     CoreItems.Text { text: "Hello, world!" }
 7 | 
 8 |     // WRONG! No namespace prefix - the Text type won't be found
 9 |     Text { text: "Hello, world!" }
10 | }


--------------------------------------------------------------------------------
/search_engine.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | 
 5 | # Send a GET request to the Facebook Developers website
 6 | url = "https://developers.facebook.com/docs/"
 7 | response = requests.get(url)
 8 | 
 9 | # Parse the HTML content with BeautifulSoup
10 | soup = BeautifulSoup(response.content, "html.parser")
11 | 
12 | # Extract the relevant information (page titles and URLs)
13 | links = []
14 | for link in soup.find_all("a"):
15 |     title = link.get_text()
16 |     url = link.get("href")
17 |     if url.startswith("/docs/"):
18 |         links.append({"title": title, "url": url})
19 | 
20 | # Search for matches based on target keywords
21 | target_keywords = ["API", "SDK"]
22 | matched_links = []
23 | for link in links:
24 |     for keyword in target_keywords:
25 |         if re.search(keyword, link["title"], re.IGNORECASE):
26 |             matched_links.append(link)
27 | 
28 | # Print the matched links
29 | for link in matched_links:
30 |     print(f'{link["title"]}: {url}{link["url"]}')


--------------------------------------------------------------------------------
/tkinter_ui.py:
--------------------------------------------------------------------------------
 1 | import tkinter as tk
 2 | import tkinter.ttk as ttk
 3 | import pandas as pd
 4 | 
 5 | class App:
 6 |     def __init__(self, master):
 7 |         self.input_label = tk.Label(master, text="Enter URL:")
 8 |         self.input_label.grid(row=0, column=0, pady=5, sticky=tk.W)
 9 | 
10 |         self.input_entry = tk.Entry(master, width=40)
11 |         self.input_entry.grid(row=0, column=1, padx=5, pady=5, sticky=tk.W)
12 | 
13 |         self.search_button = tk.Button(master, text="Search", command=self.search_url)
14 |         self.search_button.grid(row=0, column=2, padx=5, pady=5, sticky=tk.W)
15 | 
16 |         self.function_label = tk.Label(master, text="Function:")
17 |         self.function_label.grid(row=1, column=0, pady=5, sticky=tk.W)
18 | 
19 |         self.function_combobox = ttk.Combobox(master, values=["None", "h1", "h2", "h3", "comments", "other special"])
20 |         self.function_combobox.current(0)
21 |         self.function_combobox.grid(row=1, column=1, padx=5, pady=5, sticky=tk.W)
22 | 
23 |         self.result_label = tk.Label(master, text="Results:")
24 |         self.result_label.grid(row=2, column=0, pady=5, sticky=tk.W)
25 | 
26 |         self.result_listbox = tk.Listbox(master, width=60, height=10)
27 |         self.result_listbox.grid(row=3, column=0, columnspan=3, padx=5, pady=5, sticky=tk.W+tk.E)
28 | 
29 |         self.delete_button = tk.Button(master, text="Delete", state=tk.DISABLED, command=self.delete_url)
30 |         self.delete_button.grid(row=4, column=0, padx=5, pady=5, sticky=tk.W)
31 | 
32 |         self.update_button = tk.Button(master, text="Update", state=tk.DISABLED, command=self.update_url)
33 |         self.update_button.grid(row=4, column=1, padx=5, pady=5, sticky=tk.W)
34 | 
35 |         self.save_button = tk.Button(master, text="Save", state=tk.DISABLED, command=self.save_results)
36 |         self.save_button.grid(row=4, column=2, padx=5, pady=5, sticky=tk.W)
37 | 
38 |         self.result_listbox.bind('<Double-Button-1>', self.enable_buttons)
39 |         self.result_listbox.bind('<ButtonRelease-1>', self.click_url)
40 | 
41 |         self.results = {}
42 | 
43 |     def search_url(self):
44 |         url = self.input_entry.get()
45 |         function = self.function_combobox.get()
46 | 
47 |         # Do something with the URL and function
48 |         # Then add the results to the listbox with its corresponding index stored in the self.results dictionary
49 |         index = self.result_listbox.index(tk.END)
50 |         self.results[index] = {"url": url, "function": function}
51 |         self.result_listbox.insert(tk.END, f"Result {index+1}")
52 | 
53 |         # Enable save button
54 |         self.save_button.config(state=tk.NORMAL)
55 | 
56 |     def enable_buttons(self, event):
57 |         self.delete_button.config(state=tk.NORMAL)
58 |         self.update_button.config(state=tk.NORMAL)
59 | 
60 |     def click_url(self, event):
61 |         selected_item = self.result_listbox.curselection()
62 |         if selected_item:
63 |             index = selected_item[0]
64 |             url = self.results[index]["url"]
65 |             print(f"Opening URL: {url}") # Replace this line with your code to open the URL in a browser
66 | 
67 |     def delete_url(self):
68 |         selected_item = self.result_listbox.curselection()
69 |         if selected_item:
70 |             index = selected_item[0]
71 |             self.result_listbox.delete(index)
72 |             del self.results[index]
73 |             for i in range(index, self.result_listbox.size()):
74 |                 self.results[i] = self.results[i+1]
75 |             self.results.pop(self.result_listbox.size(), None)
76 | 
77 |     def update_url(self):
78 |         selected_item = self.result_listbox.curselection()
79 |         if selected_item:
80 |             index = selected_item[0]
81 |             self.result_listbox.itemconfigure(index, bg='yellow')
82 |             new_url = "http://newurl.com" # Replace this line with your code to prompt the user for a new URL
83 |             self.results[index]["url"] = new_url
84 |             self.result_listbox.delete(index)
85 |             self.result_listbox.insert(index, f"Result {index+1}")
86 | 
87 |     def save_results(self):
88 |         data = []
89 |         for index in self.results:
90 |             data.append([self.results[index]["url"], self.results[index]["function"]])
91 |         df = pd.DataFrame(data, columns=['URL', 'Function'])
92 |         df.to_excel('results.xlsx', index=False)
93 | 
94 | root = tk.Tk()
95 | root.title("URL Search Results")
96 | 
97 | app = App(root)
98 | 
99 | root.mainloop()


--------------------------------------------------------------------------------