├── .gitignore
├── dataCollection
├── last_toot_id.txt
├── __pycache__
│ └── loadData.cpython-39.pyc
└── loadData.py
├── requirements.txt
├── images
├── gantt.PNG
├── airflow2.PNG
└── workflow.png
├── GDPR Compliance.pdf
├── Commands.txt
├── mapReduce
└── python
│ ├── mapper.py
│ └── reducer.py
├── airFlowDAG
└── mastadon_dag.py
├── README.md
├── loadingIntoHbase
└── insertion.py
└── analysis.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Environment file
2 | .env
--------------------------------------------------------------------------------
/dataCollection/last_toot_id.txt:
--------------------------------------------------------------------------------
1 | 111291292042375273
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Mastodon.py
2 | python-dotenv
3 | hdfs
4 | pandas
5 | happybase
--------------------------------------------------------------------------------
/images/gantt.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/gantt.PNG
--------------------------------------------------------------------------------
/GDPR Compliance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/GDPR Compliance.pdf
--------------------------------------------------------------------------------
/images/airflow2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/airflow2.PNG
--------------------------------------------------------------------------------
/images/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/workflow.png
--------------------------------------------------------------------------------
/dataCollection/__pycache__/loadData.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/dataCollection/__pycache__/loadData.cpython-39.pyc
--------------------------------------------------------------------------------
/Commands.txt:
--------------------------------------------------------------------------------
1 | sudo pip install -r /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/requirements.txt
2 | sudo python3 /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/loadData.py
3 |
4 | hadoop fs -rm -r /raw/
5 |
6 | # Job run
7 | hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar -mapper /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py -reducer /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py -input /raw/2023-10-21/19-0-posts.json -output /test/anass32/
8 | hadoop fs -cat /test/anass30/part-00000
9 |
10 | #hbase
11 | /usr/local/Hbase/bin/start-hbase.sh
12 | /usr/local/Hbase/bin/stop-hbase.sh
13 |
14 | /usr/local/Hbase/bin/hbase-daemon.sh start thrift
15 | /usr/local/Hbase/bin/hbase-daemon.sh stop thrift
16 |
17 | sudo /usr/local/Hbase/bin/hbase shell
18 |
19 | # Airflow
20 | /AirFlow/airflow-environment/airflow scheduler
21 | /AirFlow/airflow-environment/airflow webserver -p 8080
22 |
23 |
24 | mapred streaming -files /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py,/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py -mapper mapper.py -reducer reducer.py -input /raw/2023-10-21/19-0-posts.json
--------------------------------------------------------------------------------
/mapReduce/python/mapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import json
3 | import sys
4 | import re
5 | from urllib.parse import urlparse
6 |
7 | def process_data(input_data):
8 | pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
9 |
10 | for line in input_data:
11 | try:
12 | data = json.loads(line)
13 | toot_with_media_dict = {}
14 | toot_with_media_id = 'toot_with_media'
15 |
16 | created_at = data["created_at"].split(' ')[1].split('+')[0]
17 | account = data.get("account")
18 | media_attachments = data.get("media_attachments")
19 | emojis = data.get("emojis") if data.get("emojis") else []
20 | websites = data.get("content") if data.get("content") else []
21 | tags = data.get("tags") if data.get("tags") else []
22 |
23 | if account:
24 | user_id = 'user:' + str(account.get('id'))
25 | followers = int(account.get('followers_count', 0))
26 | reblogs_count = data.get('reblogs_count', 0)
27 | favourites_count = data.get('favourites_count', 0)
28 |
29 | engagement_rate = (reblogs_count + favourites_count) / followers if followers > 0 else 0
30 |
31 | croissance_id = "croissance:" + account.get('created_at').split('-')[0] + '-' + account.get('created_at').split('-')[1]
32 |
33 | user_data = {
34 | "date": created_at,
35 | "followers": followers,
36 | "engagement_rate": engagement_rate
37 | }
38 |
39 | croissance_data = {"value": 1, "user_id": user_id}
40 |
41 | # Emit key-value pairs for the reducer
42 | print(f"{user_id}\t{user_data}")
43 | print(f"{croissance_id}\t{croissance_data}")
44 |
45 | if media_attachments:
46 | toot_with_media_dict["value"] = 1
47 | print(f"{toot_with_media_id}\t{toot_with_media_dict}")
48 |
49 | language_id = "language:" + data.get('language')
50 | language_data = {"value": 1}
51 | print(f"{language_id}\t{language_data}")
52 |
53 | if emojis != []:
54 | for emoji in emojis:
55 | emoji_id = "emoji:" + emoji.get('shortcode')
56 | emoji_data = {"value": 1}
57 | print(f"{emoji_id}\t{emoji_data}")
58 |
59 | if websites != []:
60 | urls = re.search(pattern, websites)
61 | if urls:
62 | website_id = "website:" + urlparse(urls.group(0)).netloc
63 | website_data = {"value": 1}
64 | print(f"{website_id}\t{website_data}")
65 |
66 | if tags != []:
67 | for tag in tags:
68 | tag_id = "tag:" + tag.get('name')
69 | tag_data = {"value": 1}
70 | print(f"{tag_id}\t{tag_data}")
71 |
72 | except Exception as e:
73 | # Log exceptions to standard error
74 | print(f"Error: {str(e)}", file=sys.stderr)
75 |
76 | # Example usage of the process_data function
77 | if __name__ == "__main__":
78 | input_data = sys.stdin
79 | process_data(input_data)
80 |
--------------------------------------------------------------------------------
/airFlowDAG/mastadon_dag.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.operators.python_operator import PythonOperator
3 | from airflow.models import Variable
4 | from datetime import datetime, timedelta
5 | from airflow.providers.email.operators.send_email import SendEmail
6 | import subprocess
7 | import sys
8 | sys.path.insert(0, '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection')
9 | sys.path.insert(0, '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/loadingIntoHbase')
10 | from loadData import retrieve_and_save_mastodon_data
11 | from insertion import insert_data_into_hbase
12 |
13 | def success_callback(context):
14 | SendEmail(
15 | to=["saad.bentaleb08@gmail.com"],
16 | subject="Mastodon Data Pipeline Succeeded",
17 | html_content="The Mastodon Data Pipeline has succeeded."
18 | )
19 |
20 | def failure_callback(context):
21 | SendEmail(
22 | to=["saad.bentaleb08@gmail.com"],
23 | subject="Mastodon Data Pipeline Failed",
24 | html_content="The Mastodon Data Pipeline has failed."
25 | )
26 |
27 | default_args = {
28 | 'owner': 'admin',
29 | 'start_date': datetime(2023, 10, 23),
30 | 'retries': 0, # Set to 0 to disable retries for the DAG
31 | }
32 |
33 | with DAG('mastodon_data_pipeline1', default_args=default_args, schedule_interval='@daily') as dag:
34 | def set_data_path(**kwargs):
35 | data_path = retrieve_and_save_mastodon_data() # Run the data collection function
36 | processed_path = '/processed/' + datetime.now().strftime('%Y-%m-%d/%H-%M') + '/'
37 | Variable.set("data_path", data_path)
38 | Variable.set("processed_path", processed_path)
39 |
40 | # Create PythonOperator tasks
41 | retrieve_and_save_mastodon_data_task = PythonOperator(
42 | task_id='mastodon_data_pipeline1',
43 | provide_context=True,
44 | python_callable=set_data_path,
45 | dag=dag,
46 | )
47 |
48 | def run_map_reduce(**kwargs):
49 | data_path = Variable.get("data_path") # Retrieve the data path from the variable
50 | output_path = Variable.get("processed_path") # Retrieve the processed path from the variable
51 | # Use subprocess to run Hadoop MapReduce job with the provided data path
52 | hadoop_command = f"hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar " \
53 | f"-mapper /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py " \
54 | f"-reducer /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py " \
55 | f"-input {data_path} " \
56 | f"-output {output_path}"
57 | subprocess.run(hadoop_command, shell=True)
58 |
59 | run_map_reduce_task = PythonOperator(
60 | task_id='run_map_reduce',
61 | provide_context=True,
62 | python_callable=run_map_reduce,
63 | dag=dag,
64 | )
65 |
66 | def run_hbase_insertion(**kwargs):
67 | processed_path = Variable.get("processed_path")
68 | insert_data_into_hbase(processed_path)
69 |
70 | run_hbase_insertion_task = PythonOperator(
71 | task_id='run_hbase_insertion',
72 | provide_context=True,
73 | python_callable=run_hbase_insertion,
74 | dag=dag,
75 | )
76 |
77 | retrieve_and_save_mastodon_data_task >> run_map_reduce_task >> run_hbase_insertion_task
78 |
79 | if __name__ == "__main__":
80 | dag.cli()
81 |
82 |
--------------------------------------------------------------------------------
/dataCollection/loadData.py:
--------------------------------------------------------------------------------
1 | from mastodon import Mastodon
2 | from dotenv import load_dotenv
3 | import os
4 | from hdfs import InsecureClient
5 | import datetime
6 | import time
7 | import json
8 |
9 | load_dotenv()
10 |
11 | def retrieve_and_save_mastodon_data():
12 | # Connect to the mastodon API
13 | mastodon = Mastodon(
14 | client_id=os.getenv('Client_key'),
15 | client_secret=os.getenv('Client_secret'),
16 | access_token=os.getenv('Access_token'),
17 | api_base_url="https://mastodon.social"
18 | )
19 |
20 | # Initialize an HDFS client
21 | hdfs_client = InsecureClient('http://localhost:9870', user='hadoop')
22 |
23 | # Get the current date and time
24 | now = datetime.datetime.now()
25 | directory_path = '/raw/' + str(now.year) + '-' + str(now.month) + '-' + str(now.day)
26 |
27 | # Check if the directory already exists
28 | if not hdfs_client.status(directory_path, strict=False):
29 | hdfs_client.makedirs(directory_path)
30 |
31 | # Define the HDFS path where you want to save the data
32 | hdfs_path = directory_path + '/' + str(now.hour) + '-' + str(now.minute)
33 |
34 | # Retrieve the last toot ID from a local file or start with None
35 | try:
36 | with open('/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/last_toot_id.txt', 'r', encoding='utf-8') as reader:
37 | last_toot_id = reader.read().strip()
38 | except FileNotFoundError:
39 | last_toot_id = None
40 |
41 | public_posts = []
42 |
43 | # Specify the duration of collecting data in minutes
44 | duration = 20 # Seconds
45 |
46 | # Get the current time
47 | start_time = time.time()
48 |
49 | while True:
50 | # Check if 10 minutes have passed
51 | if time.time() - start_time >= duration:
52 | break # Retrieve public posts
53 |
54 | new = mastodon.timeline_public(limit=40, since_id=last_toot_id)
55 |
56 | # Append the current run's public posts to the list
57 | public_posts.extend(new)
58 | print(f'Number of posts retrieved: {str(len(public_posts))}', end='\r')
59 |
60 | # Update the last_toot_id
61 | if public_posts:
62 | latest_toot = public_posts[0]
63 | last_toot_id = str(latest_toot['id'])
64 |
65 | class CustomJSONEncoder(json.JSONEncoder):
66 | def default(self, o):
67 | if isinstance(o, datetime.datetime):
68 | # Convert datetime to a string representation
69 | return o.strftime('%Y-%m-%d %H:%M:%S %z')
70 | elif hasattr(o, '__dict__'):
71 | # Handle other objects with __dict__ attribute
72 | return o.__dict__
73 | return super().default(o)
74 |
75 | formatted_data = []
76 | for obj in public_posts:
77 | formatted_obj = json.dumps(obj, separators=(',', ':'), default=str, cls=CustomJSONEncoder)
78 | formatted_data.append(formatted_obj)
79 |
80 | # Convert the formatted data to a string
81 | formatted_data_str = '\n'.join(formatted_data)
82 |
83 | # Save the preprocessed data to HDFS
84 | with hdfs_client.write(hdfs_path + '-posts.json', encoding='utf-8') as writer:
85 | writer.write(formatted_data_str)
86 |
87 | print('Data saved successfully to HDFS: ' + hdfs_path + '-posts.json')
88 |
89 | # After retrieving the public posts, you can save the latest toot_id to a local file.
90 | if public_posts:
91 | latest_toot = public_posts[0] # Assuming the latest toot is at the first position
92 | latest_toot_id = latest_toot['id']
93 |
94 | # Convert latest_toot_id to a string
95 | latest_toot_id_str = str(latest_toot_id)
96 |
97 | # Define the path to the local file
98 | local_file_path = '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/last_toot_id.txt'
99 |
100 | # Update or create the local file with the latest_toot_id
101 | with open(local_file_path, 'w', encoding='utf-8') as writer:
102 | writer.write(latest_toot_id_str)
103 |
104 |
105 | return hdfs_path + '-posts.json'
106 |
107 | if __name__ == '__main__':
108 | retrieve_and_save_mastodon_data()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Mastodon Data Pipeline README
2 |
3 | This README provides an overview of a data pipeline project that consists of four main phases: data extraction from the Mastodon API, data processing with Hadoop MapReduce using Python streaming, data storage in HBase, and orchestration with Apache Airflow for automated daily execution. The project aims to collect and analyze data from the Mastodon platform, a federated social media platform.
4 |
5 | ## Folders structure
6 | ```
7 | Repository Root
8 | ├── airFlowDag
9 | │ ├── mastadon_dag.py
10 | │
11 | ├── dataCollection
12 | │ ├── last_toot_id.txt
13 | │ ├── loadData.py
14 | │ ├── public_posts.json
15 | │ ├── tests.ipynb
16 | │
17 | ├── images
18 | │ ├── airflow.PNG
19 | │ ├── gantt.PNG
20 | │ ├── workflow.PNG
21 | │
22 | ├── loadingIntoHbase
23 | │ ├── insertion.py
24 | │
25 | ├── mapReduce
26 | │ ├── Python
27 | │ │ ├── mapper.py
28 | │ │ ├── reducer.py
29 | │
30 | ├── airFlowDag
31 | │ ├── mastadon_dag.py
32 | │
33 | │
34 | ├── README.md
35 | ├── analysis.ipynb
36 | ├── Commands.txt
37 | ├── RGPD.pdf
38 | ├── requirements.txt
39 |
40 | ```
41 |
42 | ## Project Overview
43 |
44 |
45 | As a Data Developer, your role in this project is to set up an automated pipeline to address the following challenges:
46 | ---
47 | 
48 | ---
49 |
50 | ### Phase 1: Data Collection
51 |
52 | - **Data Extraction:** Utilize the Mastodon API with your access tokens to gather raw data from the Mastodon platform.
53 |
54 | - **Raw Data Storage:** Store the raw data in a distributed file system such as HDFS for scalability.
55 |
56 | - **HDFS Data Lake Modeling:** Define the data lake schema for HDFS.
57 |
58 | ### Phase 2: Data Processing with MapReduce
59 |
60 | - **Mapper:** Process the input data and generate key-value pairs based on the desired metrics (e.g., user followers, engagement rate, URLs, emojis, etc.).
61 |
62 | - **Reducer:** Aggregate the key-value pairs produced by the mapper.
63 |
64 | - **MapReduce Job Execution:** Use Hadoop streaming API to execute the MapReduce task, providing the mapper and reducer scripts as inputs.
65 |
66 | - **Monitoring:** Keep track of job progress through the Hadoop web UI.
67 |
68 | ### Phase 3: Data Storage in HBase
69 |
70 | - **HBase Schema Design:** Design the HBase tables schema based on the information you want to extract.
71 |
72 | | **Table Name** | **Description** | **Schema** |
73 | |--------------------------|-----------------------------------------------|-------------------------------------------|
74 | | `language_table` | Stores language counts. | - Row Key: Language code (e.g., 'en', 'es')
- Columns:
- `data:count`: Count of users/data.
- Timestamp: Record timestamp. |
75 | | `user_table` | Stores user information, engagement rates, and followers. | - Row Key: User ID (e.g., '1007156', '10106')
- Columns:
- `data:engagement_rate`: User's engagement rate.
- `data:followers`: Number of followers.
- Timestamp: Record timestamp. |
76 | | `croissance_table` | Records user creation counts by month. | - Row Key: Month (e.g., '2007-07', '2008-01')
- Columns:
- `data:count`: Count of user creations.
- Timestamp: Record timestamp. |
77 | | `url_table` | Tracks mentions of external websites. | - Row Key: Website URL (e.g., 'a.gup.pe', 'abcgazetesi.com')
- Columns:
- `data:count`: Count of mentions.
- Timestamp: Record timestamp. |
78 | | `toot_with_media_table` | Records the count of toots with media content. | - Row Key: Fixed key 'toot_with_media'
- Columns:
- `data:count`: Count of toots with media.
- Timestamp: Record timestamp. |
79 | | `tag_table` | Stores counts of used tags. | - Row Key: Tag name (e.g., '10yrsago', '17thcentury')
- Columns:
- `data:count`: Count of tag usage.
- Timestamp: Record timestamp. |
80 |
81 |
82 | - **Best Practices:** Follow best practices for row key design, column family design, compression, bloom filters, batch inserts, etc.
83 |
84 | - **Table Creation:** Create the necessary tables in HBase.
85 |
86 | ```
87 | # HBase connection settings
88 | hbase_host = 'localhost' # Replace with your HBase host
89 | hbase_port = 9090 # Default HBase port
90 | # Connect to HBase
91 | connection = happybase.Connection(host=hbase_host, port=hbase_port)
92 |
93 | tables_list = ["user_table", "croissance_table", "language_table", "toot_with_media_table", "emoji_table", "url_table", "tag_table"]
94 | tables = connection.tables()
95 | # remove the b' and ' from the table names
96 | tables = [table_name.decode() for table_name in tables]
97 |
98 | # Create the tables if they do not exist
99 | for table_name in tables_list:
100 | if table_name not in tables:
101 | connection.create_table(table_name, {'data': dict()})
102 |
103 | connection.close()
104 | ```
105 |
106 | - **Data Insertion:** Populate the output from the reducer into HBase tables using a Python HBase client or your preferred method.
107 |
108 | ### Phase 4: Orchestration with Apache Airflow
109 |
110 | - **Workflow Orchestration:** Define a Directed Acyclic Graph (DAG) to orchestrate the entire workflow.
111 |
112 | - **Task Creation:** Create tasks for running the MapReduce job and storing results in HBase.
113 |
114 | - **Monitoring and Error Handling:** Monitor progress and manage errors or failures.
115 |
116 | ### Phase 5: Data Analysis
117 |
118 | After successfully completing the previous phases, you can perform data analysis. You
119 |
120 | ### Phase 6: Workflow Execution
121 |
122 | In the Apache Airflow web interface, activate the DAG, monitor DAG execution progress, and check logs for any issues. Once the DAG is complete, review the results in HBase.
123 |
124 | #### Airflow run details
125 |
126 | 
127 |
128 | ### Phase 7: Optimization and Monitoring
129 |
130 | Optimize MapReduce scripts for better performance. Monitor HBase for storage issues and set up alerts in Airflow for task failures. Regularly monitor Hadoop using its respective web interface.
131 |
132 | ### Phase 8: Data Access Rights Configuration Updates
133 |
134 | Update API tokens if organizational roles change, ensuring they have the necessary permissions for data retrieval.
135 |
136 | ### Phase 9: Scheduling and Compliance
137 |
138 | Ensure that DAGs are scheduled at appropriate intervals for data refresh. Update the data processing log to ensure GDPR compliance by documenting all personal data from Mastodon and how it's processed.
139 |
--------------------------------------------------------------------------------
/mapReduce/python/reducer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import sys
3 | import json
4 | # import happybase
5 |
6 | # # Initialize a connection to HBase
7 | # connection = happybase.Connection('localhost', 9090)
8 |
9 | # # Define the HBase tables
10 | # user_table = connection.table('user_data')
11 | # croissance_table = connection.table('croissance_data')
12 | # language_table = connection.table('language_data')
13 | # toot_with_media_table = connection.table('toot_with_media_data')
14 | # emoji_table = connection.table('emoji_data')
15 | # url_table = connection.table('website_data')
16 |
17 | current_user = None
18 | current_croissance = None
19 | current_language = None
20 | current_toot_with_media = None
21 | current_emoji = None
22 | current_url = None
23 | current_tag = None
24 |
25 | earliest_followers = float('inf')
26 | old_time = "00:00:00"
27 |
28 | engagement_rate_sum = 0.0000
29 | croissance_sum = 0
30 | language_sum = 0
31 | toot_with_media_sum = 0
32 | emoji_sum = 0
33 | url_sum = 0
34 | tag_sum = 0
35 | count = 1
36 |
37 | unique_users_roissance = []
38 |
39 | user_dict = {}
40 | croissance_dict = {}
41 | language_dict = {}
42 | toot_with_media_dict = {}
43 | emoji_dict = {}
44 | url_dict = {}
45 | tag_dict = {}
46 |
47 | for line in sys.stdin:
48 | key = line.strip().split('\t')[0]
49 | key = key.strip()
50 | # if key starts with user, it's a user record
51 | if key.startswith("user"):
52 | user, strdata = line.strip().split('\t')
53 | # string to dict
54 | data = eval(strdata.strip())
55 | followers = int(data["followers"])
56 | time = data['date']
57 | engagement_rate = float(data["engagement_rate"])
58 |
59 | if current_user == user:
60 | if time > old_time:
61 | old_time = time
62 | earliest_followers = followers
63 | engagement_rate_sum += engagement_rate
64 | count += 1
65 | else:
66 | if current_user is not None:
67 | if count > 0:
68 | engagement_rate_avg = engagement_rate_sum / count
69 | user_dict["engagement_rate"] = engagement_rate_avg
70 | user_dict["followers"] = earliest_followers
71 | print(f"{current_user}\t{user_dict}")
72 | current_user = user
73 | earliest_followers = followers
74 | old_time = time
75 | engagement_rate_sum = engagement_rate
76 | count = 1
77 |
78 | # if key starts with croissance, it's a croissance record
79 | elif key.startswith("croissance"):
80 | croissance, strdata = line.strip().split('\t')
81 | data = eval(strdata.strip())
82 | user_id = data["user_id"]
83 | if croissance == current_croissance:
84 | if current_croissance is not None:
85 | if user_id not in unique_users_roissance:
86 | unique_users_roissance.append(user_id)
87 | croissance_sum += data["value"]
88 | elif croissance != current_croissance:
89 | if current_croissance is not None:
90 | croissance_dict["count"] = croissance_sum
91 | print(f"{current_croissance}\t{croissance_dict}")
92 | current_croissance = croissance
93 | croissance_sum = 1
94 | unique_users_roissance = [user_id]
95 |
96 | #if key starts with language, it's a language record
97 | elif key.startswith("language"):
98 | language, strdata = line.strip().split('\t')
99 | data = eval(strdata.strip())
100 | if language == current_language:
101 | if current_language is not None:
102 | language_sum += data["value"]
103 | elif language != current_language:
104 | if current_language is not None:
105 | language_dict["count"] = language_sum
106 | print(f"{current_language}\t{language_dict}")
107 | current_language = language
108 | language_sum = 1
109 |
110 | elif key.startswith("toot_with_media"):
111 | toot_with_media, strdata = line.strip().split('\t')
112 | data = eval(strdata.strip())
113 | if toot_with_media == current_toot_with_media:
114 | if current_toot_with_media is not None:
115 | toot_with_media_sum += data["value"]
116 | elif toot_with_media != current_toot_with_media:
117 | if current_toot_with_media is not None:
118 | toot_with_media_dict["count"] = toot_with_media_sum
119 | print(f"{current_toot_with_media}\t{toot_with_media_dict}")
120 | current_toot_with_media = toot_with_media
121 | toot_with_media_sum = 1
122 |
123 | elif key.startswith("emoji"):
124 | emoji_id , strdata = line.strip().split('\t')
125 | data = eval(strdata.strip())
126 | if emoji_id == current_emoji:
127 | if current_emoji is not None:
128 | emoji_sum += data["value"]
129 | elif emoji_id != current_emoji:
130 | if current_emoji is not None:
131 | emoji_dict["count"] = emoji_sum
132 | print(f"{current_emoji}\t{emoji_dict}")
133 | current_emoji = emoji_id
134 | emoji_sum = 1
135 |
136 | elif key.startswith("website"):
137 | website_id , strdata = line.strip().split('\t')
138 | data = eval(strdata.strip())
139 | if website_id == current_url:
140 | if current_url is not None:
141 | url_sum += data["value"]
142 | elif website_id != current_url:
143 | if current_url is not None:
144 | url_dict["count"] = url_sum
145 | print(f"{current_url}\t{url_dict}")
146 | current_url = website_id
147 | url_sum = 1
148 |
149 | elif key.startswith("tag"):
150 | tag_id , strdata = line.strip().split('\t')
151 | data = eval(strdata.strip())
152 | if tag_id == current_tag:
153 | if current_tag is not None:
154 | tag_sum += data["value"]
155 | elif tag_id != current_tag:
156 | if current_tag is not None:
157 | tag_dict["count"] = tag_sum
158 | print(f"{current_tag}\t{tag_dict}")
159 | current_tag = tag_id
160 | tag_sum = 1
161 |
162 | # Print the last data
163 | if current_croissance is not None:
164 | croissance_dict["count"] = croissance_sum
165 | print(f"{current_croissance}\t{croissance_dict}")
166 |
167 | if current_user is not None:
168 | if count > 0:
169 | engagement_rate_avg = engagement_rate_sum / count
170 | print(f"{current_user}\t{user_dict}")
171 |
172 | if current_language is not None:
173 | language_dict["count"] = language_sum
174 | print(f"{current_language}\t{language_dict}")
175 |
176 | if current_toot_with_media is not None:
177 | toot_with_media_dict["count"] = toot_with_media_sum
178 | print(f"{current_toot_with_media}\t{toot_with_media_dict}")
179 |
180 | if current_emoji is not None:
181 | emoji_dict["count"] = emoji_sum
182 | print(f"{current_emoji}\t{emoji_dict}")
183 |
184 | if current_url is not None:
185 | url_dict["count"] = url_sum
186 | print(f"{current_url}\t{url_dict}")
187 |
188 | if current_tag is not None:
189 | tag_dict["count"] = tag_sum
190 | print(f"{current_tag}\t{tag_dict}")
--------------------------------------------------------------------------------
/loadingIntoHbase/insertion.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from hdfs import InsecureClient
3 | import json
4 | import happybase
5 |
6 |
7 |
8 |
9 | # define a function to process the data from the hdf file
10 | def insert_data_into_hbase(data_folder_path):
11 |
12 | # HBase connection settings
13 | hbase_host = 'localhost' # Replace with your HBase host
14 | hbase_port = 9090 # Default HBase port
15 | # Connect to HBase
16 | connection = happybase.Connection(host=hbase_host, port=hbase_port)
17 |
18 | tables_list = ["user_table", "croissance_table", "language_table", "toot_with_media_table", "emoji_table", "url_table", "tag_table"]
19 | tables = connection.tables()
20 | # remove the b' and ' from the table names
21 | tables = [table_name.decode() for table_name in tables]
22 |
23 | # Create the tables if they do not exist
24 | for table_name in tables_list:
25 | if table_name not in tables:
26 | connection.create_table(table_name, {'data': dict()})
27 | # Get the tables
28 | user_table = connection.table('user_table')
29 | croissance_table = connection.table('croissance_table')
30 | language_table = connection.table('language_table')
31 | toot_with_media_table = connection.table('toot_with_media_table')
32 | emoji_table = connection.table('emoji_table')
33 | url_table = connection.table('url_table')
34 | tag_table = connection.table('tag_table')
35 |
36 | # Initialize an HDFS client
37 | hdfs_client = InsecureClient('http://localhost:9870', user='hadoop')
38 | # get files from the data folder that start with part
39 | files = hdfs_client.list(data_folder_path)
40 | # get oly the files that start with part
41 | files = [file for file in files if file.startswith('part')]
42 | # loop through the files
43 | for file in files:
44 | # get the file path
45 | file_path = data_folder_path + '/' + file
46 | # open the file
47 | with hdfs_client.read(file_path, encoding='utf-8') as reader:
48 | # read the file
49 | data = reader.read()
50 | # split the file into lines
51 | lines = data.splitlines()
52 | # loop through the lines
53 | for line in lines:
54 | # split the line into key and value
55 | key, value = line.strip().split('\t')
56 | # split the key into key and id
57 | # convert the value to a dictionary
58 | value = eval(value.strip())
59 | # if the key is user
60 | if key.startswith('user'):
61 | key, id = key.strip().split(':')
62 | # get the engagement rate
63 | engagement_rate = value['engagement_rate']
64 | # get the followers
65 | followers = value['followers']
66 | user_dict = {
67 | "engagement_rate": engagement_rate,
68 | "followers": followers
69 | }
70 | try:
71 | # Store the user data in HBase
72 | user_table.put(str(id).encode(), {b'data:engagement_rate': str(user_dict["engagement_rate"]).encode()})
73 | user_table.put(str(id).encode(), {b'data:followers': str(user_dict["followers"]).encode()})
74 | except Exception as e:
75 | # Log exceptions to standard error
76 | print(f"Error: {str(e)}")
77 | elif key.startswith('croissance'):
78 | key, id = key.strip().split(':')
79 | croissance_dict = {
80 | "count": value['count']
81 | }
82 | try:
83 | # Store the user data in HBase
84 | croissance_table.put(str(id).encode(), {b'data:count': str(croissance_dict["count"]).encode()})
85 | except Exception as e:
86 | # Log exceptions to standard error
87 | print(f"Error: {str(e)}")
88 | elif key.startswith('language'):
89 | key, id = key.strip().split(':')
90 | language_dict = {
91 | "count": value['count']
92 | }
93 | try:
94 | # Store the user data in HBase
95 | language_table.put(str(id).encode(), {b'data:count': str(language_dict["count"]).encode()})
96 | except Exception as e:
97 | # Log exceptions to standard error
98 | print(f"Error: {str(e)}")
99 | elif key.startswith('toot_with_media'):
100 | id = key.strip()
101 | toot_with_media_dict = {
102 | "count": value['count']
103 | }
104 | try:
105 | # Store the user data in HBase
106 | toot_with_media_table.put(str(id).encode(), {b'data:count': str(toot_with_media_dict["count"]).encode()})
107 | except Exception as e:
108 | # Log exceptions to standard error
109 | print(f"Error: {str(e)}")
110 | elif key.startswith('emoji'):
111 | key, id = key.strip().split(':')
112 | emoji_dict = {
113 | "count": value['count']
114 | }
115 | try:
116 | # Store the user data in HBase
117 | emoji_table.put(str(id).encode(), {b'data:count': str(emoji_dict["count"]).encode()})
118 | except Exception as e:
119 | # Log exceptions to standard error
120 | print(f"Error: {str(e)}")
121 | elif key.startswith('website'):
122 | key, id = key.strip().split(':')
123 | url_dict = {
124 | "count": value['count']
125 | }
126 | try:
127 | # Store the user data in HBase
128 | url_table.put(str(id).encode(), {b'data:count': str(url_dict["count"]).encode()})
129 | except Exception as e:
130 | # Log exceptions to standard error
131 | print(f"Error: {str(e)}")
132 | elif key.startswith('tag'):
133 | key, id = key.strip().split(':')
134 | tag_dict = {
135 | "count": value['count']
136 | }
137 | try:
138 | # Store the user data in HBase
139 | tag_table.put(str(id).encode(), {b'data:count': str(tag_dict["count"]).encode()})
140 | except Exception as e:
141 | # Log exceptions to standard error
142 | print(f"Error: {str(e)}")
143 |
144 |
145 | connection.close()
146 |
147 |
148 | if __name__ == "__main__":
149 | # Get the data path from the command line
150 | data_path = sys.argv[1]
151 | # Process the data
152 | insert_data_into_hbase(data_path)
--------------------------------------------------------------------------------
/analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import happybase"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# User Analysis\n",
17 | "---"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Top 3 Followed users"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 21,
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "name": "stdout",
34 | "output_type": "stream",
35 | "text": [
36 | "User ID: 109999724950289416, Followers: 3340001\n",
37 | "User ID: 109999725002952105, Followers: 2410000\n",
38 | "User ID: 109718180885037597, Followers: 2330004\n"
39 | ]
40 | }
41 | ],
42 | "source": [
43 | "# Connect to HBase\n",
44 | "# HBase connection settings\n",
45 | "hbase_host = 'localhost' # Replace with your HBase host\n",
46 | "hbase_port = 9090 # Default HBase port\n",
47 | "# Connect to HBase\n",
48 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
49 | "\n",
50 | "table = connection.table('user_table')\n",
51 | "\n",
52 | "# Scan the table to retrieve user data\n",
53 | "scan_result = table.scan(columns=['data:followers'])\n",
54 | "\n",
55 | "# Create a dictionary to store user followers\n",
56 | "user_followers = {}\n",
57 | "\n",
58 | "# Process the scan result\n",
59 | "for key, data in scan_result:\n",
60 | " user_id = key.decode('utf-8')\n",
61 | " followers = int(data[b'data:followers'].decode('utf-8'))\n",
62 | " user_followers[user_id] = followers\n",
63 | "\n",
64 | "# Close the HBase connection\n",
65 | "connection.close()\n",
66 | "\n",
67 | "# Sort the users by followers and get the top users\n",
68 | "top_users = sorted(user_followers.items(), key=lambda item: item[1], reverse=True)[:3]\n",
69 | "\n",
70 | "# Print the top users and their followers\n",
71 | "for user_id, followers in top_users:\n",
72 | " print(f\"User ID: {user_id}, Followers: {followers}\")\n"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "## Users with highest engagement rate"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 27,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "User ID: 110643001753379072, Engagement Rate: 19.05 %\n",
92 | "User ID: 111243750378556518, Engagement Rate: 7.14 %\n",
93 | "User ID: 110657506981999016, Engagement Rate: 5.62 %\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "# Connect to HBase\n",
99 | "# HBase connection settings\n",
100 | "hbase_host = 'localhost' # Replace with your HBase host\n",
101 | "hbase_port = 9090 # Default HBase port\n",
102 | "# Connect to HBase\n",
103 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
104 | "\n",
105 | "# Select the table\n",
106 | "table = connection.table('user_table')\n",
107 | "\n",
108 | "# Scan the table to retrieve user data\n",
109 | "scan_result = table.scan(columns=['data:engagement_rate'])\n",
110 | "\n",
111 | "# Create a dictionary to store user engagement rates\n",
112 | "user_engagement = {}\n",
113 | "\n",
114 | "# Process the scan result\n",
115 | "for key, data in scan_result:\n",
116 | " user_id = key.decode('utf-8')\n",
117 | " engagement_rate = float(data[b'data:engagement_rate'].decode('utf-8'))\n",
118 | " user_engagement[user_id] = engagement_rate\n",
119 | "\n",
120 | "# Close the HBase connection\n",
121 | "connection.close()\n",
122 | "\n",
123 | "# Sort the users by engagement rate and get the top users\n",
124 | "top_users = sorted(user_engagement.items(), key=lambda item: item[1], reverse=True)[:3]\n",
125 | "\n",
126 | "# Print the top users and their engagement rates\n",
127 | "for user_id, engagement_rate in top_users:\n",
128 | " eng = float(engagement_rate) * 100\n",
129 | " print(f\"User ID: {user_id}, Engagement Rate: {eng:.2f} %\")"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "## User growth over time"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 28,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "Month: 2022-11, User Count: 45\n",
149 | "Month: 2022-12, User Count: 14\n",
150 | "Month: 2022-04, User Count: 13\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "# Connect to HBase\n",
156 | "# HBase connection settings\n",
157 | "hbase_host = 'localhost' # Replace with your HBase host\n",
158 | "hbase_port = 9090 # Default HBase port\n",
159 | "# Connect to HBase\n",
160 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
161 | "\n",
162 | "# Select the table\n",
163 | "table = connection.table('croissance_table')\n",
164 | "\n",
165 | "# Scan the table to retrieve data\n",
166 | "scan_result = table.scan(columns=['data:count'])\n",
167 | "\n",
168 | "# Create a dictionary to store month-wise user counts\n",
169 | "month_user_counts = {}\n",
170 | "\n",
171 | "# Process the scan result\n",
172 | "for key, data in scan_result:\n",
173 | " month = key.decode('utf-8')\n",
174 | " count = int(data[b'data:count'].decode('utf-8'))\n",
175 | " month_user_counts[month] = count\n",
176 | "\n",
177 | "# Close the HBase connection\n",
178 | "connection.close()\n",
179 | "\n",
180 | "# Sort the months by user counts and get the top 3\n",
181 | "top_months = sorted(month_user_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
182 | "\n",
183 | "# Print the top 3 months and their user counts\n",
184 | "for month, count in top_months:\n",
185 | " print(f\"Month: {month}, User Count: {count}\")"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "# Content analysis\n",
193 | "---"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "## Top 3 websites"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 29,
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "name": "stdout",
210 | "output_type": "stream",
211 | "text": [
212 | "Website: mastodon.social, Mention Count: 137\n",
213 | "Website: www.telam.com.ar, Mention Count: 70\n",
214 | "Website: twitter.com, Mention Count: 58\n"
215 | ]
216 | }
217 | ],
218 | "source": [
219 | "# Connect to HBase\n",
220 | "# HBase connection settings\n",
221 | "hbase_host = 'localhost' # Replace with your HBase host\n",
222 | "hbase_port = 9090 # Default HBase port\n",
223 | "# Connect to HBase\n",
224 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
225 | "\n",
226 | "# Select the table\n",
227 | "table = connection.table('url_table')\n",
228 | "\n",
229 | "# Scan the table to retrieve data\n",
230 | "scan_result = table.scan(columns=['data:count'])\n",
231 | "\n",
232 | "# Create a dictionary to store website mention counts\n",
233 | "website_counts = {}\n",
234 | "\n",
235 | "# Process the scan result\n",
236 | "for key, data in scan_result:\n",
237 | " website = key.decode('utf-8')\n",
238 | " count = int(data[b'data:count'].decode('utf-8'))\n",
239 | " website_counts[website] = count\n",
240 | "\n",
241 | "# Close the HBase connection\n",
242 | "connection.close()\n",
243 | "\n",
244 | "# Sort the websites by mention counts and get the top 3\n",
245 | "top_websites = sorted(website_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
246 | "\n",
247 | "# Print the top 3 websites and their mention counts\n",
248 | "for website, count in top_websites:\n",
249 | " print(f\"Website: {website}, Mention Count: {count}\")"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "# Language analysis\n",
257 | "---"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "## Most used languages"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 30,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "name": "stdout",
274 | "output_type": "stream",
275 | "text": [
276 | "Language: en, Count: 1677\n",
277 | "Language: de, Count: 313\n",
278 | "Language: es, Count: 115\n"
279 | ]
280 | }
281 | ],
282 | "source": [
283 | "# Connect to HBase\n",
284 | "# HBase connection settings\n",
285 | "hbase_host = 'localhost' # Replace with your HBase host\n",
286 | "hbase_port = 9090 # Default HBase port\n",
287 | "# Connect to HBase\n",
288 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
289 | "\n",
290 | "\n",
291 | "# Select the table\n",
292 | "table = connection.table('language_table')\n",
293 | "\n",
294 | "# Scan the table to retrieve language data\n",
295 | "scan_result = table.scan(columns=['data:count'])\n",
296 | "\n",
297 | "# Create a dictionary to store language counts\n",
298 | "language_counts = {}\n",
299 | "\n",
300 | "# Process the scan result\n",
301 | "for key, data in scan_result:\n",
302 | " language = key.decode('utf-8')\n",
303 | " count = int(data[b'data:count'].decode('utf-8'))\n",
304 | " language_counts[language] = count\n",
305 | "\n",
306 | "# Close the HBase connection\n",
307 | "connection.close()\n",
308 | "\n",
309 | "# Sort the language counts and get the top 3\n",
310 | "top_languages = sorted(language_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
311 | "\n",
312 | "# Print the top 3 languages and their counts\n",
313 | "for language, count in top_languages:\n",
314 | " print(f\"Language: {language}, Count: {count}\")"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "# Media engagement\n",
322 | "---"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "## Count of posts with media"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 35,
335 | "metadata": {},
336 | "outputs": [
337 | {
338 | "name": "stdout",
339 | "output_type": "stream",
340 | "text": [
341 | "There is 619 Posts with media\n"
342 | ]
343 | }
344 | ],
345 | "source": [
346 | "# Connect to HBase\n",
347 | "# HBase connection settings\n",
348 | "hbase_host = 'localhost' # Replace with your HBase host\n",
349 | "hbase_port = 9090 # Default HBase port\n",
350 | "# Connect to HBase\n",
351 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
352 | "\n",
353 | "# Select the table\n",
354 | "table = connection.table('toot_with_media_table')\n",
355 | "\n",
356 | "# Specify the row key you want to retrieve\n",
357 | "row_key = b'toot_with_media' # Use bytes for the row key\n",
358 | "\n",
359 | "# Use the get method to retrieve the row\n",
360 | "row_data = table.row(row_key)\n",
361 | "\n",
362 | "# Close the HBase connection\n",
363 | "connection.close()\n",
364 | "\n",
365 | "# Print or process the retrieved data\n",
366 | "if row_data:\n",
367 | " for column, cell in row_data.items():\n",
368 | " print(f\"There is {cell.decode()} Posts with media\")\n",
369 | "else:\n",
370 | " print(f\"Row with key '{row_key.decode('utf-8')}' not found in the table.\")"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "# Tags and mentions analysis\n",
378 | "---"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "metadata": {},
384 | "source": [
385 | "## Most used tags"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 38,
391 | "metadata": {},
392 | "outputs": [
393 | {
394 | "name": "stdout",
395 | "output_type": "stream",
396 | "text": [
397 | "Tag: michigan, Count: 56\n",
398 | "Tag: press, Count: 46\n",
399 | "Tag: genocide, Count: 42\n"
400 | ]
401 | }
402 | ],
403 | "source": [
404 | "# Connect to HBase\n",
405 | "# HBase connection settings\n",
406 | "hbase_host = 'localhost' # Replace with your HBase host\n",
407 | "hbase_port = 9090 # Default HBase port\n",
408 | "# Connect to HBase\n",
409 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
410 | "\n",
411 | "\n",
412 | "# Select the table\n",
413 | "table = connection.table('tag_table')\n",
414 | "\n",
415 | "# Scan the table to retrieve data\n",
416 | "scan_result = table.scan(columns=['data:count'])\n",
417 | "\n",
418 | "# Create a dictionary to store tag counts\n",
419 | "tag_counts = {}\n",
420 | "\n",
421 | "# Process the scan result\n",
422 | "for key, data in scan_result:\n",
423 | " tag = key.decode('utf-8')\n",
424 | " count = int(data[b'data:count'].decode('utf-8'))\n",
425 | " tag_counts[tag] = count\n",
426 | "\n",
427 | "# Close the HBase connection\n",
428 | "connection.close()\n",
429 | "\n",
430 | "# Sort the tags by count and get the top 3\n",
431 | "top_tags = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
432 | "\n",
433 | "# Print the top 3 tags and their counts\n",
434 | "for tag, count in top_tags:\n",
435 | " print(f\"Tag: {tag}, Count: {count}\")"
436 | ]
437 | }
438 | ],
439 | "metadata": {
440 | "kernelspec": {
441 | "display_name": "Python 3",
442 | "language": "python",
443 | "name": "python3"
444 | },
445 | "language_info": {
446 | "codemirror_mode": {
447 | "name": "ipython",
448 | "version": 3
449 | },
450 | "file_extension": ".py",
451 | "mimetype": "text/x-python",
452 | "name": "python",
453 | "nbconvert_exporter": "python",
454 | "pygments_lexer": "ipython3",
455 | "version": "3.10.12"
456 | }
457 | },
458 | "nbformat": 4,
459 | "nbformat_minor": 2
460 | }
461 |
--------------------------------------------------------------------------------