├── .gitignore
├── dataCollection
    ├── last_toot_id.txt
    ├── __pycache__
    │   └── loadData.cpython-39.pyc
    └── loadData.py
├── requirements.txt
├── images
    ├── gantt.PNG
    ├── airflow2.PNG
    └── workflow.png
├── GDPR Compliance.pdf
├── Commands.txt
├── mapReduce
    └── python
    │   ├── mapper.py
    │   └── reducer.py
├── airFlowDAG
    └── mastadon_dag.py
├── README.md
├── loadingIntoHbase
    └── insertion.py
└── analysis.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | # Environment file
2 | .env


--------------------------------------------------------------------------------
/dataCollection/last_toot_id.txt:
--------------------------------------------------------------------------------
1 | 111291292042375273


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Mastodon.py
2 | python-dotenv
3 | hdfs
4 | pandas
5 | happybase


--------------------------------------------------------------------------------
/images/gantt.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/gantt.PNG


--------------------------------------------------------------------------------
/GDPR Compliance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/GDPR Compliance.pdf


--------------------------------------------------------------------------------
/images/airflow2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/airflow2.PNG


--------------------------------------------------------------------------------
/images/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/workflow.png


--------------------------------------------------------------------------------
/dataCollection/__pycache__/loadData.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/dataCollection/__pycache__/loadData.cpython-39.pyc


--------------------------------------------------------------------------------
/Commands.txt:
--------------------------------------------------------------------------------
 1 | sudo pip install -r /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/requirements.txt
 2 | sudo python3 /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/loadData.py
 3 | 
 4 | hadoop fs -rm -r /raw/
 5 | 
 6 | # Job run
 7 | hadoop jar  /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar -mapper /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py -reducer /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py -input /raw/2023-10-21/19-0-posts.json -output /test/anass32/ 
 8 | hadoop fs -cat /test/anass30/part-00000
 9 | 
10 | #hbase 
11 | /usr/local/Hbase/bin/start-hbase.sh
12 | /usr/local/Hbase/bin/stop-hbase.sh
13 | 
14 | /usr/local/Hbase/bin/hbase-daemon.sh start thrift
15 | /usr/local/Hbase/bin/hbase-daemon.sh stop thrift
16 | 
17 | sudo /usr/local/Hbase/bin/hbase shell
18 | 
19 | # Airflow
20 | /AirFlow/airflow-environment/airflow scheduler
21 | /AirFlow/airflow-environment/airflow webserver -p 8080
22 | 
23 | 
24 | mapred streaming -files /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py,/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py -mapper mapper.py -reducer reducer.py -input /raw/2023-10-21/19-0-posts.json


--------------------------------------------------------------------------------
/mapReduce/python/mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import sys
 4 | import re
 5 | from urllib.parse import urlparse
 6 | 
 7 | def process_data(input_data):
 8 |     pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
 9 |     
10 |     for line in input_data:
11 |         try:
12 |             data = json.loads(line)
13 |             toot_with_media_dict = {}
14 |             toot_with_media_id = 'toot_with_media'
15 |             
16 |             created_at = data["created_at"].split(' ')[1].split('+')[0]
17 |             account = data.get("account")
18 |             media_attachments = data.get("media_attachments")
19 |             emojis = data.get("emojis") if data.get("emojis") else []
20 |             websites = data.get("content") if data.get("content") else []
21 |             tags = data.get("tags") if data.get("tags") else []
22 |             
23 |             if account:
24 |                 user_id = 'user:' + str(account.get('id'))
25 |                 followers = int(account.get('followers_count', 0))
26 |                 reblogs_count = data.get('reblogs_count', 0)
27 |                 favourites_count = data.get('favourites_count', 0)
28 | 
29 |                 engagement_rate = (reblogs_count + favourites_count) / followers if followers > 0 else 0
30 | 
31 |                 croissance_id = "croissance:" + account.get('created_at').split('-')[0] + '-' + account.get('created_at').split('-')[1]
32 | 
33 |                 user_data = {
34 |                     "date": created_at,
35 |                     "followers": followers,
36 |                     "engagement_rate": engagement_rate
37 |                 }
38 | 
39 |                 croissance_data = {"value": 1, "user_id": user_id}
40 | 
41 |                 # Emit key-value pairs for the reducer
42 |                 print(f"{user_id}\t{user_data}")
43 |                 print(f"{croissance_id}\t{croissance_data}")
44 |             
45 |             if media_attachments:
46 |                 toot_with_media_dict["value"] = 1
47 |                 print(f"{toot_with_media_id}\t{toot_with_media_dict}")
48 | 
49 |             language_id = "language:" + data.get('language')
50 |             language_data = {"value": 1}
51 |             print(f"{language_id}\t{language_data}")
52 | 
53 |             if emojis != []:
54 |                 for emoji in emojis:
55 |                     emoji_id = "emoji:" + emoji.get('shortcode')
56 |                     emoji_data = {"value": 1}
57 |                     print(f"{emoji_id}\t{emoji_data}")
58 |             
59 |             if websites != []:
60 |                 urls = re.search(pattern, websites)
61 |                 if urls:
62 |                     website_id = "website:" + urlparse(urls.group(0)).netloc
63 |                     website_data = {"value": 1}
64 |                     print(f"{website_id}\t{website_data}")    
65 |                             
66 |             if tags != []:
67 |                 for tag in tags:
68 |                     tag_id = "tag:" + tag.get('name')
69 |                     tag_data = {"value": 1}
70 |                     print(f"{tag_id}\t{tag_data}")
71 | 
72 |         except Exception as e:
73 |             # Log exceptions to standard error
74 |             print(f"Error: {str(e)}", file=sys.stderr)
75 | 
76 | # Example usage of the process_data function
77 | if __name__ == "__main__":
78 |     input_data = sys.stdin
79 |     process_data(input_data)
80 | 


--------------------------------------------------------------------------------
/airFlowDAG/mastadon_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.python_operator import PythonOperator
 3 | from airflow.models import Variable
 4 | from datetime import datetime, timedelta
 5 | from airflow.providers.email.operators.send_email import SendEmail
 6 | import subprocess
 7 | import sys
 8 | sys.path.insert(0, '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection')
 9 | sys.path.insert(0, '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/loadingIntoHbase')
10 | from loadData import retrieve_and_save_mastodon_data
11 | from insertion import insert_data_into_hbase
12 | 
13 | def success_callback(context):
14 |     SendEmail(
15 |         to=["saad.bentaleb08@gmail.com"],
16 |         subject="Mastodon Data Pipeline Succeeded",
17 |         html_content="The Mastodon Data Pipeline has succeeded."
18 |     )
19 | 
20 | def failure_callback(context):
21 |     SendEmail(
22 |         to=["saad.bentaleb08@gmail.com"],
23 |         subject="Mastodon Data Pipeline Failed",
24 |         html_content="The Mastodon Data Pipeline has failed."
25 |     )
26 | 
27 | default_args = {
28 |     'owner': 'admin',
29 |     'start_date': datetime(2023, 10, 23),
30 |     'retries': 0,  # Set to 0 to disable retries for the DAG
31 | }
32 | 
33 | with DAG('mastodon_data_pipeline1', default_args=default_args, schedule_interval='@daily') as dag:
34 |     def set_data_path(**kwargs):
35 |         data_path = retrieve_and_save_mastodon_data()  # Run the data collection function
36 |         processed_path = '/processed/' + datetime.now().strftime('%Y-%m-%d/%H-%M') + '/'
37 |         Variable.set("data_path", data_path)
38 |         Variable.set("processed_path", processed_path)
39 | 
40 |     # Create PythonOperator tasks
41 |     retrieve_and_save_mastodon_data_task = PythonOperator(
42 |         task_id='mastodon_data_pipeline1',
43 |         provide_context=True,
44 |         python_callable=set_data_path,
45 |         dag=dag,  
46 |     )
47 | 
48 |     def run_map_reduce(**kwargs):
49 |         data_path = Variable.get("data_path")  # Retrieve the data path from the variable
50 |         output_path = Variable.get("processed_path")  # Retrieve the processed path from the variable
51 |         # Use subprocess to run Hadoop MapReduce job with the provided data path
52 |         hadoop_command = f"hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar " \
53 |                          f"-mapper /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py " \
54 |                          f"-reducer /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py " \
55 |                          f"-input {data_path} " \
56 |                          f"-output {output_path}"
57 |         subprocess.run(hadoop_command, shell=True)
58 | 
59 |     run_map_reduce_task = PythonOperator(
60 |         task_id='run_map_reduce',
61 |         provide_context=True,
62 |         python_callable=run_map_reduce,
63 |         dag=dag,  
64 |     )
65 | 
66 |     def run_hbase_insertion(**kwargs):
67 |         processed_path = Variable.get("processed_path")
68 |         insert_data_into_hbase(processed_path)
69 | 
70 |     run_hbase_insertion_task = PythonOperator(
71 |         task_id='run_hbase_insertion',
72 |         provide_context=True,
73 |         python_callable=run_hbase_insertion,
74 |         dag=dag,  
75 |     )
76 | 
77 |     retrieve_and_save_mastodon_data_task >> run_map_reduce_task >> run_hbase_insertion_task
78 | 
79 | if __name__ == "__main__":
80 |     dag.cli()
81 | 
82 | 


--------------------------------------------------------------------------------
/dataCollection/loadData.py:
--------------------------------------------------------------------------------
  1 | from mastodon import Mastodon
  2 | from dotenv import load_dotenv
  3 | import os
  4 | from hdfs import InsecureClient
  5 | import datetime
  6 | import time
  7 | import json
  8 | 
  9 | load_dotenv()
 10 | 
 11 | def retrieve_and_save_mastodon_data():
 12 |     # Connect to the mastodon API
 13 |     mastodon = Mastodon(
 14 |         client_id=os.getenv('Client_key'),
 15 |         client_secret=os.getenv('Client_secret'),
 16 |         access_token=os.getenv('Access_token'),
 17 |         api_base_url="https://mastodon.social"
 18 |     )
 19 | 
 20 |     # Initialize an HDFS client
 21 |     hdfs_client = InsecureClient('http://localhost:9870', user='hadoop')
 22 | 
 23 |     # Get the current date and time
 24 |     now = datetime.datetime.now()
 25 |     directory_path = '/raw/' + str(now.year) + '-' + str(now.month) + '-' + str(now.day)
 26 | 
 27 |     # Check if the directory already exists
 28 |     if not hdfs_client.status(directory_path, strict=False):
 29 |         hdfs_client.makedirs(directory_path)
 30 | 
 31 |     # Define the HDFS path where you want to save the data
 32 |     hdfs_path = directory_path + '/' + str(now.hour) + '-' + str(now.minute)
 33 | 
 34 |     # Retrieve the last toot ID from a local file or start with None
 35 |     try:
 36 |         with open('/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/last_toot_id.txt', 'r', encoding='utf-8') as reader:
 37 |             last_toot_id = reader.read().strip()
 38 |     except FileNotFoundError:
 39 |         last_toot_id = None
 40 | 
 41 |     public_posts = []
 42 | 
 43 |     # Specify the duration of collecting data in minutes
 44 |     duration = 20 # Seconds
 45 | 
 46 |     # Get the current time
 47 |     start_time = time.time()
 48 | 
 49 |     while True:
 50 |         # Check if 10 minutes have passed
 51 |         if time.time() - start_time >= duration:
 52 |             break        # Retrieve public posts
 53 | 
 54 |         new = mastodon.timeline_public(limit=40, since_id=last_toot_id)
 55 |         
 56 |         # Append the current run's public posts to the list
 57 |         public_posts.extend(new)
 58 |         print(f'Number of posts retrieved: {str(len(public_posts))}', end='\r')
 59 |         
 60 |         # Update the last_toot_id
 61 |         if public_posts:
 62 |             latest_toot = public_posts[0]
 63 |             last_toot_id = str(latest_toot['id'])
 64 | 
 65 |     class CustomJSONEncoder(json.JSONEncoder):
 66 |         def default(self, o):
 67 |             if isinstance(o, datetime.datetime):
 68 |                 # Convert datetime to a string representation
 69 |                 return o.strftime('%Y-%m-%d %H:%M:%S %z')
 70 |             elif hasattr(o, '__dict__'):
 71 |                 # Handle other objects with __dict__ attribute
 72 |                 return o.__dict__
 73 |             return super().default(o)
 74 | 
 75 |     formatted_data = []
 76 |     for obj in public_posts:
 77 |         formatted_obj = json.dumps(obj, separators=(',', ':'), default=str, cls=CustomJSONEncoder)
 78 |         formatted_data.append(formatted_obj)
 79 | 
 80 |     # Convert the formatted data to a string
 81 |     formatted_data_str = '\n'.join(formatted_data)
 82 | 
 83 |     # Save the preprocessed data to HDFS
 84 |     with hdfs_client.write(hdfs_path + '-posts.json', encoding='utf-8') as writer:
 85 |         writer.write(formatted_data_str)
 86 | 
 87 |     print('Data saved successfully to HDFS: ' + hdfs_path + '-posts.json')
 88 |     
 89 |     # After retrieving the public posts, you can save the latest toot_id to a local file.
 90 |     if public_posts:
 91 |         latest_toot = public_posts[0]  # Assuming the latest toot is at the first position
 92 |         latest_toot_id = latest_toot['id']
 93 | 
 94 |         # Convert latest_toot_id to a string
 95 |         latest_toot_id_str = str(latest_toot_id)
 96 | 
 97 |         # Define the path to the local file
 98 |         local_file_path = '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/last_toot_id.txt'
 99 | 
100 |         # Update or create the local file with the latest_toot_id
101 |         with open(local_file_path, 'w', encoding='utf-8') as writer:
102 |             writer.write(latest_toot_id_str)
103 | 
104 | 
105 |     return hdfs_path + '-posts.json'
106 | 
107 | if __name__ == '__main__':
108 |     retrieve_and_save_mastodon_data()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Mastodon Data Pipeline README
  2 | 
  3 | This README provides an overview of a data pipeline project that consists of four main phases: data extraction from the Mastodon API, data processing with Hadoop MapReduce using Python streaming, data storage in HBase, and orchestration with Apache Airflow for automated daily execution. The project aims to collect and analyze data from the Mastodon platform, a federated social media platform.
  4 | 
  5 | ## Folders structure
  6 | ```
  7 | Repository Root
  8 | ├── airFlowDag
  9 | │   ├── mastadon_dag.py
 10 | │
 11 | ├── dataCollection
 12 | │   ├── last_toot_id.txt
 13 | │   ├── loadData.py
 14 | │   ├── public_posts.json
 15 | │   ├── tests.ipynb
 16 | │
 17 | ├── images
 18 | │   ├── airflow.PNG
 19 | │   ├── gantt.PNG
 20 | │   ├── workflow.PNG
 21 | │
 22 | ├── loadingIntoHbase
 23 | │   ├── insertion.py
 24 | │
 25 | ├── mapReduce
 26 | │   ├── Python
 27 | │   │   ├── mapper.py
 28 | │   │   ├── reducer.py
 29 | │
 30 | ├── airFlowDag
 31 | │   ├── mastadon_dag.py
 32 | │
 33 | │
 34 | ├── README.md
 35 | ├── analysis.ipynb
 36 | ├── Commands.txt
 37 | ├── RGPD.pdf
 38 | ├── requirements.txt
 39 | 
 40 | ```
 41 | 
 42 | ## Project Overview
 43 | 
 44 | 
 45 | As a Data Developer, your role in this project is to set up an automated pipeline to address the following challenges:
 46 | ---
 47 | ![Workflow](/images/workflow.png)
 48 | ---
 49 | 
 50 | ### Phase 1: Data Collection
 51 | 
 52 | - **Data Extraction:** Utilize the Mastodon API with your access tokens to gather raw data from the Mastodon platform.
 53 | 
 54 | - **Raw Data Storage:** Store the raw data in a distributed file system such as HDFS for scalability.
 55 | 
 56 | - **HDFS Data Lake Modeling:** Define the data lake schema for HDFS.
 57 | 
 58 | ### Phase 2: Data Processing with MapReduce
 59 | 
 60 | - **Mapper:** Process the input data and generate key-value pairs based on the desired metrics (e.g., user followers, engagement rate, URLs, emojis, etc.).
 61 | 
 62 | - **Reducer:** Aggregate the key-value pairs produced by the mapper.
 63 | 
 64 | - **MapReduce Job Execution:** Use Hadoop streaming API to execute the MapReduce task, providing the mapper and reducer scripts as inputs.
 65 | 
 66 | - **Monitoring:** Keep track of job progress through the Hadoop web UI.
 67 | 
 68 | ### Phase 3: Data Storage in HBase
 69 | 
 70 | - **HBase Schema Design:** Design the HBase tables schema based on the information you want to extract.
 71 | 
 72 | | **Table Name**           | **Description**                               | **Schema**                                 |
 73 | |--------------------------|-----------------------------------------------|-------------------------------------------|
 74 | | `language_table`         | Stores language counts.                      | - Row Key: Language code (e.g., 'en', 'es')<br>- Columns:<br>  - `data:count`: Count of users/data.<br>- Timestamp: Record timestamp.        |
 75 | | `user_table`             | Stores user information, engagement rates, and followers. | - Row Key: User ID (e.g., '1007156', '10106')<br>- Columns:<br>  - `data:engagement_rate`: User's engagement rate.<br>  - `data:followers`: Number of followers.<br>- Timestamp: Record timestamp. |
 76 | | `croissance_table`       | Records user creation counts by month.       | - Row Key: Month (e.g., '2007-07', '2008-01')<br>- Columns:<br>  - `data:count`: Count of user creations.<br>- Timestamp: Record timestamp.   |
 77 | | `url_table`              | Tracks mentions of external websites.        | - Row Key: Website URL (e.g., 'a.gup.pe', 'abcgazetesi.com')<br>- Columns:<br>  - `data:count`: Count of mentions.<br>- Timestamp: Record timestamp. |
 78 | | `toot_with_media_table`  | Records the count of toots with media content. | - Row Key: Fixed key 'toot_with_media'<br>- Columns:<br>  - `data:count`: Count of toots with media.<br>- Timestamp: Record timestamp.   |
 79 | | `tag_table`              | Stores counts of used tags.                  | - Row Key: Tag name (e.g., '10yrsago', '17thcentury')<br>- Columns:<br>  - `data:count`: Count of tag usage.<br>- Timestamp: Record timestamp. |
 80 | 
 81 | 
 82 | - **Best Practices:** Follow best practices for row key design, column family design, compression, bloom filters, batch inserts, etc.
 83 | 
 84 | - **Table Creation:** Create the necessary tables in HBase.
 85 | 
 86 | ```
 87 | # HBase connection settings
 88 | hbase_host = 'localhost'  # Replace with your HBase host
 89 | hbase_port = 9090  # Default HBase port
 90 | # Connect to HBase
 91 | connection = happybase.Connection(host=hbase_host, port=hbase_port)
 92 | 
 93 | tables_list = ["user_table", "croissance_table", "language_table", "toot_with_media_table", "emoji_table", "url_table", "tag_table"]
 94 | tables = connection.tables()
 95 | # remove the b' and ' from the table names
 96 | tables = [table_name.decode() for table_name in tables]
 97 | 
 98 | # Create the tables if they do not exist
 99 | for table_name in tables_list:
100 |     if table_name not in tables:
101 |         connection.create_table(table_name, {'data': dict()})
102 | 
103 | connection.close()
104 | ```
105 | 
106 | - **Data Insertion:** Populate the output from the reducer into HBase tables using a Python HBase client or your preferred method.
107 | 
108 | ### Phase 4: Orchestration with Apache Airflow
109 | 
110 | - **Workflow Orchestration:** Define a Directed Acyclic Graph (DAG) to orchestrate the entire workflow.
111 | 
112 | - **Task Creation:** Create tasks for running the MapReduce job and storing results in HBase.
113 | 
114 | - **Monitoring and Error Handling:** Monitor progress and manage errors or failures.
115 | 
116 | ### Phase 5: Data Analysis
117 | 
118 | After successfully completing the previous phases, you can perform data analysis. You
119 | 
120 | ### Phase 6: Workflow Execution
121 | 
122 | In the Apache Airflow web interface, activate the DAG, monitor DAG execution progress, and check logs for any issues. Once the DAG is complete, review the results in HBase.
123 | 
124 | #### Airflow run details
125 | 
126 | ![AirflowRun](/images/airflow2.PNG)
127 | 
128 | ### Phase 7: Optimization and Monitoring
129 | 
130 | Optimize MapReduce scripts for better performance. Monitor HBase for storage issues and set up alerts in Airflow for task failures. Regularly monitor Hadoop using its respective web interface.
131 | 
132 | ### Phase 8: Data Access Rights Configuration Updates
133 | 
134 | Update API tokens if organizational roles change, ensuring they have the necessary permissions for data retrieval.
135 | 
136 | ### Phase 9: Scheduling and Compliance
137 | 
138 | Ensure that DAGs are scheduled at appropriate intervals for data refresh. Update the data processing log to ensure GDPR compliance by documenting all personal data from Mastodon and how it's processed.
139 | 


--------------------------------------------------------------------------------
/mapReduce/python/reducer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys
  3 | import json
  4 | # import happybase
  5 | 
  6 | # # Initialize a connection to HBase
  7 | # connection = happybase.Connection('localhost', 9090)
  8 | 
  9 | # # Define the HBase tables
 10 | # user_table = connection.table('user_data')
 11 | # croissance_table = connection.table('croissance_data')
 12 | # language_table = connection.table('language_data')
 13 | # toot_with_media_table = connection.table('toot_with_media_data')
 14 | # emoji_table = connection.table('emoji_data')
 15 | # url_table = connection.table('website_data')
 16 | 
 17 | current_user = None
 18 | current_croissance = None
 19 | current_language = None
 20 | current_toot_with_media = None
 21 | current_emoji = None
 22 | current_url = None
 23 | current_tag = None
 24 | 
 25 | earliest_followers = float('inf')
 26 | old_time = "00:00:00"
 27 | 
 28 | engagement_rate_sum = 0.0000
 29 | croissance_sum = 0
 30 | language_sum = 0
 31 | toot_with_media_sum = 0
 32 | emoji_sum = 0
 33 | url_sum = 0
 34 | tag_sum = 0
 35 | count = 1
 36 | 
 37 | unique_users_roissance = []
 38 | 
 39 | user_dict = {}
 40 | croissance_dict = {}
 41 | language_dict = {}
 42 | toot_with_media_dict = {}
 43 | emoji_dict = {}
 44 | url_dict = {}
 45 | tag_dict = {}
 46 | 
 47 | for line in sys.stdin:
 48 |     key = line.strip().split('\t')[0]
 49 |     key = key.strip()
 50 |     # if key starts with user, it's a user record
 51 |     if key.startswith("user"):
 52 |         user, strdata = line.strip().split('\t')
 53 |         # string to dict
 54 |         data = eval(strdata.strip())
 55 |         followers = int(data["followers"])
 56 |         time = data['date']
 57 |         engagement_rate = float(data["engagement_rate"])
 58 | 
 59 |         if current_user == user:
 60 |             if time > old_time:
 61 |                 old_time = time
 62 |                 earliest_followers = followers
 63 |             engagement_rate_sum += engagement_rate
 64 |             count += 1
 65 |         else:
 66 |             if current_user is not None:
 67 |                 if count > 0:
 68 |                     engagement_rate_avg = engagement_rate_sum / count
 69 |                     user_dict["engagement_rate"] = engagement_rate_avg
 70 |                     user_dict["followers"] = earliest_followers
 71 |                     print(f"{current_user}\t{user_dict}")
 72 |             current_user = user
 73 |             earliest_followers = followers
 74 |             old_time = time
 75 |             engagement_rate_sum = engagement_rate
 76 |             count = 1
 77 | 
 78 |     # if key starts with croissance, it's a croissance record
 79 |     elif key.startswith("croissance"):
 80 |         croissance, strdata = line.strip().split('\t')
 81 |         data = eval(strdata.strip())
 82 |         user_id = data["user_id"]
 83 |         if croissance == current_croissance:
 84 |             if current_croissance is not None:
 85 |                 if user_id not in unique_users_roissance:
 86 |                     unique_users_roissance.append(user_id)
 87 |                     croissance_sum += data["value"]
 88 |         elif croissance != current_croissance:
 89 |             if current_croissance is not None:
 90 |                 croissance_dict["count"] = croissance_sum
 91 |                 print(f"{current_croissance}\t{croissance_dict}")
 92 |             current_croissance = croissance
 93 |             croissance_sum = 1
 94 |             unique_users_roissance = [user_id]
 95 | 
 96 |     #if key starts with language, it's a language record
 97 |     elif key.startswith("language"):
 98 |         language, strdata = line.strip().split('\t')
 99 |         data = eval(strdata.strip())
100 |         if language == current_language:
101 |             if current_language is not None:
102 |                 language_sum += data["value"]
103 |         elif language != current_language:
104 |             if current_language is not None:
105 |                 language_dict["count"] = language_sum
106 |                 print(f"{current_language}\t{language_dict}")
107 |             current_language = language
108 |             language_sum = 1
109 | 
110 |     elif key.startswith("toot_with_media"):
111 |         toot_with_media, strdata = line.strip().split('\t')
112 |         data = eval(strdata.strip())
113 |         if toot_with_media == current_toot_with_media:
114 |             if current_toot_with_media is not None:
115 |                 toot_with_media_sum += data["value"]
116 |         elif toot_with_media != current_toot_with_media:
117 |             if current_toot_with_media is not None:
118 |                 toot_with_media_dict["count"] = toot_with_media_sum
119 |                 print(f"{current_toot_with_media}\t{toot_with_media_dict}")
120 |             current_toot_with_media = toot_with_media
121 |             toot_with_media_sum = 1
122 |     
123 |     elif key.startswith("emoji"):
124 |         emoji_id , strdata = line.strip().split('\t')
125 |         data = eval(strdata.strip())
126 |         if emoji_id == current_emoji:
127 |             if current_emoji is not None:
128 |                 emoji_sum += data["value"]
129 |         elif emoji_id != current_emoji:
130 |             if current_emoji is not None:
131 |                 emoji_dict["count"] = emoji_sum
132 |                 print(f"{current_emoji}\t{emoji_dict}")
133 |             current_emoji = emoji_id
134 |             emoji_sum = 1
135 | 
136 |     elif key.startswith("website"):
137 |         website_id , strdata = line.strip().split('\t')
138 |         data = eval(strdata.strip())
139 |         if website_id == current_url:
140 |             if current_url is not None:
141 |                 url_sum += data["value"]
142 |         elif website_id != current_url:
143 |             if current_url is not None:
144 |                 url_dict["count"] = url_sum
145 |                 print(f"{current_url}\t{url_dict}")
146 |             current_url = website_id
147 |             url_sum = 1
148 |     
149 |     elif key.startswith("tag"):
150 |         tag_id , strdata = line.strip().split('\t')
151 |         data = eval(strdata.strip())
152 |         if tag_id == current_tag:
153 |             if current_tag is not None:
154 |                 tag_sum += data["value"]
155 |         elif tag_id != current_tag:
156 |             if current_tag is not None:
157 |                 tag_dict["count"] = tag_sum
158 |                 print(f"{current_tag}\t{tag_dict}")
159 |             current_tag = tag_id
160 |             tag_sum = 1
161 | 
162 | # Print the last data
163 | if current_croissance is not None:
164 |     croissance_dict["count"] = croissance_sum
165 |     print(f"{current_croissance}\t{croissance_dict}")
166 | 
167 | if current_user is not None:
168 |     if count > 0:
169 |         engagement_rate_avg = engagement_rate_sum / count
170 |         print(f"{current_user}\t{user_dict}")
171 | 
172 | if current_language is not None:
173 |     language_dict["count"] = language_sum
174 |     print(f"{current_language}\t{language_dict}")
175 | 
176 | if current_toot_with_media is not None:
177 |     toot_with_media_dict["count"] = toot_with_media_sum
178 |     print(f"{current_toot_with_media}\t{toot_with_media_dict}")
179 | 
180 | if current_emoji is not None:
181 |     emoji_dict["count"] = emoji_sum
182 |     print(f"{current_emoji}\t{emoji_dict}")
183 | 
184 | if current_url is not None:
185 |     url_dict["count"] = url_sum
186 |     print(f"{current_url}\t{url_dict}")
187 | 
188 | if current_tag is not None:
189 |     tag_dict["count"] = tag_sum
190 |     print(f"{current_tag}\t{tag_dict}")


--------------------------------------------------------------------------------
/loadingIntoHbase/insertion.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from hdfs import InsecureClient
  3 | import json
  4 | import happybase
  5 | 
  6 | 
  7 | 
  8 | 
  9 | # define a function to process the data from the hdf file
 10 | def insert_data_into_hbase(data_folder_path):
 11 | 
 12 |     # HBase connection settings
 13 |     hbase_host = 'localhost'  # Replace with your HBase host
 14 |     hbase_port = 9090  # Default HBase port
 15 |     # Connect to HBase
 16 |     connection = happybase.Connection(host=hbase_host, port=hbase_port)
 17 | 
 18 |     tables_list = ["user_table", "croissance_table", "language_table", "toot_with_media_table", "emoji_table", "url_table", "tag_table"]
 19 |     tables = connection.tables()
 20 |     # remove the b' and ' from the table names
 21 |     tables = [table_name.decode() for table_name in tables]
 22 | 
 23 |     # Create the tables if they do not exist
 24 |     for table_name in tables_list:
 25 |         if table_name not in tables:
 26 |             connection.create_table(table_name, {'data': dict()})
 27 |     # Get the tables
 28 |     user_table = connection.table('user_table')
 29 |     croissance_table = connection.table('croissance_table')
 30 |     language_table = connection.table('language_table')
 31 |     toot_with_media_table = connection.table('toot_with_media_table')
 32 |     emoji_table = connection.table('emoji_table')
 33 |     url_table = connection.table('url_table')
 34 |     tag_table = connection.table('tag_table')
 35 |     
 36 |     # Initialize an HDFS client
 37 |     hdfs_client = InsecureClient('http://localhost:9870', user='hadoop')
 38 |     # get files from the data folder that start with part
 39 |     files = hdfs_client.list(data_folder_path)
 40 |     # get oly the files that start with part
 41 |     files = [file for file in files if file.startswith('part')]
 42 |     # loop through the files
 43 |     for file in files:
 44 |         # get the file path
 45 |         file_path = data_folder_path + '/' + file
 46 |         # open the file
 47 |         with hdfs_client.read(file_path, encoding='utf-8') as reader:
 48 |             # read the file
 49 |             data = reader.read()
 50 |             # split the file into lines
 51 |             lines = data.splitlines()
 52 |             # loop through the lines
 53 |             for line in lines:
 54 |                 # split the line into key and value
 55 |                 key, value = line.strip().split('\t')
 56 |                 # split the key into key and id                
 57 |                 # convert the value to a dictionary
 58 |                 value = eval(value.strip())
 59 |                 # if the key is user
 60 |                 if key.startswith('user'):
 61 |                     key, id = key.strip().split(':')
 62 |                     # get the engagement rate
 63 |                     engagement_rate = value['engagement_rate']
 64 |                     # get the followers
 65 |                     followers = value['followers']
 66 |                     user_dict = {
 67 |                         "engagement_rate": engagement_rate,
 68 |                         "followers": followers
 69 |                     }
 70 |                     try:
 71 |                         # Store the user data in HBase
 72 |                         user_table.put(str(id).encode(), {b'data:engagement_rate': str(user_dict["engagement_rate"]).encode()})
 73 |                         user_table.put(str(id).encode(), {b'data:followers': str(user_dict["followers"]).encode()})
 74 |                     except Exception as e:
 75 |                         # Log exceptions to standard error
 76 |                         print(f"Error: {str(e)}")
 77 |                 elif key.startswith('croissance'):
 78 |                     key, id = key.strip().split(':')
 79 |                     croissance_dict = {
 80 |                         "count": value['count']
 81 |                     }
 82 |                     try:
 83 |                         # Store the user data in HBase
 84 |                         croissance_table.put(str(id).encode(), {b'data:count': str(croissance_dict["count"]).encode()})
 85 |                     except Exception as e:
 86 |                         # Log exceptions to standard error
 87 |                         print(f"Error: {str(e)}")
 88 |                 elif key.startswith('language'):
 89 |                     key, id = key.strip().split(':')
 90 |                     language_dict = {
 91 |                         "count": value['count']
 92 |                     }
 93 |                     try:
 94 |                         # Store the user data in HBase
 95 |                         language_table.put(str(id).encode(), {b'data:count': str(language_dict["count"]).encode()})
 96 |                     except Exception as e:
 97 |                         # Log exceptions to standard error
 98 |                         print(f"Error: {str(e)}")
 99 |                 elif key.startswith('toot_with_media'):
100 |                     id = key.strip()
101 |                     toot_with_media_dict = {
102 |                         "count": value['count']
103 |                     }
104 |                     try:
105 |                         # Store the user data in HBase
106 |                         toot_with_media_table.put(str(id).encode(), {b'data:count': str(toot_with_media_dict["count"]).encode()})
107 |                     except Exception as e:
108 |                         # Log exceptions to standard error
109 |                         print(f"Error: {str(e)}")
110 |                 elif key.startswith('emoji'):
111 |                     key, id = key.strip().split(':')
112 |                     emoji_dict = {
113 |                         "count": value['count']
114 |                     }
115 |                     try:
116 |                         # Store the user data in HBase
117 |                         emoji_table.put(str(id).encode(), {b'data:count': str(emoji_dict["count"]).encode()})
118 |                     except Exception as e:
119 |                         # Log exceptions to standard error
120 |                         print(f"Error: {str(e)}")
121 |                 elif key.startswith('website'):
122 |                     key, id = key.strip().split(':')
123 |                     url_dict = {
124 |                         "count": value['count']
125 |                     }
126 |                     try:
127 |                         # Store the user data in HBase
128 |                         url_table.put(str(id).encode(), {b'data:count': str(url_dict["count"]).encode()})
129 |                     except Exception as e:
130 |                         # Log exceptions to standard error
131 |                         print(f"Error: {str(e)}")
132 |                 elif key.startswith('tag'):
133 |                     key, id = key.strip().split(':')
134 |                     tag_dict = {
135 |                         "count": value['count']
136 |                     }
137 |                     try:
138 |                         # Store the user data in HBase
139 |                         tag_table.put(str(id).encode(), {b'data:count': str(tag_dict["count"]).encode()})
140 |                     except Exception as e:
141 |                         # Log exceptions to standard error
142 |                         print(f"Error: {str(e)}")
143 |                         
144 |     
145 |     connection.close()
146 |                 
147 |         
148 | if __name__ == "__main__":
149 |     # Get the data path from the command line
150 |     data_path = sys.argv[1]
151 |     # Process the data
152 |     insert_data_into_hbase(data_path)


--------------------------------------------------------------------------------
/analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import happybase"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# User Analysis\n",
 17 |     "---"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Top 3 Followed users"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 21,
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stdout",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "User ID: 109999724950289416, Followers: 3340001\n",
 37 |       "User ID: 109999725002952105, Followers: 2410000\n",
 38 |       "User ID: 109718180885037597, Followers: 2330004\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# Connect to HBase\n",
 44 |     "# HBase connection settings\n",
 45 |     "hbase_host = 'localhost'  # Replace with your HBase host\n",
 46 |     "hbase_port = 9090  # Default HBase port\n",
 47 |     "# Connect to HBase\n",
 48 |     "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
 49 |     "\n",
 50 |     "table = connection.table('user_table')\n",
 51 |     "\n",
 52 |     "# Scan the table to retrieve user data\n",
 53 |     "scan_result = table.scan(columns=['data:followers'])\n",
 54 |     "\n",
 55 |     "# Create a dictionary to store user followers\n",
 56 |     "user_followers = {}\n",
 57 |     "\n",
 58 |     "# Process the scan result\n",
 59 |     "for key, data in scan_result:\n",
 60 |     "    user_id = key.decode('utf-8')\n",
 61 |     "    followers = int(data[b'data:followers'].decode('utf-8'))\n",
 62 |     "    user_followers[user_id] = followers\n",
 63 |     "\n",
 64 |     "# Close the HBase connection\n",
 65 |     "connection.close()\n",
 66 |     "\n",
 67 |     "# Sort the users by followers and get the top users\n",
 68 |     "top_users = sorted(user_followers.items(), key=lambda item: item[1], reverse=True)[:3]\n",
 69 |     "\n",
 70 |     "# Print the top users and their followers\n",
 71 |     "for user_id, followers in top_users:\n",
 72 |     "    print(f\"User ID: {user_id}, Followers: {followers}\")\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Users with highest engagement rate"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 27,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "User ID: 110643001753379072, Engagement Rate: 19.05 %\n",
 92 |       "User ID: 111243750378556518, Engagement Rate: 7.14 %\n",
 93 |       "User ID: 110657506981999016, Engagement Rate: 5.62 %\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "# Connect to HBase\n",
 99 |     "# HBase connection settings\n",
100 |     "hbase_host = 'localhost'  # Replace with your HBase host\n",
101 |     "hbase_port = 9090  # Default HBase port\n",
102 |     "# Connect to HBase\n",
103 |     "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
104 |     "\n",
105 |     "# Select the table\n",
106 |     "table = connection.table('user_table')\n",
107 |     "\n",
108 |     "# Scan the table to retrieve user data\n",
109 |     "scan_result = table.scan(columns=['data:engagement_rate'])\n",
110 |     "\n",
111 |     "# Create a dictionary to store user engagement rates\n",
112 |     "user_engagement = {}\n",
113 |     "\n",
114 |     "# Process the scan result\n",
115 |     "for key, data in scan_result:\n",
116 |     "    user_id = key.decode('utf-8')\n",
117 |     "    engagement_rate = float(data[b'data:engagement_rate'].decode('utf-8'))\n",
118 |     "    user_engagement[user_id] = engagement_rate\n",
119 |     "\n",
120 |     "# Close the HBase connection\n",
121 |     "connection.close()\n",
122 |     "\n",
123 |     "# Sort the users by engagement rate and get the top users\n",
124 |     "top_users = sorted(user_engagement.items(), key=lambda item: item[1], reverse=True)[:3]\n",
125 |     "\n",
126 |     "# Print the top users and their engagement rates\n",
127 |     "for user_id, engagement_rate in top_users:\n",
128 |     "    eng = float(engagement_rate) * 100\n",
129 |     "    print(f\"User ID: {user_id}, Engagement Rate: {eng:.2f} %\")"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## User growth over time"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 28,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "Month: 2022-11, User Count: 45\n",
149 |       "Month: 2022-12, User Count: 14\n",
150 |       "Month: 2022-04, User Count: 13\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "# Connect to HBase\n",
156 |     "# HBase connection settings\n",
157 |     "hbase_host = 'localhost'  # Replace with your HBase host\n",
158 |     "hbase_port = 9090  # Default HBase port\n",
159 |     "# Connect to HBase\n",
160 |     "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
161 |     "\n",
162 |     "# Select the table\n",
163 |     "table = connection.table('croissance_table')\n",
164 |     "\n",
165 |     "# Scan the table to retrieve data\n",
166 |     "scan_result = table.scan(columns=['data:count'])\n",
167 |     "\n",
168 |     "# Create a dictionary to store month-wise user counts\n",
169 |     "month_user_counts = {}\n",
170 |     "\n",
171 |     "# Process the scan result\n",
172 |     "for key, data in scan_result:\n",
173 |     "    month = key.decode('utf-8')\n",
174 |     "    count = int(data[b'data:count'].decode('utf-8'))\n",
175 |     "    month_user_counts[month] = count\n",
176 |     "\n",
177 |     "# Close the HBase connection\n",
178 |     "connection.close()\n",
179 |     "\n",
180 |     "# Sort the months by user counts and get the top 3\n",
181 |     "top_months = sorted(month_user_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
182 |     "\n",
183 |     "# Print the top 3 months and their user counts\n",
184 |     "for month, count in top_months:\n",
185 |     "    print(f\"Month: {month}, User Count: {count}\")"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "# Content analysis\n",
193 |     "---"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Top 3 websites"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 29,
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "Website: mastodon.social, Mention Count: 137\n",
213 |       "Website: www.telam.com.ar, Mention Count: 70\n",
214 |       "Website: twitter.com, Mention Count: 58\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "# Connect to HBase\n",
220 |     "# HBase connection settings\n",
221 |     "hbase_host = 'localhost'  # Replace with your HBase host\n",
222 |     "hbase_port = 9090  # Default HBase port\n",
223 |     "# Connect to HBase\n",
224 |     "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
225 |     "\n",
226 |     "# Select the table\n",
227 |     "table = connection.table('url_table')\n",
228 |     "\n",
229 |     "# Scan the table to retrieve data\n",
230 |     "scan_result = table.scan(columns=['data:count'])\n",
231 |     "\n",
232 |     "# Create a dictionary to store website mention counts\n",
233 |     "website_counts = {}\n",
234 |     "\n",
235 |     "# Process the scan result\n",
236 |     "for key, data in scan_result:\n",
237 |     "    website = key.decode('utf-8')\n",
238 |     "    count = int(data[b'data:count'].decode('utf-8'))\n",
239 |     "    website_counts[website] = count\n",
240 |     "\n",
241 |     "# Close the HBase connection\n",
242 |     "connection.close()\n",
243 |     "\n",
244 |     "# Sort the websites by mention counts and get the top 3\n",
245 |     "top_websites = sorted(website_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
246 |     "\n",
247 |     "# Print the top 3 websites and their mention counts\n",
248 |     "for website, count in top_websites:\n",
249 |     "    print(f\"Website: {website}, Mention Count: {count}\")"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "# Language analysis\n",
257 |     "---"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "## Most used languages"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 30,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "Language: en, Count: 1677\n",
277 |       "Language: de, Count: 313\n",
278 |       "Language: es, Count: 115\n"
279 |      ]
280 |     }
281 |    ],
282 |    "source": [
283 |     "# Connect to HBase\n",
284 |     "# HBase connection settings\n",
285 |     "hbase_host = 'localhost'  # Replace with your HBase host\n",
286 |     "hbase_port = 9090  # Default HBase port\n",
287 |     "# Connect to HBase\n",
288 |     "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
289 |     "\n",
290 |     "\n",
291 |     "# Select the table\n",
292 |     "table = connection.table('language_table')\n",
293 |     "\n",
294 |     "# Scan the table to retrieve language data\n",
295 |     "scan_result = table.scan(columns=['data:count'])\n",
296 |     "\n",
297 |     "# Create a dictionary to store language counts\n",
298 |     "language_counts = {}\n",
299 |     "\n",
300 |     "# Process the scan result\n",
301 |     "for key, data in scan_result:\n",
302 |     "    language = key.decode('utf-8')\n",
303 |     "    count = int(data[b'data:count'].decode('utf-8'))\n",
304 |     "    language_counts[language] = count\n",
305 |     "\n",
306 |     "# Close the HBase connection\n",
307 |     "connection.close()\n",
308 |     "\n",
309 |     "# Sort the language counts and get the top 3\n",
310 |     "top_languages = sorted(language_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
311 |     "\n",
312 |     "# Print the top 3 languages and their counts\n",
313 |     "for language, count in top_languages:\n",
314 |     "    print(f\"Language: {language}, Count: {count}\")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "# Media engagement\n",
322 |     "---"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "## Count of posts with media"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 35,
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "name": "stdout",
339 |      "output_type": "stream",
340 |      "text": [
341 |       "There is 619 Posts with media\n"
342 |      ]
343 |     }
344 |    ],
345 |    "source": [
346 |     "# Connect to HBase\n",
347 |     "# HBase connection settings\n",
348 |     "hbase_host = 'localhost'  # Replace with your HBase host\n",
349 |     "hbase_port = 9090  # Default HBase port\n",
350 |     "# Connect to HBase\n",
351 |     "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
352 |     "\n",
353 |     "# Select the table\n",
354 |     "table = connection.table('toot_with_media_table')\n",
355 |     "\n",
356 |     "# Specify the row key you want to retrieve\n",
357 |     "row_key = b'toot_with_media'  # Use bytes for the row key\n",
358 |     "\n",
359 |     "# Use the get method to retrieve the row\n",
360 |     "row_data = table.row(row_key)\n",
361 |     "\n",
362 |     "# Close the HBase connection\n",
363 |     "connection.close()\n",
364 |     "\n",
365 |     "# Print or process the retrieved data\n",
366 |     "if row_data:\n",
367 |     "    for column, cell in row_data.items():\n",
368 |     "        print(f\"There is {cell.decode()} Posts with media\")\n",
369 |     "else:\n",
370 |     "    print(f\"Row with key '{row_key.decode('utf-8')}' not found in the table.\")"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {},
376 |    "source": [
377 |     "# Tags and mentions analysis\n",
378 |     "---"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "## Most used tags"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 38,
391 |    "metadata": {},
392 |    "outputs": [
393 |     {
394 |      "name": "stdout",
395 |      "output_type": "stream",
396 |      "text": [
397 |       "Tag: michigan, Count: 56\n",
398 |       "Tag: press, Count: 46\n",
399 |       "Tag: genocide, Count: 42\n"
400 |      ]
401 |     }
402 |    ],
403 |    "source": [
404 |     "# Connect to HBase\n",
405 |     "# HBase connection settings\n",
406 |     "hbase_host = 'localhost'  # Replace with your HBase host\n",
407 |     "hbase_port = 9090  # Default HBase port\n",
408 |     "# Connect to HBase\n",
409 |     "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n",
410 |     "\n",
411 |     "\n",
412 |     "# Select the table\n",
413 |     "table = connection.table('tag_table')\n",
414 |     "\n",
415 |     "# Scan the table to retrieve data\n",
416 |     "scan_result = table.scan(columns=['data:count'])\n",
417 |     "\n",
418 |     "# Create a dictionary to store tag counts\n",
419 |     "tag_counts = {}\n",
420 |     "\n",
421 |     "# Process the scan result\n",
422 |     "for key, data in scan_result:\n",
423 |     "    tag = key.decode('utf-8')\n",
424 |     "    count = int(data[b'data:count'].decode('utf-8'))\n",
425 |     "    tag_counts[tag] = count\n",
426 |     "\n",
427 |     "# Close the HBase connection\n",
428 |     "connection.close()\n",
429 |     "\n",
430 |     "# Sort the tags by count and get the top 3\n",
431 |     "top_tags = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n",
432 |     "\n",
433 |     "# Print the top 3 tags and their counts\n",
434 |     "for tag, count in top_tags:\n",
435 |     "    print(f\"Tag: {tag}, Count: {count}\")"
436 |    ]
437 |   }
438 |  ],
439 |  "metadata": {
440 |   "kernelspec": {
441 |    "display_name": "Python 3",
442 |    "language": "python",
443 |    "name": "python3"
444 |   },
445 |   "language_info": {
446 |    "codemirror_mode": {
447 |     "name": "ipython",
448 |     "version": 3
449 |    },
450 |    "file_extension": ".py",
451 |    "mimetype": "text/x-python",
452 |    "name": "python",
453 |    "nbconvert_exporter": "python",
454 |    "pygments_lexer": "ipython3",
455 |    "version": "3.10.12"
456 |   }
457 |  },
458 |  "nbformat": 4,
459 |  "nbformat_minor": 2
460 | }
461 | 


--------------------------------------------------------------------------------