├── .gitignore ├── dataCollection ├── last_toot_id.txt ├── __pycache__ │ └── loadData.cpython-39.pyc └── loadData.py ├── requirements.txt ├── images ├── gantt.PNG ├── airflow2.PNG └── workflow.png ├── GDPR Compliance.pdf ├── Commands.txt ├── mapReduce └── python │ ├── mapper.py │ └── reducer.py ├── airFlowDAG └── mastadon_dag.py ├── README.md ├── loadingIntoHbase └── insertion.py └── analysis.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Environment file 2 | .env -------------------------------------------------------------------------------- /dataCollection/last_toot_id.txt: -------------------------------------------------------------------------------- 1 | 111291292042375273 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Mastodon.py 2 | python-dotenv 3 | hdfs 4 | pandas 5 | happybase -------------------------------------------------------------------------------- /images/gantt.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/gantt.PNG -------------------------------------------------------------------------------- /GDPR Compliance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/GDPR Compliance.pdf -------------------------------------------------------------------------------- /images/airflow2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/airflow2.PNG -------------------------------------------------------------------------------- /images/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/images/workflow.png -------------------------------------------------------------------------------- /dataCollection/__pycache__/loadData.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SAAD-BEN/Mastadon_data_analysis_Airflow_Hadoop_Hbase/HEAD/dataCollection/__pycache__/loadData.cpython-39.pyc -------------------------------------------------------------------------------- /Commands.txt: -------------------------------------------------------------------------------- 1 | sudo pip install -r /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/requirements.txt 2 | sudo python3 /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/loadData.py 3 | 4 | hadoop fs -rm -r /raw/ 5 | 6 | # Job run 7 | hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar -mapper /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py -reducer /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py -input /raw/2023-10-21/19-0-posts.json -output /test/anass32/ 8 | hadoop fs -cat /test/anass30/part-00000 9 | 10 | #hbase 11 | /usr/local/Hbase/bin/start-hbase.sh 12 | /usr/local/Hbase/bin/stop-hbase.sh 13 | 14 | /usr/local/Hbase/bin/hbase-daemon.sh start thrift 15 | /usr/local/Hbase/bin/hbase-daemon.sh stop thrift 16 | 17 | sudo /usr/local/Hbase/bin/hbase shell 18 | 19 | # Airflow 20 | /AirFlow/airflow-environment/airflow scheduler 21 | /AirFlow/airflow-environment/airflow webserver -p 8080 22 | 23 | 24 | mapred streaming -files /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py,/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py -mapper mapper.py -reducer reducer.py -input /raw/2023-10-21/19-0-posts.json -------------------------------------------------------------------------------- /mapReduce/python/mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import sys 4 | import re 5 | from urllib.parse import urlparse 6 | 7 | def process_data(input_data): 8 | pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' 9 | 10 | for line in input_data: 11 | try: 12 | data = json.loads(line) 13 | toot_with_media_dict = {} 14 | toot_with_media_id = 'toot_with_media' 15 | 16 | created_at = data["created_at"].split(' ')[1].split('+')[0] 17 | account = data.get("account") 18 | media_attachments = data.get("media_attachments") 19 | emojis = data.get("emojis") if data.get("emojis") else [] 20 | websites = data.get("content") if data.get("content") else [] 21 | tags = data.get("tags") if data.get("tags") else [] 22 | 23 | if account: 24 | user_id = 'user:' + str(account.get('id')) 25 | followers = int(account.get('followers_count', 0)) 26 | reblogs_count = data.get('reblogs_count', 0) 27 | favourites_count = data.get('favourites_count', 0) 28 | 29 | engagement_rate = (reblogs_count + favourites_count) / followers if followers > 0 else 0 30 | 31 | croissance_id = "croissance:" + account.get('created_at').split('-')[0] + '-' + account.get('created_at').split('-')[1] 32 | 33 | user_data = { 34 | "date": created_at, 35 | "followers": followers, 36 | "engagement_rate": engagement_rate 37 | } 38 | 39 | croissance_data = {"value": 1, "user_id": user_id} 40 | 41 | # Emit key-value pairs for the reducer 42 | print(f"{user_id}\t{user_data}") 43 | print(f"{croissance_id}\t{croissance_data}") 44 | 45 | if media_attachments: 46 | toot_with_media_dict["value"] = 1 47 | print(f"{toot_with_media_id}\t{toot_with_media_dict}") 48 | 49 | language_id = "language:" + data.get('language') 50 | language_data = {"value": 1} 51 | print(f"{language_id}\t{language_data}") 52 | 53 | if emojis != []: 54 | for emoji in emojis: 55 | emoji_id = "emoji:" + emoji.get('shortcode') 56 | emoji_data = {"value": 1} 57 | print(f"{emoji_id}\t{emoji_data}") 58 | 59 | if websites != []: 60 | urls = re.search(pattern, websites) 61 | if urls: 62 | website_id = "website:" + urlparse(urls.group(0)).netloc 63 | website_data = {"value": 1} 64 | print(f"{website_id}\t{website_data}") 65 | 66 | if tags != []: 67 | for tag in tags: 68 | tag_id = "tag:" + tag.get('name') 69 | tag_data = {"value": 1} 70 | print(f"{tag_id}\t{tag_data}") 71 | 72 | except Exception as e: 73 | # Log exceptions to standard error 74 | print(f"Error: {str(e)}", file=sys.stderr) 75 | 76 | # Example usage of the process_data function 77 | if __name__ == "__main__": 78 | input_data = sys.stdin 79 | process_data(input_data) 80 | -------------------------------------------------------------------------------- /airFlowDAG/mastadon_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.python_operator import PythonOperator 3 | from airflow.models import Variable 4 | from datetime import datetime, timedelta 5 | from airflow.providers.email.operators.send_email import SendEmail 6 | import subprocess 7 | import sys 8 | sys.path.insert(0, '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection') 9 | sys.path.insert(0, '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/loadingIntoHbase') 10 | from loadData import retrieve_and_save_mastodon_data 11 | from insertion import insert_data_into_hbase 12 | 13 | def success_callback(context): 14 | SendEmail( 15 | to=["saad.bentaleb08@gmail.com"], 16 | subject="Mastodon Data Pipeline Succeeded", 17 | html_content="The Mastodon Data Pipeline has succeeded." 18 | ) 19 | 20 | def failure_callback(context): 21 | SendEmail( 22 | to=["saad.bentaleb08@gmail.com"], 23 | subject="Mastodon Data Pipeline Failed", 24 | html_content="The Mastodon Data Pipeline has failed." 25 | ) 26 | 27 | default_args = { 28 | 'owner': 'admin', 29 | 'start_date': datetime(2023, 10, 23), 30 | 'retries': 0, # Set to 0 to disable retries for the DAG 31 | } 32 | 33 | with DAG('mastodon_data_pipeline1', default_args=default_args, schedule_interval='@daily') as dag: 34 | def set_data_path(**kwargs): 35 | data_path = retrieve_and_save_mastodon_data() # Run the data collection function 36 | processed_path = '/processed/' + datetime.now().strftime('%Y-%m-%d/%H-%M') + '/' 37 | Variable.set("data_path", data_path) 38 | Variable.set("processed_path", processed_path) 39 | 40 | # Create PythonOperator tasks 41 | retrieve_and_save_mastodon_data_task = PythonOperator( 42 | task_id='mastodon_data_pipeline1', 43 | provide_context=True, 44 | python_callable=set_data_path, 45 | dag=dag, 46 | ) 47 | 48 | def run_map_reduce(**kwargs): 49 | data_path = Variable.get("data_path") # Retrieve the data path from the variable 50 | output_path = Variable.get("processed_path") # Retrieve the processed path from the variable 51 | # Use subprocess to run Hadoop MapReduce job with the provided data path 52 | hadoop_command = f"hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar " \ 53 | f"-mapper /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/mapper.py " \ 54 | f"-reducer /home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/mapReduce/python/reducer.py " \ 55 | f"-input {data_path} " \ 56 | f"-output {output_path}" 57 | subprocess.run(hadoop_command, shell=True) 58 | 59 | run_map_reduce_task = PythonOperator( 60 | task_id='run_map_reduce', 61 | provide_context=True, 62 | python_callable=run_map_reduce, 63 | dag=dag, 64 | ) 65 | 66 | def run_hbase_insertion(**kwargs): 67 | processed_path = Variable.get("processed_path") 68 | insert_data_into_hbase(processed_path) 69 | 70 | run_hbase_insertion_task = PythonOperator( 71 | task_id='run_hbase_insertion', 72 | provide_context=True, 73 | python_callable=run_hbase_insertion, 74 | dag=dag, 75 | ) 76 | 77 | retrieve_and_save_mastodon_data_task >> run_map_reduce_task >> run_hbase_insertion_task 78 | 79 | if __name__ == "__main__": 80 | dag.cli() 81 | 82 | -------------------------------------------------------------------------------- /dataCollection/loadData.py: -------------------------------------------------------------------------------- 1 | from mastodon import Mastodon 2 | from dotenv import load_dotenv 3 | import os 4 | from hdfs import InsecureClient 5 | import datetime 6 | import time 7 | import json 8 | 9 | load_dotenv() 10 | 11 | def retrieve_and_save_mastodon_data(): 12 | # Connect to the mastodon API 13 | mastodon = Mastodon( 14 | client_id=os.getenv('Client_key'), 15 | client_secret=os.getenv('Client_secret'), 16 | access_token=os.getenv('Access_token'), 17 | api_base_url="https://mastodon.social" 18 | ) 19 | 20 | # Initialize an HDFS client 21 | hdfs_client = InsecureClient('http://localhost:9870', user='hadoop') 22 | 23 | # Get the current date and time 24 | now = datetime.datetime.now() 25 | directory_path = '/raw/' + str(now.year) + '-' + str(now.month) + '-' + str(now.day) 26 | 27 | # Check if the directory already exists 28 | if not hdfs_client.status(directory_path, strict=False): 29 | hdfs_client.makedirs(directory_path) 30 | 31 | # Define the HDFS path where you want to save the data 32 | hdfs_path = directory_path + '/' + str(now.hour) + '-' + str(now.minute) 33 | 34 | # Retrieve the last toot ID from a local file or start with None 35 | try: 36 | with open('/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/last_toot_id.txt', 'r', encoding='utf-8') as reader: 37 | last_toot_id = reader.read().strip() 38 | except FileNotFoundError: 39 | last_toot_id = None 40 | 41 | public_posts = [] 42 | 43 | # Specify the duration of collecting data in minutes 44 | duration = 20 # Seconds 45 | 46 | # Get the current time 47 | start_time = time.time() 48 | 49 | while True: 50 | # Check if 10 minutes have passed 51 | if time.time() - start_time >= duration: 52 | break # Retrieve public posts 53 | 54 | new = mastodon.timeline_public(limit=40, since_id=last_toot_id) 55 | 56 | # Append the current run's public posts to the list 57 | public_posts.extend(new) 58 | print(f'Number of posts retrieved: {str(len(public_posts))}', end='\r') 59 | 60 | # Update the last_toot_id 61 | if public_posts: 62 | latest_toot = public_posts[0] 63 | last_toot_id = str(latest_toot['id']) 64 | 65 | class CustomJSONEncoder(json.JSONEncoder): 66 | def default(self, o): 67 | if isinstance(o, datetime.datetime): 68 | # Convert datetime to a string representation 69 | return o.strftime('%Y-%m-%d %H:%M:%S %z') 70 | elif hasattr(o, '__dict__'): 71 | # Handle other objects with __dict__ attribute 72 | return o.__dict__ 73 | return super().default(o) 74 | 75 | formatted_data = [] 76 | for obj in public_posts: 77 | formatted_obj = json.dumps(obj, separators=(',', ':'), default=str, cls=CustomJSONEncoder) 78 | formatted_data.append(formatted_obj) 79 | 80 | # Convert the formatted data to a string 81 | formatted_data_str = '\n'.join(formatted_data) 82 | 83 | # Save the preprocessed data to HDFS 84 | with hdfs_client.write(hdfs_path + '-posts.json', encoding='utf-8') as writer: 85 | writer.write(formatted_data_str) 86 | 87 | print('Data saved successfully to HDFS: ' + hdfs_path + '-posts.json') 88 | 89 | # After retrieving the public posts, you can save the latest toot_id to a local file. 90 | if public_posts: 91 | latest_toot = public_posts[0] # Assuming the latest toot is at the first position 92 | latest_toot_id = latest_toot['id'] 93 | 94 | # Convert latest_toot_id to a string 95 | latest_toot_id_str = str(latest_toot_id) 96 | 97 | # Define the path to the local file 98 | local_file_path = '/home/project/Mastadon_data_analysis_Airflow_Hadoop_Hbase/dataCollection/last_toot_id.txt' 99 | 100 | # Update or create the local file with the latest_toot_id 101 | with open(local_file_path, 'w', encoding='utf-8') as writer: 102 | writer.write(latest_toot_id_str) 103 | 104 | 105 | return hdfs_path + '-posts.json' 106 | 107 | if __name__ == '__main__': 108 | retrieve_and_save_mastodon_data() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mastodon Data Pipeline README 2 | 3 | This README provides an overview of a data pipeline project that consists of four main phases: data extraction from the Mastodon API, data processing with Hadoop MapReduce using Python streaming, data storage in HBase, and orchestration with Apache Airflow for automated daily execution. The project aims to collect and analyze data from the Mastodon platform, a federated social media platform. 4 | 5 | ## Folders structure 6 | ``` 7 | Repository Root 8 | ├── airFlowDag 9 | │ ├── mastadon_dag.py 10 | │ 11 | ├── dataCollection 12 | │ ├── last_toot_id.txt 13 | │ ├── loadData.py 14 | │ ├── public_posts.json 15 | │ ├── tests.ipynb 16 | │ 17 | ├── images 18 | │ ├── airflow.PNG 19 | │ ├── gantt.PNG 20 | │ ├── workflow.PNG 21 | │ 22 | ├── loadingIntoHbase 23 | │ ├── insertion.py 24 | │ 25 | ├── mapReduce 26 | │ ├── Python 27 | │ │ ├── mapper.py 28 | │ │ ├── reducer.py 29 | │ 30 | ├── airFlowDag 31 | │ ├── mastadon_dag.py 32 | │ 33 | │ 34 | ├── README.md 35 | ├── analysis.ipynb 36 | ├── Commands.txt 37 | ├── RGPD.pdf 38 | ├── requirements.txt 39 | 40 | ``` 41 | 42 | ## Project Overview 43 | 44 | 45 | As a Data Developer, your role in this project is to set up an automated pipeline to address the following challenges: 46 | --- 47 | ![Workflow](/images/workflow.png) 48 | --- 49 | 50 | ### Phase 1: Data Collection 51 | 52 | - **Data Extraction:** Utilize the Mastodon API with your access tokens to gather raw data from the Mastodon platform. 53 | 54 | - **Raw Data Storage:** Store the raw data in a distributed file system such as HDFS for scalability. 55 | 56 | - **HDFS Data Lake Modeling:** Define the data lake schema for HDFS. 57 | 58 | ### Phase 2: Data Processing with MapReduce 59 | 60 | - **Mapper:** Process the input data and generate key-value pairs based on the desired metrics (e.g., user followers, engagement rate, URLs, emojis, etc.). 61 | 62 | - **Reducer:** Aggregate the key-value pairs produced by the mapper. 63 | 64 | - **MapReduce Job Execution:** Use Hadoop streaming API to execute the MapReduce task, providing the mapper and reducer scripts as inputs. 65 | 66 | - **Monitoring:** Keep track of job progress through the Hadoop web UI. 67 | 68 | ### Phase 3: Data Storage in HBase 69 | 70 | - **HBase Schema Design:** Design the HBase tables schema based on the information you want to extract. 71 | 72 | | **Table Name** | **Description** | **Schema** | 73 | |--------------------------|-----------------------------------------------|-------------------------------------------| 74 | | `language_table` | Stores language counts. | - Row Key: Language code (e.g., 'en', 'es')
- Columns:
- `data:count`: Count of users/data.
- Timestamp: Record timestamp. | 75 | | `user_table` | Stores user information, engagement rates, and followers. | - Row Key: User ID (e.g., '1007156', '10106')
- Columns:
- `data:engagement_rate`: User's engagement rate.
- `data:followers`: Number of followers.
- Timestamp: Record timestamp. | 76 | | `croissance_table` | Records user creation counts by month. | - Row Key: Month (e.g., '2007-07', '2008-01')
- Columns:
- `data:count`: Count of user creations.
- Timestamp: Record timestamp. | 77 | | `url_table` | Tracks mentions of external websites. | - Row Key: Website URL (e.g., 'a.gup.pe', 'abcgazetesi.com')
- Columns:
- `data:count`: Count of mentions.
- Timestamp: Record timestamp. | 78 | | `toot_with_media_table` | Records the count of toots with media content. | - Row Key: Fixed key 'toot_with_media'
- Columns:
- `data:count`: Count of toots with media.
- Timestamp: Record timestamp. | 79 | | `tag_table` | Stores counts of used tags. | - Row Key: Tag name (e.g., '10yrsago', '17thcentury')
- Columns:
- `data:count`: Count of tag usage.
- Timestamp: Record timestamp. | 80 | 81 | 82 | - **Best Practices:** Follow best practices for row key design, column family design, compression, bloom filters, batch inserts, etc. 83 | 84 | - **Table Creation:** Create the necessary tables in HBase. 85 | 86 | ``` 87 | # HBase connection settings 88 | hbase_host = 'localhost' # Replace with your HBase host 89 | hbase_port = 9090 # Default HBase port 90 | # Connect to HBase 91 | connection = happybase.Connection(host=hbase_host, port=hbase_port) 92 | 93 | tables_list = ["user_table", "croissance_table", "language_table", "toot_with_media_table", "emoji_table", "url_table", "tag_table"] 94 | tables = connection.tables() 95 | # remove the b' and ' from the table names 96 | tables = [table_name.decode() for table_name in tables] 97 | 98 | # Create the tables if they do not exist 99 | for table_name in tables_list: 100 | if table_name not in tables: 101 | connection.create_table(table_name, {'data': dict()}) 102 | 103 | connection.close() 104 | ``` 105 | 106 | - **Data Insertion:** Populate the output from the reducer into HBase tables using a Python HBase client or your preferred method. 107 | 108 | ### Phase 4: Orchestration with Apache Airflow 109 | 110 | - **Workflow Orchestration:** Define a Directed Acyclic Graph (DAG) to orchestrate the entire workflow. 111 | 112 | - **Task Creation:** Create tasks for running the MapReduce job and storing results in HBase. 113 | 114 | - **Monitoring and Error Handling:** Monitor progress and manage errors or failures. 115 | 116 | ### Phase 5: Data Analysis 117 | 118 | After successfully completing the previous phases, you can perform data analysis. You 119 | 120 | ### Phase 6: Workflow Execution 121 | 122 | In the Apache Airflow web interface, activate the DAG, monitor DAG execution progress, and check logs for any issues. Once the DAG is complete, review the results in HBase. 123 | 124 | #### Airflow run details 125 | 126 | ![AirflowRun](/images/airflow2.PNG) 127 | 128 | ### Phase 7: Optimization and Monitoring 129 | 130 | Optimize MapReduce scripts for better performance. Monitor HBase for storage issues and set up alerts in Airflow for task failures. Regularly monitor Hadoop using its respective web interface. 131 | 132 | ### Phase 8: Data Access Rights Configuration Updates 133 | 134 | Update API tokens if organizational roles change, ensuring they have the necessary permissions for data retrieval. 135 | 136 | ### Phase 9: Scheduling and Compliance 137 | 138 | Ensure that DAGs are scheduled at appropriate intervals for data refresh. Update the data processing log to ensure GDPR compliance by documenting all personal data from Mastodon and how it's processed. 139 | -------------------------------------------------------------------------------- /mapReduce/python/reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import json 4 | # import happybase 5 | 6 | # # Initialize a connection to HBase 7 | # connection = happybase.Connection('localhost', 9090) 8 | 9 | # # Define the HBase tables 10 | # user_table = connection.table('user_data') 11 | # croissance_table = connection.table('croissance_data') 12 | # language_table = connection.table('language_data') 13 | # toot_with_media_table = connection.table('toot_with_media_data') 14 | # emoji_table = connection.table('emoji_data') 15 | # url_table = connection.table('website_data') 16 | 17 | current_user = None 18 | current_croissance = None 19 | current_language = None 20 | current_toot_with_media = None 21 | current_emoji = None 22 | current_url = None 23 | current_tag = None 24 | 25 | earliest_followers = float('inf') 26 | old_time = "00:00:00" 27 | 28 | engagement_rate_sum = 0.0000 29 | croissance_sum = 0 30 | language_sum = 0 31 | toot_with_media_sum = 0 32 | emoji_sum = 0 33 | url_sum = 0 34 | tag_sum = 0 35 | count = 1 36 | 37 | unique_users_roissance = [] 38 | 39 | user_dict = {} 40 | croissance_dict = {} 41 | language_dict = {} 42 | toot_with_media_dict = {} 43 | emoji_dict = {} 44 | url_dict = {} 45 | tag_dict = {} 46 | 47 | for line in sys.stdin: 48 | key = line.strip().split('\t')[0] 49 | key = key.strip() 50 | # if key starts with user, it's a user record 51 | if key.startswith("user"): 52 | user, strdata = line.strip().split('\t') 53 | # string to dict 54 | data = eval(strdata.strip()) 55 | followers = int(data["followers"]) 56 | time = data['date'] 57 | engagement_rate = float(data["engagement_rate"]) 58 | 59 | if current_user == user: 60 | if time > old_time: 61 | old_time = time 62 | earliest_followers = followers 63 | engagement_rate_sum += engagement_rate 64 | count += 1 65 | else: 66 | if current_user is not None: 67 | if count > 0: 68 | engagement_rate_avg = engagement_rate_sum / count 69 | user_dict["engagement_rate"] = engagement_rate_avg 70 | user_dict["followers"] = earliest_followers 71 | print(f"{current_user}\t{user_dict}") 72 | current_user = user 73 | earliest_followers = followers 74 | old_time = time 75 | engagement_rate_sum = engagement_rate 76 | count = 1 77 | 78 | # if key starts with croissance, it's a croissance record 79 | elif key.startswith("croissance"): 80 | croissance, strdata = line.strip().split('\t') 81 | data = eval(strdata.strip()) 82 | user_id = data["user_id"] 83 | if croissance == current_croissance: 84 | if current_croissance is not None: 85 | if user_id not in unique_users_roissance: 86 | unique_users_roissance.append(user_id) 87 | croissance_sum += data["value"] 88 | elif croissance != current_croissance: 89 | if current_croissance is not None: 90 | croissance_dict["count"] = croissance_sum 91 | print(f"{current_croissance}\t{croissance_dict}") 92 | current_croissance = croissance 93 | croissance_sum = 1 94 | unique_users_roissance = [user_id] 95 | 96 | #if key starts with language, it's a language record 97 | elif key.startswith("language"): 98 | language, strdata = line.strip().split('\t') 99 | data = eval(strdata.strip()) 100 | if language == current_language: 101 | if current_language is not None: 102 | language_sum += data["value"] 103 | elif language != current_language: 104 | if current_language is not None: 105 | language_dict["count"] = language_sum 106 | print(f"{current_language}\t{language_dict}") 107 | current_language = language 108 | language_sum = 1 109 | 110 | elif key.startswith("toot_with_media"): 111 | toot_with_media, strdata = line.strip().split('\t') 112 | data = eval(strdata.strip()) 113 | if toot_with_media == current_toot_with_media: 114 | if current_toot_with_media is not None: 115 | toot_with_media_sum += data["value"] 116 | elif toot_with_media != current_toot_with_media: 117 | if current_toot_with_media is not None: 118 | toot_with_media_dict["count"] = toot_with_media_sum 119 | print(f"{current_toot_with_media}\t{toot_with_media_dict}") 120 | current_toot_with_media = toot_with_media 121 | toot_with_media_sum = 1 122 | 123 | elif key.startswith("emoji"): 124 | emoji_id , strdata = line.strip().split('\t') 125 | data = eval(strdata.strip()) 126 | if emoji_id == current_emoji: 127 | if current_emoji is not None: 128 | emoji_sum += data["value"] 129 | elif emoji_id != current_emoji: 130 | if current_emoji is not None: 131 | emoji_dict["count"] = emoji_sum 132 | print(f"{current_emoji}\t{emoji_dict}") 133 | current_emoji = emoji_id 134 | emoji_sum = 1 135 | 136 | elif key.startswith("website"): 137 | website_id , strdata = line.strip().split('\t') 138 | data = eval(strdata.strip()) 139 | if website_id == current_url: 140 | if current_url is not None: 141 | url_sum += data["value"] 142 | elif website_id != current_url: 143 | if current_url is not None: 144 | url_dict["count"] = url_sum 145 | print(f"{current_url}\t{url_dict}") 146 | current_url = website_id 147 | url_sum = 1 148 | 149 | elif key.startswith("tag"): 150 | tag_id , strdata = line.strip().split('\t') 151 | data = eval(strdata.strip()) 152 | if tag_id == current_tag: 153 | if current_tag is not None: 154 | tag_sum += data["value"] 155 | elif tag_id != current_tag: 156 | if current_tag is not None: 157 | tag_dict["count"] = tag_sum 158 | print(f"{current_tag}\t{tag_dict}") 159 | current_tag = tag_id 160 | tag_sum = 1 161 | 162 | # Print the last data 163 | if current_croissance is not None: 164 | croissance_dict["count"] = croissance_sum 165 | print(f"{current_croissance}\t{croissance_dict}") 166 | 167 | if current_user is not None: 168 | if count > 0: 169 | engagement_rate_avg = engagement_rate_sum / count 170 | print(f"{current_user}\t{user_dict}") 171 | 172 | if current_language is not None: 173 | language_dict["count"] = language_sum 174 | print(f"{current_language}\t{language_dict}") 175 | 176 | if current_toot_with_media is not None: 177 | toot_with_media_dict["count"] = toot_with_media_sum 178 | print(f"{current_toot_with_media}\t{toot_with_media_dict}") 179 | 180 | if current_emoji is not None: 181 | emoji_dict["count"] = emoji_sum 182 | print(f"{current_emoji}\t{emoji_dict}") 183 | 184 | if current_url is not None: 185 | url_dict["count"] = url_sum 186 | print(f"{current_url}\t{url_dict}") 187 | 188 | if current_tag is not None: 189 | tag_dict["count"] = tag_sum 190 | print(f"{current_tag}\t{tag_dict}") -------------------------------------------------------------------------------- /loadingIntoHbase/insertion.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from hdfs import InsecureClient 3 | import json 4 | import happybase 5 | 6 | 7 | 8 | 9 | # define a function to process the data from the hdf file 10 | def insert_data_into_hbase(data_folder_path): 11 | 12 | # HBase connection settings 13 | hbase_host = 'localhost' # Replace with your HBase host 14 | hbase_port = 9090 # Default HBase port 15 | # Connect to HBase 16 | connection = happybase.Connection(host=hbase_host, port=hbase_port) 17 | 18 | tables_list = ["user_table", "croissance_table", "language_table", "toot_with_media_table", "emoji_table", "url_table", "tag_table"] 19 | tables = connection.tables() 20 | # remove the b' and ' from the table names 21 | tables = [table_name.decode() for table_name in tables] 22 | 23 | # Create the tables if they do not exist 24 | for table_name in tables_list: 25 | if table_name not in tables: 26 | connection.create_table(table_name, {'data': dict()}) 27 | # Get the tables 28 | user_table = connection.table('user_table') 29 | croissance_table = connection.table('croissance_table') 30 | language_table = connection.table('language_table') 31 | toot_with_media_table = connection.table('toot_with_media_table') 32 | emoji_table = connection.table('emoji_table') 33 | url_table = connection.table('url_table') 34 | tag_table = connection.table('tag_table') 35 | 36 | # Initialize an HDFS client 37 | hdfs_client = InsecureClient('http://localhost:9870', user='hadoop') 38 | # get files from the data folder that start with part 39 | files = hdfs_client.list(data_folder_path) 40 | # get oly the files that start with part 41 | files = [file for file in files if file.startswith('part')] 42 | # loop through the files 43 | for file in files: 44 | # get the file path 45 | file_path = data_folder_path + '/' + file 46 | # open the file 47 | with hdfs_client.read(file_path, encoding='utf-8') as reader: 48 | # read the file 49 | data = reader.read() 50 | # split the file into lines 51 | lines = data.splitlines() 52 | # loop through the lines 53 | for line in lines: 54 | # split the line into key and value 55 | key, value = line.strip().split('\t') 56 | # split the key into key and id 57 | # convert the value to a dictionary 58 | value = eval(value.strip()) 59 | # if the key is user 60 | if key.startswith('user'): 61 | key, id = key.strip().split(':') 62 | # get the engagement rate 63 | engagement_rate = value['engagement_rate'] 64 | # get the followers 65 | followers = value['followers'] 66 | user_dict = { 67 | "engagement_rate": engagement_rate, 68 | "followers": followers 69 | } 70 | try: 71 | # Store the user data in HBase 72 | user_table.put(str(id).encode(), {b'data:engagement_rate': str(user_dict["engagement_rate"]).encode()}) 73 | user_table.put(str(id).encode(), {b'data:followers': str(user_dict["followers"]).encode()}) 74 | except Exception as e: 75 | # Log exceptions to standard error 76 | print(f"Error: {str(e)}") 77 | elif key.startswith('croissance'): 78 | key, id = key.strip().split(':') 79 | croissance_dict = { 80 | "count": value['count'] 81 | } 82 | try: 83 | # Store the user data in HBase 84 | croissance_table.put(str(id).encode(), {b'data:count': str(croissance_dict["count"]).encode()}) 85 | except Exception as e: 86 | # Log exceptions to standard error 87 | print(f"Error: {str(e)}") 88 | elif key.startswith('language'): 89 | key, id = key.strip().split(':') 90 | language_dict = { 91 | "count": value['count'] 92 | } 93 | try: 94 | # Store the user data in HBase 95 | language_table.put(str(id).encode(), {b'data:count': str(language_dict["count"]).encode()}) 96 | except Exception as e: 97 | # Log exceptions to standard error 98 | print(f"Error: {str(e)}") 99 | elif key.startswith('toot_with_media'): 100 | id = key.strip() 101 | toot_with_media_dict = { 102 | "count": value['count'] 103 | } 104 | try: 105 | # Store the user data in HBase 106 | toot_with_media_table.put(str(id).encode(), {b'data:count': str(toot_with_media_dict["count"]).encode()}) 107 | except Exception as e: 108 | # Log exceptions to standard error 109 | print(f"Error: {str(e)}") 110 | elif key.startswith('emoji'): 111 | key, id = key.strip().split(':') 112 | emoji_dict = { 113 | "count": value['count'] 114 | } 115 | try: 116 | # Store the user data in HBase 117 | emoji_table.put(str(id).encode(), {b'data:count': str(emoji_dict["count"]).encode()}) 118 | except Exception as e: 119 | # Log exceptions to standard error 120 | print(f"Error: {str(e)}") 121 | elif key.startswith('website'): 122 | key, id = key.strip().split(':') 123 | url_dict = { 124 | "count": value['count'] 125 | } 126 | try: 127 | # Store the user data in HBase 128 | url_table.put(str(id).encode(), {b'data:count': str(url_dict["count"]).encode()}) 129 | except Exception as e: 130 | # Log exceptions to standard error 131 | print(f"Error: {str(e)}") 132 | elif key.startswith('tag'): 133 | key, id = key.strip().split(':') 134 | tag_dict = { 135 | "count": value['count'] 136 | } 137 | try: 138 | # Store the user data in HBase 139 | tag_table.put(str(id).encode(), {b'data:count': str(tag_dict["count"]).encode()}) 140 | except Exception as e: 141 | # Log exceptions to standard error 142 | print(f"Error: {str(e)}") 143 | 144 | 145 | connection.close() 146 | 147 | 148 | if __name__ == "__main__": 149 | # Get the data path from the command line 150 | data_path = sys.argv[1] 151 | # Process the data 152 | insert_data_into_hbase(data_path) -------------------------------------------------------------------------------- /analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import happybase" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# User Analysis\n", 17 | "---" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Top 3 Followed users" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 21, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "User ID: 109999724950289416, Followers: 3340001\n", 37 | "User ID: 109999725002952105, Followers: 2410000\n", 38 | "User ID: 109718180885037597, Followers: 2330004\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "# Connect to HBase\n", 44 | "# HBase connection settings\n", 45 | "hbase_host = 'localhost' # Replace with your HBase host\n", 46 | "hbase_port = 9090 # Default HBase port\n", 47 | "# Connect to HBase\n", 48 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n", 49 | "\n", 50 | "table = connection.table('user_table')\n", 51 | "\n", 52 | "# Scan the table to retrieve user data\n", 53 | "scan_result = table.scan(columns=['data:followers'])\n", 54 | "\n", 55 | "# Create a dictionary to store user followers\n", 56 | "user_followers = {}\n", 57 | "\n", 58 | "# Process the scan result\n", 59 | "for key, data in scan_result:\n", 60 | " user_id = key.decode('utf-8')\n", 61 | " followers = int(data[b'data:followers'].decode('utf-8'))\n", 62 | " user_followers[user_id] = followers\n", 63 | "\n", 64 | "# Close the HBase connection\n", 65 | "connection.close()\n", 66 | "\n", 67 | "# Sort the users by followers and get the top users\n", 68 | "top_users = sorted(user_followers.items(), key=lambda item: item[1], reverse=True)[:3]\n", 69 | "\n", 70 | "# Print the top users and their followers\n", 71 | "for user_id, followers in top_users:\n", 72 | " print(f\"User ID: {user_id}, Followers: {followers}\")\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Users with highest engagement rate" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 27, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "User ID: 110643001753379072, Engagement Rate: 19.05 %\n", 92 | "User ID: 111243750378556518, Engagement Rate: 7.14 %\n", 93 | "User ID: 110657506981999016, Engagement Rate: 5.62 %\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "# Connect to HBase\n", 99 | "# HBase connection settings\n", 100 | "hbase_host = 'localhost' # Replace with your HBase host\n", 101 | "hbase_port = 9090 # Default HBase port\n", 102 | "# Connect to HBase\n", 103 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n", 104 | "\n", 105 | "# Select the table\n", 106 | "table = connection.table('user_table')\n", 107 | "\n", 108 | "# Scan the table to retrieve user data\n", 109 | "scan_result = table.scan(columns=['data:engagement_rate'])\n", 110 | "\n", 111 | "# Create a dictionary to store user engagement rates\n", 112 | "user_engagement = {}\n", 113 | "\n", 114 | "# Process the scan result\n", 115 | "for key, data in scan_result:\n", 116 | " user_id = key.decode('utf-8')\n", 117 | " engagement_rate = float(data[b'data:engagement_rate'].decode('utf-8'))\n", 118 | " user_engagement[user_id] = engagement_rate\n", 119 | "\n", 120 | "# Close the HBase connection\n", 121 | "connection.close()\n", 122 | "\n", 123 | "# Sort the users by engagement rate and get the top users\n", 124 | "top_users = sorted(user_engagement.items(), key=lambda item: item[1], reverse=True)[:3]\n", 125 | "\n", 126 | "# Print the top users and their engagement rates\n", 127 | "for user_id, engagement_rate in top_users:\n", 128 | " eng = float(engagement_rate) * 100\n", 129 | " print(f\"User ID: {user_id}, Engagement Rate: {eng:.2f} %\")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## User growth over time" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 28, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Month: 2022-11, User Count: 45\n", 149 | "Month: 2022-12, User Count: 14\n", 150 | "Month: 2022-04, User Count: 13\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "# Connect to HBase\n", 156 | "# HBase connection settings\n", 157 | "hbase_host = 'localhost' # Replace with your HBase host\n", 158 | "hbase_port = 9090 # Default HBase port\n", 159 | "# Connect to HBase\n", 160 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n", 161 | "\n", 162 | "# Select the table\n", 163 | "table = connection.table('croissance_table')\n", 164 | "\n", 165 | "# Scan the table to retrieve data\n", 166 | "scan_result = table.scan(columns=['data:count'])\n", 167 | "\n", 168 | "# Create a dictionary to store month-wise user counts\n", 169 | "month_user_counts = {}\n", 170 | "\n", 171 | "# Process the scan result\n", 172 | "for key, data in scan_result:\n", 173 | " month = key.decode('utf-8')\n", 174 | " count = int(data[b'data:count'].decode('utf-8'))\n", 175 | " month_user_counts[month] = count\n", 176 | "\n", 177 | "# Close the HBase connection\n", 178 | "connection.close()\n", 179 | "\n", 180 | "# Sort the months by user counts and get the top 3\n", 181 | "top_months = sorted(month_user_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n", 182 | "\n", 183 | "# Print the top 3 months and their user counts\n", 184 | "for month, count in top_months:\n", 185 | " print(f\"Month: {month}, User Count: {count}\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "# Content analysis\n", 193 | "---" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "## Top 3 websites" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 29, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "Website: mastodon.social, Mention Count: 137\n", 213 | "Website: www.telam.com.ar, Mention Count: 70\n", 214 | "Website: twitter.com, Mention Count: 58\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "# Connect to HBase\n", 220 | "# HBase connection settings\n", 221 | "hbase_host = 'localhost' # Replace with your HBase host\n", 222 | "hbase_port = 9090 # Default HBase port\n", 223 | "# Connect to HBase\n", 224 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n", 225 | "\n", 226 | "# Select the table\n", 227 | "table = connection.table('url_table')\n", 228 | "\n", 229 | "# Scan the table to retrieve data\n", 230 | "scan_result = table.scan(columns=['data:count'])\n", 231 | "\n", 232 | "# Create a dictionary to store website mention counts\n", 233 | "website_counts = {}\n", 234 | "\n", 235 | "# Process the scan result\n", 236 | "for key, data in scan_result:\n", 237 | " website = key.decode('utf-8')\n", 238 | " count = int(data[b'data:count'].decode('utf-8'))\n", 239 | " website_counts[website] = count\n", 240 | "\n", 241 | "# Close the HBase connection\n", 242 | "connection.close()\n", 243 | "\n", 244 | "# Sort the websites by mention counts and get the top 3\n", 245 | "top_websites = sorted(website_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n", 246 | "\n", 247 | "# Print the top 3 websites and their mention counts\n", 248 | "for website, count in top_websites:\n", 249 | " print(f\"Website: {website}, Mention Count: {count}\")" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "# Language analysis\n", 257 | "---" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "## Most used languages" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 30, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "Language: en, Count: 1677\n", 277 | "Language: de, Count: 313\n", 278 | "Language: es, Count: 115\n" 279 | ] 280 | } 281 | ], 282 | "source": [ 283 | "# Connect to HBase\n", 284 | "# HBase connection settings\n", 285 | "hbase_host = 'localhost' # Replace with your HBase host\n", 286 | "hbase_port = 9090 # Default HBase port\n", 287 | "# Connect to HBase\n", 288 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n", 289 | "\n", 290 | "\n", 291 | "# Select the table\n", 292 | "table = connection.table('language_table')\n", 293 | "\n", 294 | "# Scan the table to retrieve language data\n", 295 | "scan_result = table.scan(columns=['data:count'])\n", 296 | "\n", 297 | "# Create a dictionary to store language counts\n", 298 | "language_counts = {}\n", 299 | "\n", 300 | "# Process the scan result\n", 301 | "for key, data in scan_result:\n", 302 | " language = key.decode('utf-8')\n", 303 | " count = int(data[b'data:count'].decode('utf-8'))\n", 304 | " language_counts[language] = count\n", 305 | "\n", 306 | "# Close the HBase connection\n", 307 | "connection.close()\n", 308 | "\n", 309 | "# Sort the language counts and get the top 3\n", 310 | "top_languages = sorted(language_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n", 311 | "\n", 312 | "# Print the top 3 languages and their counts\n", 313 | "for language, count in top_languages:\n", 314 | " print(f\"Language: {language}, Count: {count}\")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "# Media engagement\n", 322 | "---" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## Count of posts with media" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 35, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "name": "stdout", 339 | "output_type": "stream", 340 | "text": [ 341 | "There is 619 Posts with media\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "# Connect to HBase\n", 347 | "# HBase connection settings\n", 348 | "hbase_host = 'localhost' # Replace with your HBase host\n", 349 | "hbase_port = 9090 # Default HBase port\n", 350 | "# Connect to HBase\n", 351 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n", 352 | "\n", 353 | "# Select the table\n", 354 | "table = connection.table('toot_with_media_table')\n", 355 | "\n", 356 | "# Specify the row key you want to retrieve\n", 357 | "row_key = b'toot_with_media' # Use bytes for the row key\n", 358 | "\n", 359 | "# Use the get method to retrieve the row\n", 360 | "row_data = table.row(row_key)\n", 361 | "\n", 362 | "# Close the HBase connection\n", 363 | "connection.close()\n", 364 | "\n", 365 | "# Print or process the retrieved data\n", 366 | "if row_data:\n", 367 | " for column, cell in row_data.items():\n", 368 | " print(f\"There is {cell.decode()} Posts with media\")\n", 369 | "else:\n", 370 | " print(f\"Row with key '{row_key.decode('utf-8')}' not found in the table.\")" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "# Tags and mentions analysis\n", 378 | "---" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "## Most used tags" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 38, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "Tag: michigan, Count: 56\n", 398 | "Tag: press, Count: 46\n", 399 | "Tag: genocide, Count: 42\n" 400 | ] 401 | } 402 | ], 403 | "source": [ 404 | "# Connect to HBase\n", 405 | "# HBase connection settings\n", 406 | "hbase_host = 'localhost' # Replace with your HBase host\n", 407 | "hbase_port = 9090 # Default HBase port\n", 408 | "# Connect to HBase\n", 409 | "connection = happybase.Connection(host=hbase_host, port=hbase_port)\n", 410 | "\n", 411 | "\n", 412 | "# Select the table\n", 413 | "table = connection.table('tag_table')\n", 414 | "\n", 415 | "# Scan the table to retrieve data\n", 416 | "scan_result = table.scan(columns=['data:count'])\n", 417 | "\n", 418 | "# Create a dictionary to store tag counts\n", 419 | "tag_counts = {}\n", 420 | "\n", 421 | "# Process the scan result\n", 422 | "for key, data in scan_result:\n", 423 | " tag = key.decode('utf-8')\n", 424 | " count = int(data[b'data:count'].decode('utf-8'))\n", 425 | " tag_counts[tag] = count\n", 426 | "\n", 427 | "# Close the HBase connection\n", 428 | "connection.close()\n", 429 | "\n", 430 | "# Sort the tags by count and get the top 3\n", 431 | "top_tags = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)[:3]\n", 432 | "\n", 433 | "# Print the top 3 tags and their counts\n", 434 | "for tag, count in top_tags:\n", 435 | " print(f\"Tag: {tag}, Count: {count}\")" 436 | ] 437 | } 438 | ], 439 | "metadata": { 440 | "kernelspec": { 441 | "display_name": "Python 3", 442 | "language": "python", 443 | "name": "python3" 444 | }, 445 | "language_info": { 446 | "codemirror_mode": { 447 | "name": "ipython", 448 | "version": 3 449 | }, 450 | "file_extension": ".py", 451 | "mimetype": "text/x-python", 452 | "name": "python", 453 | "nbconvert_exporter": "python", 454 | "pygments_lexer": "ipython3", 455 | "version": "3.10.12" 456 | } 457 | }, 458 | "nbformat": 4, 459 | "nbformat_minor": 2 460 | } 461 | --------------------------------------------------------------------------------