├── imgs ├── gr1.png ├── gr2.png ├── res1.png ├── res2.png ├── teaser.png └── Poster_ICMR23.pdf ├── models ├── Conv1D.py ├── MLP.py ├── Xgboost.py ├── GAT.py └── GCN.py ├── Evaluation.py ├── Training.py ├── utils.py ├── CreateNetwork.py ├── req.txt ├── README.md └── main.py /imgs/gr1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/gr1.png -------------------------------------------------------------------------------- /imgs/gr2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/gr2.png -------------------------------------------------------------------------------- /imgs/res1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/res1.png -------------------------------------------------------------------------------- /imgs/res2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/res2.png -------------------------------------------------------------------------------- /imgs/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/teaser.png -------------------------------------------------------------------------------- /imgs/Poster_ICMR23.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/Poster_ICMR23.pdf -------------------------------------------------------------------------------- /models/Conv1D.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D 2 | from keras.models import Sequential 3 | 4 | 5 | def create_Conv1D(num_classes, hidden, input): 6 | model = Sequential() 7 | model.add(Conv1D(hidden, 1, activation="tanh", input_shape=(input, 1))) 8 | # model.add(Conv1D(128, 1, strides=2, activation="relu")) 9 | # model.add(Conv1D(256, 1, strides=2, activation="relu")) 10 | # model.add(Conv1D(512, 1, strides=2, activation="relu")) 11 | # model.add(Conv1D(1024, 1, strides=2, activation="relu")) 12 | model.add(MaxPooling1D()) 13 | model.add(Flatten()) 14 | model.add(Dense(2, name='logits')) 15 | return model 16 | -------------------------------------------------------------------------------- /models/MLP.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | from tensorflow.keras import layers 3 | from utils import create_ffn 4 | 5 | 6 | def MLP(input_shape, hidden_units, num_classes, dropout_rate=0.2): 7 | inputs = layers.Input(shape=(input_shape,), name="input_features") 8 | x = create_ffn(hidden_units, dropout_rate, name=f"ffn_block1")(inputs) 9 | for block_idx in range(4): 10 | # Create an FFN block. 11 | x1 = create_ffn(hidden_units, dropout_rate, name=f"ffn_block{block_idx + 2}")(x) 12 | # Add skip connection. 13 | x = layers.Add(name=f"skip_connection{block_idx + 2}")([x, x1]) 14 | # Compute logits. 15 | logits = layers.Dense(num_classes, name="logits")(x) 16 | # Create the model. 17 | return keras.Model(inputs=inputs, outputs=logits, name="baseline") 18 | 19 | 20 | def create_MLP(input_shape, hidden_units, num_classes, dropout_rate): 21 | model = MLP(input_shape, hidden_units, num_classes, dropout_rate) 22 | return model 23 | -------------------------------------------------------------------------------- /Evaluation.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_auc_score 2 | import numpy as np 3 | from scipy.special import softmax 4 | import xgboost as xgb 5 | 6 | 7 | def evaluate(model, X_test, y_test): 8 | logits = model.predict(X_test) 9 | probs = softmax(logits, axis=1) 10 | classes = np.argmax(probs, axis=1) 11 | print_metrics(classes, y_test, probs[:,1]) 12 | 13 | 14 | def evaluate_XGB(obj, X_test, y_test): 15 | dtest = xgb.DMatrix(data=X_test) 16 | probs = obj.predict(dtest) 17 | classes = probs.copy() 18 | classes[classes > 0.5] = 1 19 | classes[classes <= 0.5] = 0 20 | print_metrics(classes, y_test, probs) 21 | 22 | 23 | def print_metrics(classes, y_test, probs): 24 | print(classification_report(classes, y_test, labels=[0, 1])) 25 | prec, recall, thr = precision_recall_curve(y_test, probs, pos_label=1) 26 | prauc = auc(recall, prec) 27 | print(prauc) 28 | prauc = roc_auc_score(y_test, probs) 29 | print(prauc) 30 | print(confusion_matrix(classes, y_test)) 31 | -------------------------------------------------------------------------------- /Training.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | import xgboost as xgb 3 | 4 | 5 | def run_experiment(model, x_train, y_train, learning_rate, loss, num_epochs, batch_size, optimizer): 6 | # Compile the model. 7 | model.compile( 8 | optimizer=optimizer(learning_rate), 9 | loss=loss(from_logits=True), 10 | metrics=['accuracy'], 11 | ) 12 | # Create an early stopping callback. 13 | early_stopping = keras.callbacks.EarlyStopping( 14 | monitor="val_loss", patience=5, restore_best_weights=True 15 | ) 16 | reduce_lr = keras.callbacks.ReduceLROnPlateau( 17 | patience=2 18 | ) 19 | # Fit the model. 20 | history = model.fit( 21 | x=x_train, 22 | y=y_train, 23 | epochs=num_epochs, 24 | batch_size=batch_size, 25 | validation_split=0.15, 26 | callbacks=[early_stopping, reduce_lr], 27 | ) 28 | return history 29 | 30 | 31 | def run_experiment_XGB(model, x_train, y_train): 32 | dtrain = xgb.DMatrix(data=x_train, label=y_train) 33 | 34 | obj = xgb.train(model.__getparams__(), 35 | dtrain=dtrain, 36 | num_boost_round=500, 37 | ) 38 | return obj 39 | -------------------------------------------------------------------------------- /models/Xgboost.py: -------------------------------------------------------------------------------- 1 | def create_XGB(max_depth=8, learning_rate=0.025, subsample=0.85, 2 | colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic', 3 | tree_method='gpu_hist', seed=1): 4 | return xgBoost(max_depth=8, learning_rate=0.025, subsample=0.85, 5 | colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic', 6 | tree_method='gpu_hist', seed=1) 7 | 8 | 9 | class xgBoost(): 10 | def __init__(self, max_depth=8, learning_rate=0.025, subsample=0.85, 11 | colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic', 12 | tree_method='gpu_hist', seed=1): 13 | self.max_depth = 8 14 | self.learning_rate = 0.025 15 | self.subsample = 0.85 16 | self.colsample_bytree = 0.35 17 | self.eval_metric = 'logloss' 18 | self.objective = 'binary:logistic' 19 | self.tree_method = 'gpu_hist' 20 | self.seed = 1 21 | 22 | def __getparams__(self): 23 | dict_ = { 24 | 'max_depth': self.max_depth, 25 | 'learning_rate': self.learning_rate, 26 | 'subsample': self.subsample, 27 | 'colsample_bytree': self.subsample, 28 | 'eval_metric': self.eval_metric, 29 | 'objective': self.objective, 30 | 'tree_method': self.tree_method, 31 | 'seed': self.seed, 32 | } 33 | return dict_ 34 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | from tensorflow import keras 4 | import numpy as np 5 | import networkx as nx 6 | 7 | 8 | def eng_class(x): 9 | if x <= 0: 10 | return 0 11 | else: 12 | return 1 13 | 14 | 15 | def sampling_k_elements(group, k=30000): 16 | if len(group) < k: 17 | return group 18 | return group.sample(k) 19 | 20 | 21 | def create_ffn(hidden_units, dropout_rate, name=None): 22 | fnn_layers = [] 23 | 24 | for units in hidden_units: 25 | fnn_layers.append(layers.BatchNormalization()) 26 | fnn_layers.append(layers.Dropout(dropout_rate)) 27 | fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu)) 28 | 29 | return keras.Sequential(fnn_layers, name=name) 30 | 31 | 32 | def normalize(df): 33 | df["user_followers"] = np.log10(df["user_followers"] + 1e-5) 34 | df["user_ntweet"] = np.log10(df["user_ntweet"] + 1e-5) 35 | df = df.drop(["hashtag", "text", "time", "screen_name", "favorite", "engagement", "retweet", "id"], axis=1) 36 | for col in df.columns: 37 | if not isinstance(df[col].values[0], str): 38 | df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) 39 | return df 40 | 41 | 42 | def extract_graph(g, df): 43 | mapping_graph = {k: v for v, k in enumerate(g.nodes)} 44 | g = nx.relabel_nodes(g, mapping_graph) 45 | edges = np.array(list(g.edges)).T 46 | edges_weight = [x[2]["weight"] for x in g.edges(data=True)] 47 | features_names = set(df.columns) - {"n_emojis", "user_following", "official_source", "class"} 48 | node_features = tf.cast( 49 | df.sort_index()[features_names].to_numpy(), dtype=tf.dtypes.float32 50 | ) 51 | graph_info = (node_features, edges, edges_weight) 52 | return graph_info 53 | -------------------------------------------------------------------------------- /CreateNetwork.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import ast 3 | import networkx as nx 4 | import datetime 5 | from joblib import Parallel, delayed 6 | from tqdm.auto import tqdm 7 | import numpy as np 8 | 9 | # settings (define the path to the tweets dataset) 10 | delta = 15 11 | path_to_tweets = "./first_week.csv" 12 | n_jobs = 8 13 | 14 | ''' 15 | Function to compute the edges on a slice of the tweets dataframe. 16 | Returns the corresponding list of edges 17 | ''' 18 | 19 | 20 | def edges_subset(split_df, delta=15): 21 | # filter by time 22 | edges = [] 23 | for _, post in split_df.iterrows(): 24 | sub_df = df.loc[((df["time"]) > post["time"] - datetime.timedelta(minutes=delta)) & 25 | (df["time"] < post["time"])].copy(deep=True) 26 | sub_df["connected"] = sub_df["hashtag"].apply(lambda x: len(set(x).intersection(post["hashtag"]))) 27 | sub_df = sub_df.loc[sub_df["connected"] > 0] 28 | edges = edges + [(row["id"], post["id"], row["connected"]) for _, row in sub_df.iterrows()] 29 | return edges 30 | 31 | 32 | # LOAD DATA 33 | df = pd.read_csv(path_to_tweets, lineterminator='\n') 34 | df["hashtag"] = df["hashtag"].apply(lambda x: list(set(ast.literal_eval(x)))) 35 | df["time"] = pd.to_datetime(df["time"]) 36 | 37 | # COMPUTE EDGES using Parallel jobs. It works on dataframe splits 38 | all_edges = Parallel(n_jobs=n_jobs)(delayed(edges_subset)(split_df, delta=delta) for split_df in tqdm(np.array_split(df, 100))) 39 | all_edges = [y for x in all_edges for y in x] 40 | 41 | # CREATE GRAPH 42 | graph = nx.Graph() 43 | # add weighted edges 44 | graph.add_weighted_edges_from(all_edges) 45 | # add isolated nodes 46 | isolated = set(df["id"]).difference(list(graph.nodes)) 47 | graph.add_nodes_from(isolated) 48 | # add node attributes 49 | nx.set_node_attributes(graph, df.loc[df["id"].isin(list(graph.nodes))].set_index("id").to_dict(orient="index")) 50 | print("NODES:", len(graph.nodes)) 51 | print("EDGES:", len(graph.edges)) 52 | print("DENSITY:", nx.density(graph)) 53 | print("NUM CONNECTED COMPONENTS:", len([len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)])) 54 | print("MAX CONNECTED COMPONENT:", max([len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)])) 55 | 56 | # check 57 | if "/" in path_to_tweets: 58 | filename = path_to_tweets.split("/")[-1].split(".")[0] 59 | else: 60 | filename = path_to_tweets.split("\\")[-1].split(".")[0] 61 | 62 | 63 | # protocol=4 ensures compatibility with older Python versions 64 | nx.write_gpickle(graph, "network_tweets.pickle", protocol=4) 65 | -------------------------------------------------------------------------------- /req.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.7.1 2 | apturl==0.5.2 3 | asn1crypto==0.24.0 4 | astor==0.8.0 5 | attrs==17.4.0 6 | beautifulsoup4==4.11.1 7 | blinker==1.4 8 | bounded-pool-executor==0.0.3 9 | Brlapi==0.6.6 10 | certifi==2018.1.18 11 | chardet==3.0.4 12 | click==6.7 13 | cloudpickle==1.6.0 14 | colorama==0.3.7 15 | command-not-found==0.3 16 | cryptography==2.1.4 17 | cupshelpers==1.0 18 | cycler==0.10.0 19 | dask==1.1.1 20 | dataclasses==0.8 21 | decorator==4.3.2 22 | defer==1.0.6 23 | distro-info===0.18ubuntu0.18.04.1 24 | filelock==3.4.1 25 | flake8==3.5.0 26 | Flask==0.12.2 27 | future==0.18.2 28 | gast==0.2.2 29 | gdown==4.5.3 30 | graphviz==0.14.2 31 | grpcio==1.21.1 32 | gym==0.18.0 33 | h5py==2.9.0 34 | httplib2==0.9.2 35 | huggingface-hub==0.4.0 36 | idna==2.6 37 | imageio==2.9.0 38 | importlib-metadata==4.8.3 39 | iopath==0.1.10 40 | iotop==0.6 41 | itsdangerous==0.24 42 | Jinja2==2.10 43 | joblib==0.11 44 | Keras-Applications==1.0.8 45 | Keras-Preprocessing==1.1.0 46 | keyring==10.6.0 47 | keyrings.alt==3.0 48 | kiwisolver==1.3.1 49 | kornia==0.6.1 50 | language-selector==0.1 51 | launchpadlib==1.10.6 52 | lazr.restfulclient==0.13.5 53 | lazr.uri==1.0.3 54 | louis==3.5.0 55 | lpips==0.1.3 56 | macaroonbakery==1.1.3 57 | Mako==1.0.7 58 | Markdown==3.1.1 59 | MarkupSafe==1.0 60 | matplotlib==3.0.2 61 | mccabe==0.6.1 62 | mock==3.0.5 63 | netifaces==0.10.4 64 | networkx==2.2 65 | nltk==3.6.7 66 | nose==1.3.7 67 | numpy==1.19.5 68 | oauth==1.0.1 69 | olefile==0.45.1 70 | packaging==21.2 71 | pandas==1.1.5 72 | patool==1.12 73 | pexpect==4.2.1 74 | Pillow==8.3.1 75 | piq==0.5.5 76 | pluggy==0.6.0 77 | portalocker==2.5.1 78 | protobuf==3.8.0 79 | py==1.5.2 80 | pycairo==1.16.2 81 | pycodestyle==2.3.1 82 | pycrypto==2.6.1 83 | pycups==1.9.73 84 | pyDeprecate==0.3.2 85 | pyflakes==1.6.0 86 | pyglet==1.5.0 87 | PyGObject==3.26.1 88 | pyinotify==0.9.6 89 | pymacaroons==0.13.0 90 | PyNaCl==1.1.2 91 | pyOpenSSL==17.5.0 92 | pyparsing==2.4.7 93 | pyRFC3339==1.0 94 | PySocks==1.7.1 95 | pytest==3.3.2 96 | python-apt==1.6.5+ubuntu0.7 97 | python-dateutil==2.8.2 98 | python-debian==0.1.32 99 | pytorch-ignite==0.2.0 100 | pytorch-msssim==0.2.1 101 | pytz==2018.3 102 | PyWavelets==1.0.1 103 | pyxdg==0.25 104 | PyYAML==6.0 105 | rarfile==4.0 106 | regex==2022.10.31 107 | reportlab==3.4.0 108 | requests==2.18.4 109 | requests-unixsocket==0.1.5 110 | sacremoses==0.0.53 111 | scikit-image==0.14.2 112 | scikit-learn==0.19.1 113 | scipy==1.5.4 114 | screen-resolution-extra==0.0.0 115 | SecretStorage==2.3.1 116 | sentence-transformers==2.2.2 117 | sentencepiece==0.1.97 118 | simplejson==3.13.2 119 | six==1.16.0 120 | soupsieve==2.3.2.post1 121 | ssh-import-id==5.7 122 | system-service==0.3 123 | systemd-python==234 124 | tb-nightly==1.15.0a20190624 125 | tensorboard==1.13.1 126 | tensorboardX==1.5 127 | tensorflow==1.13.1 128 | tensorflow-cpu==0.0.0 129 | tensorflow-estimator==1.13.0 130 | termcolor==1.1.0 131 | tokenizers==0.12.1 132 | toolz==0.9.0 133 | torch==1.9.0 134 | torchaudio==0.7.2 135 | torchmetrics==0.8.2 136 | torchvision==0.10.0 137 | tqdm==4.29.1 138 | transformers==4.18.0 139 | typing-extensions==3.10.0.0 140 | ubuntu-advantage-tools==27.6 141 | ubuntu-drivers-common==0.0.0 142 | ufw==0.36 143 | unattended-upgrades==0.1 144 | urllib3==1.22 145 | usb-creator==0.3.3 146 | virtualenv==15.1.0 147 | wadllib==1.3.2 148 | Werkzeug==0.15.4 149 | xkit==0.0.0 150 | zipp==3.6.0 151 | zope.interface==4.3.2 152 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Graph Neural Network Framework For Post Engagement Prediction in Online social Media 2 | Welcome to the "A Graph Neural Network Framework For Post Engagement Prediction in Online Social Media" repository! 3 | 4 | This repository contains two works focused on predicting post engagement in online social media using graph neural networks. 5 | The goal of this framework is to provide an effective solution for understanding and predicting the engagement of posts in online social media platforms, using the relationships between posts, users, and other network structures. The works presented here showcase the potential of graph neural networks in this field and provide a solid foundation for future research. We hope this repository serves as a valuable resource for the machine learning community. 6 | 7 | ## Predicting Tweet Engagement with Graph Neural Networks [![Paper](https://img.shields.io/badge/PrePrint-brightgreen)](http://arxiv.org/abs/2305.10103) [![Paper](https://img.shields.io/badge/Paper-red)](https://dl.acm.org/doi/10.1145/3591106.3592294) [![Paper](https://img.shields.io/badge/Poster-blue)](./imgs/Poster_ICMR23.pdf) 8 | Official implementation of the paper: "Predicting Tweet Engagement with Graph Neural Networks" 9 | 10 | Published in the **ACM International Conference on Multimedia Retrieval 2023 (ICMR2023)** 11 | 12 | In this paper we present **TweetGage**, a Graph Neural Network solution to predict the user engagement based on a novel graph-based model that represents the relationships among posts. 13 | 14 | 15 | [Marco Arazzi](https://scholar.google.com/citations?user=8dD5SUkAAAAJ&hl=it&oi=ao), 16 | [Marco Cotogni](https://scholar.google.com/citations?user=8PUz5lAAAAAJ&hl=it), 17 | [Antonino Nocera](https://scholar.google.com/citations?user=YF10PJwAAAAJ&hl=it) and 18 | [Luca Virgili](https://scholar.google.com/citations?hl=it&user=2D771YsAAAAJ) 19 | 20 |

21 | 22 | 23 | ### Requirements 24 | In order to replicate our results you can create an environment via Anaconda and install the required packages using pip 25 | ``` 26 | conda create -n TweetGage python=3.9 27 | conda activate TweetGage 28 | pip install -r req.txt 29 | ``` 30 | ### Dataset 31 | For our experiments, we considered one week of data from twitter, from [November 1st to November 7th 2021](https://archive.org/details/archiveteam-twitter-stream-2021-11), obtained through the Twitter API. 32 |

33 | 34 | 35 |

36 | 37 | ### Graph Creation 38 | 39 | Once the tweets have been downloaded, the graph network can be build and saved as a .pickle file with: 40 | ``` 41 | python3 CreateNetwork.py 42 | ``` 43 | 44 | The script will create the graph network as 'network_tweets.pickle'. 45 | 46 | ### Running the Code 47 | 48 | Once the graph network has been created, it is possible to replicate the results of our paper, executing the following command in your terminal: 49 | ``` 50 | python3 main.py --LOAD_CSV --EXTRACT_BERT --USE_PCA --USER_FEAT --BERT_FEAT --Model_Type 'GCN' 51 | ``` 52 | #### Arguments Explanation 53 | 54 | The following arguments can be passed to the main.py script: 55 | 56 | - LOAD_CSV: If you have already computed the features in a csv file, you can load it with this argument. In our code, we load the file "first_week_posts_bert.csv", which contains post features and BERT-extracted text embeddings. 57 | - EXTRACT_BERT: Computes the text embedding of the posts using BERT (valid only if LOAD_CSV is not provided). 58 | - USE_PCA: If True, computes the Principal Component Analysis with 48 projected features that cover more than 80% of the variance of the text features. 59 | - USER_FEAT: If True, includes Post Features in the final feature set. 60 | - BERT_FEAT: If True, includes Text Features in the final feature set. 61 | - Model_Type: Can be one of the following: 'GCN', 'MLP', 'Conv1D', 'GAT', 'XGBOOST'. Default value is 'GCN'. 62 | 63 | Note: If any argument is omitted, its default value is False. 64 | 65 | ### Results 66 | 67 |

68 | 69 | 70 |

71 | 72 | #### References 73 | If this repo is useful to your research or you want to cite our paper please use: 74 | ``` 75 | @inproceedings{ 76 | 10.1145/3591106.3592294, 77 | author = {Arazzi, Marco and Cotogni, Marco and Nocera, Antonino and Virgili, Luca}, 78 | title = {Predicting Tweet Engagement with Graph Neural Networks}, 79 | year = {2023}, 80 | booktitle = {Proceedings of the 2023 ACM International Conference on Multimedia Retrieval}, 81 | pages = {172–180}, 82 | numpages = {9}, 83 | location = {Thessaloniki, Greece}, 84 | series = {ICMR '23} 85 | } 86 | ``` 87 | 88 | ## Available Soon... 89 | -------------------------------------------------------------------------------- /models/GAT.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | from tensorflow.keras import layers 3 | import tensorflow as tf 4 | 5 | 6 | class GraphAttention(layers.Layer): 7 | def __init__( 8 | self, 9 | units, 10 | kernel_initializer="glorot_uniform", 11 | kernel_regularizer=None, 12 | **kwargs, 13 | ): 14 | super().__init__(**kwargs) 15 | self.units = units 16 | self.kernel_initializer = keras.initializers.get(kernel_initializer) 17 | self.kernel_regularizer = keras.regularizers.get(kernel_regularizer) 18 | 19 | def build(self, input_shape): 20 | self.kernel = self.add_weight( 21 | shape=(input_shape[0][-1], self.units), 22 | trainable=True, 23 | initializer=self.kernel_initializer, 24 | regularizer=self.kernel_regularizer, 25 | name="kernel", 26 | ) 27 | self.kernel_attention = self.add_weight( 28 | shape=(self.units * 2, 1), 29 | trainable=True, 30 | initializer=self.kernel_initializer, 31 | regularizer=self.kernel_regularizer, 32 | name="kernel_attention", 33 | ) 34 | self.built = True 35 | 36 | def call(self, inputs): 37 | node_states, edges = inputs 38 | 39 | # Linearly transform node states 40 | node_states_transformed = tf.matmul(node_states, self.kernel) 41 | 42 | # (1) Compute pair-wise attention scores 43 | node_states_expanded = tf.gather(node_states_transformed, edges) 44 | node_states_expanded = tf.reshape( 45 | node_states_expanded, (tf.shape(edges)[0], -1) 46 | ) 47 | attention_scores = tf.nn.leaky_relu( 48 | tf.matmul(node_states_expanded, self.kernel_attention) 49 | ) 50 | attention_scores = tf.squeeze(attention_scores, -1) 51 | 52 | # (2) Normalize attention scores 53 | attention_scores = tf.math.exp(tf.clip_by_value(attention_scores, -2, 2)) 54 | attention_scores_sum = tf.math.unsorted_segment_sum( 55 | data=attention_scores, 56 | segment_ids=edges[:, 0], 57 | num_segments=tf.reduce_max(edges[:, 0]) + 1, 58 | ) 59 | attention_scores_sum = tf.repeat( 60 | attention_scores_sum, tf.math.bincount(tf.cast(edges[:, 0], "int32")) 61 | ) 62 | attention_scores_norm = attention_scores / attention_scores_sum 63 | 64 | # (3) Gather node states of neighbors, apply attention scores and aggregate 65 | node_states_neighbors = tf.gather(node_states_transformed, edges[:, 1]) 66 | out = tf.math.unsorted_segment_sum( 67 | data=node_states_neighbors * attention_scores_norm[:, tf.newaxis], 68 | segment_ids=edges[:, 0], 69 | num_segments=tf.shape(node_states)[0], 70 | ) 71 | return out 72 | 73 | 74 | class MultiHeadGraphAttention(layers.Layer): 75 | def __init__(self, units, num_heads=8, merge_type="concat", **kwargs): 76 | super().__init__(**kwargs) 77 | self.num_heads = num_heads 78 | self.merge_type = merge_type 79 | self.attention_layers = [GraphAttention(units) for _ in range(num_heads)] 80 | 81 | def call(self, inputs): 82 | atom_features, pair_indices = inputs 83 | 84 | # Obtain outputs from each attention head 85 | outputs = [ 86 | attention_layer([atom_features, pair_indices]) 87 | for attention_layer in self.attention_layers 88 | ] 89 | # Concatenate or average the node states from each head 90 | if self.merge_type == "concat": 91 | outputs = tf.concat(outputs, axis=-1) 92 | else: 93 | outputs = tf.reduce_mean(tf.stack(outputs, axis=-1), axis=-1) 94 | # Activate and return node states 95 | return tf.nn.relu(outputs) 96 | 97 | 98 | class GraphAttentionNetwork(keras.Model): 99 | def __init__( 100 | self, 101 | node_states, 102 | edges, 103 | hidden_units, 104 | num_heads, 105 | num_layers, 106 | output_dim, 107 | **kwargs, 108 | ): 109 | super().__init__(**kwargs) 110 | self.node_states = node_states 111 | self.edges = edges 112 | self.preprocess = layers.Dense(hidden_units * num_heads, activation="relu") 113 | self.attention_layers = [ 114 | MultiHeadGraphAttention(hidden_units, num_heads) for _ in range(num_layers) 115 | ] 116 | self.output_layer = layers.Dense(output_dim) 117 | 118 | def call(self, inputs): 119 | node_states, edges = inputs 120 | x = self.preprocess(node_states) 121 | for attention_layer in self.attention_layers: 122 | x = attention_layer([x, edges]) + x 123 | outputs = self.output_layer(x) 124 | return outputs 125 | 126 | def train_step(self, data): 127 | indices, labels = data 128 | 129 | with tf.GradientTape() as tape: 130 | # Forward pass 131 | outputs = self([self.node_states, self.edges]) 132 | # Compute loss 133 | loss = self.compiled_loss(labels, tf.gather(outputs, indices)) 134 | # Compute gradients 135 | grads = tape.gradient(loss, self.trainable_weights) 136 | # Apply gradients (update weights) 137 | optimizer.apply_gradients(zip(grads, self.trainable_weights)) 138 | # Update metric(s) 139 | self.compiled_metrics.update_state(labels, tf.gather(outputs, indices)) 140 | 141 | return {m.name: m.result() for m in self.metrics} 142 | 143 | def predict_step(self, data): 144 | indices = data 145 | # Forward pass 146 | outputs = self([self.node_states, self.edges]) 147 | # Compute probabilities 148 | return tf.nn.softmax(tf.gather(outputs, indices)) 149 | 150 | def test_step(self, data): 151 | indices, labels = data 152 | # Forward pass 153 | outputs = self([self.node_states, self.edges]) 154 | # Compute loss 155 | loss = self.compiled_loss(labels, tf.gather(outputs, indices)) 156 | # Update metric(s) 157 | self.compiled_metrics.update_state(labels, tf.gather(outputs, indices)) 158 | 159 | return {m.name: m.result() for m in self.metrics} 160 | 161 | 162 | def create_GAT(node_states, edges, hidden_units, num_heads, num_layers, num_classes): 163 | gat_model = GraphAttentionNetwork( 164 | node_states, edges.T, hidden_units, num_heads, num_layers, num_classes 165 | ) 166 | return gat_model 167 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sentence_transformers import SentenceTransformer 3 | import gc 4 | from sklearn.decomposition import PCA 5 | from sklearn.model_selection import train_test_split 6 | from Training import run_experiment, run_experiment_XGB 7 | from Evaluation import evaluate, evaluate_XGB 8 | from utils import normalize, eng_class, sampling_k_elements, extract_graph 9 | import numpy as np 10 | import networkx as nx 11 | from tensorflow import keras 12 | from keras.utils import to_categorical 13 | import random 14 | from models.Xgboost import create_XGB 15 | from models.Conv1D import create_Conv1D 16 | from models.GAT import create_GAT 17 | from models.GCN import create_GCN 18 | from models.MLP import create_MLP 19 | import argparse 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser("TweetGage Params") 24 | a = parser.add_argument 25 | a('--LOAD_CSV', action='store_true') 26 | a('--EXTRACT_BERT', action='store_true') 27 | a('--USE_PCA', action='store_true') 28 | a('--USER_FEAT', action='store_true') 29 | a('--BERT_FEAT', action='store_true') 30 | a('--Model_Type', default='GCN', type=str) 31 | return parser.parse_args() 32 | 33 | 34 | def reset_random_seeds(): 35 | os.environ['PYTHONHASHSEED'] = str(2) 36 | tf.random.set_seed(2) 37 | np.random.seed(2) 38 | random.seed(2) 39 | 40 | 41 | def select_params(Model_type, X_train, y_train, X_test, y_test, df, g, num_classes=2, num_epochs=300): 42 | num_classes = num_classes 43 | num_epochs = num_epochs 44 | dropout_rate = None 45 | num_layers = None 46 | num_heads = None 47 | if Model_type == 'GCN': 48 | hidden_units = [16] 49 | dropout_rate = 0.3 50 | learning_rate = 0.1 51 | batch_size = 256 52 | input = np.array(X_train.index) 53 | target = to_categorical(y_train) 54 | loss = keras.losses.CategoricalCrossentropy 55 | optimizer = keras.optimizers.Adam 56 | input_test = np.array(X_test.index) 57 | target_test = y_test 58 | graph_info = extract_graph(g, df) 59 | model = create_GCN(graph_info, num_classes, hidden_units, dropout_rate) 60 | if Model_type == 'MLP': 61 | hidden_units = [32, 32] 62 | learning_rate = 0.01 63 | dropout_rate = 0.5 64 | batch_size = 256 65 | loss = keras.losses.CategoricalCrossentropy 66 | input = X_train 67 | target = to_categorical(y_train) 68 | input_test = X_test 69 | target_test = y_test 70 | optimizer = keras.optimizers.Adam 71 | model = create_MLP(X_train.shape[1], hidden_units, num_classes, dropout_rate) 72 | if Model_type == 'Conv1D': 73 | hidden_units = 64 74 | learning_rate = 0.1 75 | batch_size = 256 76 | model = create_Conv1D(num_classes, hidden_units, X_train.shape[1]) 77 | input = X_train.values.reshape(-1, X_train.shape[1], 1) 78 | loss = keras.losses.CategoricalCrossentropy 79 | target = to_categorical(y_train) 80 | optimizer = keras.optimizers.Adam 81 | input_test = X_test 82 | target_test = y_test 83 | if Model_type == 'GAT': 84 | hidden_units = 100 85 | num_heads = 2 86 | num_layers = 1 87 | batch_size = 64 88 | learning_rate = 1e-2 89 | graph_info = extract_graph(g, df) 90 | input = np.array(X_train.index) 91 | target = to_categorical(y_train) 92 | model = create_GAT(graph_info[0], graph_info[1].T, hidden_units, num_heads, num_layers, num_classes) 93 | loss = keras.losses.CategoricalCrossentropy 94 | optimizer = keras.optimizers.SGD 95 | input_test = np.array(X_test.index) 96 | target_test = y_test 97 | if Model_type == 'XGBOOST': 98 | max_depth = 8 99 | learning_rate = 0.025 100 | subsample = 0.85 101 | colsample_bytree = 0.35 102 | eval_metric = 'logloss' 103 | objective = 'binary:logistic' 104 | tree_method = 'gpu_hist' 105 | seed = 1 106 | model = create_XGB(max_depth, learning_rate, subsample, 107 | colsample_bytree, eval_metric, objective, 108 | tree_method, seed) 109 | return model 110 | return hidden_units, num_classes, learning_rate, num_epochs, dropout_rate, batch_size, num_layers, num_heads, input, target, loss, optimizer, input_test, target_test, model 111 | 112 | 113 | def main(LOAD_CSV=False, EXTRACT_BERT=True, USE_PCA=False, USER_FEAT=True, BERT_FEAT=True, Model_Type='GCN'): 114 | reset_random_seeds() 115 | g = nx.read_gpickle('./network_tweets.pickle') 116 | print("POST:", len(g.nodes)) 117 | print("ARCS:", len(g.edges)) 118 | print("COMPONENTS:", nx.number_connected_components(g)) 119 | if not LOAD_CSV: 120 | df = pd.read_csv("./first_week.csv", lineterminator="\n") 121 | df["class"] = df["engagement"].apply(lambda x: eng_class(x)) 122 | df = df.groupby('class').apply(sampling_k_elements).reset_index(drop=True) 123 | if EXTRACT_BERT: 124 | model = SentenceTransformer('efederici/sentence-bert-base') 125 | emb = model.encode(df["text"]) 126 | if USE_PCA: 127 | pca = PCA(n_components=48) 128 | pca.fit(emb) 129 | emb = pca.transform(emb) 130 | df = pd.concat([df, pd.DataFrame(emb)], axis=1) 131 | del emb, model 132 | gc.collect() 133 | df = normalize(df) 134 | else: 135 | df = pd.read_csv("./first_week_posts_bert.csv") 136 | if USER_FEAT and not BERT_FEAT: 137 | df = df.iloc[:, 0:11] 138 | if not USER_FEAT and BERT_FEAT: 139 | df = df.iloc[:, 10:] 140 | if USE_PCA: 141 | pca = PCA(n_components=48) 142 | print('PCA 48 Components') 143 | pca.fit(df.drop(["class"], axis=1)) 144 | emb = pca.transform(df.drop(["class"], axis=1)) 145 | df = pd.concat([pd.DataFrame(emb), df[["class"]]], axis=1) 146 | 147 | X_train, X_test, y_train, y_test = train_test_split(df.drop(["class"], axis=1), df["class"], test_size=0.2, 148 | random_state=42, stratify=df["class"]) 149 | if not Model_Type == 'XGBOOST': 150 | hidden_units, num_classes, learning_rate, num_epochs, dropout_rate, batch_size, num_layers, \ 151 | num_heads, input, target, loss, optimizer, input_test, target_test, model = select_params(Model_Type, X_train, 152 | y_train, X_test, 153 | y_test, 154 | df, 155 | g, 156 | num_epochs=300) 157 | run_experiment(model, input, target, learning_rate, loss, num_epochs, batch_size, optimizer) 158 | evaluate(model, input_test, target_test) 159 | else: 160 | model = select_params(Model_Type, X_train, y_train, X_test, y_test, df, g, 161 | num_epochs=300) 162 | obj = run_experiment_XGB(model, X_train, y_train) 163 | evaluate_XGB(obj, X_test, y_test) 164 | 165 | 166 | if __name__ == '__main__': 167 | args = vars(parse_args()) 168 | main(*list(args.values)) 169 | -------------------------------------------------------------------------------- /models/GCN.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras import layers 2 | from utils import create_ffn 3 | import tensorflow as tf 4 | 5 | 6 | class GraphConvLayer(layers.Layer): 7 | def __init__( 8 | self, 9 | hidden_units, 10 | dropout_rate=0.2, 11 | aggregation_type="mean", 12 | combination_type="concat", 13 | normalize=False, 14 | *args, 15 | **kwargs, 16 | ): 17 | super(GraphConvLayer, self).__init__(*args, **kwargs) 18 | 19 | self.aggregation_type = aggregation_type 20 | self.combination_type = combination_type 21 | self.normalize = normalize 22 | 23 | self.ffn_prepare = create_ffn(hidden_units, dropout_rate) 24 | if self.combination_type == "gated": 25 | self.update_fn = layers.GRU( 26 | units=hidden_units, 27 | activation="tanh", 28 | recurrent_activation="sigmoid", 29 | dropout=dropout_rate, 30 | return_state=True, 31 | recurrent_dropout=dropout_rate, 32 | ) 33 | else: 34 | self.update_fn = create_ffn(hidden_units, dropout_rate) 35 | 36 | def prepare(self, node_repesentations, weights=None): 37 | # node_repesentations shape is [num_edges, embedding_dim]. 38 | messages = self.ffn_prepare(node_repesentations) 39 | if weights is not None: 40 | messages = messages * tf.expand_dims(weights, -1) 41 | return messages 42 | 43 | def aggregate(self, node_indices, neighbour_messages, node_repesentations): 44 | # node_indices shape is [num_edges]. 45 | # neighbour_messages shape: [num_edges, representation_dim]. 46 | # node_repesentations shape is [num_nodes, representation_dim]. 47 | num_nodes = node_repesentations.shape[0] 48 | if self.aggregation_type == "sum": 49 | aggregated_message = tf.math.unsorted_segment_sum( 50 | neighbour_messages, node_indices, num_segments=num_nodes 51 | ) 52 | elif self.aggregation_type == "mean": 53 | aggregated_message = tf.math.unsorted_segment_mean( 54 | neighbour_messages, node_indices, num_segments=num_nodes 55 | ) 56 | elif self.aggregation_type == "max": 57 | aggregated_message = tf.math.unsorted_segment_max( 58 | neighbour_messages, node_indices, num_segments=num_nodes 59 | ) 60 | else: 61 | raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.") 62 | 63 | return aggregated_message 64 | 65 | def update(self, node_repesentations, aggregated_messages): 66 | # node_repesentations shape is [num_nodes, representation_dim]. 67 | # aggregated_messages shape is [num_nodes, representation_dim]. 68 | if self.combination_type == "gru": 69 | # Create a sequence of two elements for the GRU layer. 70 | h = tf.stack([node_repesentations, aggregated_messages], axis=1) 71 | elif self.combination_type == "concat": 72 | # Concatenate the node_repesentations and aggregated_messages. 73 | h = tf.concat([node_repesentations, aggregated_messages], axis=1) 74 | elif self.combination_type == "add": 75 | # Add node_repesentations and aggregated_messages. 76 | h = node_repesentations + aggregated_messages 77 | else: 78 | raise ValueError(f"Invalid combination type: {self.combination_type}.") 79 | 80 | # Apply the processing function. 81 | node_embeddings = self.update_fn(h) 82 | if self.combination_type == "gru": 83 | node_embeddings = tf.unstack(node_embeddings, axis=1)[-1] 84 | 85 | if self.normalize: 86 | node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1) 87 | return node_embeddings 88 | 89 | def call(self, inputs): 90 | """Process the inputs to produce the node_embeddings. 91 | 92 | inputs: a tuple of three elements: node_repesentations, edges, edge_weights. 93 | Returns: node_embeddings of shape [num_nodes, representation_dim]. 94 | """ 95 | 96 | node_repesentations, edges, edge_weights = inputs 97 | # Get node_indices (source) and neighbour_indices (target) from edges. 98 | node_indices, neighbour_indices = edges[0], edges[1] 99 | # neighbour_repesentations shape is [num_edges, representation_dim]. 100 | neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices) 101 | 102 | # Prepare the messages of the neighbours. 103 | neighbour_messages = self.prepare(neighbour_repesentations, edge_weights) 104 | # Aggregate the neighbour messages. 105 | aggregated_messages = self.aggregate( 106 | node_indices, neighbour_messages, node_repesentations 107 | ) 108 | # Update the node embedding with the neighbour messages. 109 | return self.update(node_repesentations, aggregated_messages) 110 | 111 | 112 | class GNNNodeRegression(tf.keras.Model): 113 | def __init__( 114 | self, 115 | graph_info, 116 | hidden_units, 117 | num_classes, 118 | aggregation_type="sum", 119 | combination_type="concat", 120 | dropout_rate=0.2, 121 | normalize=True, 122 | *args, 123 | **kwargs, 124 | ): 125 | super(GNNNodeRegression, self).__init__(*args, **kwargs) 126 | 127 | # Unpack graph_info to three elements: node_features, edges, and edge_weight. 128 | node_features, edges, edge_weights = graph_info 129 | self.node_features = node_features 130 | self.edges = edges 131 | self.edge_weights = edge_weights 132 | # Set edge_weights to ones if not provided. 133 | if self.edge_weights is None: 134 | self.edge_weights = tf.ones(shape=edges.shape[1]) 135 | # Scale edge_weights to sum to 1. 136 | self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights) 137 | 138 | # Create a process layer. 139 | self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess") 140 | # Create the first GraphConv layer. 141 | self.conv1 = GraphConvLayer( 142 | hidden_units, 143 | dropout_rate, 144 | aggregation_type, 145 | combination_type, 146 | normalize, 147 | name="graph_conv1", 148 | ) 149 | # Create the second GraphConv layer. 150 | self.conv2 = GraphConvLayer( 151 | hidden_units, 152 | dropout_rate, 153 | aggregation_type, 154 | combination_type, 155 | normalize, 156 | name="graph_conv2", 157 | ) 158 | # Create a postprocess layer. 159 | self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess") 160 | # Create a compute logits layer. 161 | self.compute_logits = layers.Dense(units=num_classes, name="logits") 162 | 163 | def call(self, input_node_indices): 164 | # Preprocess the node_features to produce node representations. 165 | x = self.preprocess(self.node_features) 166 | # Apply the first graph conv layer. 167 | x = self.conv1((x, self.edges, self.edge_weights)) 168 | # Skip connection. 169 | # x = x1 + x 170 | # Apply the second graph conv layer. 171 | x = self.conv2((x, self.edges, self.edge_weights)) 172 | # Skip connection. 173 | # x = x2 + x 174 | # Postprocess node embedding. 175 | x = self.postprocess(x) 176 | # Fetch node embeddings for the input node_indices. 177 | node_embeddings = tf.gather(x, input_node_indices) 178 | # Compute logits 179 | return self.compute_logits(node_embeddings) 180 | 181 | 182 | def create_GCN(graph_info, num_classes, hidden_units, dropout_rate): 183 | # SUM, CONCAT DA NON TOCCARE MAI E' LA MIGLIORE 184 | gnn_model = GNNNodeRegression( 185 | num_classes=num_classes, 186 | aggregation_type="sum", 187 | combination_type="concat", 188 | graph_info=graph_info, 189 | hidden_units=hidden_units, 190 | dropout_rate=dropout_rate, 191 | name="gnn_model", 192 | ) 193 | return gnn_model 194 | --------------------------------------------------------------------------------