├── imgs
├── gr1.png
├── gr2.png
├── res1.png
├── res2.png
├── teaser.png
└── Poster_ICMR23.pdf
├── models
├── Conv1D.py
├── MLP.py
├── Xgboost.py
├── GAT.py
└── GCN.py
├── Evaluation.py
├── Training.py
├── utils.py
├── CreateNetwork.py
├── req.txt
├── README.md
└── main.py
/imgs/gr1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/gr1.png
--------------------------------------------------------------------------------
/imgs/gr2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/gr2.png
--------------------------------------------------------------------------------
/imgs/res1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/res1.png
--------------------------------------------------------------------------------
/imgs/res2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/res2.png
--------------------------------------------------------------------------------
/imgs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/teaser.png
--------------------------------------------------------------------------------
/imgs/Poster_ICMR23.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/Poster_ICMR23.pdf
--------------------------------------------------------------------------------
/models/Conv1D.py:
--------------------------------------------------------------------------------
1 | from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
2 | from keras.models import Sequential
3 |
4 |
5 | def create_Conv1D(num_classes, hidden, input):
6 | model = Sequential()
7 | model.add(Conv1D(hidden, 1, activation="tanh", input_shape=(input, 1)))
8 | # model.add(Conv1D(128, 1, strides=2, activation="relu"))
9 | # model.add(Conv1D(256, 1, strides=2, activation="relu"))
10 | # model.add(Conv1D(512, 1, strides=2, activation="relu"))
11 | # model.add(Conv1D(1024, 1, strides=2, activation="relu"))
12 | model.add(MaxPooling1D())
13 | model.add(Flatten())
14 | model.add(Dense(2, name='logits'))
15 | return model
16 |
--------------------------------------------------------------------------------
/models/MLP.py:
--------------------------------------------------------------------------------
1 | from tensorflow import keras
2 | from tensorflow.keras import layers
3 | from utils import create_ffn
4 |
5 |
6 | def MLP(input_shape, hidden_units, num_classes, dropout_rate=0.2):
7 | inputs = layers.Input(shape=(input_shape,), name="input_features")
8 | x = create_ffn(hidden_units, dropout_rate, name=f"ffn_block1")(inputs)
9 | for block_idx in range(4):
10 | # Create an FFN block.
11 | x1 = create_ffn(hidden_units, dropout_rate, name=f"ffn_block{block_idx + 2}")(x)
12 | # Add skip connection.
13 | x = layers.Add(name=f"skip_connection{block_idx + 2}")([x, x1])
14 | # Compute logits.
15 | logits = layers.Dense(num_classes, name="logits")(x)
16 | # Create the model.
17 | return keras.Model(inputs=inputs, outputs=logits, name="baseline")
18 |
19 |
20 | def create_MLP(input_shape, hidden_units, num_classes, dropout_rate):
21 | model = MLP(input_shape, hidden_units, num_classes, dropout_rate)
22 | return model
23 |
--------------------------------------------------------------------------------
/Evaluation.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_auc_score
2 | import numpy as np
3 | from scipy.special import softmax
4 | import xgboost as xgb
5 |
6 |
7 | def evaluate(model, X_test, y_test):
8 | logits = model.predict(X_test)
9 | probs = softmax(logits, axis=1)
10 | classes = np.argmax(probs, axis=1)
11 | print_metrics(classes, y_test, probs[:,1])
12 |
13 |
14 | def evaluate_XGB(obj, X_test, y_test):
15 | dtest = xgb.DMatrix(data=X_test)
16 | probs = obj.predict(dtest)
17 | classes = probs.copy()
18 | classes[classes > 0.5] = 1
19 | classes[classes <= 0.5] = 0
20 | print_metrics(classes, y_test, probs)
21 |
22 |
23 | def print_metrics(classes, y_test, probs):
24 | print(classification_report(classes, y_test, labels=[0, 1]))
25 | prec, recall, thr = precision_recall_curve(y_test, probs, pos_label=1)
26 | prauc = auc(recall, prec)
27 | print(prauc)
28 | prauc = roc_auc_score(y_test, probs)
29 | print(prauc)
30 | print(confusion_matrix(classes, y_test))
31 |
--------------------------------------------------------------------------------
/Training.py:
--------------------------------------------------------------------------------
1 | from tensorflow import keras
2 | import xgboost as xgb
3 |
4 |
5 | def run_experiment(model, x_train, y_train, learning_rate, loss, num_epochs, batch_size, optimizer):
6 | # Compile the model.
7 | model.compile(
8 | optimizer=optimizer(learning_rate),
9 | loss=loss(from_logits=True),
10 | metrics=['accuracy'],
11 | )
12 | # Create an early stopping callback.
13 | early_stopping = keras.callbacks.EarlyStopping(
14 | monitor="val_loss", patience=5, restore_best_weights=True
15 | )
16 | reduce_lr = keras.callbacks.ReduceLROnPlateau(
17 | patience=2
18 | )
19 | # Fit the model.
20 | history = model.fit(
21 | x=x_train,
22 | y=y_train,
23 | epochs=num_epochs,
24 | batch_size=batch_size,
25 | validation_split=0.15,
26 | callbacks=[early_stopping, reduce_lr],
27 | )
28 | return history
29 |
30 |
31 | def run_experiment_XGB(model, x_train, y_train):
32 | dtrain = xgb.DMatrix(data=x_train, label=y_train)
33 |
34 | obj = xgb.train(model.__getparams__(),
35 | dtrain=dtrain,
36 | num_boost_round=500,
37 | )
38 | return obj
39 |
--------------------------------------------------------------------------------
/models/Xgboost.py:
--------------------------------------------------------------------------------
1 | def create_XGB(max_depth=8, learning_rate=0.025, subsample=0.85,
2 | colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic',
3 | tree_method='gpu_hist', seed=1):
4 | return xgBoost(max_depth=8, learning_rate=0.025, subsample=0.85,
5 | colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic',
6 | tree_method='gpu_hist', seed=1)
7 |
8 |
9 | class xgBoost():
10 | def __init__(self, max_depth=8, learning_rate=0.025, subsample=0.85,
11 | colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic',
12 | tree_method='gpu_hist', seed=1):
13 | self.max_depth = 8
14 | self.learning_rate = 0.025
15 | self.subsample = 0.85
16 | self.colsample_bytree = 0.35
17 | self.eval_metric = 'logloss'
18 | self.objective = 'binary:logistic'
19 | self.tree_method = 'gpu_hist'
20 | self.seed = 1
21 |
22 | def __getparams__(self):
23 | dict_ = {
24 | 'max_depth': self.max_depth,
25 | 'learning_rate': self.learning_rate,
26 | 'subsample': self.subsample,
27 | 'colsample_bytree': self.subsample,
28 | 'eval_metric': self.eval_metric,
29 | 'objective': self.objective,
30 | 'tree_method': self.tree_method,
31 | 'seed': self.seed,
32 | }
33 | return dict_
34 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.keras import layers
3 | from tensorflow import keras
4 | import numpy as np
5 | import networkx as nx
6 |
7 |
8 | def eng_class(x):
9 | if x <= 0:
10 | return 0
11 | else:
12 | return 1
13 |
14 |
15 | def sampling_k_elements(group, k=30000):
16 | if len(group) < k:
17 | return group
18 | return group.sample(k)
19 |
20 |
21 | def create_ffn(hidden_units, dropout_rate, name=None):
22 | fnn_layers = []
23 |
24 | for units in hidden_units:
25 | fnn_layers.append(layers.BatchNormalization())
26 | fnn_layers.append(layers.Dropout(dropout_rate))
27 | fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))
28 |
29 | return keras.Sequential(fnn_layers, name=name)
30 |
31 |
32 | def normalize(df):
33 | df["user_followers"] = np.log10(df["user_followers"] + 1e-5)
34 | df["user_ntweet"] = np.log10(df["user_ntweet"] + 1e-5)
35 | df = df.drop(["hashtag", "text", "time", "screen_name", "favorite", "engagement", "retweet", "id"], axis=1)
36 | for col in df.columns:
37 | if not isinstance(df[col].values[0], str):
38 | df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
39 | return df
40 |
41 |
42 | def extract_graph(g, df):
43 | mapping_graph = {k: v for v, k in enumerate(g.nodes)}
44 | g = nx.relabel_nodes(g, mapping_graph)
45 | edges = np.array(list(g.edges)).T
46 | edges_weight = [x[2]["weight"] for x in g.edges(data=True)]
47 | features_names = set(df.columns) - {"n_emojis", "user_following", "official_source", "class"}
48 | node_features = tf.cast(
49 | df.sort_index()[features_names].to_numpy(), dtype=tf.dtypes.float32
50 | )
51 | graph_info = (node_features, edges, edges_weight)
52 | return graph_info
53 |
--------------------------------------------------------------------------------
/CreateNetwork.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import ast
3 | import networkx as nx
4 | import datetime
5 | from joblib import Parallel, delayed
6 | from tqdm.auto import tqdm
7 | import numpy as np
8 |
9 | # settings (define the path to the tweets dataset)
10 | delta = 15
11 | path_to_tweets = "./first_week.csv"
12 | n_jobs = 8
13 |
14 | '''
15 | Function to compute the edges on a slice of the tweets dataframe.
16 | Returns the corresponding list of edges
17 | '''
18 |
19 |
20 | def edges_subset(split_df, delta=15):
21 | # filter by time
22 | edges = []
23 | for _, post in split_df.iterrows():
24 | sub_df = df.loc[((df["time"]) > post["time"] - datetime.timedelta(minutes=delta)) &
25 | (df["time"] < post["time"])].copy(deep=True)
26 | sub_df["connected"] = sub_df["hashtag"].apply(lambda x: len(set(x).intersection(post["hashtag"])))
27 | sub_df = sub_df.loc[sub_df["connected"] > 0]
28 | edges = edges + [(row["id"], post["id"], row["connected"]) for _, row in sub_df.iterrows()]
29 | return edges
30 |
31 |
32 | # LOAD DATA
33 | df = pd.read_csv(path_to_tweets, lineterminator='\n')
34 | df["hashtag"] = df["hashtag"].apply(lambda x: list(set(ast.literal_eval(x))))
35 | df["time"] = pd.to_datetime(df["time"])
36 |
37 | # COMPUTE EDGES using Parallel jobs. It works on dataframe splits
38 | all_edges = Parallel(n_jobs=n_jobs)(delayed(edges_subset)(split_df, delta=delta) for split_df in tqdm(np.array_split(df, 100)))
39 | all_edges = [y for x in all_edges for y in x]
40 |
41 | # CREATE GRAPH
42 | graph = nx.Graph()
43 | # add weighted edges
44 | graph.add_weighted_edges_from(all_edges)
45 | # add isolated nodes
46 | isolated = set(df["id"]).difference(list(graph.nodes))
47 | graph.add_nodes_from(isolated)
48 | # add node attributes
49 | nx.set_node_attributes(graph, df.loc[df["id"].isin(list(graph.nodes))].set_index("id").to_dict(orient="index"))
50 | print("NODES:", len(graph.nodes))
51 | print("EDGES:", len(graph.edges))
52 | print("DENSITY:", nx.density(graph))
53 | print("NUM CONNECTED COMPONENTS:", len([len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)]))
54 | print("MAX CONNECTED COMPONENT:", max([len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)]))
55 |
56 | # check
57 | if "/" in path_to_tweets:
58 | filename = path_to_tweets.split("/")[-1].split(".")[0]
59 | else:
60 | filename = path_to_tweets.split("\\")[-1].split(".")[0]
61 |
62 |
63 | # protocol=4 ensures compatibility with older Python versions
64 | nx.write_gpickle(graph, "network_tweets.pickle", protocol=4)
65 |
--------------------------------------------------------------------------------
/req.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.7.1
2 | apturl==0.5.2
3 | asn1crypto==0.24.0
4 | astor==0.8.0
5 | attrs==17.4.0
6 | beautifulsoup4==4.11.1
7 | blinker==1.4
8 | bounded-pool-executor==0.0.3
9 | Brlapi==0.6.6
10 | certifi==2018.1.18
11 | chardet==3.0.4
12 | click==6.7
13 | cloudpickle==1.6.0
14 | colorama==0.3.7
15 | command-not-found==0.3
16 | cryptography==2.1.4
17 | cupshelpers==1.0
18 | cycler==0.10.0
19 | dask==1.1.1
20 | dataclasses==0.8
21 | decorator==4.3.2
22 | defer==1.0.6
23 | distro-info===0.18ubuntu0.18.04.1
24 | filelock==3.4.1
25 | flake8==3.5.0
26 | Flask==0.12.2
27 | future==0.18.2
28 | gast==0.2.2
29 | gdown==4.5.3
30 | graphviz==0.14.2
31 | grpcio==1.21.1
32 | gym==0.18.0
33 | h5py==2.9.0
34 | httplib2==0.9.2
35 | huggingface-hub==0.4.0
36 | idna==2.6
37 | imageio==2.9.0
38 | importlib-metadata==4.8.3
39 | iopath==0.1.10
40 | iotop==0.6
41 | itsdangerous==0.24
42 | Jinja2==2.10
43 | joblib==0.11
44 | Keras-Applications==1.0.8
45 | Keras-Preprocessing==1.1.0
46 | keyring==10.6.0
47 | keyrings.alt==3.0
48 | kiwisolver==1.3.1
49 | kornia==0.6.1
50 | language-selector==0.1
51 | launchpadlib==1.10.6
52 | lazr.restfulclient==0.13.5
53 | lazr.uri==1.0.3
54 | louis==3.5.0
55 | lpips==0.1.3
56 | macaroonbakery==1.1.3
57 | Mako==1.0.7
58 | Markdown==3.1.1
59 | MarkupSafe==1.0
60 | matplotlib==3.0.2
61 | mccabe==0.6.1
62 | mock==3.0.5
63 | netifaces==0.10.4
64 | networkx==2.2
65 | nltk==3.6.7
66 | nose==1.3.7
67 | numpy==1.19.5
68 | oauth==1.0.1
69 | olefile==0.45.1
70 | packaging==21.2
71 | pandas==1.1.5
72 | patool==1.12
73 | pexpect==4.2.1
74 | Pillow==8.3.1
75 | piq==0.5.5
76 | pluggy==0.6.0
77 | portalocker==2.5.1
78 | protobuf==3.8.0
79 | py==1.5.2
80 | pycairo==1.16.2
81 | pycodestyle==2.3.1
82 | pycrypto==2.6.1
83 | pycups==1.9.73
84 | pyDeprecate==0.3.2
85 | pyflakes==1.6.0
86 | pyglet==1.5.0
87 | PyGObject==3.26.1
88 | pyinotify==0.9.6
89 | pymacaroons==0.13.0
90 | PyNaCl==1.1.2
91 | pyOpenSSL==17.5.0
92 | pyparsing==2.4.7
93 | pyRFC3339==1.0
94 | PySocks==1.7.1
95 | pytest==3.3.2
96 | python-apt==1.6.5+ubuntu0.7
97 | python-dateutil==2.8.2
98 | python-debian==0.1.32
99 | pytorch-ignite==0.2.0
100 | pytorch-msssim==0.2.1
101 | pytz==2018.3
102 | PyWavelets==1.0.1
103 | pyxdg==0.25
104 | PyYAML==6.0
105 | rarfile==4.0
106 | regex==2022.10.31
107 | reportlab==3.4.0
108 | requests==2.18.4
109 | requests-unixsocket==0.1.5
110 | sacremoses==0.0.53
111 | scikit-image==0.14.2
112 | scikit-learn==0.19.1
113 | scipy==1.5.4
114 | screen-resolution-extra==0.0.0
115 | SecretStorage==2.3.1
116 | sentence-transformers==2.2.2
117 | sentencepiece==0.1.97
118 | simplejson==3.13.2
119 | six==1.16.0
120 | soupsieve==2.3.2.post1
121 | ssh-import-id==5.7
122 | system-service==0.3
123 | systemd-python==234
124 | tb-nightly==1.15.0a20190624
125 | tensorboard==1.13.1
126 | tensorboardX==1.5
127 | tensorflow==1.13.1
128 | tensorflow-cpu==0.0.0
129 | tensorflow-estimator==1.13.0
130 | termcolor==1.1.0
131 | tokenizers==0.12.1
132 | toolz==0.9.0
133 | torch==1.9.0
134 | torchaudio==0.7.2
135 | torchmetrics==0.8.2
136 | torchvision==0.10.0
137 | tqdm==4.29.1
138 | transformers==4.18.0
139 | typing-extensions==3.10.0.0
140 | ubuntu-advantage-tools==27.6
141 | ubuntu-drivers-common==0.0.0
142 | ufw==0.36
143 | unattended-upgrades==0.1
144 | urllib3==1.22
145 | usb-creator==0.3.3
146 | virtualenv==15.1.0
147 | wadllib==1.3.2
148 | Werkzeug==0.15.4
149 | xkit==0.0.0
150 | zipp==3.6.0
151 | zope.interface==4.3.2
152 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # A Graph Neural Network Framework For Post Engagement Prediction in Online social Media
2 | Welcome to the "A Graph Neural Network Framework For Post Engagement Prediction in Online Social Media" repository!
3 |
4 | This repository contains two works focused on predicting post engagement in online social media using graph neural networks.
5 | The goal of this framework is to provide an effective solution for understanding and predicting the engagement of posts in online social media platforms, using the relationships between posts, users, and other network structures. The works presented here showcase the potential of graph neural networks in this field and provide a solid foundation for future research. We hope this repository serves as a valuable resource for the machine learning community.
6 |
7 | ## Predicting Tweet Engagement with Graph Neural Networks [](http://arxiv.org/abs/2305.10103) [](https://dl.acm.org/doi/10.1145/3591106.3592294) [](./imgs/Poster_ICMR23.pdf)
8 | Official implementation of the paper: "Predicting Tweet Engagement with Graph Neural Networks"
9 |
10 | Published in the **ACM International Conference on Multimedia Retrieval 2023 (ICMR2023)**
11 |
12 | In this paper we present **TweetGage**, a Graph Neural Network solution to predict the user engagement based on a novel graph-based model that represents the relationships among posts.
13 |
14 |
15 | [Marco Arazzi](https://scholar.google.com/citations?user=8dD5SUkAAAAJ&hl=it&oi=ao),
16 | [Marco Cotogni](https://scholar.google.com/citations?user=8PUz5lAAAAAJ&hl=it),
17 | [Antonino Nocera](https://scholar.google.com/citations?user=YF10PJwAAAAJ&hl=it) and
18 | [Luca Virgili](https://scholar.google.com/citations?hl=it&user=2D771YsAAAAJ)
19 |
20 |
21 |
22 |
23 | ### Requirements
24 | In order to replicate our results you can create an environment via Anaconda and install the required packages using pip
25 | ```
26 | conda create -n TweetGage python=3.9
27 | conda activate TweetGage
28 | pip install -r req.txt
29 | ```
30 | ### Dataset
31 | For our experiments, we considered one week of data from twitter, from [November 1st to November 7th 2021](https://archive.org/details/archiveteam-twitter-stream-2021-11), obtained through the Twitter API.
32 |
33 |
34 |
35 |
36 |
37 | ### Graph Creation
38 |
39 | Once the tweets have been downloaded, the graph network can be build and saved as a .pickle file with:
40 | ```
41 | python3 CreateNetwork.py
42 | ```
43 |
44 | The script will create the graph network as 'network_tweets.pickle'.
45 |
46 | ### Running the Code
47 |
48 | Once the graph network has been created, it is possible to replicate the results of our paper, executing the following command in your terminal:
49 | ```
50 | python3 main.py --LOAD_CSV --EXTRACT_BERT --USE_PCA --USER_FEAT --BERT_FEAT --Model_Type 'GCN'
51 | ```
52 | #### Arguments Explanation
53 |
54 | The following arguments can be passed to the main.py script:
55 |
56 | - LOAD_CSV: If you have already computed the features in a csv file, you can load it with this argument. In our code, we load the file "first_week_posts_bert.csv", which contains post features and BERT-extracted text embeddings.
57 | - EXTRACT_BERT: Computes the text embedding of the posts using BERT (valid only if LOAD_CSV is not provided).
58 | - USE_PCA: If True, computes the Principal Component Analysis with 48 projected features that cover more than 80% of the variance of the text features.
59 | - USER_FEAT: If True, includes Post Features in the final feature set.
60 | - BERT_FEAT: If True, includes Text Features in the final feature set.
61 | - Model_Type: Can be one of the following: 'GCN', 'MLP', 'Conv1D', 'GAT', 'XGBOOST'. Default value is 'GCN'.
62 |
63 | Note: If any argument is omitted, its default value is False.
64 |
65 | ### Results
66 |
67 |
68 |
69 |
70 |
71 |
72 | #### References
73 | If this repo is useful to your research or you want to cite our paper please use:
74 | ```
75 | @inproceedings{
76 | 10.1145/3591106.3592294,
77 | author = {Arazzi, Marco and Cotogni, Marco and Nocera, Antonino and Virgili, Luca},
78 | title = {Predicting Tweet Engagement with Graph Neural Networks},
79 | year = {2023},
80 | booktitle = {Proceedings of the 2023 ACM International Conference on Multimedia Retrieval},
81 | pages = {172–180},
82 | numpages = {9},
83 | location = {Thessaloniki, Greece},
84 | series = {ICMR '23}
85 | }
86 | ```
87 |
88 | ## Available Soon...
89 |
--------------------------------------------------------------------------------
/models/GAT.py:
--------------------------------------------------------------------------------
1 | from tensorflow import keras
2 | from tensorflow.keras import layers
3 | import tensorflow as tf
4 |
5 |
6 | class GraphAttention(layers.Layer):
7 | def __init__(
8 | self,
9 | units,
10 | kernel_initializer="glorot_uniform",
11 | kernel_regularizer=None,
12 | **kwargs,
13 | ):
14 | super().__init__(**kwargs)
15 | self.units = units
16 | self.kernel_initializer = keras.initializers.get(kernel_initializer)
17 | self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
18 |
19 | def build(self, input_shape):
20 | self.kernel = self.add_weight(
21 | shape=(input_shape[0][-1], self.units),
22 | trainable=True,
23 | initializer=self.kernel_initializer,
24 | regularizer=self.kernel_regularizer,
25 | name="kernel",
26 | )
27 | self.kernel_attention = self.add_weight(
28 | shape=(self.units * 2, 1),
29 | trainable=True,
30 | initializer=self.kernel_initializer,
31 | regularizer=self.kernel_regularizer,
32 | name="kernel_attention",
33 | )
34 | self.built = True
35 |
36 | def call(self, inputs):
37 | node_states, edges = inputs
38 |
39 | # Linearly transform node states
40 | node_states_transformed = tf.matmul(node_states, self.kernel)
41 |
42 | # (1) Compute pair-wise attention scores
43 | node_states_expanded = tf.gather(node_states_transformed, edges)
44 | node_states_expanded = tf.reshape(
45 | node_states_expanded, (tf.shape(edges)[0], -1)
46 | )
47 | attention_scores = tf.nn.leaky_relu(
48 | tf.matmul(node_states_expanded, self.kernel_attention)
49 | )
50 | attention_scores = tf.squeeze(attention_scores, -1)
51 |
52 | # (2) Normalize attention scores
53 | attention_scores = tf.math.exp(tf.clip_by_value(attention_scores, -2, 2))
54 | attention_scores_sum = tf.math.unsorted_segment_sum(
55 | data=attention_scores,
56 | segment_ids=edges[:, 0],
57 | num_segments=tf.reduce_max(edges[:, 0]) + 1,
58 | )
59 | attention_scores_sum = tf.repeat(
60 | attention_scores_sum, tf.math.bincount(tf.cast(edges[:, 0], "int32"))
61 | )
62 | attention_scores_norm = attention_scores / attention_scores_sum
63 |
64 | # (3) Gather node states of neighbors, apply attention scores and aggregate
65 | node_states_neighbors = tf.gather(node_states_transformed, edges[:, 1])
66 | out = tf.math.unsorted_segment_sum(
67 | data=node_states_neighbors * attention_scores_norm[:, tf.newaxis],
68 | segment_ids=edges[:, 0],
69 | num_segments=tf.shape(node_states)[0],
70 | )
71 | return out
72 |
73 |
74 | class MultiHeadGraphAttention(layers.Layer):
75 | def __init__(self, units, num_heads=8, merge_type="concat", **kwargs):
76 | super().__init__(**kwargs)
77 | self.num_heads = num_heads
78 | self.merge_type = merge_type
79 | self.attention_layers = [GraphAttention(units) for _ in range(num_heads)]
80 |
81 | def call(self, inputs):
82 | atom_features, pair_indices = inputs
83 |
84 | # Obtain outputs from each attention head
85 | outputs = [
86 | attention_layer([atom_features, pair_indices])
87 | for attention_layer in self.attention_layers
88 | ]
89 | # Concatenate or average the node states from each head
90 | if self.merge_type == "concat":
91 | outputs = tf.concat(outputs, axis=-1)
92 | else:
93 | outputs = tf.reduce_mean(tf.stack(outputs, axis=-1), axis=-1)
94 | # Activate and return node states
95 | return tf.nn.relu(outputs)
96 |
97 |
98 | class GraphAttentionNetwork(keras.Model):
99 | def __init__(
100 | self,
101 | node_states,
102 | edges,
103 | hidden_units,
104 | num_heads,
105 | num_layers,
106 | output_dim,
107 | **kwargs,
108 | ):
109 | super().__init__(**kwargs)
110 | self.node_states = node_states
111 | self.edges = edges
112 | self.preprocess = layers.Dense(hidden_units * num_heads, activation="relu")
113 | self.attention_layers = [
114 | MultiHeadGraphAttention(hidden_units, num_heads) for _ in range(num_layers)
115 | ]
116 | self.output_layer = layers.Dense(output_dim)
117 |
118 | def call(self, inputs):
119 | node_states, edges = inputs
120 | x = self.preprocess(node_states)
121 | for attention_layer in self.attention_layers:
122 | x = attention_layer([x, edges]) + x
123 | outputs = self.output_layer(x)
124 | return outputs
125 |
126 | def train_step(self, data):
127 | indices, labels = data
128 |
129 | with tf.GradientTape() as tape:
130 | # Forward pass
131 | outputs = self([self.node_states, self.edges])
132 | # Compute loss
133 | loss = self.compiled_loss(labels, tf.gather(outputs, indices))
134 | # Compute gradients
135 | grads = tape.gradient(loss, self.trainable_weights)
136 | # Apply gradients (update weights)
137 | optimizer.apply_gradients(zip(grads, self.trainable_weights))
138 | # Update metric(s)
139 | self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))
140 |
141 | return {m.name: m.result() for m in self.metrics}
142 |
143 | def predict_step(self, data):
144 | indices = data
145 | # Forward pass
146 | outputs = self([self.node_states, self.edges])
147 | # Compute probabilities
148 | return tf.nn.softmax(tf.gather(outputs, indices))
149 |
150 | def test_step(self, data):
151 | indices, labels = data
152 | # Forward pass
153 | outputs = self([self.node_states, self.edges])
154 | # Compute loss
155 | loss = self.compiled_loss(labels, tf.gather(outputs, indices))
156 | # Update metric(s)
157 | self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))
158 |
159 | return {m.name: m.result() for m in self.metrics}
160 |
161 |
162 | def create_GAT(node_states, edges, hidden_units, num_heads, num_layers, num_classes):
163 | gat_model = GraphAttentionNetwork(
164 | node_states, edges.T, hidden_units, num_heads, num_layers, num_classes
165 | )
166 | return gat_model
167 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sentence_transformers import SentenceTransformer
3 | import gc
4 | from sklearn.decomposition import PCA
5 | from sklearn.model_selection import train_test_split
6 | from Training import run_experiment, run_experiment_XGB
7 | from Evaluation import evaluate, evaluate_XGB
8 | from utils import normalize, eng_class, sampling_k_elements, extract_graph
9 | import numpy as np
10 | import networkx as nx
11 | from tensorflow import keras
12 | from keras.utils import to_categorical
13 | import random
14 | from models.Xgboost import create_XGB
15 | from models.Conv1D import create_Conv1D
16 | from models.GAT import create_GAT
17 | from models.GCN import create_GCN
18 | from models.MLP import create_MLP
19 | import argparse
20 |
21 |
22 | def parse_args():
23 | parser = argparse.ArgumentParser("TweetGage Params")
24 | a = parser.add_argument
25 | a('--LOAD_CSV', action='store_true')
26 | a('--EXTRACT_BERT', action='store_true')
27 | a('--USE_PCA', action='store_true')
28 | a('--USER_FEAT', action='store_true')
29 | a('--BERT_FEAT', action='store_true')
30 | a('--Model_Type', default='GCN', type=str)
31 | return parser.parse_args()
32 |
33 |
34 | def reset_random_seeds():
35 | os.environ['PYTHONHASHSEED'] = str(2)
36 | tf.random.set_seed(2)
37 | np.random.seed(2)
38 | random.seed(2)
39 |
40 |
41 | def select_params(Model_type, X_train, y_train, X_test, y_test, df, g, num_classes=2, num_epochs=300):
42 | num_classes = num_classes
43 | num_epochs = num_epochs
44 | dropout_rate = None
45 | num_layers = None
46 | num_heads = None
47 | if Model_type == 'GCN':
48 | hidden_units = [16]
49 | dropout_rate = 0.3
50 | learning_rate = 0.1
51 | batch_size = 256
52 | input = np.array(X_train.index)
53 | target = to_categorical(y_train)
54 | loss = keras.losses.CategoricalCrossentropy
55 | optimizer = keras.optimizers.Adam
56 | input_test = np.array(X_test.index)
57 | target_test = y_test
58 | graph_info = extract_graph(g, df)
59 | model = create_GCN(graph_info, num_classes, hidden_units, dropout_rate)
60 | if Model_type == 'MLP':
61 | hidden_units = [32, 32]
62 | learning_rate = 0.01
63 | dropout_rate = 0.5
64 | batch_size = 256
65 | loss = keras.losses.CategoricalCrossentropy
66 | input = X_train
67 | target = to_categorical(y_train)
68 | input_test = X_test
69 | target_test = y_test
70 | optimizer = keras.optimizers.Adam
71 | model = create_MLP(X_train.shape[1], hidden_units, num_classes, dropout_rate)
72 | if Model_type == 'Conv1D':
73 | hidden_units = 64
74 | learning_rate = 0.1
75 | batch_size = 256
76 | model = create_Conv1D(num_classes, hidden_units, X_train.shape[1])
77 | input = X_train.values.reshape(-1, X_train.shape[1], 1)
78 | loss = keras.losses.CategoricalCrossentropy
79 | target = to_categorical(y_train)
80 | optimizer = keras.optimizers.Adam
81 | input_test = X_test
82 | target_test = y_test
83 | if Model_type == 'GAT':
84 | hidden_units = 100
85 | num_heads = 2
86 | num_layers = 1
87 | batch_size = 64
88 | learning_rate = 1e-2
89 | graph_info = extract_graph(g, df)
90 | input = np.array(X_train.index)
91 | target = to_categorical(y_train)
92 | model = create_GAT(graph_info[0], graph_info[1].T, hidden_units, num_heads, num_layers, num_classes)
93 | loss = keras.losses.CategoricalCrossentropy
94 | optimizer = keras.optimizers.SGD
95 | input_test = np.array(X_test.index)
96 | target_test = y_test
97 | if Model_type == 'XGBOOST':
98 | max_depth = 8
99 | learning_rate = 0.025
100 | subsample = 0.85
101 | colsample_bytree = 0.35
102 | eval_metric = 'logloss'
103 | objective = 'binary:logistic'
104 | tree_method = 'gpu_hist'
105 | seed = 1
106 | model = create_XGB(max_depth, learning_rate, subsample,
107 | colsample_bytree, eval_metric, objective,
108 | tree_method, seed)
109 | return model
110 | return hidden_units, num_classes, learning_rate, num_epochs, dropout_rate, batch_size, num_layers, num_heads, input, target, loss, optimizer, input_test, target_test, model
111 |
112 |
113 | def main(LOAD_CSV=False, EXTRACT_BERT=True, USE_PCA=False, USER_FEAT=True, BERT_FEAT=True, Model_Type='GCN'):
114 | reset_random_seeds()
115 | g = nx.read_gpickle('./network_tweets.pickle')
116 | print("POST:", len(g.nodes))
117 | print("ARCS:", len(g.edges))
118 | print("COMPONENTS:", nx.number_connected_components(g))
119 | if not LOAD_CSV:
120 | df = pd.read_csv("./first_week.csv", lineterminator="\n")
121 | df["class"] = df["engagement"].apply(lambda x: eng_class(x))
122 | df = df.groupby('class').apply(sampling_k_elements).reset_index(drop=True)
123 | if EXTRACT_BERT:
124 | model = SentenceTransformer('efederici/sentence-bert-base')
125 | emb = model.encode(df["text"])
126 | if USE_PCA:
127 | pca = PCA(n_components=48)
128 | pca.fit(emb)
129 | emb = pca.transform(emb)
130 | df = pd.concat([df, pd.DataFrame(emb)], axis=1)
131 | del emb, model
132 | gc.collect()
133 | df = normalize(df)
134 | else:
135 | df = pd.read_csv("./first_week_posts_bert.csv")
136 | if USER_FEAT and not BERT_FEAT:
137 | df = df.iloc[:, 0:11]
138 | if not USER_FEAT and BERT_FEAT:
139 | df = df.iloc[:, 10:]
140 | if USE_PCA:
141 | pca = PCA(n_components=48)
142 | print('PCA 48 Components')
143 | pca.fit(df.drop(["class"], axis=1))
144 | emb = pca.transform(df.drop(["class"], axis=1))
145 | df = pd.concat([pd.DataFrame(emb), df[["class"]]], axis=1)
146 |
147 | X_train, X_test, y_train, y_test = train_test_split(df.drop(["class"], axis=1), df["class"], test_size=0.2,
148 | random_state=42, stratify=df["class"])
149 | if not Model_Type == 'XGBOOST':
150 | hidden_units, num_classes, learning_rate, num_epochs, dropout_rate, batch_size, num_layers, \
151 | num_heads, input, target, loss, optimizer, input_test, target_test, model = select_params(Model_Type, X_train,
152 | y_train, X_test,
153 | y_test,
154 | df,
155 | g,
156 | num_epochs=300)
157 | run_experiment(model, input, target, learning_rate, loss, num_epochs, batch_size, optimizer)
158 | evaluate(model, input_test, target_test)
159 | else:
160 | model = select_params(Model_Type, X_train, y_train, X_test, y_test, df, g,
161 | num_epochs=300)
162 | obj = run_experiment_XGB(model, X_train, y_train)
163 | evaluate_XGB(obj, X_test, y_test)
164 |
165 |
166 | if __name__ == '__main__':
167 | args = vars(parse_args())
168 | main(*list(args.values))
169 |
--------------------------------------------------------------------------------
/models/GCN.py:
--------------------------------------------------------------------------------
1 | from tensorflow.keras import layers
2 | from utils import create_ffn
3 | import tensorflow as tf
4 |
5 |
6 | class GraphConvLayer(layers.Layer):
7 | def __init__(
8 | self,
9 | hidden_units,
10 | dropout_rate=0.2,
11 | aggregation_type="mean",
12 | combination_type="concat",
13 | normalize=False,
14 | *args,
15 | **kwargs,
16 | ):
17 | super(GraphConvLayer, self).__init__(*args, **kwargs)
18 |
19 | self.aggregation_type = aggregation_type
20 | self.combination_type = combination_type
21 | self.normalize = normalize
22 |
23 | self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
24 | if self.combination_type == "gated":
25 | self.update_fn = layers.GRU(
26 | units=hidden_units,
27 | activation="tanh",
28 | recurrent_activation="sigmoid",
29 | dropout=dropout_rate,
30 | return_state=True,
31 | recurrent_dropout=dropout_rate,
32 | )
33 | else:
34 | self.update_fn = create_ffn(hidden_units, dropout_rate)
35 |
36 | def prepare(self, node_repesentations, weights=None):
37 | # node_repesentations shape is [num_edges, embedding_dim].
38 | messages = self.ffn_prepare(node_repesentations)
39 | if weights is not None:
40 | messages = messages * tf.expand_dims(weights, -1)
41 | return messages
42 |
43 | def aggregate(self, node_indices, neighbour_messages, node_repesentations):
44 | # node_indices shape is [num_edges].
45 | # neighbour_messages shape: [num_edges, representation_dim].
46 | # node_repesentations shape is [num_nodes, representation_dim].
47 | num_nodes = node_repesentations.shape[0]
48 | if self.aggregation_type == "sum":
49 | aggregated_message = tf.math.unsorted_segment_sum(
50 | neighbour_messages, node_indices, num_segments=num_nodes
51 | )
52 | elif self.aggregation_type == "mean":
53 | aggregated_message = tf.math.unsorted_segment_mean(
54 | neighbour_messages, node_indices, num_segments=num_nodes
55 | )
56 | elif self.aggregation_type == "max":
57 | aggregated_message = tf.math.unsorted_segment_max(
58 | neighbour_messages, node_indices, num_segments=num_nodes
59 | )
60 | else:
61 | raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")
62 |
63 | return aggregated_message
64 |
65 | def update(self, node_repesentations, aggregated_messages):
66 | # node_repesentations shape is [num_nodes, representation_dim].
67 | # aggregated_messages shape is [num_nodes, representation_dim].
68 | if self.combination_type == "gru":
69 | # Create a sequence of two elements for the GRU layer.
70 | h = tf.stack([node_repesentations, aggregated_messages], axis=1)
71 | elif self.combination_type == "concat":
72 | # Concatenate the node_repesentations and aggregated_messages.
73 | h = tf.concat([node_repesentations, aggregated_messages], axis=1)
74 | elif self.combination_type == "add":
75 | # Add node_repesentations and aggregated_messages.
76 | h = node_repesentations + aggregated_messages
77 | else:
78 | raise ValueError(f"Invalid combination type: {self.combination_type}.")
79 |
80 | # Apply the processing function.
81 | node_embeddings = self.update_fn(h)
82 | if self.combination_type == "gru":
83 | node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]
84 |
85 | if self.normalize:
86 | node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
87 | return node_embeddings
88 |
89 | def call(self, inputs):
90 | """Process the inputs to produce the node_embeddings.
91 |
92 | inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
93 | Returns: node_embeddings of shape [num_nodes, representation_dim].
94 | """
95 |
96 | node_repesentations, edges, edge_weights = inputs
97 | # Get node_indices (source) and neighbour_indices (target) from edges.
98 | node_indices, neighbour_indices = edges[0], edges[1]
99 | # neighbour_repesentations shape is [num_edges, representation_dim].
100 | neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)
101 |
102 | # Prepare the messages of the neighbours.
103 | neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
104 | # Aggregate the neighbour messages.
105 | aggregated_messages = self.aggregate(
106 | node_indices, neighbour_messages, node_repesentations
107 | )
108 | # Update the node embedding with the neighbour messages.
109 | return self.update(node_repesentations, aggregated_messages)
110 |
111 |
112 | class GNNNodeRegression(tf.keras.Model):
113 | def __init__(
114 | self,
115 | graph_info,
116 | hidden_units,
117 | num_classes,
118 | aggregation_type="sum",
119 | combination_type="concat",
120 | dropout_rate=0.2,
121 | normalize=True,
122 | *args,
123 | **kwargs,
124 | ):
125 | super(GNNNodeRegression, self).__init__(*args, **kwargs)
126 |
127 | # Unpack graph_info to three elements: node_features, edges, and edge_weight.
128 | node_features, edges, edge_weights = graph_info
129 | self.node_features = node_features
130 | self.edges = edges
131 | self.edge_weights = edge_weights
132 | # Set edge_weights to ones if not provided.
133 | if self.edge_weights is None:
134 | self.edge_weights = tf.ones(shape=edges.shape[1])
135 | # Scale edge_weights to sum to 1.
136 | self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)
137 |
138 | # Create a process layer.
139 | self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
140 | # Create the first GraphConv layer.
141 | self.conv1 = GraphConvLayer(
142 | hidden_units,
143 | dropout_rate,
144 | aggregation_type,
145 | combination_type,
146 | normalize,
147 | name="graph_conv1",
148 | )
149 | # Create the second GraphConv layer.
150 | self.conv2 = GraphConvLayer(
151 | hidden_units,
152 | dropout_rate,
153 | aggregation_type,
154 | combination_type,
155 | normalize,
156 | name="graph_conv2",
157 | )
158 | # Create a postprocess layer.
159 | self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
160 | # Create a compute logits layer.
161 | self.compute_logits = layers.Dense(units=num_classes, name="logits")
162 |
163 | def call(self, input_node_indices):
164 | # Preprocess the node_features to produce node representations.
165 | x = self.preprocess(self.node_features)
166 | # Apply the first graph conv layer.
167 | x = self.conv1((x, self.edges, self.edge_weights))
168 | # Skip connection.
169 | # x = x1 + x
170 | # Apply the second graph conv layer.
171 | x = self.conv2((x, self.edges, self.edge_weights))
172 | # Skip connection.
173 | # x = x2 + x
174 | # Postprocess node embedding.
175 | x = self.postprocess(x)
176 | # Fetch node embeddings for the input node_indices.
177 | node_embeddings = tf.gather(x, input_node_indices)
178 | # Compute logits
179 | return self.compute_logits(node_embeddings)
180 |
181 |
182 | def create_GCN(graph_info, num_classes, hidden_units, dropout_rate):
183 | # SUM, CONCAT DA NON TOCCARE MAI E' LA MIGLIORE
184 | gnn_model = GNNNodeRegression(
185 | num_classes=num_classes,
186 | aggregation_type="sum",
187 | combination_type="concat",
188 | graph_info=graph_info,
189 | hidden_units=hidden_units,
190 | dropout_rate=dropout_rate,
191 | name="gnn_model",
192 | )
193 | return gnn_model
194 |
--------------------------------------------------------------------------------