├── imgs
    ├── gr1.png
    ├── gr2.png
    ├── res1.png
    ├── res2.png
    ├── teaser.png
    └── Poster_ICMR23.pdf
├── models
    ├── Conv1D.py
    ├── MLP.py
    ├── Xgboost.py
    ├── GAT.py
    └── GCN.py
├── Evaluation.py
├── Training.py
├── utils.py
├── CreateNetwork.py
├── req.txt
├── README.md
└── main.py


/imgs/gr1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/gr1.png


--------------------------------------------------------------------------------
/imgs/gr2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/gr2.png


--------------------------------------------------------------------------------
/imgs/res1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/res1.png


--------------------------------------------------------------------------------
/imgs/res2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/res2.png


--------------------------------------------------------------------------------
/imgs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/teaser.png


--------------------------------------------------------------------------------
/imgs/Poster_ICMR23.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OcraM17/EngagementGNN/HEAD/imgs/Poster_ICMR23.pdf


--------------------------------------------------------------------------------
/models/Conv1D.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
 2 | from keras.models import Sequential
 3 | 
 4 | 
 5 | def create_Conv1D(num_classes, hidden, input):
 6 |     model = Sequential()
 7 |     model.add(Conv1D(hidden, 1, activation="tanh", input_shape=(input, 1)))
 8 |     # model.add(Conv1D(128, 1, strides=2, activation="relu"))
 9 |     # model.add(Conv1D(256, 1, strides=2, activation="relu"))
10 |     # model.add(Conv1D(512, 1, strides=2, activation="relu"))
11 |     # model.add(Conv1D(1024, 1, strides=2, activation="relu"))
12 |     model.add(MaxPooling1D())
13 |     model.add(Flatten())
14 |     model.add(Dense(2, name='logits'))
15 |     return model
16 | 


--------------------------------------------------------------------------------
/models/MLP.py:
--------------------------------------------------------------------------------
 1 | from tensorflow import keras
 2 | from tensorflow.keras import layers
 3 | from utils import create_ffn
 4 | 
 5 | 
 6 | def MLP(input_shape, hidden_units, num_classes, dropout_rate=0.2):
 7 |     inputs = layers.Input(shape=(input_shape,), name="input_features")
 8 |     x = create_ffn(hidden_units, dropout_rate, name=f"ffn_block1")(inputs)
 9 |     for block_idx in range(4):
10 |         # Create an FFN block.
11 |         x1 = create_ffn(hidden_units, dropout_rate, name=f"ffn_block{block_idx + 2}")(x)
12 |         # Add skip connection.
13 |         x = layers.Add(name=f"skip_connection{block_idx + 2}")([x, x1])
14 |     # Compute logits.
15 |     logits = layers.Dense(num_classes, name="logits")(x)
16 |     # Create the model.
17 |     return keras.Model(inputs=inputs, outputs=logits, name="baseline")
18 | 
19 | 
20 | def create_MLP(input_shape, hidden_units, num_classes, dropout_rate):
21 |     model = MLP(input_shape, hidden_units, num_classes, dropout_rate)
22 |     return model
23 | 


--------------------------------------------------------------------------------
/Evaluation.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_auc_score
 2 | import numpy as np
 3 | from scipy.special import softmax
 4 | import xgboost as xgb
 5 | 
 6 | 
 7 | def evaluate(model, X_test, y_test):
 8 |     logits = model.predict(X_test)
 9 |     probs = softmax(logits, axis=1)
10 |     classes = np.argmax(probs, axis=1)
11 |     print_metrics(classes, y_test, probs[:,1])
12 | 
13 | 
14 | def evaluate_XGB(obj, X_test, y_test):
15 |     dtest = xgb.DMatrix(data=X_test)
16 |     probs = obj.predict(dtest)
17 |     classes = probs.copy()
18 |     classes[classes > 0.5] = 1
19 |     classes[classes <= 0.5] = 0
20 |     print_metrics(classes, y_test, probs)
21 | 
22 | 
23 | def print_metrics(classes, y_test, probs):
24 |     print(classification_report(classes, y_test, labels=[0, 1]))
25 |     prec, recall, thr = precision_recall_curve(y_test, probs, pos_label=1)
26 |     prauc = auc(recall, prec)
27 |     print(prauc)
28 |     prauc = roc_auc_score(y_test, probs)
29 |     print(prauc)
30 |     print(confusion_matrix(classes, y_test))
31 | 


--------------------------------------------------------------------------------
/Training.py:
--------------------------------------------------------------------------------
 1 | from tensorflow import keras
 2 | import xgboost as xgb
 3 | 
 4 | 
 5 | def run_experiment(model, x_train, y_train, learning_rate, loss, num_epochs, batch_size, optimizer):
 6 |     # Compile the model.
 7 |     model.compile(
 8 |         optimizer=optimizer(learning_rate),
 9 |         loss=loss(from_logits=True),
10 |         metrics=['accuracy'],
11 |     )
12 |     # Create an early stopping callback.
13 |     early_stopping = keras.callbacks.EarlyStopping(
14 |         monitor="val_loss", patience=5, restore_best_weights=True
15 |     )
16 |     reduce_lr = keras.callbacks.ReduceLROnPlateau(
17 |         patience=2
18 |     )
19 |     # Fit the model.
20 |     history = model.fit(
21 |         x=x_train,
22 |         y=y_train,
23 |         epochs=num_epochs,
24 |         batch_size=batch_size,
25 |         validation_split=0.15,
26 |         callbacks=[early_stopping, reduce_lr],
27 |     )
28 |     return history
29 | 
30 | 
31 | def run_experiment_XGB(model, x_train, y_train):
32 |     dtrain = xgb.DMatrix(data=x_train, label=y_train)
33 | 
34 |     obj = xgb.train(model.__getparams__(),
35 |                       dtrain=dtrain,
36 |                       num_boost_round=500,
37 |                       )
38 |     return obj
39 | 


--------------------------------------------------------------------------------
/models/Xgboost.py:
--------------------------------------------------------------------------------
 1 | def create_XGB(max_depth=8, learning_rate=0.025, subsample=0.85,
 2 |                colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic',
 3 |                tree_method='gpu_hist', seed=1):
 4 |     return xgBoost(max_depth=8, learning_rate=0.025, subsample=0.85,
 5 |                    colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic',
 6 |                    tree_method='gpu_hist', seed=1)
 7 | 
 8 | 
 9 | class xgBoost():
10 |     def __init__(self, max_depth=8, learning_rate=0.025, subsample=0.85,
11 |                  colsample_bytree=0.35, eval_metric='logloss', objective='binary:logistic',
12 |                  tree_method='gpu_hist', seed=1):
13 |         self.max_depth = 8
14 |         self.learning_rate = 0.025
15 |         self.subsample = 0.85
16 |         self.colsample_bytree = 0.35
17 |         self.eval_metric = 'logloss'
18 |         self.objective = 'binary:logistic'
19 |         self.tree_method = 'gpu_hist'
20 |         self.seed = 1
21 | 
22 |     def __getparams__(self):
23 |         dict_ = {
24 |             'max_depth': self.max_depth,
25 |             'learning_rate': self.learning_rate,
26 |             'subsample': self.subsample,
27 |             'colsample_bytree': self.subsample,
28 |             'eval_metric': self.eval_metric,
29 |             'objective': self.objective,
30 |             'tree_method': self.tree_method,
31 |             'seed': self.seed,
32 |         }
33 |         return dict_
34 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | from tensorflow import keras
 4 | import numpy as np
 5 | import networkx as nx
 6 | 
 7 | 
 8 | def eng_class(x):
 9 |     if x <= 0:
10 |         return 0
11 |     else:
12 |         return 1
13 | 
14 | 
15 | def sampling_k_elements(group, k=30000):
16 |     if len(group) < k:
17 |         return group
18 |     return group.sample(k)
19 | 
20 | 
21 | def create_ffn(hidden_units, dropout_rate, name=None):
22 |     fnn_layers = []
23 | 
24 |     for units in hidden_units:
25 |         fnn_layers.append(layers.BatchNormalization())
26 |         fnn_layers.append(layers.Dropout(dropout_rate))
27 |         fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))
28 | 
29 |     return keras.Sequential(fnn_layers, name=name)
30 | 
31 | 
32 | def normalize(df):
33 |     df["user_followers"] = np.log10(df["user_followers"] + 1e-5)
34 |     df["user_ntweet"] = np.log10(df["user_ntweet"] + 1e-5)
35 |     df = df.drop(["hashtag", "text", "time", "screen_name", "favorite", "engagement", "retweet", "id"], axis=1)
36 |     for col in df.columns:
37 |         if not isinstance(df[col].values[0], str):
38 |             df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
39 |     return df
40 | 
41 | 
42 | def extract_graph(g, df):
43 |     mapping_graph = {k: v for v, k in enumerate(g.nodes)}
44 |     g = nx.relabel_nodes(g, mapping_graph)
45 |     edges = np.array(list(g.edges)).T
46 |     edges_weight = [x[2]["weight"] for x in g.edges(data=True)]
47 |     features_names = set(df.columns) - {"n_emojis", "user_following", "official_source", "class"}
48 |     node_features = tf.cast(
49 |         df.sort_index()[features_names].to_numpy(), dtype=tf.dtypes.float32
50 |     )
51 |     graph_info = (node_features, edges, edges_weight)
52 |     return graph_info
53 | 


--------------------------------------------------------------------------------
/CreateNetwork.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import ast
 3 | import networkx as nx
 4 | import datetime
 5 | from joblib import Parallel, delayed
 6 | from tqdm.auto import tqdm
 7 | import numpy as np
 8 | 
 9 | # settings (define the path to the tweets dataset)
10 | delta = 15
11 | path_to_tweets = "./first_week.csv"
12 | n_jobs = 8
13 | 
14 | '''
15 | Function to compute the edges on a slice of the tweets dataframe.
16 | Returns the corresponding list of edges
17 | '''
18 | 
19 | 
20 | def edges_subset(split_df, delta=15):
21 |     # filter by time
22 |     edges = []
23 |     for _, post in split_df.iterrows():
24 |         sub_df = df.loc[((df["time"]) > post["time"] - datetime.timedelta(minutes=delta)) &
25 |                         (df["time"] < post["time"])].copy(deep=True)
26 |         sub_df["connected"] = sub_df["hashtag"].apply(lambda x: len(set(x).intersection(post["hashtag"])))
27 |         sub_df = sub_df.loc[sub_df["connected"] > 0]
28 |         edges = edges + [(row["id"], post["id"], row["connected"]) for _, row in sub_df.iterrows()]
29 |     return edges
30 | 
31 | 
32 | # LOAD DATA
33 | df = pd.read_csv(path_to_tweets, lineterminator='\n')
34 | df["hashtag"] = df["hashtag"].apply(lambda x: list(set(ast.literal_eval(x))))
35 | df["time"] = pd.to_datetime(df["time"])
36 | 
37 | # COMPUTE EDGES using Parallel jobs. It works on dataframe splits
38 | all_edges = Parallel(n_jobs=n_jobs)(delayed(edges_subset)(split_df, delta=delta) for split_df in tqdm(np.array_split(df, 100)))
39 | all_edges = [y for x in all_edges for y in x]
40 | 
41 | # CREATE GRAPH
42 | graph = nx.Graph()
43 | # add weighted edges
44 | graph.add_weighted_edges_from(all_edges)
45 | # add isolated nodes
46 | isolated = set(df["id"]).difference(list(graph.nodes))
47 | graph.add_nodes_from(isolated)
48 | # add node attributes
49 | nx.set_node_attributes(graph, df.loc[df["id"].isin(list(graph.nodes))].set_index("id").to_dict(orient="index"))
50 | print("NODES:", len(graph.nodes))
51 | print("EDGES:", len(graph.edges))
52 | print("DENSITY:", nx.density(graph))
53 | print("NUM CONNECTED COMPONENTS:", len([len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)]))
54 | print("MAX CONNECTED COMPONENT:", max([len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)]))
55 | 
56 | # check
57 | if "/" in path_to_tweets:
58 |     filename = path_to_tweets.split("/")[-1].split(".")[0]
59 | else:
60 |     filename = path_to_tweets.split("\\")[-1].split(".")[0]
61 | 
62 | 
63 | # protocol=4 ensures compatibility with older Python versions
64 | nx.write_gpickle(graph, "network_tweets.pickle", protocol=4)
65 | 


--------------------------------------------------------------------------------
/req.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.7.1
  2 | apturl==0.5.2
  3 | asn1crypto==0.24.0
  4 | astor==0.8.0
  5 | attrs==17.4.0
  6 | beautifulsoup4==4.11.1
  7 | blinker==1.4
  8 | bounded-pool-executor==0.0.3
  9 | Brlapi==0.6.6
 10 | certifi==2018.1.18
 11 | chardet==3.0.4
 12 | click==6.7
 13 | cloudpickle==1.6.0
 14 | colorama==0.3.7
 15 | command-not-found==0.3
 16 | cryptography==2.1.4
 17 | cupshelpers==1.0
 18 | cycler==0.10.0
 19 | dask==1.1.1
 20 | dataclasses==0.8
 21 | decorator==4.3.2
 22 | defer==1.0.6
 23 | distro-info===0.18ubuntu0.18.04.1
 24 | filelock==3.4.1
 25 | flake8==3.5.0
 26 | Flask==0.12.2
 27 | future==0.18.2
 28 | gast==0.2.2
 29 | gdown==4.5.3
 30 | graphviz==0.14.2
 31 | grpcio==1.21.1
 32 | gym==0.18.0
 33 | h5py==2.9.0
 34 | httplib2==0.9.2
 35 | huggingface-hub==0.4.0
 36 | idna==2.6
 37 | imageio==2.9.0
 38 | importlib-metadata==4.8.3
 39 | iopath==0.1.10
 40 | iotop==0.6
 41 | itsdangerous==0.24
 42 | Jinja2==2.10
 43 | joblib==0.11
 44 | Keras-Applications==1.0.8
 45 | Keras-Preprocessing==1.1.0
 46 | keyring==10.6.0
 47 | keyrings.alt==3.0
 48 | kiwisolver==1.3.1
 49 | kornia==0.6.1
 50 | language-selector==0.1
 51 | launchpadlib==1.10.6
 52 | lazr.restfulclient==0.13.5
 53 | lazr.uri==1.0.3
 54 | louis==3.5.0
 55 | lpips==0.1.3
 56 | macaroonbakery==1.1.3
 57 | Mako==1.0.7
 58 | Markdown==3.1.1
 59 | MarkupSafe==1.0
 60 | matplotlib==3.0.2
 61 | mccabe==0.6.1
 62 | mock==3.0.5
 63 | netifaces==0.10.4
 64 | networkx==2.2
 65 | nltk==3.6.7
 66 | nose==1.3.7
 67 | numpy==1.19.5
 68 | oauth==1.0.1
 69 | olefile==0.45.1
 70 | packaging==21.2
 71 | pandas==1.1.5
 72 | patool==1.12
 73 | pexpect==4.2.1
 74 | Pillow==8.3.1
 75 | piq==0.5.5
 76 | pluggy==0.6.0
 77 | portalocker==2.5.1
 78 | protobuf==3.8.0
 79 | py==1.5.2
 80 | pycairo==1.16.2
 81 | pycodestyle==2.3.1
 82 | pycrypto==2.6.1
 83 | pycups==1.9.73
 84 | pyDeprecate==0.3.2
 85 | pyflakes==1.6.0
 86 | pyglet==1.5.0
 87 | PyGObject==3.26.1
 88 | pyinotify==0.9.6
 89 | pymacaroons==0.13.0
 90 | PyNaCl==1.1.2
 91 | pyOpenSSL==17.5.0
 92 | pyparsing==2.4.7
 93 | pyRFC3339==1.0
 94 | PySocks==1.7.1
 95 | pytest==3.3.2
 96 | python-apt==1.6.5+ubuntu0.7
 97 | python-dateutil==2.8.2
 98 | python-debian==0.1.32
 99 | pytorch-ignite==0.2.0
100 | pytorch-msssim==0.2.1
101 | pytz==2018.3
102 | PyWavelets==1.0.1
103 | pyxdg==0.25
104 | PyYAML==6.0
105 | rarfile==4.0
106 | regex==2022.10.31
107 | reportlab==3.4.0
108 | requests==2.18.4
109 | requests-unixsocket==0.1.5
110 | sacremoses==0.0.53
111 | scikit-image==0.14.2
112 | scikit-learn==0.19.1
113 | scipy==1.5.4
114 | screen-resolution-extra==0.0.0
115 | SecretStorage==2.3.1
116 | sentence-transformers==2.2.2
117 | sentencepiece==0.1.97
118 | simplejson==3.13.2
119 | six==1.16.0
120 | soupsieve==2.3.2.post1
121 | ssh-import-id==5.7
122 | system-service==0.3
123 | systemd-python==234
124 | tb-nightly==1.15.0a20190624
125 | tensorboard==1.13.1
126 | tensorboardX==1.5
127 | tensorflow==1.13.1
128 | tensorflow-cpu==0.0.0
129 | tensorflow-estimator==1.13.0
130 | termcolor==1.1.0
131 | tokenizers==0.12.1
132 | toolz==0.9.0
133 | torch==1.9.0
134 | torchaudio==0.7.2
135 | torchmetrics==0.8.2
136 | torchvision==0.10.0
137 | tqdm==4.29.1
138 | transformers==4.18.0
139 | typing-extensions==3.10.0.0
140 | ubuntu-advantage-tools==27.6
141 | ubuntu-drivers-common==0.0.0
142 | ufw==0.36
143 | unattended-upgrades==0.1
144 | urllib3==1.22
145 | usb-creator==0.3.3
146 | virtualenv==15.1.0
147 | wadllib==1.3.2
148 | Werkzeug==0.15.4
149 | xkit==0.0.0
150 | zipp==3.6.0
151 | zope.interface==4.3.2
152 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A Graph Neural Network Framework For Post Engagement Prediction in Online social Media
 2 | Welcome to the "A Graph Neural Network Framework For Post Engagement Prediction in Online Social Media" repository!
 3 | 
 4 | This repository contains two works focused on predicting post engagement in online social media using graph neural networks.
 5 | The goal of this framework is to provide an effective solution for understanding and predicting the engagement of posts in online social media platforms, using the relationships between posts, users, and other network structures. The works presented here showcase the potential of graph neural networks in this field and provide a solid foundation for future research. We hope this repository serves as a valuable resource for the machine learning community.
 6 | 
 7 | ## Predicting Tweet Engagement with Graph Neural Networks  [![Paper](https://img.shields.io/badge/PrePrint-brightgreen)](http://arxiv.org/abs/2305.10103) [![Paper](https://img.shields.io/badge/Paper-red)](https://dl.acm.org/doi/10.1145/3591106.3592294) [![Paper](https://img.shields.io/badge/Poster-blue)](./imgs/Poster_ICMR23.pdf)
 8 | Official implementation of the paper: "Predicting Tweet Engagement with Graph Neural Networks"
 9 | 
10 | Published in the **ACM International Conference on Multimedia Retrieval 2023 (ICMR2023)**
11 | 
12 | In this paper we present **TweetGage**, a Graph Neural Network solution to predict the user engagement based on a novel graph-based model that represents the relationships among posts. 
13 | 
14 | 
15 | [Marco Arazzi](https://scholar.google.com/citations?user=8dD5SUkAAAAJ&hl=it&oi=ao),
16 | [Marco Cotogni](https://scholar.google.com/citations?user=8PUz5lAAAAAJ&hl=it),
17 | [Antonino Nocera](https://scholar.google.com/citations?user=YF10PJwAAAAJ&hl=it) and
18 | [Luca Virgili](https://scholar.google.com/citations?hl=it&user=2D771YsAAAAJ) 
19 | 
20 | <p align="center">
21 | <img src="imgs/teaser.png"/>
22 | 
23 | ### Requirements 
24 | In order to replicate our results you can create an environment via Anaconda and install the required packages using pip
25 | ```
26 | conda create -n TweetGage python=3.9
27 | conda activate TweetGage
28 | pip install -r req.txt
29 | ```
30 | ### Dataset
31 | For our experiments, we considered one week of data from twitter, from [November 1st to November 7th 2021](https://archive.org/details/archiveteam-twitter-stream-2021-11),  obtained through the Twitter API.
32 | <p float="center">
33 |     <img src="imgs/gr2.png" width="300" height="120" />
34 |     <img src="imgs/gr1.png" width="300" height="122"/>
35 | </p>
36 | 
37 | ### Graph Creation
38 | 
39 | Once the tweets have been downloaded, the graph network can be build and saved as a .pickle file with:
40 | ```
41 | python3 CreateNetwork.py
42 | ```
43 | 
44 | The script will create the graph network as 'network_tweets.pickle'.
45 | 
46 | ### Running the Code
47 | 
48 | Once the graph network has been created, it is possible to replicate the results of our paper, executing the following command in your terminal:
49 | ```
50 | python3 main.py --LOAD_CSV --EXTRACT_BERT --USE_PCA --USER_FEAT --BERT_FEAT --Model_Type 'GCN'
51 | ```
52 | #### Arguments Explanation
53 | 
54 | The following arguments can be passed to the main.py script:
55 | 
56 | - LOAD_CSV: If you have already computed the features in a csv file, you can load it with this argument. In our code, we load the file "first_week_posts_bert.csv", which contains post features and BERT-extracted text embeddings.
57 | - EXTRACT_BERT: Computes the text embedding of the posts using BERT (valid only if LOAD_CSV is not provided).
58 | - USE_PCA: If True, computes the Principal Component Analysis with 48 projected features that cover more than 80% of the variance of the text features.
59 | - USER_FEAT: If True, includes Post Features in the final feature set.
60 | - BERT_FEAT: If True, includes Text Features in the final feature set.
61 | - Model_Type: Can be one of the following: 'GCN', 'MLP', 'Conv1D', 'GAT', 'XGBOOST'. Default value is 'GCN'.
62 | 
63 | Note: If any argument is omitted, its default value is False.
64 | 
65 | ### Results
66 | 
67 | <p float="center">
68 |     <img src="imgs/res1.png" width="300" height="110"/>
69 |     <img src="imgs/res2.png" width="300" height="111"/>
70 | </p>
71 | 
72 | #### References
73 | If this repo is useful to your research or you want to cite our paper please use:
74 | ```
75 | @inproceedings{
76 | 10.1145/3591106.3592294, 
77 | author = {Arazzi, Marco and Cotogni, Marco and Nocera, Antonino and Virgili, Luca}, 
78 | title = {Predicting Tweet Engagement with Graph Neural Networks}, 
79 | year = {2023}, 
80 | booktitle = {Proceedings of the 2023 ACM International Conference on Multimedia Retrieval}, 
81 | pages = {172–180}, 
82 | numpages = {9}, 
83 | location = {Thessaloniki, Greece}, 
84 | series = {ICMR '23} 
85 | }
86 | ```
87 | 
88 | ## Available Soon...
89 | 


--------------------------------------------------------------------------------
/models/GAT.py:
--------------------------------------------------------------------------------
  1 | from tensorflow import keras
  2 | from tensorflow.keras import layers
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | class GraphAttention(layers.Layer):
  7 |     def __init__(
  8 |             self,
  9 |             units,
 10 |             kernel_initializer="glorot_uniform",
 11 |             kernel_regularizer=None,
 12 |             **kwargs,
 13 |     ):
 14 |         super().__init__(**kwargs)
 15 |         self.units = units
 16 |         self.kernel_initializer = keras.initializers.get(kernel_initializer)
 17 |         self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
 18 | 
 19 |     def build(self, input_shape):
 20 |         self.kernel = self.add_weight(
 21 |             shape=(input_shape[0][-1], self.units),
 22 |             trainable=True,
 23 |             initializer=self.kernel_initializer,
 24 |             regularizer=self.kernel_regularizer,
 25 |             name="kernel",
 26 |         )
 27 |         self.kernel_attention = self.add_weight(
 28 |             shape=(self.units * 2, 1),
 29 |             trainable=True,
 30 |             initializer=self.kernel_initializer,
 31 |             regularizer=self.kernel_regularizer,
 32 |             name="kernel_attention",
 33 |         )
 34 |         self.built = True
 35 | 
 36 |     def call(self, inputs):
 37 |         node_states, edges = inputs
 38 | 
 39 |         # Linearly transform node states
 40 |         node_states_transformed = tf.matmul(node_states, self.kernel)
 41 | 
 42 |         # (1) Compute pair-wise attention scores
 43 |         node_states_expanded = tf.gather(node_states_transformed, edges)
 44 |         node_states_expanded = tf.reshape(
 45 |             node_states_expanded, (tf.shape(edges)[0], -1)
 46 |         )
 47 |         attention_scores = tf.nn.leaky_relu(
 48 |             tf.matmul(node_states_expanded, self.kernel_attention)
 49 |         )
 50 |         attention_scores = tf.squeeze(attention_scores, -1)
 51 | 
 52 |         # (2) Normalize attention scores
 53 |         attention_scores = tf.math.exp(tf.clip_by_value(attention_scores, -2, 2))
 54 |         attention_scores_sum = tf.math.unsorted_segment_sum(
 55 |             data=attention_scores,
 56 |             segment_ids=edges[:, 0],
 57 |             num_segments=tf.reduce_max(edges[:, 0]) + 1,
 58 |         )
 59 |         attention_scores_sum = tf.repeat(
 60 |             attention_scores_sum, tf.math.bincount(tf.cast(edges[:, 0], "int32"))
 61 |         )
 62 |         attention_scores_norm = attention_scores / attention_scores_sum
 63 | 
 64 |         # (3) Gather node states of neighbors, apply attention scores and aggregate
 65 |         node_states_neighbors = tf.gather(node_states_transformed, edges[:, 1])
 66 |         out = tf.math.unsorted_segment_sum(
 67 |             data=node_states_neighbors * attention_scores_norm[:, tf.newaxis],
 68 |             segment_ids=edges[:, 0],
 69 |             num_segments=tf.shape(node_states)[0],
 70 |         )
 71 |         return out
 72 | 
 73 | 
 74 | class MultiHeadGraphAttention(layers.Layer):
 75 |     def __init__(self, units, num_heads=8, merge_type="concat", **kwargs):
 76 |         super().__init__(**kwargs)
 77 |         self.num_heads = num_heads
 78 |         self.merge_type = merge_type
 79 |         self.attention_layers = [GraphAttention(units) for _ in range(num_heads)]
 80 | 
 81 |     def call(self, inputs):
 82 |         atom_features, pair_indices = inputs
 83 | 
 84 |         # Obtain outputs from each attention head
 85 |         outputs = [
 86 |             attention_layer([atom_features, pair_indices])
 87 |             for attention_layer in self.attention_layers
 88 |         ]
 89 |         # Concatenate or average the node states from each head
 90 |         if self.merge_type == "concat":
 91 |             outputs = tf.concat(outputs, axis=-1)
 92 |         else:
 93 |             outputs = tf.reduce_mean(tf.stack(outputs, axis=-1), axis=-1)
 94 |         # Activate and return node states
 95 |         return tf.nn.relu(outputs)
 96 | 
 97 | 
 98 | class GraphAttentionNetwork(keras.Model):
 99 |     def __init__(
100 |             self,
101 |             node_states,
102 |             edges,
103 |             hidden_units,
104 |             num_heads,
105 |             num_layers,
106 |             output_dim,
107 |             **kwargs,
108 |     ):
109 |         super().__init__(**kwargs)
110 |         self.node_states = node_states
111 |         self.edges = edges
112 |         self.preprocess = layers.Dense(hidden_units * num_heads, activation="relu")
113 |         self.attention_layers = [
114 |             MultiHeadGraphAttention(hidden_units, num_heads) for _ in range(num_layers)
115 |         ]
116 |         self.output_layer = layers.Dense(output_dim)
117 | 
118 |     def call(self, inputs):
119 |         node_states, edges = inputs
120 |         x = self.preprocess(node_states)
121 |         for attention_layer in self.attention_layers:
122 |             x = attention_layer([x, edges]) + x
123 |         outputs = self.output_layer(x)
124 |         return outputs
125 | 
126 |     def train_step(self, data):
127 |         indices, labels = data
128 | 
129 |         with tf.GradientTape() as tape:
130 |             # Forward pass
131 |             outputs = self([self.node_states, self.edges])
132 |             # Compute loss
133 |             loss = self.compiled_loss(labels, tf.gather(outputs, indices))
134 |         # Compute gradients
135 |         grads = tape.gradient(loss, self.trainable_weights)
136 |         # Apply gradients (update weights)
137 |         optimizer.apply_gradients(zip(grads, self.trainable_weights))
138 |         # Update metric(s)
139 |         self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))
140 | 
141 |         return {m.name: m.result() for m in self.metrics}
142 | 
143 |     def predict_step(self, data):
144 |         indices = data
145 |         # Forward pass
146 |         outputs = self([self.node_states, self.edges])
147 |         # Compute probabilities
148 |         return tf.nn.softmax(tf.gather(outputs, indices))
149 | 
150 |     def test_step(self, data):
151 |         indices, labels = data
152 |         # Forward pass
153 |         outputs = self([self.node_states, self.edges])
154 |         # Compute loss
155 |         loss = self.compiled_loss(labels, tf.gather(outputs, indices))
156 |         # Update metric(s)
157 |         self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))
158 | 
159 |         return {m.name: m.result() for m in self.metrics}
160 | 
161 | 
162 | def create_GAT(node_states, edges, hidden_units, num_heads, num_layers, num_classes):
163 |     gat_model = GraphAttentionNetwork(
164 |         node_states, edges.T, hidden_units, num_heads, num_layers, num_classes
165 |     )
166 |     return gat_model
167 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sentence_transformers import SentenceTransformer
  3 | import gc
  4 | from sklearn.decomposition import PCA
  5 | from sklearn.model_selection import train_test_split
  6 | from Training import run_experiment, run_experiment_XGB
  7 | from Evaluation import evaluate, evaluate_XGB
  8 | from utils import normalize, eng_class, sampling_k_elements, extract_graph
  9 | import numpy as np
 10 | import networkx as nx
 11 | from tensorflow import keras
 12 | from keras.utils import to_categorical
 13 | import random
 14 | from models.Xgboost import create_XGB
 15 | from models.Conv1D import create_Conv1D
 16 | from models.GAT import create_GAT
 17 | from models.GCN import create_GCN
 18 | from models.MLP import create_MLP
 19 | import argparse
 20 | 
 21 | 
 22 | def parse_args():
 23 |     parser = argparse.ArgumentParser("TweetGage Params")
 24 |     a = parser.add_argument
 25 |     a('--LOAD_CSV', action='store_true')
 26 |     a('--EXTRACT_BERT', action='store_true')
 27 |     a('--USE_PCA', action='store_true')
 28 |     a('--USER_FEAT', action='store_true')
 29 |     a('--BERT_FEAT', action='store_true')
 30 |     a('--Model_Type', default='GCN', type=str)
 31 |     return parser.parse_args()
 32 | 
 33 | 
 34 | def reset_random_seeds():
 35 |     os.environ['PYTHONHASHSEED'] = str(2)
 36 |     tf.random.set_seed(2)
 37 |     np.random.seed(2)
 38 |     random.seed(2)
 39 | 
 40 | 
 41 | def select_params(Model_type, X_train, y_train, X_test, y_test, df, g, num_classes=2, num_epochs=300):
 42 |     num_classes = num_classes
 43 |     num_epochs = num_epochs
 44 |     dropout_rate = None
 45 |     num_layers = None
 46 |     num_heads = None
 47 |     if Model_type == 'GCN':
 48 |         hidden_units = [16]
 49 |         dropout_rate = 0.3
 50 |         learning_rate = 0.1
 51 |         batch_size = 256
 52 |         input = np.array(X_train.index)
 53 |         target = to_categorical(y_train)
 54 |         loss = keras.losses.CategoricalCrossentropy
 55 |         optimizer = keras.optimizers.Adam
 56 |         input_test = np.array(X_test.index)
 57 |         target_test = y_test
 58 |         graph_info = extract_graph(g, df)
 59 |         model = create_GCN(graph_info, num_classes, hidden_units, dropout_rate)
 60 |     if Model_type == 'MLP':
 61 |         hidden_units = [32, 32]
 62 |         learning_rate = 0.01
 63 |         dropout_rate = 0.5
 64 |         batch_size = 256
 65 |         loss = keras.losses.CategoricalCrossentropy
 66 |         input = X_train
 67 |         target = to_categorical(y_train)
 68 |         input_test = X_test
 69 |         target_test = y_test
 70 |         optimizer = keras.optimizers.Adam
 71 |         model = create_MLP(X_train.shape[1], hidden_units, num_classes, dropout_rate)
 72 |     if Model_type == 'Conv1D':
 73 |         hidden_units = 64
 74 |         learning_rate = 0.1
 75 |         batch_size = 256
 76 |         model = create_Conv1D(num_classes, hidden_units, X_train.shape[1])
 77 |         input = X_train.values.reshape(-1, X_train.shape[1], 1)
 78 |         loss = keras.losses.CategoricalCrossentropy
 79 |         target = to_categorical(y_train)
 80 |         optimizer = keras.optimizers.Adam
 81 |         input_test = X_test
 82 |         target_test = y_test
 83 |     if Model_type == 'GAT':
 84 |         hidden_units = 100
 85 |         num_heads = 2
 86 |         num_layers = 1
 87 |         batch_size = 64
 88 |         learning_rate = 1e-2
 89 |         graph_info = extract_graph(g, df)
 90 |         input = np.array(X_train.index)
 91 |         target = to_categorical(y_train)
 92 |         model = create_GAT(graph_info[0], graph_info[1].T, hidden_units, num_heads, num_layers, num_classes)
 93 |         loss = keras.losses.CategoricalCrossentropy
 94 |         optimizer = keras.optimizers.SGD
 95 |         input_test = np.array(X_test.index)
 96 |         target_test = y_test
 97 |     if Model_type == 'XGBOOST':
 98 |         max_depth = 8
 99 |         learning_rate = 0.025
100 |         subsample = 0.85
101 |         colsample_bytree = 0.35
102 |         eval_metric = 'logloss'
103 |         objective = 'binary:logistic'
104 |         tree_method = 'gpu_hist'
105 |         seed = 1
106 |         model = create_XGB(max_depth, learning_rate, subsample,
107 |                            colsample_bytree, eval_metric, objective,
108 |                            tree_method, seed)
109 |         return model
110 |     return hidden_units, num_classes, learning_rate, num_epochs, dropout_rate, batch_size, num_layers, num_heads, input, target, loss, optimizer, input_test, target_test, model
111 | 
112 | 
113 | def main(LOAD_CSV=False, EXTRACT_BERT=True, USE_PCA=False, USER_FEAT=True, BERT_FEAT=True, Model_Type='GCN'):
114 |     reset_random_seeds()
115 |     g = nx.read_gpickle('./network_tweets.pickle')
116 |     print("POST:", len(g.nodes))
117 |     print("ARCS:", len(g.edges))
118 |     print("COMPONENTS:", nx.number_connected_components(g))
119 |     if not LOAD_CSV:
120 |         df = pd.read_csv("./first_week.csv", lineterminator="\n")
121 |         df["class"] = df["engagement"].apply(lambda x: eng_class(x))
122 |         df = df.groupby('class').apply(sampling_k_elements).reset_index(drop=True)
123 |         if EXTRACT_BERT:
124 |             model = SentenceTransformer('efederici/sentence-bert-base')
125 |             emb = model.encode(df["text"])
126 |             if USE_PCA:
127 |                 pca = PCA(n_components=48)
128 |                 pca.fit(emb)
129 |                 emb = pca.transform(emb)
130 |             df = pd.concat([df, pd.DataFrame(emb)], axis=1)
131 |             del emb, model
132 |             gc.collect()
133 |         df = normalize(df)
134 |     else:
135 |         df = pd.read_csv("./first_week_posts_bert.csv")
136 |         if USER_FEAT and not BERT_FEAT:
137 |             df = df.iloc[:, 0:11]
138 |         if not USER_FEAT and BERT_FEAT:
139 |             df = df.iloc[:, 10:]
140 |         if USE_PCA:
141 |             pca = PCA(n_components=48)
142 |             print('PCA 48 Components')
143 |             pca.fit(df.drop(["class"], axis=1))
144 |             emb = pca.transform(df.drop(["class"], axis=1))
145 |             df = pd.concat([pd.DataFrame(emb), df[["class"]]], axis=1)
146 | 
147 |     X_train, X_test, y_train, y_test = train_test_split(df.drop(["class"], axis=1), df["class"], test_size=0.2,
148 |                                                         random_state=42, stratify=df["class"])
149 |     if not Model_Type == 'XGBOOST':
150 |         hidden_units, num_classes, learning_rate, num_epochs, dropout_rate, batch_size, num_layers, \
151 |         num_heads, input, target, loss, optimizer, input_test, target_test, model = select_params(Model_Type, X_train,
152 |                                                                                                   y_train, X_test,
153 |                                                                                                   y_test,
154 |                                                                                                   df,
155 |                                                                                                   g,
156 |                                                                                                   num_epochs=300)
157 |         run_experiment(model, input, target, learning_rate, loss, num_epochs, batch_size, optimizer)
158 |         evaluate(model, input_test, target_test)
159 |     else:
160 |         model = select_params(Model_Type, X_train, y_train, X_test, y_test, df, g,
161 |                               num_epochs=300)
162 |         obj = run_experiment_XGB(model, X_train, y_train)
163 |         evaluate_XGB(obj, X_test, y_test)
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     args = vars(parse_args())
168 |     main(*list(args.values))
169 | 


--------------------------------------------------------------------------------
/models/GCN.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.keras import layers
  2 | from utils import create_ffn
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | class GraphConvLayer(layers.Layer):
  7 |     def __init__(
  8 |             self,
  9 |             hidden_units,
 10 |             dropout_rate=0.2,
 11 |             aggregation_type="mean",
 12 |             combination_type="concat",
 13 |             normalize=False,
 14 |             *args,
 15 |             **kwargs,
 16 |     ):
 17 |         super(GraphConvLayer, self).__init__(*args, **kwargs)
 18 | 
 19 |         self.aggregation_type = aggregation_type
 20 |         self.combination_type = combination_type
 21 |         self.normalize = normalize
 22 | 
 23 |         self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
 24 |         if self.combination_type == "gated":
 25 |             self.update_fn = layers.GRU(
 26 |                 units=hidden_units,
 27 |                 activation="tanh",
 28 |                 recurrent_activation="sigmoid",
 29 |                 dropout=dropout_rate,
 30 |                 return_state=True,
 31 |                 recurrent_dropout=dropout_rate,
 32 |             )
 33 |         else:
 34 |             self.update_fn = create_ffn(hidden_units, dropout_rate)
 35 | 
 36 |     def prepare(self, node_repesentations, weights=None):
 37 |         # node_repesentations shape is [num_edges, embedding_dim].
 38 |         messages = self.ffn_prepare(node_repesentations)
 39 |         if weights is not None:
 40 |             messages = messages * tf.expand_dims(weights, -1)
 41 |         return messages
 42 | 
 43 |     def aggregate(self, node_indices, neighbour_messages, node_repesentations):
 44 |         # node_indices shape is [num_edges].
 45 |         # neighbour_messages shape: [num_edges, representation_dim].
 46 |         # node_repesentations shape is [num_nodes, representation_dim].
 47 |         num_nodes = node_repesentations.shape[0]
 48 |         if self.aggregation_type == "sum":
 49 |             aggregated_message = tf.math.unsorted_segment_sum(
 50 |                 neighbour_messages, node_indices, num_segments=num_nodes
 51 |             )
 52 |         elif self.aggregation_type == "mean":
 53 |             aggregated_message = tf.math.unsorted_segment_mean(
 54 |                 neighbour_messages, node_indices, num_segments=num_nodes
 55 |             )
 56 |         elif self.aggregation_type == "max":
 57 |             aggregated_message = tf.math.unsorted_segment_max(
 58 |                 neighbour_messages, node_indices, num_segments=num_nodes
 59 |             )
 60 |         else:
 61 |             raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")
 62 | 
 63 |         return aggregated_message
 64 | 
 65 |     def update(self, node_repesentations, aggregated_messages):
 66 |         # node_repesentations shape is [num_nodes, representation_dim].
 67 |         # aggregated_messages shape is [num_nodes, representation_dim].
 68 |         if self.combination_type == "gru":
 69 |             # Create a sequence of two elements for the GRU layer.
 70 |             h = tf.stack([node_repesentations, aggregated_messages], axis=1)
 71 |         elif self.combination_type == "concat":
 72 |             # Concatenate the node_repesentations and aggregated_messages.
 73 |             h = tf.concat([node_repesentations, aggregated_messages], axis=1)
 74 |         elif self.combination_type == "add":
 75 |             # Add node_repesentations and aggregated_messages.
 76 |             h = node_repesentations + aggregated_messages
 77 |         else:
 78 |             raise ValueError(f"Invalid combination type: {self.combination_type}.")
 79 | 
 80 |         # Apply the processing function.
 81 |         node_embeddings = self.update_fn(h)
 82 |         if self.combination_type == "gru":
 83 |             node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]
 84 | 
 85 |         if self.normalize:
 86 |             node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
 87 |         return node_embeddings
 88 | 
 89 |     def call(self, inputs):
 90 |         """Process the inputs to produce the node_embeddings.
 91 | 
 92 |         inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
 93 |         Returns: node_embeddings of shape [num_nodes, representation_dim].
 94 |         """
 95 | 
 96 |         node_repesentations, edges, edge_weights = inputs
 97 |         # Get node_indices (source) and neighbour_indices (target) from edges.
 98 |         node_indices, neighbour_indices = edges[0], edges[1]
 99 |         # neighbour_repesentations shape is [num_edges, representation_dim].
100 |         neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)
101 | 
102 |         # Prepare the messages of the neighbours.
103 |         neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
104 |         # Aggregate the neighbour messages.
105 |         aggregated_messages = self.aggregate(
106 |             node_indices, neighbour_messages, node_repesentations
107 |         )
108 |         # Update the node embedding with the neighbour messages.
109 |         return self.update(node_repesentations, aggregated_messages)
110 | 
111 | 
112 | class GNNNodeRegression(tf.keras.Model):
113 |     def __init__(
114 |             self,
115 |             graph_info,
116 |             hidden_units,
117 |             num_classes,
118 |             aggregation_type="sum",
119 |             combination_type="concat",
120 |             dropout_rate=0.2,
121 |             normalize=True,
122 |             *args,
123 |             **kwargs,
124 |     ):
125 |         super(GNNNodeRegression, self).__init__(*args, **kwargs)
126 | 
127 |         # Unpack graph_info to three elements: node_features, edges, and edge_weight.
128 |         node_features, edges, edge_weights = graph_info
129 |         self.node_features = node_features
130 |         self.edges = edges
131 |         self.edge_weights = edge_weights
132 |         # Set edge_weights to ones if not provided.
133 |         if self.edge_weights is None:
134 |             self.edge_weights = tf.ones(shape=edges.shape[1])
135 |         # Scale edge_weights to sum to 1.
136 |         self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)
137 | 
138 |         # Create a process layer.
139 |         self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
140 |         # Create the first GraphConv layer.
141 |         self.conv1 = GraphConvLayer(
142 |             hidden_units,
143 |             dropout_rate,
144 |             aggregation_type,
145 |             combination_type,
146 |             normalize,
147 |             name="graph_conv1",
148 |         )
149 |         # Create the second GraphConv layer.
150 |         self.conv2 = GraphConvLayer(
151 |             hidden_units,
152 |             dropout_rate,
153 |             aggregation_type,
154 |             combination_type,
155 |             normalize,
156 |             name="graph_conv2",
157 |         )
158 |         # Create a postprocess layer.
159 |         self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
160 |         # Create a compute logits layer.
161 |         self.compute_logits = layers.Dense(units=num_classes, name="logits")
162 | 
163 |     def call(self, input_node_indices):
164 |         # Preprocess the node_features to produce node representations.
165 |         x = self.preprocess(self.node_features)
166 |         # Apply the first graph conv layer.
167 |         x = self.conv1((x, self.edges, self.edge_weights))
168 |         # Skip connection.
169 |         # x = x1 + x
170 |         # Apply the second graph conv layer.
171 |         x = self.conv2((x, self.edges, self.edge_weights))
172 |         # Skip connection.
173 |         # x = x2 + x
174 |         # Postprocess node embedding.
175 |         x = self.postprocess(x)
176 |         # Fetch node embeddings for the input node_indices.
177 |         node_embeddings = tf.gather(x, input_node_indices)
178 |         # Compute logits
179 |         return self.compute_logits(node_embeddings)
180 | 
181 | 
182 | def create_GCN(graph_info, num_classes, hidden_units, dropout_rate):
183 |     # SUM, CONCAT DA NON TOCCARE MAI E' LA MIGLIORE
184 |     gnn_model = GNNNodeRegression(
185 |         num_classes=num_classes,
186 |         aggregation_type="sum",
187 |         combination_type="concat",
188 |         graph_info=graph_info,
189 |         hidden_units=hidden_units,
190 |         dropout_rate=dropout_rate,
191 |         name="gnn_model",
192 |     )
193 |     return gnn_model
194 | 


--------------------------------------------------------------------------------