├── pygda
    ├── version.py
    ├── utils
    │   ├── __init__.py
    │   ├── svd_transform.py
    │   ├── utility.py
    │   └── mmd.py
    ├── __init__.py
    ├── metrics
    │   └── __init__.py
    ├── datasets
    │   ├── __init__.py
    │   ├── tugraph.py
    │   ├── mag.py
    │   ├── airport.py
    │   └── arxiv.py
    ├── models
    │   ├── __init__.py
    │   └── base.py
    └── nn
    │   ├── attention.py
    │   ├── __init__.py
    │   ├── reverse_layer.py
    │   ├── deepwalk_pretrain.py
    │   └── adagcn_base.py
├── docs
    ├── requirements.in
    ├── pygda_logo.png
    ├── models
    │   ├── ASN.md
    │   ├── GNN.md
    │   ├── KBL.md
    │   ├── SOGA.md
    │   ├── A2GNN.md
    │   ├── ACDNE.md
    │   ├── BaseGDA.md
    │   ├── CWGCN.md
    │   ├── DANE.md
    │   ├── DGDA.md
    │   ├── DGSDA.md
    │   ├── DMGNN.md
    │   ├── GRADE.md
    │   ├── JHGDA.md
    │   ├── PairAlign.md
    │   ├── SAGDA.md
    │   ├── SEPA.md
    │   ├── TDSS.md
    │   ├── AdaGCN.md
    │   ├── GTrans.md
    │   ├── GraphATA.md
    │   ├── GraphCTA.md
    │   ├── SpecReg.md
    │   ├── StruRW.md
    │   ├── UDAGCN.md
    │   └── Overview.md
    ├── nn
    │   ├── A2GNNBase.md
    │   ├── ACDNEBase.md
    │   ├── ASNBase.md
    │   ├── Attention.md
    │   ├── CWGCNBase.md
    │   ├── DGDABase.md
    │   ├── DGSDABase.md
    │   ├── GNNBase.md
    │   ├── GRADEBase.md
    │   ├── JHGDABase.md
    │   ├── KBLBase.md
    │   ├── MixUpBase.md
    │   ├── PPMIConv.md
    │   ├── SAGDABase.md
    │   ├── SOGABase.md
    │   ├── AdaGCNBase.md
    │   ├── GradReverse.md
    │   ├── PropGCNConv.md
    │   ├── ReweightGNN.md
    │   ├── UDAGCNBase.md
    │   ├── CacheGCNConv.md
    │   ├── DWPretrain.md
    │   ├── GMMClustering.md
    │   ├── GraphATABase.md
    │   ├── MixUpGCNConv.md
    │   └── NodeCentricConv.md
    ├── utils
    │   ├── MMD.md
    │   ├── Perturb.md
    │   ├── Sampler.md
    │   ├── Utility.md
    │   └── SVDTransform.md
    ├── datasets
    │   ├── MAG.md
    │   ├── Arxiv.md
    │   ├── Blog.md
    │   ├── Twitch.md
    │   ├── Airport.md
    │   ├── Citation.md
    │   ├── Elliptic.md
    │   ├── Facebook.md
    │   ├── Squirrel.md
    │   ├── TUGraph.md
    │   ├── Twitter.md
    │   └── Overview.md
    ├── metrics
    │   └── Metrics.md
    ├── cheatsheet
    │   ├── Dataset Cheatsheet.md
    │   └── Model Cheatsheet.md
    ├── requirements.txt
    ├── resources
    │   └── Resources.md
    ├── benchmark
    │   └── Overview.md
    └── assets
    │   └── css
    │       └── custom.css
├── .gitignore
├── benchmark
    ├── node
    │   ├── README.md
    │   ├── run.sh
    │   ├── parser.py
    │   └── run_blog.sh
    ├── graph
    │   ├── README.md
    │   ├── run_all_M.sh
    │   ├── parser.py
    │   ├── run_all_P.sh
    │   ├── run_all_F.sh
    │   ├── cwgcn.py
    │   ├── grade.py
    │   ├── udagcn.py
    │   ├── dane.py
    │   ├── a2gnn.py
    │   ├── sagda.py
    │   └── adagcn.py
    └── llm
    │   ├── README.md
    │   ├── parser.py
    │   ├── run1.sh
    │   ├── run2.sh
    │   ├── origin_preprocess.py
    │   ├── run3.sh
    │   ├── kbl.py
    │   ├── udagcn.py
    │   ├── grade.py
    │   ├── a2gnn.py
    │   └── adagcn.py
├── examples
    ├── README.md
    └── demo.py
├── .readthedocs.yaml
├── pyproject.toml
├── LICENSE
├── .github
    └── workflows
    │   ├── python-publish.yml
    │   └── codeql.yml
├── data
    └── README.md
└── mkdocs.yml


/pygda/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.2.1'


--------------------------------------------------------------------------------
/docs/requirements.in:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocstrings[python]
3 | markdown-include


--------------------------------------------------------------------------------
/docs/pygda_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pygda-team/pygda/HEAD/docs/pygda_logo.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | testg.py
3 | testn.py
4 | data/*
5 | !data/README.md
6 | bench/
7 | dist/
8 | site/


--------------------------------------------------------------------------------
/pygda/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utility import *
2 | from .mmd import *
3 | from .svd_transform import svd_transform
4 | from .sampler import *
5 | from .perturb import *
6 | 


--------------------------------------------------------------------------------
/benchmark/node/README.md:
--------------------------------------------------------------------------------
1 | # Benchmark
2 | Evaluation scripts for 16 methods on the five datasets. Each experiment is repeated 3 times.
3 | 
4 | Run the whole suite via
5 | ```
6 | ./run.sh
7 | ```


--------------------------------------------------------------------------------
/benchmark/node/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | for i in  1 2 3
 4 | do
 5 |     echo $i
 6 |     ./run_airport.sh
 7 |     ./run_blog.sh
 8 |     ./run_citation.sh
 9 |     ./run_twitch.sh
10 |     ./run_mag.sh
11 | done
12 | 


--------------------------------------------------------------------------------
/docs/models/ASN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.asn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/GNN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.gnn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/KBL.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.kbl
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/SOGA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.soga
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/A2GNNBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.a2gnn_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/ACDNEBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.acdne_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/ASNBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.asn_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/Attention.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.attention
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/CWGCNBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.cwgcn_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/DGDABase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.dgda_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/DGSDABase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.dgsda_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/GNNBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.gnn_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/GRADEBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.grade_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/JHGDABase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.jhgda_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/KBLBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.kbl_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/MixUpBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.mixup_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/PPMIConv.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.ppmi_conv
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/SAGDABase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.sagda_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/SOGABase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.soga_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/utils/MMD.md:
--------------------------------------------------------------------------------
1 | ::: pygda.utils.mmd
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/MAG.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.mag
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/A2GNN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.a2gnn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/ACDNE.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.acdne
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/BaseGDA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/CWGCN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.cwgcn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/DANE.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.dane
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/DGDA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.dgda
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/DGSDA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.dgsda
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/DMGNN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.dmgnn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/GRADE.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.grade
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/JHGDA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.jhgda
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/PairAlign.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.pa
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/SAGDA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.sagda
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/SEPA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.sepa
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/TDSS.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.tdss
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/nn/AdaGCNBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.adagcn_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/GradReverse.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.reverse_layer
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/PropGCNConv.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.prop_gcn_conv
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/ReweightGNN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.reweight_gnn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/UDAGCNBase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.udagcn_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/datasets/Arxiv.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.arxiv
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Blog.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.blog
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Twitch.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.twitch
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/metrics/Metrics.md:
--------------------------------------------------------------------------------
1 | ::: pygda.metrics.metrics
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/AdaGCN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.adagcn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/GTrans.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.gtrans
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/GraphATA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.graphata
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/GraphCTA.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.graphcta
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/SpecReg.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.specreg
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/StruRW.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.strurw
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/models/UDAGCN.md:
--------------------------------------------------------------------------------
1 | ::: pygda.models.udagcn
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/nn/CacheGCNConv.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.cached_gcn_conv
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/DWPretrain.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.deepwalk_pretrain
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/GMMClustering.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.gmm_clustering
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/GraphATABase.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.graphata_base
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/nn/MixUpGCNConv.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.mixup_gcnconv
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/utils/Perturb.md:
--------------------------------------------------------------------------------
1 | ::: pygda.utils.perturb
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/utils/Sampler.md:
--------------------------------------------------------------------------------
1 | ::: pygda.utils.sampler
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/utils/Utility.md:
--------------------------------------------------------------------------------
1 | ::: pygda.utils.utility
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Airport.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.airport
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Citation.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.citation
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Elliptic.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.elliptic
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Facebook.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.facebook
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Squirrel.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.squirrel
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/TUGraph.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.tugraph
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/datasets/Twitter.md:
--------------------------------------------------------------------------------
1 | ::: pygda.datasets.twitter
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/docs/nn/NodeCentricConv.md:
--------------------------------------------------------------------------------
1 | ::: pygda.nn.node_centric_conv
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6


--------------------------------------------------------------------------------
/docs/utils/SVDTransform.md:
--------------------------------------------------------------------------------
1 | ::: pygda.utils.svd_transform
2 |     options:
3 |       docstring_style: numpy
4 |       show_source: false
5 |       merge_init_into_class: true
6 |       ignore_init_summary: true
7 |       heading_level: 6
8 | 


--------------------------------------------------------------------------------
/pygda/__init__.py:
--------------------------------------------------------------------------------
1 | from . import datasets
2 | from . import models
3 | from . import metrics
4 | from . import utils
5 | from . import nn
6 | from .version import __version__
7 | 
8 | __all__ = ['datasets', 'models', 'metrics', 'utils', 'nn']


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | This folder contains a plethora of examples using different GDA models. This readme highlights some key examples.
4 | 
5 | * For SpecReg, `svd_transform` is needed for datasets preprocess, i.e., `source_dataset = CitationDataset(path, args.source, pre_transform=svd_transform)`.


--------------------------------------------------------------------------------
/benchmark/graph/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | Evaluation scripts for 7 methods on the three graph classification datasets. Each experiment is repeated 3 times.
 3 | 
 4 | ## Datasets
 5 | All datasets are accessible via the links provided in the data folder.
 6 | 
 7 | ## Run
 8 | 
 9 | Run via
10 | ```
11 | ./run_all_F.sh
12 | ./run_all_M.sh
13 | ./run_all_P.sh
14 | ```


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | 
 3 | # Required
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.12"
10 |     # You can also specify other tool versions:
11 |     # nodejs: "19"
12 |     # rust: "1.64"
13 |     # golang: "1.19"
14 | 
15 | mkdocs:
16 |   configuration: mkdocs.yml
17 | 
18 | python:
19 |    install:
20 |    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/pygda/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | from .metrics import eval_average_precision
 2 | from .metrics import eval_macro_f1
 3 | from .metrics import eval_micro_f1
 4 | from .metrics import eval_precision_at_k
 5 | from .metrics import eval_recall_at_k
 6 | from .metrics import eval_roc_auc
 7 | 
 8 | __all__ = [
 9 |     'eval_average_precision',
10 |     'eval_micro_f1',
11 |     'eval_macro_f1',
12 |     'eval_precision_at_k',
13 |     'eval_recall_at_k',
14 |     'eval_roc_auc'
15 | ]


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires=["flit_core >=3.2,<4"]
 3 | build-backend="flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "pygda"
 7 | dynamic = ["version"]
 8 | description = "A Python library for Graph Domain Adaptation"
 9 | authors=[
10 |     {name="pygda-team"},
11 | ]
12 | readme = "README.md"
13 | classifiers = [
14 |     "Programming Language :: Python :: 3",
15 |     "License :: OSI Approved :: MIT License",
16 |     "Operating System :: OS Independent"
17 | ]
18 | 
19 | dependencies=[
20 |     "numpy",
21 |     "scikit-learn",
22 |     "scipy",
23 |     "tqdm"
24 | ]
25 | 
26 | [tool.setuptools]
27 | package-dir = {"" = "pygda"}
28 | 
29 | [project.urls]
30 | Repository = "https://github.com/pygda-team/pygda"
31 | 
32 | 


--------------------------------------------------------------------------------
/pygda/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .airport import AirportDataset
 2 | from .arxiv import ArxivDataset
 3 | from .blog import BlogDataset
 4 | from .citation import CitationDataset
 5 | from .elliptic import EllipticDataset
 6 | from .facebook import FacebookDataset
 7 | from .mag import MAGDataset
 8 | from .squirrel import SquirrelDataset
 9 | from .twitch import TwitchDataset
10 | from .twitter import TwitterDataset
11 | from .tugraph import GraphTUDataset
12 | from .webkb import WebKBDataset
13 | 
14 | 
15 | __all__ = [
16 |     "AirportDataset",
17 |     "ArxivDataset",
18 |     "BlogDataset",
19 |     "CitationDataset",
20 |     "EllipticDataset",
21 |     "FacebookDataset",
22 |     "MAGDataset",
23 |     "SquirrelDataset",
24 |     "TwitchDataset",
25 |     "TwitterDataset",
26 |     "GraphTUDataset",
27 |     "WebKBDataset"
28 | ]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 pygda-team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/cheatsheet/Dataset Cheatsheet.md:
--------------------------------------------------------------------------------
 1 | # Datasets Cheatsheet
 2 | 
 3 | | Datasets | Domains | #Node | #Edge | #Class |
 4 | |----------|---------|-------|-------|---------|
 5 | | Airport | Brazil | 131 | 1,074 | 4 |
 6 | | | Euroup | 399 | 5,995 | |
 7 | | | USA | 1,190 | 13,599 | |
 8 | | Blog  | Blog1 | 2,300 | 33,471 | 6 |
 9 | | | Blog2 | 2,896 | 53,836 | |
10 | | Citation  | ACMv9 | 9,360 | 15,556 | 5 |
11 | | | Citationv1 | 8,935 | 15,098 | |
12 | | | DBLPv7 | 5,484 | 8,117 | |
13 | | MAG  | CN | 101,952 | 285,561 | 20 |
14 | | | DE | 43,032 | 126,683 | |
15 | | | FR | 29,262 | 78,222 | |
16 | | | JP | 37,498 | 90,944 | |
17 | | | RU | 32,833 | 67,994 | |
18 | | | US | 132,558 | 697,450 | |
19 | | Twitch | DE | 9,498 | 153,138 | 2 |
20 | | | EN | 7,126 | 35,324 | |
21 | | | ES | 4,648 | 59,382 | |
22 | | | FR | 6,549 | 112,666 | |
23 | | | PT | 1,912 | 31,299 | |
24 | | | RU | 4,385 | 37,304 | |
25 | | ogbn-arxiv | 1950-2016 | 69,499 | 237,163 | 40 |
26 | | | 2016-2018 | 51,241 | 111,754 | |
27 | | | 2018-2020 | 48,603 | 60,403 | |
28 | |TUGraph | Proteins | ~39.06 | ~72.82 | 2 |
29 | | | Mutagenicity | ~30.32 | ~30.77 | |
30 | | | Frankenstein | ~16.90 | ~17.88 | |


--------------------------------------------------------------------------------
/pygda/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseGDA
 2 | from .udagcn import UDAGCN
 3 | from .a2gnn import A2GNN
 4 | from .grade import GRADE
 5 | from .asn import ASN
 6 | from .specreg import SpecReg
 7 | from .gnn import GNN
 8 | from .strurw import StruRW
 9 | from .acdne import ACDNE
10 | from .dane import DANE
11 | from .adagcn import AdaGCN
12 | from .jhgda import JHGDA
13 | from .kbl import KBL
14 | from .dgda import DGDA
15 | from .sagda import SAGDA
16 | from .cwgcn import CWGCN
17 | from .dmgnn import DMGNN
18 | from .pa import PairAlign
19 | from .soga import SOGA
20 | from .gtrans import GTrans
21 | from .graphcta import GraphCTA
22 | from .graphata import GraphATA
23 | from .sepa import SEPA
24 | from .dgsda import DGSDA
25 | from .tdss import TDSS
26 | 
27 | __all__ = [
28 |     "BaseGDA",
29 |     "UDAGCN",
30 |     "A2GNN",
31 |     "GRADE",
32 |     "ASN",
33 |     "SpecReg",
34 |     "GNN",
35 |     "StruRW",
36 |     "ACDNE",
37 |     "DANE",
38 |     "AdaGCN",
39 |     "JHGDA",
40 |     "KBL",
41 |     "DGDA",
42 |     "SAGDA",
43 |     "CWGCN",
44 |     "DMGNN",
45 |     "PairAlign",
46 |     "SOGA",
47 |     "GTrans",
48 |     "GraphCTA",
49 |     "GraphATA",
50 |     "SEPA",
51 |     "DGSDA",
52 |     "TDSS"
53 | ]


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 |   
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v4
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python3 -m pip install --upgrade pip
32 |         python3 -m pip install --upgrade build
33 |         python3 -m pip install --user --upgrade twine
34 |     - name: Build package
35 |       run: |
36 |         python3 -m build
37 |     - name: Publish package
38 |       uses: pypa/gh-action-pypi-publish@release/v1
39 |       with:
40 |         password: ${{ secrets.PYPI_API_TOKEN }}
41 | 


--------------------------------------------------------------------------------
/docs/cheatsheet/Model Cheatsheet.md:
--------------------------------------------------------------------------------
 1 | # Model Cheatsheet
 2 | 
 3 | | Num | Methods | Settings | Supported Tasks |
 4 | |-----|----------|----------|-----------------|
 5 | | 1 | Vanilla GNN | No-adaptation | Node/Graph Level |
 6 | | 2 | DANE | Source-needed | Node/Graph Level |
 7 | | 3 | ACDNE | Source-needed | Node Level |
 8 | | 4 | UDAGCN | Source-needed | Node/Graph Level |
 9 | | 5 | ASN  | Source-needed | Node Level |
10 | | 6 | AdaGCN | Source-needed | Node/Graph Level |
11 | | 7 | GRADE  | Source-needed | Node/Graph Level |
12 | | 8 | SpecReg | Source-needed | Node Level |
13 | | 9 | StruRW  | Source-needed | Node Level |
14 | | 10 | JHGDA | Source-needed | Node Level |
15 | | 11 | KBL | Source-needed | Node Level |
16 | | 12 | WGCNN | Source-needed | Node Level |
17 | | 13 | CWGCN  | Source-needed | Node/Graph Level |
18 | | 14 | SAGDA | Source-needed | Node/Graph Level |
19 | | 15 | GTrans | Source-free | Node Level |
20 | | 16 | DGDA | Source-needed | Node Level |
21 | | 17 | A2GNN  | Source-needed | Node/Graph Level |
22 | | 18 | PairAlign | Source-needed | Node Level |
23 | | 19 | SEPA | Source-needed | Node Level |
24 | | 20 | SOGA  | Source-free | Node Level |
25 | | 21 | GraphCTA | Source-free | Node Level |
26 | | 22 | TDSS | Source-needed| Node Level |
27 | | 23 | GraphATA | Multi-Source-free | Node/Graph Level |
28 | | 24 | DGSDA | Source-needed | Node Level |


--------------------------------------------------------------------------------
/pygda/nn/attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Attention(nn.Module):
 7 |     """
 8 |     Attention mechanism for feature aggregation.
 9 | 
10 |     Parameters
11 |     ----------
12 |     in_channels : int
13 |         Input feature dimension.
14 | 
15 |     Notes
16 |     -----
17 |     - Implements learnable attention weights
18 |     - Uses softmax normalization
19 |     - Includes dropout regularization
20 |     - Single-head attention mechanism
21 |     """
22 | 
23 |     def __init__(self, in_channels):
24 |         super().__init__()
25 |         self.dense_weight = nn.Linear(in_channels, 1)
26 |         self.dropout = nn.Dropout(0.1)
27 | 
28 |     def forward(self, inputs):
29 |         """
30 |         Apply attention mechanism to input features.
31 | 
32 |         Parameters
33 |         ----------
34 |         inputs : list[torch.Tensor]
35 |             List of input tensors to be attended.
36 | 
37 |         Returns
38 |         -------
39 |         torch.Tensor
40 |             Attention-weighted feature aggregation.
41 | 
42 |         Notes
43 |         -----
44 |         Process:
45 |         
46 |         1. Stack input tensors
47 |         2. Compute attention weights
48 |         3. Apply softmax normalization
49 |         4. Weighted sum of features
50 |         """
51 | 
52 |         stacked = torch.stack(inputs, dim=1)
53 |         weights = F.softmax(self.dense_weight(stacked), dim=1)
54 |         outputs = torch.sum(stacked * weights, dim=1)
55 |         return outputs
56 | 


--------------------------------------------------------------------------------
/pygda/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cached_gcn_conv import CachedGCNConv
 2 | from .ppmi_conv import PPMIConv
 3 | from .attention import Attention
 4 | from .udagcn_base import UDAGCNBase
 5 | from .reverse_layer import GradReverse
 6 | from .prop_gcn_conv import PropGCNConv
 7 | from .a2gnn_base import A2GNNBase
 8 | from .grade_base import GRADEBase
 9 | from .asn_base import ASNBase
10 | from .gnn_base import GNNBase
11 | from .mixup_gcnconv import MixUpGCNConv
12 | from .mixup_base import MixupBase
13 | from .acdne_base import ACDNEBase
14 | from .adagcn_base import AdaGCNBase
15 | from .gmm_clustering import GMMClustering
16 | from .jhgda_base import JHGDABase
17 | from .kbl_base import KBLBase
18 | from .dgda_base import DGDABase
19 | from .deepwalk_pretrain import DWPretrain
20 | from .sagda_base import SAGDABase
21 | from .cwgcn_base import CWGCNBase
22 | from .reweight_gnn import ReweightGNN
23 | from .soga_base import SOGABase
24 | from .node_centric_conv import NodeCentricConv, NodeCentricMLP
25 | from .graphata_base import GraphATABase
26 | from .dgsda_base import DGSDABase
27 | 
28 | 
29 | __all__ = [
30 |     "CachedGCNConv",
31 |     "PPMIConv",
32 |     "Attention",
33 |     "UDAGCNBase",
34 |     "GradReverse",
35 |     "PropGCNConv",
36 |     "A2GNNBase",
37 |     "GRADEBase",
38 |     "ASNBase",
39 |     "GNNBase",
40 |     "MixUpGCNConv",
41 |     "MixupBase",
42 |     "ACDNEBase",
43 |     "AdaGCNBase",
44 |     "GMMClustering",
45 |     "JHGDABase",
46 |     "KBLBase",
47 |     "DGDABase",
48 |     "DWPretrain",
49 |     "SAGDABase",
50 |     "CWGCNBase",
51 |     "ReweightGNN",
52 |     "SOGABase",
53 |     "NodeCentricConv",
54 |     "NodeCentricMLP",
55 |     "GraphATABase",
56 |     "DGSDABase"
57 | ]
58 | 


--------------------------------------------------------------------------------
/pygda/nn/reverse_layer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class GradReverse(torch.autograd.Function):
 5 |     """
 6 |     Gradient Reversal Layer for adversarial training.
 7 | 
 8 |     Implements a custom autograd function that:
 9 |     
10 |     - Forward: Identity operation
11 |     - Backward: Reverses and scales gradients
12 | 
13 |     """
14 | 
15 |     @staticmethod
16 |     def forward(ctx, x, alpha):
17 |         """
18 |         Forward pass of gradient reversal.
19 | 
20 |         Parameters
21 |         ----------
22 |         ctx : torch.autograd.function.Context
23 |             Context object for storing variables for backward.
24 |         x : torch.Tensor
25 |             Input tensor.
26 |         alpha : float
27 |             Gradient scaling factor.
28 | 
29 |         Returns
30 |         -------
31 |         torch.Tensor
32 |             Input tensor without modification.
33 | 
34 |         Notes
35 |         -----
36 |         Identity operation in forward pass, stores alpha for backward.
37 |         """
38 |         ctx.alpha = alpha
39 |         return x.view_as(x)
40 | 
41 |     @staticmethod
42 |     def backward(ctx, grad_output):
43 |         """
44 |         Backward pass of gradient reversal.
45 | 
46 |         Parameters
47 |         ----------
48 |         ctx : torch.autograd.function.Context
49 |             Context object containing saved alpha.
50 |         grad_output : torch.Tensor
51 |             Gradient from subsequent layer.
52 | 
53 |         Returns
54 |         -------
55 |         tuple
56 |             Contains:
57 |             - torch.Tensor: Reversed and scaled gradient
58 |             - None: For alpha parameter (not needed)
59 | 
60 |         Notes
61 |         -----
62 |         Implements gradient reversal:
63 |         grad = -alpha * grad_output
64 |         """
65 |         grad_output = grad_output.neg() * ctx.alpha
66 |         return grad_output, None
67 | 


--------------------------------------------------------------------------------
/benchmark/llm/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | Evaluation scripts for 5 methods on the ogbn-arxiv datasets with LLM predictions and explanations. Each experiment is repeated 3 times.
 3 | 
 4 | ## LLM as Feature Encoder
 5 | To investigate whether the distribution gap narrows after utilizing the LLM as the feature encoder, we utilize the prompts from TAPE (ICLR 2024, Explanations as Features: LLM-Based Features for Text-Attributed Graphs), which allows us to assess the impact of LLM-based features on the model's performance.
 6 | 
 7 | The dataset is chronologically divided into 3 groups according to the publication years of the papers. We construct 3 graphs encompassing papers published before 2016 (Group A), 2016-2018 (Group B), and 2018-2020 (Group C). 
 8 | 
 9 | ### Datasets Preprocess
10 | - Original node attributes, which are obtained by averaging the embeddings of words in its title and abstract via word2vec.
11 | ```
12 | python origin_preprocess.py
13 | ```
14 | - LLM enhanced text with word2vec embedding, which combines the title, abstract, and LLM-generated predictions and explanations into a single input. This composite text is then fed into word2vec. Then, the node features are obtained by averaging the embeddings of its combined input.
15 | ```
16 | python llm_w2v_preprocess.py
17 | ```
18 | - LLM enhanced text with BERT embedding, which combines the title, abstract, and LLM-generated predictions and explanations into a single input. This composite text is then fed into a pretrained DeBERTa. Then, the node features are obtained by sentence embedding. **Note that, we did not finetune the DeBERTa like TAPE paper, since we study unsupervised graph domain adaptation**.
19 | ```
20 | python llm_bert_preprocess.py
21 | ```
22 | 
23 | ### Data Download
24 | - ogbn-arixv. The [OGB](https://ogb.stanford.edu/docs/nodeprop/) provides the mapping from MAG paper IDs into the raw texts of titles and abstracts. Download the title and abstract data [here](https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz).
25 | - LLM-responses. Download the LLM responses data [here](https://drive.google.com/file/d/1A6mZSFzDIhJU795497R6mAAM2Y9qutI5/view?usp=sharing) from TAPE paper.
26 | 
27 | ## Run
28 | 
29 | Run via
30 | ```
31 | ./run1.sh
32 | ./run2.sh
33 | ./run3.sh
34 | ```


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.8
 3 | # by the following command:
 4 | #
 5 | #    pip-compile docs/requirements.in
 6 | #
 7 | astunparse==1.6.3
 8 |     # via griffe
 9 | click==8.1.7
10 |     # via
11 |     #   mkdocs
12 |     #   mkdocstrings
13 | colorama==0.4.6
14 |     # via griffe
15 | ghp-import==2.1.0
16 |     # via mkdocs
17 | griffe==0.44.0
18 |     # via mkdocstrings-python
19 | importlib-metadata==7.1.0
20 |     # via
21 |     #   markdown
22 |     #   mkdocs
23 |     #   mkdocs-get-deps
24 |     #   mkdocstrings
25 | jinja2>=3.1.5
26 |     # via
27 |     #   mkdocs
28 |     #   mkdocstrings
29 | markdown==3.6
30 |     # via
31 |     #   markdown-include
32 |     #   mkdocs
33 |     #   mkdocs-autorefs
34 |     #   mkdocstrings
35 |     #   pymdown-extensions
36 | markdown-include==0.8.1
37 |     # via -r docs/requirements.in
38 | markupsafe==2.1.5
39 |     # via
40 |     #   jinja2
41 |     #   mkdocs
42 |     #   mkdocs-autorefs
43 |     #   mkdocstrings
44 | mergedeep==1.3.4
45 |     # via
46 |     #   mkdocs
47 |     #   mkdocs-get-deps
48 | mkdocs==1.6.0
49 |     # via
50 |     #   -r docs/requirements.in
51 |     #   mkdocs-autorefs
52 |     #   mkdocstrings
53 | mkdocs-autorefs==1.0.1
54 |     # via mkdocstrings
55 | mkdocs-get-deps==0.2.0
56 |     # via mkdocs
57 | mkdocstrings[python]==0.25.1
58 |     # via
59 |     #   -r docs/requirements.in
60 |     #   mkdocstrings-python
61 | mkdocstrings-python==1.10.0
62 |     # via mkdocstrings
63 | packaging==24.0
64 |     # via mkdocs
65 | pathspec==0.12.1
66 |     # via mkdocs
67 | platformdirs==4.2.1
68 |     # via
69 |     #   mkdocs-get-deps
70 |     #   mkdocstrings
71 | pymdown-extensions==10.8.1
72 |     # via mkdocstrings
73 | python-dateutil==2.9.0.post0
74 |     # via ghp-import
75 | pyyaml==6.0.1
76 |     # via
77 |     #   mkdocs
78 |     #   mkdocs-get-deps
79 |     #   pymdown-extensions
80 |     #   pyyaml-env-tag
81 | pyyaml-env-tag==0.1
82 |     # via mkdocs
83 | six==1.16.0
84 |     # via
85 |     #   astunparse
86 |     #   python-dateutil
87 | typing-extensions==4.11.0
88 |     # via mkdocstrings
89 | watchdog==4.0.0
90 |     # via mkdocs
91 | wheel==0.43.0
92 |     # via astunparse
93 | zipp==3.19.1
94 |     # via importlib-metadata
95 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | ## Datasets
2 | * **Airport**: It has 3 different domains, i.e., Brazil, Euroup and USA. They are adopted from the [struc2vec](https://arxiv.org/abs/1704.03165) and can be downloaded [here](https://drive.google.com/drive/folders/1zlluWoeukD33ZxwaTRQi3jCdD0qC-I2j?usp=share_link). The graph processing can be found at ``AirportDataset``. We utilize ``OneHotDegree`` to construct node features for each node.
3 | * **Blog**: It has 2 different domains, i.e., Blog1 and Blog2. They are adopted from the [ACDNE](https://arxiv.org/abs/2002.07366) and can be downloaded [here](https://drive.google.com/drive/folders/1jKKG0o7rEY-BaVEjBhuGijzwwhU0M-pQ?usp=share_link). The graph processing can be found at ``BlogDataset``.
4 | * **Citation**: It has 3 different domains, i.e., ACMv9 , Citationv1 and DBLPv7. They are adopted from the [ASN](https://dl.acm.org/doi/abs/10.1145/3459637.3482228) and can be downloaded [here](https://drive.google.com/drive/folders/1ntNt3qHE4p9Us8Re9tZDaB-tdtqwV8AX?usp=share_link). The graph processing can be found at ``CitationDataset``.
5 | * **MAG**: The MAG dataset is originally from the ogbn-mag dataset and the [PairAlign](https://arxiv.org/abs/2403.01092) separates it into 6 countries, including CN, DE, FR, JP, RU, and US. The data can be downloaded [here](https://drive.google.com/drive/folders/1HinhjpNPPivyqoubiYOr8X2jq-rjw3e9?usp=share_link) and the graph processing can be found at ``MAGDataset``.
6 | * **Twitch**: It has 6 different domains, i.e., DE, EN, ES, FR,PT and RU. They are adopted from the [Twitch Social Networks](https://github.com/benedekrozemberczki/datasets?tab=readme-ov-file#twitch-social-networks) and can be downloaded [here](https://drive.google.com/drive/folders/1GWMyyJOZ4CeeqP_H5dCA5voSQHT0WlXG?usp=share_link). The graph processing can be found at ``TwitchDataset``.
7 | * **PROTEINS**, **FRANKENSTEIN** and **Mutagenicity**: They have 2 domains based on its density. They are adopted from [TUDataset](https://chrsmrrs.github.io/datasets/docs/datasets/) and can be downloaded [here](https://drive.google.com/drive/folders/1NbPK71Dy0ulH3CdNyfvMwQECj_Oh867I?usp=sharing). The graph processing can be found at ``GraphTUDataset``.
8 | * **Arxiv**: It has 3 domains based the publication years of the papers. They are adopted from [ogbn-arxiv](https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv) and can be preprocessed with the scripts in benchmark folder. The graph processing can be found at ``ArxivDataset``.


--------------------------------------------------------------------------------
/docs/models/Overview.md:
--------------------------------------------------------------------------------
 1 | # Models Overview
 2 | 
 3 | This section provides detailed documentation for all supported models in PyGDA. Our framework offers a comprehensive collection of graph domain adaptation models, built on a flexible and extensible architecture.
 4 | 
 5 | ### Core Architecture
 6 | 
 7 | #### [BaseGDA](BaseGDA.md)
 8 | The foundation of PyGDA's model architecture, providing:
 9 | 
10 | - Base class for all graph domain adaptation models
11 | - Core training and inference functionalities
12 | - Standardized interfaces for model customization
13 | - Common utility methods and configurations
14 | 
15 | ### Customization Guide
16 | 
17 | PyGDA is designed for easy customization and extension. To create your own model:
18 | 
19 | ```python
20 | from pygda.models import BaseGDA
21 | 
22 | class CustomGDA(BaseGDA):
23 |     def __init__(self, **kwargs):
24 |         super().__init__(**kwargs)
25 |         # Initialize your model components
26 |         
27 |     def fit(self, data):
28 |         # Implement your training logic
29 |         pass
30 |         
31 |     def predict(self, data):
32 |         # Implement your inference logic
33 |         return predictions
34 | ```
35 | 
36 | ### Key Features
37 | 
38 | 1. **Flexible Base Architecture**
39 | 
40 |     - Inherit from `BaseGDA` for consistent interface
41 |     - Access to core functionalities and utilities
42 |     - Standardized training and evaluation methods
43 | 
44 | 2. **Easy Training Process**
45 |     
46 |     - Use `fit()` method for model training
47 |     - Support for custom hyperparameters
48 |     - Flexible dataset input handling
49 |     - Built-in optimization utilities
50 | 
51 | 3. **Streamlined Evaluation**
52 | 
53 |     - Simple `predict()` interface
54 |     - Standardized performance metrics
55 |     - Easy integration with evaluation pipelines
56 | 
57 | 4. **Extensibility**
58 | 
59 |     - Create custom model architectures
60 |     - Add new training strategies
61 |     - Implement domain-specific features
62 |     - Integrate with existing PyGDA components
63 | 
64 | ### Usage Example
65 | 
66 | ```python
67 | from pygda.models import A2GNN
68 | 
69 | # Initialize model
70 | model = A2GNN(in_dim=100, hidden_dim=64, num_classes=7)
71 | 
72 | # Train model
73 | model.fit(train_data)
74 | 
75 | # Make predictions
76 | predictions = model.predict(test_data)
77 | ```
78 | 
79 | For detailed information about each model, please visit their respective documentation pages linked above.


--------------------------------------------------------------------------------
/benchmark/node/parser.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | filename = 'results.txt'
 5 | 
 6 | f = open(filename, 'r')
 7 | lines = f.readlines()
 8 | f.close()
 9 | 
10 | dataDict = dict()
11 | 
12 | for line in lines:
13 |     elements = line.strip('\n\r').split(',')
14 |     name  = elements[0]
15 |     source = elements[2]
16 |     target = elements[4]
17 |     micro_f1 = eval(elements[6])
18 |     macro_f1 = eval(elements[8])
19 |     auc = eval(elements[10])
20 |     if name not in dataDict:
21 |         dataDict[name] = dict()
22 |         if (source, target) not in dataDict[name]:
23 |             dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]]
24 |         else:
25 |             dataDict[name][(source, target)].append([micro_f1, macro_f1, auc])
26 |     else:
27 |         if (source, target) not in dataDict[name]:
28 |             dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]]
29 |         else:
30 |             dataDict[name][(source, target)].append([micro_f1, macro_f1, auc])
31 |     
32 | print('source target mean std:')
33 | 
34 | for k, v in dataDict.items():
35 |     print(k)
36 |     for st, value in v.items():
37 |         value = np.array(value)
38 |         mean_v = np.mean(value, axis=0)
39 |         std_v = np.std(value, axis=0)
40 |         print(st, mean_v, std_v)
41 | 
42 | 
43 | # Create a pandas DataFrame from the nested dictionary
44 | data = []
45 | 
46 | # Collect all (src, tgt) pairs
47 | src_tgt_pairs = set()
48 | for model_results in dataDict.values():
49 |     src_tgt_pairs.update(model_results.keys())
50 | 
51 | # Sort the pairs for consistent ordering
52 | src_tgt_pairs = sorted(src_tgt_pairs)
53 | 
54 | # Build the data for the DataFrame
55 | for model, model_results in dataDict.items():
56 |     row = {'Model': model}
57 |     for pair in src_tgt_pairs:
58 |         if pair in model_results:
59 |             metrics = np.array(model_results[pair]) * 100  # Multiply by 100
60 |             mean_metrics = np.mean(metrics, axis=0)
61 |             std_metrics = np.std(metrics, axis=0)
62 |             mean_metrics_rounded = np.round(mean_metrics, 2)
63 |             std_metrics_rounded = np.round(std_metrics, 2)
64 |             row[pair] = f"{mean_metrics_rounded} +/- {std_metrics_rounded}"
65 |         else:
66 |             row[pair] = "N/A"  # Handle cases where there are no results for this pair
67 |     data.append(row)
68 | 
69 | # Create the DataFrame
70 | df = pd.DataFrame(data)
71 | 
72 | # Set the 'Model' column as the index
73 | df.set_index('Model', inplace=True)
74 | 
75 | # Optionally, save the DataFrame to a CSV file
76 | df.to_csv('csv_results.csv')


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: PyGDA
 2 | theme: readthedocs
 3 | 
 4 | plugins:
 5 |   - search
 6 |   - mkdocstrings
 7 | 
 8 | nav:
 9 |   - GET STARTED: index.md
10 |   - Package Reference:
11 |     - pygda.nn:
12 |       - nn/A2GNNBase.md
13 |       - nn/ACDNEBase.md
14 |       - nn/AdaGCNBase.md
15 |       - nn/ASNBase.md
16 |       - nn/Attention.md
17 |       - nn/CWGCNBase.md
18 |       - nn/CacheGCNConv.md
19 |       - nn/DGDABase.md
20 |       - nn/DGSDABase.md
21 |       - nn/DWPretrain.md
22 |       - nn/GMMClustering.md
23 |       - nn/GNNBase.md
24 |       - nn/GRADEBase.md
25 |       - nn/GradReverse.md
26 |       - nn/GraphATABase.md
27 |       - nn/JHGDABase.md
28 |       - nn/KBLBase.md
29 |       - nn/MixUpBase.md
30 |       - nn/MixUpGCNConv.md
31 |       - nn/NodeCentricConv.md
32 |       - nn/PPMIConv.md
33 |       - nn/PropGCNConv.md
34 |       - nn/ReweightGNN.md
35 |       - nn/SAGDABase.md
36 |       - nn/SOGABase.md
37 |       - nn/UDAGCNBase.md             
38 |     - pygda.models:
39 |       - models/Overview.md
40 |       - models/BaseGDA.md
41 |       - models/GNN.md
42 |       - models/DANE.md
43 |       - models/ACDNE.md
44 |       - models/UDAGCN.md
45 |       - models/ASN.md
46 |       - models/AdaGCN.md
47 |       - models/GRADE.md
48 |       - models/SpecReg.md
49 |       - models/StruRW.md
50 |       - models/JHGDA.md
51 |       - models/KBL.md
52 |       - models/DMGNN.md
53 |       - models/CWGCN.md
54 |       - models/SAGDA.md
55 |       - models/DGDA.md
56 |       - models/A2GNN.md
57 |       - models/PairAlign.md
58 |       - models/GTrans.md
59 |       - models/SOGA.md
60 |       - models/SEPA.md
61 |       - models/GraphCTA.md
62 |       - models/TDSS.md
63 |       - models/GraphATA.md
64 |       - models/DGSDA.md
65 |     - pygda.datasets: 
66 |       - datasets/Overview.md
67 |       - datasets/Airport.md
68 |       - datasets/Arxiv.md
69 |       - datasets/Blog.md
70 |       - datasets/Citation.md
71 |       - datasets/Elliptic.md
72 |       - datasets/Facebook.md
73 |       - datasets/MAG.md
74 |       - datasets/Squirrel.md
75 |       - datasets/TUGraph.md
76 |       - datasets/Twitch.md
77 |       - datasets/Twitter.md
78 |     - pygda.metrics:
79 |       - metrics/Metrics.md
80 |     - pygda.utils:
81 |       - utils/MMD.md
82 |       - utils/Perturb.md
83 |       - utils/SVDTransform.md
84 |       - utils/Sampler.md
85 |       - utils/Utility.md
86 |   - Benchmarks:
87 |     - benchmark/Overview.md
88 |   - Cheatsheets: 
89 |     - cheatsheet/Dataset Cheatsheet.md
90 |     - cheatsheet/Model Cheatsheet.md
91 |   - External Resources:
92 |     - resources/Resources.md
93 | 
94 | extra_css:
95 |   - assets/css/custom.css
96 | 


--------------------------------------------------------------------------------
/benchmark/graph/run_all_M.sh:
--------------------------------------------------------------------------------
 1 | python grade.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --filename 'results-M.txt'
 2 | python adagcn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-M.txt'
 3 | python udagcn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-M.txt'
 4 | python a2gnn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-M.txt'
 5 | python cwgcn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-M.txt'
 6 | python dane.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-M.txt'
 7 | python sagda.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-M.txt'
 8 | 
 9 | python grade.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --filename 'results-M.txt'
10 | python adagcn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-M.txt'
11 | python udagcn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-M.txt'
12 | python a2gnn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-M.txt'
13 | python cwgcn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-M.txt'
14 | python dane.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-M.txt'
15 | python sagda.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-M.txt'


--------------------------------------------------------------------------------
/benchmark/graph/parser.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | filename = 'results-F.txt'
 5 | 
 6 | f = open(filename, 'r')
 7 | lines = f.readlines()
 8 | f.close()
 9 | 
10 | dataDict = dict()
11 | 
12 | for line in lines:
13 |     elements = line.strip('\n\r').split(',')
14 |     name  = elements[0]
15 |     source = elements[2]
16 |     target = elements[4]
17 |     micro_f1 = eval(elements[6])
18 |     macro_f1 = eval(elements[8])
19 |     auc = eval(elements[10])
20 |     if name not in dataDict:
21 |         dataDict[name] = dict()
22 |         if (source, target) not in dataDict[name]:
23 |             dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]]
24 |         else:
25 |             dataDict[name][(source, target)].append([micro_f1, macro_f1, auc])
26 |     else:
27 |         if (source, target) not in dataDict[name]:
28 |             dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]]
29 |         else:
30 |             dataDict[name][(source, target)].append([micro_f1, macro_f1, auc])
31 |     
32 | print('source target mean std:')
33 | 
34 | for k, v in dataDict.items():
35 |     print(k)
36 |     for st, value in v.items():
37 |         value = np.array(value)
38 |         mean_v = np.mean(value, axis=0)
39 |         std_v = np.std(value, axis=0)
40 |         print(st, mean_v, std_v)
41 | 
42 | 
43 | # Create a pandas DataFrame from the nested dictionary
44 | data = []
45 | 
46 | # Collect all (src, tgt) pairs
47 | src_tgt_pairs = set()
48 | for model_results in dataDict.values():
49 |     src_tgt_pairs.update(model_results.keys())
50 | 
51 | # Sort the pairs for consistent ordering
52 | src_tgt_pairs = sorted(src_tgt_pairs)
53 | 
54 | # Build the data for the DataFrame
55 | for model, model_results in dataDict.items():
56 |     row = {'Model': model}
57 |     for pair in src_tgt_pairs:
58 |         if pair in model_results:
59 |             metrics = np.array(model_results[pair]) * 100  # Multiply by 100
60 |             mean_metrics = np.mean(metrics, axis=0)
61 |             std_metrics = np.std(metrics, axis=0)
62 |             mean_metrics_rounded = np.round(mean_metrics, 2)
63 |             std_metrics_rounded = np.round(std_metrics, 2)
64 |             row[pair] = f"{mean_metrics_rounded} +/- {std_metrics_rounded}"
65 |         else:
66 |             # row[pair] = "N/A"  # Handle cases where there are no results for this pair
67 |             row[pair] = f"[0 0 0] +/- [0 0 0]"
68 |     data.append(row)
69 | 
70 | # Create the DataFrame
71 | df = pd.DataFrame(data)
72 | 
73 | # Set the 'Model' column as the index
74 | df.set_index('Model', inplace=True)
75 | 
76 | # Display the DataFrame
77 | # print(df)
78 | 
79 | # Optionally, save the DataFrame to a CSV file
80 | df.to_csv('results-F.csv')


--------------------------------------------------------------------------------
/benchmark/llm/parser.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | filename = 'results-llm-3.txt'
 5 | 
 6 | f = open(filename, 'r')
 7 | lines = f.readlines()
 8 | f.close()
 9 | 
10 | dataDict = dict()
11 | 
12 | for line in lines:
13 |     elements = line.strip('\n\r').split(',')
14 |     name  = elements[0]
15 |     source = elements[2]
16 |     target = elements[4]
17 |     micro_f1 = eval(elements[6])
18 |     macro_f1 = eval(elements[8])
19 |     auc = eval(elements[10])
20 |     if name not in dataDict:
21 |         dataDict[name] = dict()
22 |         if (source, target) not in dataDict[name]:
23 |             dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]]
24 |         else:
25 |             dataDict[name][(source, target)].append([micro_f1, macro_f1, auc])
26 |     else:
27 |         if (source, target) not in dataDict[name]:
28 |             dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]]
29 |         else:
30 |             dataDict[name][(source, target)].append([micro_f1, macro_f1, auc])
31 |     
32 | print('source target mean std:')
33 | 
34 | for k, v in dataDict.items():
35 |     print(k)
36 |     for st, value in v.items():
37 |         value = np.array(value)
38 |         mean_v = np.mean(value, axis=0)
39 |         std_v = np.std(value, axis=0)
40 |         print(st, mean_v, std_v)
41 | 
42 | 
43 | # Create a pandas DataFrame from the nested dictionary
44 | data = []
45 | 
46 | # Collect all (src, tgt) pairs
47 | src_tgt_pairs = set()
48 | for model_results in dataDict.values():
49 |     src_tgt_pairs.update(model_results.keys())
50 | 
51 | # Sort the pairs for consistent ordering
52 | src_tgt_pairs = sorted(src_tgt_pairs)
53 | 
54 | # Build the data for the DataFrame
55 | for model, model_results in dataDict.items():
56 |     row = {'Model': model}
57 |     for pair in src_tgt_pairs:
58 |         if pair in model_results:
59 |             metrics = np.array(model_results[pair]) * 100  # Multiply by 100
60 |             mean_metrics = np.mean(metrics, axis=0)
61 |             std_metrics = np.std(metrics, axis=0)
62 |             mean_metrics_rounded = np.round(mean_metrics, 2)
63 |             std_metrics_rounded = np.round(std_metrics, 2)
64 |             row[pair] = f"{mean_metrics_rounded} +/- {std_metrics_rounded}"
65 |         else:
66 |             # row[pair] = "N/A"  # Handle cases where there are no results for this pair
67 |             row[pair] = f"[0 0 0] +/- [0 0 0]"
68 |     data.append(row)
69 | 
70 | # Create the DataFrame
71 | df = pd.DataFrame(data)
72 | 
73 | # Set the 'Model' column as the index
74 | df.set_index('Model', inplace=True)
75 | 
76 | # Display the DataFrame
77 | # print(df)
78 | 
79 | # Optionally, save the DataFrame to a CSV file
80 | df.to_csv('results-llm-bert.csv')


--------------------------------------------------------------------------------
/pygda/utils/svd_transform.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import numpy as np
 5 | from torch_geometric.utils import get_laplacian
 6 | from sklearn.decomposition import TruncatedSVD
 7 | 
 8 | def svd_transform(
 9 |     data, 
10 |     processed_paths
11 |     ):
12 |     """
13 |     Perform SVD transformation on graph adjacency matrix and store eigenvalues/vectors.
14 | 
15 |     Parameters
16 |     ----------
17 |     data : torch_geometric.data.Data
18 |         Input graph data object containing edge indices and node features
19 |     processed_paths : str
20 |         Path to save processed eigenvalues and eigenvectors
21 | 
22 |     Returns
23 |     -------
24 |     torch_geometric.data.Data
25 |         Data object augmented with eigenvalues and eigenvectors
26 | 
27 |     Notes
28 |     -----
29 |     Processing Steps:
30 | 
31 |     - Graph Processing:
32 |        
33 |        * Extract number of nodes
34 |        * Compute Laplacian matrix
35 |        * Convert to dense adjacency
36 | 
37 |     - SVD Computation:
38 |        
39 |        * Choose components based on graph size
40 |        * Small graphs (<1000 nodes): 100 components
41 |        * Large graphs: 1000 components
42 |        * Perform truncated SVD
43 | 
44 |     - Data Storage:
45 |        
46 |        * Save square root of explained variance
47 |        * Save component vectors
48 |        * Load into data object
49 | 
50 |     Features:
51 | 
52 |     - Adaptive dimensionality
53 |     - Memory efficient SVD
54 |     - Sparse to dense conversion
55 |     - Eigendecomposition storage
56 | 
57 |     Mathematical Details:
58 | 
59 |     - Uses truncated SVD for dimensionality reduction
60 |     - Computes graph Laplacian eigendecomposition
61 |     - Stores sqrt{explained_variance} as eigenvalues
62 |     - Preserves principal components as eigenvectors
63 |     """
64 |     num_node = data.y.shape[0]
65 |     edge_index, edge_weight = get_laplacian(data.edge_index, num_nodes=num_node)
66 |     edge_index = edge_index.numpy()
67 |     edge_weight = edge_weight.numpy()
68 |     adj = np.zeros((num_node, num_node), dtype=np.float32)
69 |     adj[edge_index[0,:], edge_index[1,:]] = edge_weight
70 | 
71 |     if num_node < 1000:
72 |         pca = TruncatedSVD(n_components=100, n_iter=20, random_state=42)
73 |     else:
74 |         pca = TruncatedSVD(n_components=1000, n_iter=20, random_state=42)
75 |     pca.fit(adj)
76 | 
77 |     torch.save(torch.tensor(pca.explained_variance_ ** 0.5, dtype=torch.float32 ), processed_paths + 'eival.pt')
78 |     torch.save(torch.tensor(pca.components_, dtype=torch.float32 ), processed_paths + 'eivec.pt')
79 | 
80 |     data.eival = torch.load(processed_paths + 'eival.pt')
81 |     data.eivec = torch.load(processed_paths + 'eivec.pt')
82 | 
83 |     return data
84 | 


--------------------------------------------------------------------------------
/benchmark/graph/run_all_P.sh:
--------------------------------------------------------------------------------
 1 | python grade.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:2' --filename 'results-P.txt'
 2 | python adagcn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:2' --filename 'results-P.txt'
 3 | python udagcn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:2' --filename 'results-P.txt'
 4 | python a2gnn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:2' --filename 'results-P.txt'
 5 | python cwgcn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt'
 6 | python dane.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt'
 7 | python sagda.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:2' --filename 'results-P.txt'
 8 | 
 9 | python grade.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:2' --filename 'results-P.txt'
10 | python adagcn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:2' --filename 'results-P.txt'
11 | python udagcn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:2' --filename 'results-P.txt'
12 | python a2gnn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:2' --filename 'results-P.txt'
13 | python cwgcn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt'
14 | python dane.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt'
15 | python sagda.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:2' --filename 'results-P.txt'


--------------------------------------------------------------------------------
/benchmark/graph/run_all_F.sh:
--------------------------------------------------------------------------------
 1 | python grade.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:3' --filename 'results-F.txt'
 2 | python adagcn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:3' --filename 'results-F.txt'
 3 | python udagcn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:3' --filename 'results-F.txt'
 4 | python a2gnn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:3' --filename 'results-F.txt'
 5 | python cwgcn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt'
 6 | python dane.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt'
 7 | python sagda.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:3' --filename 'results-F.txt'
 8 | 
 9 | python grade.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:3' --filename 'results-F.txt'
10 | python adagcn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:3' --filename 'results-F.txt'
11 | python udagcn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:3' --filename 'results-F.txt'
12 | python a2gnn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:3' --filename 'results-F.txt'
13 | python cwgcn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt'
14 | python dane.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt'
15 | python sagda.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:3' --filename 'results-F.txt'


--------------------------------------------------------------------------------
/docs/resources/Resources.md:
--------------------------------------------------------------------------------
 1 | # External Links
 2 | 
 3 | ### NumPy
 4 | - Official Website: [https://numpy.org](https://numpy.org)
 5 | - Documentation: [https://numpy.org/doc/stable](https://numpy.org/doc/stable)
 6 | - GitHub Repository: [https://github.com/numpy/numpy](https://github.com/numpy/numpy)
 7 | 
 8 | NumPy is a fundamental package for scientific computing in Python. It provides:
 9 | 
10 | - A powerful N-dimensional array object
11 | - Sophisticated broadcasting functions
12 | - Tools for integrating C/C++ code
13 | - Useful linear algebra, Fourier transform, and random number capabilities
14 | 
15 | ### SciPy
16 | - Official Website: [https://scipy.org](https://scipy.org)
17 | - Documentation: [https://docs.scipy.org/doc/scipy](https://docs.scipy.org/doc/scipy)
18 | - GitHub Repository: [https://github.com/scipy/scipy](https://github.com/scipy/scipy)
19 | 
20 | SciPy is a scientific computing library that builds on NumPy. It provides:
21 | 
22 | - Optimization algorithms
23 | - Linear algebra operations
24 | - Signal and image processing tools
25 | - Statistical functions
26 | 
27 | ### NetworkX
28 | - Official Website: [https://networkx.org](https://networkx.org)
29 | - Documentation: [https://networkx.org/documentation/stable](https://networkx.org/documentation/stable)
30 | - GitHub Repository: [https://github.com/networkx/networkx](https://github.com/networkx/networkx)
31 | 
32 | NetworkX is a Python package for complex networks. It provides:
33 | 
34 | - Graph creation and manipulation
35 | - Network structure and analysis algorithms
36 | - Network visualization tools
37 | - Large collection of graph algorithms
38 | 
39 | ### Scikit-learn
40 | - Official Website: [https://scikit-learn.org](https://scikit-learn.org)
41 | - Documentation: [https://scikit-learn.org/stable](https://scikit-learn.org/stable)
42 | - GitHub Repository: [https://github.com/scikit-learn/scikit-learn](https://github.com/scikit-learn/scikit-learn)
43 | 
44 | Scikit-learn is a machine learning library for Python. It provides:
45 | 
46 | - Classification, regression, and clustering algorithms
47 | - Model selection and evaluation tools
48 | - Preprocessing and feature engineering utilities
49 | - Comprehensive documentation and examples
50 | 
51 | ### PyTorch
52 | - Official Website: [https://pytorch.org](https://pytorch.org)
53 | - Documentation: [https://pytorch.org/docs/stable/index.html](https://pytorch.org/docs/stable/index.html)
54 | - GitHub Repository: [https://github.com/pytorch/pytorch](https://github.com/pytorch/pytorch)
55 | 
56 | PyTorch is an open source machine learning framework. It provides:
57 | 
58 | - Dynamic computational graphs
59 | - GPU acceleration
60 | - Deep neural network building blocks
61 | - Rich ecosystem of tools and libraries
62 | 
63 | ### PyTorch Geometric
64 | - Official Website: [https://pytorch-geometric.readthedocs.io](https://pytorch-geometric.readthedocs.io)
65 | - Documentation: [https://pytorch-geometric.readthedocs.io/en/latest](https://pytorch-geometric.readthedocs.io/en/latest)
66 | - GitHub Repository: [https://github.com/pyg-team/pytorch_geometric](https://github.com/pyg-team/pytorch_geometric)
67 | 
68 | PyTorch Geometric (PyG) is a library for deep learning on irregular input data. It provides:
69 | 
70 | - Graph Neural Network implementations
71 | - Various graph datasets
72 | - Common graph operations
73 | - Efficient sparse matrix operations
74 | 


--------------------------------------------------------------------------------
/docs/benchmark/Overview.md:
--------------------------------------------------------------------------------
  1 | # Benchmarks Overview
  2 | 
  3 | PyGDA provides extensive benchmarking capabilities across different types of graph domain adaptation tasks. This document outlines our three main benchmark suites.
  4 | 
  5 | ## Node Classification Benchmark
  6 | 
  7 | ### Overview
  8 | - Evaluates 16 different methods
  9 | - Tests on 5 distinct datasets
 10 | - Each experiment repeated 3 times for statistical significance
 11 | 
 12 | ### Running the Benchmark
 13 | ```
 14 | cd benchmark/node
 15 | ./run.sh
 16 | ```
 17 | 
 18 | ## Graph Classification Benchmark
 19 | 
 20 | ### Overview
 21 | - Evaluates 7 different methods
 22 | - Tests on 3 graph classification datasets:
 23 |     
 24 |     * PROTEINS
 25 |     * FRANKENSTEIN
 26 |     * Mutagenicity
 27 | 
 28 | - Each experiment repeated 3 times for statistical significance
 29 | 
 30 | ### Running the Benchmark
 31 | ```
 32 | cd benchmark/graph
 33 | # Run benchmarks for each dataset
 34 | ./run_all_F.sh  # FRANKENSTEIN
 35 | ./run_all_M.sh  # Mutagenicity
 36 | ./run_all_P.sh  # PROTEINS
 37 | ```
 38 | 
 39 | ## LLM-Enhanced Benchmark
 40 | 
 41 | ### Overview
 42 | - Evaluates 5 different methods
 43 | - Focuses on ogbn-arxiv dataset with LLM predictions and explanations
 44 | - Each experiment repeated 3 times for statistical significance
 45 | - Tests different feature encoding approaches
 46 | 
 47 | ### Dataset Preprocessing Options
 48 | 
 49 | #### **Original Features**
 50 | ```
 51 | python origin_preprocess.py
 52 | ```
 53 | 
 54 | #### **LLM with Word2Vec**
 55 | ```
 56 | python llm_w2v_preprocess.py
 57 | ```
 58 | - Combines title, abstract, and LLM outputs
 59 | - Processes using word2vec embeddings
 60 | 
 61 | #### **LLM with BERT**
 62 | ```
 63 | python llm_bert_preprocess.py
 64 | ```
 65 | - Combines title, abstract, and LLM outputs
 66 | - Uses DeBERTa for sentence embeddings
 67 | - Unsupervised approach (no fine-tuning)
 68 | 
 69 | ### Data Requirements
 70 | - **ogbn-arxiv**: Download title and abstract data from [OGB](https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz)
 71 | - **LLM Responses**: Download from [TAPE paper data](https://drive.google.com/file/d/1A6mZSFzDIhJU795497R6mAAM2Y9qutI5/view?usp=sharing)
 72 | 
 73 | ### Chronological Split
 74 | Dataset is divided into 3 groups based on publication years:
 75 | 
 76 | - Group A: Papers before 2016
 77 | - Group B: Papers from 2016-2018
 78 | - Group C: Papers from 2018-2020
 79 | 
 80 | ### Running the Benchmark
 81 | ```
 82 | cd benchmark/llm
 83 | ./run1.sh
 84 | ./run2.sh
 85 | ./run3.sh
 86 | ```
 87 | 
 88 | ## General Guidelines
 89 | 
 90 | ### Running Benchmarks
 91 | - Ensure all required datasets are downloaded
 92 | - Install all dependencies
 93 | - Run benchmarks from their respective directories
 94 | - Results will be saved in the corresponding output directories
 95 | 
 96 | ### Reproducibility
 97 | - Fixed random seeds are used
 98 | - Multiple runs (3x) for statistical significance
 99 | - Standardized evaluation metrics across all experiments
100 | 
101 | ### Resource Requirements
102 | - Node classification: Moderate GPU memory
103 | - Graph classification: Lower GPU memory
104 | - LLM benchmark: Higher GPU memory (for BERT embeddings)
105 | 
106 | 
107 | This overview:
108 | 
109 | 1. Provides a clear structure for each benchmark suite
110 | 2. Includes detailed setup and running instructions
111 | 3. Specifies data requirements and preprocessing steps
112 | 4. Offers guidelines for reproducibility
113 | 5. Maintains consistent formatting throughout
114 | 6. Includes resource requirements


--------------------------------------------------------------------------------
/benchmark/llm/run1.sh:
--------------------------------------------------------------------------------
 1 | echo "Task arxiv-1950-2016 -> arxiv-2016-2018"
 2 | echo "Original ogbn features"
 3 | 
 4 | python a2gnn.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-1.txt' --device 'cuda:1'
 5 | python udagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-1.txt' --device 'cuda:1'
 6 | python kbl.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-1.txt' --device 'cuda:1'
 7 | python grade.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-1.txt' --device 'cuda:1'
 8 | python adagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-1.txt' --device 'cuda:1'
 9 | 
10 | echo "Task arxiv-1950-2016 -> arxiv-2018-2020"
11 | echo "Original ogbn features"
12 | 
13 | python a2gnn.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-1.txt' --device 'cuda:1'
14 | python udagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-1.txt' --device 'cuda:1'
15 | python kbl.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-1.txt' --device 'cuda:1'
16 | python grade.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-1.txt' --device 'cuda:1'
17 | python adagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-1.txt' --device 'cuda:1'
18 | 
19 | echo "Task arxiv-2016-2018 -> arxiv-2018-2020"
20 | echo "Original ogbn features"
21 | 
22 | python a2gnn.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-1.txt' --device 'cuda:1'
23 | python udagcn.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-1.txt' --device 'cuda:1'
24 | python kbl.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-1.txt' --device 'cuda:1'
25 | python grade.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-1.txt' --device 'cuda:1'
26 | python adagcn.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-1.txt' --device 'cuda:1'


--------------------------------------------------------------------------------
/benchmark/llm/run2.sh:
--------------------------------------------------------------------------------
 1 | echo "Task arxiv-1950-2016 -> arxiv-2016-2018"
 2 | echo "LLM enhanced text with word2vec embedding"
 3 | 
 4 | python a2gnn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-2.txt' --device 'cuda:2'
 5 | python udagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-2.txt' --device 'cuda:2'
 6 | python kbl.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-2.txt' --device 'cuda:2'
 7 | python grade.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-2.txt' --device 'cuda:2'
 8 | python adagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-2.txt' --device 'cuda:2'
 9 | 
10 | 
11 | echo "Task arxiv-1950-2016 -> arxiv-2018-2020"
12 | echo "LLM enhanced text with word2vec embedding"
13 | 
14 | python a2gnn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-2.txt' --device 'cuda:2'
15 | python udagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-2.txt' --device 'cuda:2'
16 | python kbl.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-2.txt' --device 'cuda:2'
17 | python grade.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-2.txt' --device 'cuda:2'
18 | python adagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-2.txt' --device 'cuda:2'
19 | 
20 | echo "Task arxiv-2016-2018 -> arxiv-2018-2020"
21 | echo "LLM enhanced text with word2vec embedding"
22 | 
23 | python a2gnn.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-2.txt' --device 'cuda:2'
24 | python udagcn.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-2.txt' --device 'cuda:2'
25 | python kbl.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-2.txt' --device 'cuda:2'
26 | python grade.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-2.txt' --device 'cuda:2'
27 | python adagcn.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-2.txt' --device 'cuda:2'
28 | 


--------------------------------------------------------------------------------
/examples/demo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os.path as osp
 3 | import numpy as np
 4 | 
 5 | from pygda.datasets import CitationDataset
 6 | 
 7 | from pygda.models import UDAGCN, A2GNN, GRADE
 8 | from pygda.models import ASN, SpecReg, GNN
 9 | from pygda.models import StruRW, ACDNE, DANE
10 | from pygda.models import AdaGCN, JHGDA, KBL
11 | from pygda.models import DGDA, SAGDA, CWGCN
12 | from pygda.models import DMGNN, PairAlign
13 | 
14 | from pygda.metrics import eval_micro_f1, eval_macro_f1
15 | from pygda.utils import svd_transform
16 | 
17 | parser = argparse.ArgumentParser()
18 | 
19 | parser.add_argument('--nhid', type=int, default=64, help='hidden size')
20 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices')
21 | parser.add_argument('--source', type=str, default='DBLPv7', help='source domain data, DBLPv7/ACMv9/Citationv1')
22 | parser.add_argument('--target', type=str, default='ACMv9', help='target domain data, DBLPv7/ACMv9/Citationv1')
23 | 
24 | args = parser.parse_args()
25 | 
26 | # load data 
27 | if args.source in {'DBLPv7', 'ACMv9', 'Citationv1'}:
28 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data/Citation', args.source)
29 |     source_dataset = CitationDataset(path, args.source)
30 | 
31 | if args.target in {'DBLPv7', 'ACMv9', 'Citationv1'}:
32 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data/Citation', args.target)
33 |     target_dataset = CitationDataset(path, args.target)
34 | 
35 | source_data = source_dataset[0].to(args.device)
36 | target_data = target_dataset[0].to(args.device)
37 | 
38 | num_features = source_data.x.size(1)
39 | num_classes = len(np.unique(source_data.y.cpu().numpy()))
40 | 
41 | # choose a graph domain adaptation model
42 | model = A2GNN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
43 | # model = UDAGCN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
44 | # model = GRADE(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
45 | # model = ASN(in_dim=num_features, hid_dim=args.nhid, hid_dim_vae=args.nhid, num_classes=num_classes, device=args.device)
46 | # model = SpecReg(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device, reg_mode=True)
47 | # model = GNN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
48 | # model = StruRW(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
49 | # model = ACDNE(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
50 | # model = DANE(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
51 | # model = AdaGCN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
52 | # model = JHGDA(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
53 | # model = KBL(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
54 | # model = DGDA(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
55 | # model = SAGDA(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
56 | # model = CWGCN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
57 | # model = DMGNN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
58 | # model = PairAlign(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device)
59 | 
60 | # train the model
61 | model.fit(source_data, target_data)
62 | 
63 | # evaluate the performance
64 | logits, labels = model.predict(target_data)
65 | 
66 | preds = logits.argmax(dim=1)
67 | 
68 | mi_f1 = eval_micro_f1(labels, preds)
69 | ma_f1 = eval_macro_f1(labels, preds)
70 | 
71 | print('micro-f1: ' + str(mi_f1))
72 | print('macro-f1: ' + str(ma_f1))
73 | 


--------------------------------------------------------------------------------
/benchmark/llm/origin_preprocess.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import random
  4 | import json
  5 | import pandas as pd
  6 | import argparse
  7 | 
  8 | 
  9 | def take_second(element):
 10 |     return element[1]
 11 | 
 12 | 
 13 | def load_ogb_arxiv(data_dir, year_bound = [2018, 2020], proportion = 1.0):
 14 |     import ogb.nodeproppred
 15 | 
 16 |     dataset = ogb.nodeproppred.NodePropPredDataset(name='ogbn-arxiv', root=data_dir)
 17 |     graph = dataset.graph
 18 | 
 19 |     node_years = graph['node_year']
 20 |     # print(node_years) year for each node
 21 |     n = node_years.shape[0]
 22 |     # print(n) number of nodes
 23 |     node_years = node_years.reshape(n)
 24 | 
 25 |     gpt_text = load_data_gpt_text(n)
 26 |     raw_text = load_data_raw_text()
 27 | 
 28 |     d = np.zeros(len(node_years))
 29 |     print(d.shape)
 30 | 
 31 |     edges = graph['edge_index']
 32 |     for i in range(edges.shape[1]):
 33 |         if node_years[edges[0][i]] <= year_bound[1] and node_years[edges[1][i]] <= year_bound[1] and node_years[edges[0][i]] > year_bound[0] and node_years[edges[1][i]] > year_bound[0]:
 34 |             d[edges[0][i]] += 1
 35 |             d[edges[1][i]] += 1
 36 | 
 37 |     nodes = []
 38 |     for i, year in enumerate(node_years):
 39 |         if year <= year_bound[1] and year > year_bound[0]:
 40 |             nodes.append([i, d[i]])
 41 | 
 42 |     nodes.sort(key = take_second, reverse = True)
 43 | 
 44 |     nodes = nodes[: int(proportion * len(nodes))]
 45 | 
 46 |     random.shuffle(nodes)
 47 | 
 48 |     result_edges = []
 49 |     result_features = []
 50 |     result_labels = []
 51 |     result_text = []
 52 | 
 53 |     for node in nodes:
 54 |         result_features.append(graph['node_feat'][node[0]])
 55 |     result_features = np.array(result_features)
 56 | 
 57 |     ids = {}
 58 |     for i, node in enumerate(nodes):
 59 |         ids[node[0]] = i
 60 | 
 61 |     for i in range(edges.shape[1]):
 62 |         if edges[0][i] in ids and edges[1][i] in ids:
 63 |             result_edges.append([ids[edges[0][i]], ids[edges[1][i]]])
 64 |     result_edges = np.array(result_edges).transpose(1, 0)
 65 | 
 66 |     result_labels = dataset.labels[[node[0] for node in nodes]]
 67 | 
 68 |     edge_index = torch.tensor(result_edges, dtype=torch.long)
 69 |     
 70 |     # result_features: original features
 71 |     node_feat = torch.tensor(result_features, dtype=torch.float)
 72 |     
 73 |     dataset.graph = {'edge_index': edge_index,
 74 |                      'edge_feat': None,
 75 |                      'node_feat': node_feat,
 76 |                      'num_nodes': node_feat.size(0)}
 77 |     dataset.label = torch.tensor(result_labels)
 78 | 
 79 |     return dataset
 80 | 
 81 | def main(args):
 82 |     data_dir = './data'
 83 | 
 84 |     # 3 domains: [1950, 2016], [2016, 2018], [2018, 2020]
 85 | 
 86 |     start_year = 1950
 87 |     end_year = 2016
 88 | 
 89 |     dataset = load_ogb_arxiv(data_dir, year_bound=[start_year, end_year])
 90 | 
 91 |     dataset.n = dataset.graph['num_nodes']
 92 |     dataset.c = max(dataset.label.max().item() + 1, dataset.label.shape[1])
 93 |     dataset.d = dataset.graph['node_feat'].shape[1]
 94 | 
 95 |     print(torch.min(dataset.graph['edge_index']))
 96 |     print(torch.max(dataset.graph['edge_index']))
 97 |     print(len(torch.unique(dataset.graph['edge_index'])))
 98 |     print(len(dataset.graph['edge_index'][1]))
 99 |     print(dataset.graph['node_feat'].size())
100 |     print(len(dataset.label))
101 | 
102 |     print(f"num nodes {dataset.n}| num classes {dataset.c} | num node feats {dataset.d}")
103 | 
104 |     import pickle
105 | 
106 |     filename = 'arxiv-' + str(start_year) + '-' + str(end_year) + '.pkl'
107 | 
108 |     fw = open(filename, 'wb')
109 |     pickle.dump(dataset, fw)
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     parser = argparse.ArgumentParser()
114 | 
115 |     args = parser.parse_args()
116 |     main(args)
117 | 


--------------------------------------------------------------------------------
/benchmark/llm/run3.sh:
--------------------------------------------------------------------------------
 1 | echo "Task arxiv-1950-2016 -> arxiv-2016-2018"
 2 | echo "LLM enhanced text with bert embedding"
 3 | 
 4 | python a2gnn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-3.txt' --device 'cuda:3'
 5 | python udagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-3.txt' --device 'cuda:3'
 6 | python kbl.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-3.txt' --device 'cuda:3'
 7 | python grade.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-3.txt' --device 'cuda:3'
 8 | python adagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-3.txt' --device 'cuda:3'
 9 | 
10 | 
11 | echo "Task arxiv-1950-2016 -> arxiv-2018-2020"
12 | echo "LLM enhanced text with bert embedding"
13 | 
14 | python a2gnn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-3.txt' --device 'cuda:3'
15 | python udagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-3.txt' --device 'cuda:3'
16 | python kbl.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-3.txt' --device 'cuda:3'
17 | python grade.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-3.txt' --device 'cuda:3'
18 | python adagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-3.txt' --device 'cuda:3'
19 | 
20 | echo "Task arxiv-2016-2018 -> arxiv-2018-2020"
21 | echo "LLM enhanced text with bert embedding"
22 | 
23 | python a2gnn.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-3.txt' --device 'cuda:3'
24 | python udagcn.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-3.txt' --device 'cuda:3'
25 | python kbl.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-3.txt' --device 'cuda:3'
26 | python grade.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-3.txt' --device 'cuda:3'
27 | python adagcn.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-3.txt' --device 'cuda:3'
28 | 


--------------------------------------------------------------------------------
/docs/datasets/Overview.md:
--------------------------------------------------------------------------------
 1 | # Dataset Overview
 2 | 
 3 | This section provides detailed documentation for all supported datasets in PyGDA, including their domains and sources.
 4 | 
 5 | ### Citation Networks
 6 | 
 7 | #### [Arxiv](Arxiv.md)
 8 | - **Domains**: 3 domains based on publication years
 9 | - **Source**: [ogbn-arxiv](https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv)
10 | - **Processing**: See [ArxivDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/arxiv.py)
11 | - **Features**: Generated from paper abstracts
12 | - **Note**: Can be preprocessed with scripts in benchmark folder
13 | 
14 | #### [Citation](Citation.md)
15 | - **Domains**: ACMv9, Citationv1, DBLPv7
16 | - **Source**: Adopted from [ASN](https://arxiv.org/abs/2103.13355)
17 | - **Processing**: See [CitationDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/citation.py)
18 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1ntNt3qHE4p9Us8Re9tZDaB-tdtqwV8AX?usp=share_link)
19 | 
20 | #### [MAG](MAG.md)
21 | - **Domains**: CN, DE, FR, JP, RU, US
22 | - **Source**: Originally from [ogbn-mag](https://ogb.stanford.edu/docs/nodeprop/#ogbn-mag)
23 | - **Processing**: See [MAGDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/mag.py)
24 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1HinhjpNPPivyqoubiYOr8X2jq-rjw3e9?usp=share_link)
25 | - **Note**: Separated into 6 countries by [PairAlign](https://arxiv.org/abs/2403.01092)
26 | 
27 | ### Social Networks
28 | 
29 | #### [Blog](Blog.md)
30 | - **Domains**: Blog1, Blog2
31 | - **Source**: Adopted from [ACDNE](https://arxiv.org/abs/2002.07366)
32 | - **Processing**: See [BlogDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/blog.py)
33 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1jKKG0o7rEY-BaVEjBhuGijzwwhU0M-pQ?usp=share_link)
34 | 
35 | #### [Twitch](Twitch.md)
36 | - **Domains**: DE, EN, ES, FR, PT, RU
37 | - **Source**: [Twitch Social Networks](https://github.com/benedekrozemberczki/datasets#twitch-social-networks)
38 | - **Processing**: See [TwitchDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/twitch.py)
39 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1GWMyyJOZ4CeeqP_H5dCA5voSQHT0WlXG?usp=share_link)
40 | 
41 | ### Infrastructure Networks
42 | 
43 | #### [Airport](Airport.md)
44 | - **Domains**: Brazil, Europe, USA
45 | - **Source**: Adopted from [struc2vec](https://arxiv.org/abs/1704.03165)
46 | - **Processing**: See [AirportDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/airport.py)
47 | - **Features**: Constructed using OneHotDegree for each node
48 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1zlluWoeukD33ZxwaTRQi3jCdD0qC-I2j?usp=share_link)
49 | 
50 | ### Graph Classification Benchmarks
51 | 
52 | #### [TUGraph](TUGraph.md)
53 | - **Datasets**: 
54 |     
55 |     * PROTEINS
56 |     * FRANKENSTEIN
57 |     * Mutagenicity
58 | 
59 | - **Domains**: 2 domains based on density for each dataset
60 | - **Source**: Adopted from [TUDataset](https://chrsmrrs.github.io/datasets/docs/datasets/)
61 | - **Processing**: See [GraphTUDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/tugraph.py)
62 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1NbPK71Dy0ulH3CdNyfvMwQECj_Oh867I?usp=sharing)
63 | 
64 | ### Usage Example
65 | 
66 | ```python
67 | from pygda.datasets import CitationDataset
68 | 
69 | # Load the Citation dataset
70 | dataset = CitationDataset(root='data/citation', name='ACMv9')
71 | data = dataset[0]
72 | 
73 | # Access the data
74 | x = data.x  # Node features
75 | edge_index = data.edge_index  # Graph connectivity
76 | y = data.y  # Labels
77 | ```
78 | 
79 | Each dataset documentation includes:
80 | 
81 | - Detailed domain descriptions
82 | - Data sources and references
83 | - Processing instructions
84 | - Download information
85 | - Usage examples
86 | - Implementation details
87 | 
88 | For specific details about each dataset, please visit their respective documentation pages linked above.


--------------------------------------------------------------------------------
/pygda/utils/utility.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def logger(epoch=0,
  4 |            loss=0,
  5 |            source_train_acc=None,
  6 |            source_val_acc=None,
  7 |            target=None,
  8 |            time=None,
  9 |            verbose=0,
 10 |            train=True):
 11 |     """
 12 |     Print formatted training/testing progress information.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     epoch : int, optional
 17 |         Current training epoch. Default: 0
 18 |     loss : float or tuple, optional
 19 |         Loss value(s) for current epoch. If tuple, contains inner and outer losses.
 20 |         Default: 0
 21 |     source_train_acc : float, optional
 22 |         Source domain training accuracy. Default: None
 23 |     source_val_acc : float, optional
 24 |         Source domain validation accuracy. Default: None
 25 |     target : torch.Tensor, optional
 26 |         Target domain predictions/labels. Default: None
 27 |     time : float, optional
 28 |         Time taken for current epoch. Default: None
 29 |     verbose : int, optional
 30 |         Verbosity level controlling output detail:
 31 | 
 32 |         - 0: No output
 33 |         - 1: Basic loss information
 34 |         - 2: Add accuracy metrics
 35 |         - 3: Add detailed metrics (recall, precision, etc.)
 36 | 
 37 |         Default: 0
 38 |     train : bool, optional
 39 |         Whether in training or testing mode. Default: True
 40 | 
 41 |     Notes
 42 |     -----
 43 |     Output Levels:
 44 | 
 45 |     - Basic Output (verbose=1):
 46 |        
 47 |        * Epoch number (training) or "Test" (testing)
 48 |        * Loss values (single or inner/outer)
 49 | 
 50 |     - Extended Output (verbose=2):
 51 |        
 52 |        * Basic output
 53 |        * Source domain accuracy
 54 |        * Target domain accuracy
 55 |        * Timing information
 56 | 
 57 |     - Detailed Output (verbose=3):
 58 |        
 59 |        * Extended output
 60 |        * Recall at k
 61 |        * Precision at k
 62 |        * Average precision
 63 |        * F1 score
 64 |        * Contamination metrics
 65 | 
 66 |     Features:
 67 |     
 68 |     - Multi-level verbosity
 69 |     - Flexible metric display
 70 |     - Progress tracking
 71 |     - Performance monitoring
 72 | 
 73 |     Format:
 74 | 
 75 |     - Epoch XXXX: Loss X.XXXX | Source Acc X.XXXX | Target Acc X.XXXX | Metrics ... | Time X.XX
 76 |     """
 77 |     if verbose > 0:
 78 |         if train:
 79 |             print("Epoch {:04d}: ".format(epoch), end='')
 80 |         else:
 81 |             print("Test: ", end='')
 82 | 
 83 |         if isinstance(loss, tuple):
 84 |             print("Loss I {:.4f} | Loss O {:.4f} | "
 85 |                       .format(loss[0], loss[1]), end='')
 86 |         else:
 87 |             print("loss {:.4f}, ".format(loss), end='')
 88 | 
 89 |         if verbose > 1:
 90 |             if source_train_acc is not None:
 91 |                 print("source acc {:.4f}, ".format(source_train_acc), end='')
 92 | 
 93 |             if target is not None:
 94 |                 print("target acc {:.4f}, ".format(target), end='')
 95 | 
 96 |             if verbose > 2:
 97 |                 if target is not None:
 98 |                     pos_size = target.nonzero().size(0)
 99 |                     rec = eval_recall_at_k(target, score, pos_size)
100 |                     pre = eval_precision_at_k(target, score, pos_size)
101 |                     ap = eval_average_precision(target, score)
102 | 
103 |                     contamination = sum(target) / len(target)
104 |                     threshold = np.percentile(score,
105 |                                               100 * (1 - contamination))
106 |                     pred = (score > threshold).long()
107 |                     f1 = eval_f1(target, pred)
108 | 
109 |                     print(" | Recall {:.4f} | Precision {:.4f} "
110 |                           "| AP {:.4f} | F1 {:.4f}"
111 |                           .format(rec, pre, ap, f1), end='')
112 | 
113 |             if time is not None:
114 |                 print("time {:.2f}".format(time), end='')
115 | 
116 |         print()


--------------------------------------------------------------------------------
/pygda/nn/deepwalk_pretrain.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch_geometric.nn import Node2Vec
  3 | 
  4 | class DWPretrain(torch.nn.Module):
  5 |     """
  6 |     DeepWalk pretraining implementation for graph embeddings.
  7 | 
  8 |     Parameters
  9 |     ----------
 10 |     data : torch_geometric.data.Data
 11 |         Input graph data object.
 12 |     epoch : int, optional
 13 |         Number of training epochs. Default: 200.
 14 |     embedding_dim : int, optional
 15 |         Dimension of node embeddings. Default: 128.
 16 |     walk_length : int, optional
 17 |         Length of each random walk. Default: 20.
 18 |     context_size : int, optional
 19 |         Size of context window. Default: 10.
 20 |     walks_per_node : int, optional
 21 |         Number of walks per node. Default: 10.
 22 |     num_negative_samples : int, optional
 23 |         Number of negative samples per positive pair. Default: 1.
 24 | 
 25 |     Notes
 26 |     -----
 27 |     Implements DeepWalk algorithm using Node2Vec with p=q=1.0 (equivalent to DeepWalk).
 28 |     Uses sparse implementation for memory efficiency.
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         data,
 34 |         epoch=200,
 35 |         embedding_dim=128,
 36 |         walk_length=20,
 37 |         context_size=10,
 38 |         walks_per_node=10,
 39 |         num_negative_samples=1,
 40 |         ):
 41 |         super(DWPretrain, self).__init__()
 42 | 
 43 |         self.data = data
 44 |         self.device = data.edge_index.device
 45 |         self.epoch = epoch
 46 |         self.embedding_dim = embedding_dim
 47 |         self.walk_length = walk_length
 48 |         self.context_size = context_size
 49 |         self.walks_per_node = walks_per_node
 50 |         self.num_negative_samples = num_negative_samples
 51 | 
 52 |         self.model = Node2Vec(
 53 |             data.edge_index,
 54 |             embedding_dim=self.embedding_dim,
 55 |             walk_length=self.walk_length,
 56 |             context_size=self.context_size,
 57 |             walks_per_node=self.walks_per_node,
 58 |             num_negative_samples=self.num_negative_samples,
 59 |             p=1.0,
 60 |             q=1.0,
 61 |             sparse=True,
 62 |         ).to(self.device)
 63 | 
 64 |         num_workers = 4
 65 |         self.loader = self.model.loader(batch_size=128, shuffle=True, num_workers=num_workers)
 66 |         self.optimizer = torch.optim.SparseAdam(list(self.model.parameters()), lr=0.01)
 67 |     
 68 |     def train(self):
 69 |         """
 70 |         Execute one epoch of training.
 71 | 
 72 |         Returns
 73 |         -------
 74 |         float
 75 |             Average loss value for the epoch.
 76 | 
 77 |         Notes
 78 |         -----
 79 |         Training process:
 80 |         
 81 |         1. Generate random walks
 82 |         2. Sample positive and negative context pairs
 83 |         3. Update embeddings using SparseAdam optimizer
 84 |         """
 85 |         self.model.train()
 86 |         total_loss = 0
 87 |         for pos_rw, neg_rw in self.loader:
 88 |             self.optimizer.zero_grad()
 89 |             loss = self.model.loss(pos_rw.to(self.device), neg_rw.to(self.device))
 90 |             loss.backward()
 91 |             self.optimizer.step()
 92 |             total_loss += loss.item()
 93 |         
 94 |         return total_loss / len(self.loader)
 95 |     
 96 |     def fit(self):
 97 |         """
 98 |         Complete training procedure for all epochs.
 99 | 
100 |         Notes
101 |         -----
102 |         Executes training loop for specified number of epochs.
103 |         Prints progress including epoch number and loss value.
104 |         """
105 |         for epoch in range(self.epoch):
106 |             loss = self.train()
107 |             print(f'Epoch: {epoch:03d}, pretrain loss: {loss:.4f}')
108 |     
109 |     def get_embedding(self):
110 |         """
111 |         Retrieve learned node embeddings.
112 | 
113 |         Returns
114 |         -------
115 |         torch.Tensor
116 |             Node embedding matrix of shape (num_nodes, embedding_dim).
117 | 
118 |         Notes
119 |         -----
120 |         Returns final node embeddings after training or during evaluation.
121 |         """
122 |         self.model.eval()
123 |         z = self.model()
124 | 
125 |         return z
126 | 


--------------------------------------------------------------------------------
/pygda/datasets/tugraph.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | import torch
  3 | import numpy as np
  4 | from torch_geometric.data import InMemoryDataset, Data
  5 | from torch_geometric.io import read_txt_array
  6 | import torch.nn.functional as F
  7 | import random
  8 | 
  9 | import scipy
 10 | import pickle as pkl
 11 | from sklearn.preprocessing import label_binarize
 12 | import csv
 13 | import json
 14 | 
 15 | import warnings
 16 | warnings.filterwarnings('ignore', category=DeprecationWarning)
 17 | 
 18 | 
 19 | class GraphTUDataset(InMemoryDataset):
 20 |     """
 21 |     TUGraph Dataset loader for graph-based analysis.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     root : str
 26 |         Root directory where the dataset should be saved
 27 |     name : str
 28 |         Name of the TU dataset
 29 |     transform : callable, optional
 30 |         Function/transform that takes in a Data object and returns a transformed
 31 |         version. Default: None
 32 |     pre_transform : callable, optional
 33 |         Function/transform to be applied to the data object before saving.
 34 |         Default: None
 35 |     pre_filter : callable, optional
 36 |         Function that takes in a Data object and returns a boolean value,
 37 |         indicating whether the data object should be included. Default: None
 38 | 
 39 |     Notes
 40 |     -----
 41 |     Dataset Structure:
 42 | 
 43 |     - Collection of graphs
 44 |     - Each graph has its own structure and features
 45 |     - Supports various graph classification tasks
 46 |     - Random shuffling for better training
 47 |     """
 48 |     def __init__(self,
 49 |                  root,
 50 |                  name,
 51 |                  transform=None,
 52 |                  pre_transform=None,
 53 |                  pre_filter=None):
 54 |         self.name = name
 55 |         self.root = root
 56 |         super(GraphTUDataset, self).__init__(root, transform, pre_transform, pre_filter)
 57 | 
 58 |         self.data, self.slices = torch.load(self.processed_paths[0])
 59 |     
 60 |     @property
 61 |     def raw_file_names(self):
 62 |         """
 63 |         Names of required raw files.
 64 | 
 65 |         Returns
 66 |         -------
 67 |         list[str]
 68 |             List of required raw file names
 69 | 
 70 |         Notes
 71 |         -----
 72 |         Required files:
 73 | 
 74 |         - .pkl: Pickle file containing list of graph data objects
 75 |         """
 76 |         return [".pkl"]
 77 | 
 78 |     @property
 79 |     def processed_file_names(self):
 80 |         """
 81 |         Names of processed data files.
 82 | 
 83 |         Returns
 84 |         -------
 85 |         list[str]
 86 |             List of processed file names
 87 | 
 88 |         Notes
 89 |         -----
 90 |         Processed files:
 91 | 
 92 |         - data.pt: Contains processed PyTorch Geometric data objects
 93 |         """
 94 |         return ['data.pt']
 95 | 
 96 |     def download(self):
 97 |         """
 98 |         Download raw data files.
 99 | 
100 |         Notes
101 |         -----
102 |         Empty implementation - data should be manually placed in raw directory
103 |         """
104 |         pass
105 | 
106 |     def process(self):
107 |         """
108 |         Process raw data into PyTorch Geometric Data format.
109 | 
110 |         Notes
111 |         -----
112 |         Processing Steps:
113 |         
114 |         - Load pickle data:
115 | 
116 |             * List of graph data objects
117 | 
118 |         - Random shuffling:
119 | 
120 |             * Shuffle graphs for better training
121 | 
122 |         - Apply pre-transform:
123 | 
124 |             * Transform each graph if specified
125 | 
126 |         - Collate graphs:
127 | 
128 |             * Combine into single data object
129 | 
130 |         - Save processed data
131 | 
132 |         Features:
133 | 
134 |         - Multiple graph handling
135 |         - Random shuffling
136 |         - Optional pre-transform support
137 |         - Batch processing support
138 |         """
139 |         path = osp.join(self.raw_dir, '{}.pkl'.format(self.name))
140 |         data_list = pkl.load(open(path, 'rb'))
141 |         random.shuffle(data_list)
142 | 
143 |         if self.pre_transform is not None:
144 |             data_list = [self.pre_transform(data) for data in data_list]
145 | 
146 |         self.data, self.slices = self.collate(data_list)
147 | 
148 |         torch.save((self.data, self.slices), self.processed_paths[0])
149 | 


--------------------------------------------------------------------------------
/docs/assets/css/custom.css:
--------------------------------------------------------------------------------
  1 | /* Global styles */
  2 | .rst-content {
  3 |     max-width: 1000px;
  4 |     margin: 0 auto;
  5 |     line-height: 1.7;
  6 | }
  7 | 
  8 | /* Beautiful Typography */
  9 | .rst-content h1 {
 10 |     font-size: 2.5rem;
 11 |     color: #2c3e50;
 12 |     border-bottom: 3px solid #3498db;
 13 |     padding-bottom: 0.5rem;
 14 |     margin-bottom: 2rem;
 15 | }
 16 | 
 17 | .rst-content h2 {
 18 |     font-size: 2rem;
 19 |     color: #34495e;
 20 |     margin-top: 2.5rem;
 21 |     margin-bottom: 1.5rem;
 22 | }
 23 | 
 24 | .rst-content h3 {
 25 |     font-size: 1.75rem;
 26 |     color: #2980b9;
 27 |     margin-top: 2rem;
 28 | }
 29 | 
 30 | /* Function signatures with gradient background */
 31 | .rst-content .function {
 32 |     background: linear-gradient(to right, #f6f9fc, #ffffff);
 33 |     border-left: 4px solid #3498db;
 34 |     padding: 1rem;
 35 |     margin: 1.5rem 0;
 36 |     border-radius: 0 6px 6px 0;
 37 |     font-size: 1.2rem;
 38 |     font-weight: 600;
 39 |     color: #2c3e50;
 40 |     box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 41 | }
 42 | 
 43 | /* Beautiful docstring styling */
 44 | .rst-content .docstring {
 45 |     background: #ffffff;
 46 |     padding: 1.5rem;
 47 |     margin: 1rem 0 2rem 0;
 48 |     border-radius: 8px;
 49 |     box-shadow: 0 4px 6px rgba(0,0,0,0.07);
 50 | }
 51 | 
 52 | .rst-content .docstring .parameter {
 53 |     font-family: "Segoe UI", system-ui, -apple-system, sans-serif;
 54 |     color: #2980b9;
 55 |     background-color: #f8f9fa;
 56 |     padding: 1rem 1.5rem;
 57 |     margin: 0.5rem 0;
 58 |     border-radius: 6px;
 59 |     border: 1px solid #e1e8ed;
 60 |     transition: all 0.2s ease;
 61 | }
 62 | 
 63 | .rst-content .docstring .parameter:hover {
 64 |     transform: translateY(-2px);
 65 |     box-shadow: 0 4px 8px rgba(0,0,0,0.1);
 66 | }
 67 | 
 68 | /* Code blocks with modern styling */
 69 | .rst-content pre {
 70 |     background: #282c34;
 71 |     border-radius: 8px;
 72 |     margin: 1.5rem 0;
 73 |     box-shadow: 0 4px 6px rgba(0,0,0,0.1);
 74 | }
 75 | 
 76 | .rst-content pre code {
 77 |     color: #282c34;
 78 |     font-family: 'Fira Code', 'Consolas', monospace;
 79 |     padding: 1.5rem;
 80 |     font-size: 0.95rem;
 81 |     line-height: 1.6;
 82 | }
 83 | 
 84 | /* Inline code */
 85 | .rst-content code {
 86 |     color: #e83e8c;
 87 |     background: #f8f9fa;
 88 |     padding: 2px 6px;
 89 |     border-radius: 4px;
 90 |     font-size: 0.9em;
 91 | }
 92 | 
 93 | /* Method signatures */
 94 | .rst-content .method .signature {
 95 |     color: #6c5ce7;
 96 |     font-weight: 600;
 97 |     background: #f8f9fa;
 98 |     padding: 1rem;
 99 |     border-radius: 6px;
100 |     margin: 1rem 0;
101 |     border: 1px solid #e1e8ed;
102 | }
103 | 
104 | /* Summary sections */
105 | .rst-content .docstring .summary {
106 |     font-weight: 600;
107 |     color: #2c3e50;
108 |     font-size: 1.1rem;
109 |     margin-bottom: 1rem;
110 |     padding-bottom: 0.5rem;
111 |     border-bottom: 2px solid #e1e8ed;
112 | }
113 | 
114 | /* Arguments styling */
115 | .rst-content .arguments .argument {
116 |     color: #576574;
117 |     padding: 0.5rem 0;
118 |     border-bottom: 1px solid #f1f1f1;
119 | }
120 | 
121 | /* Admonitions (notes, warnings, etc.) */
122 | .rst-content .admonition {
123 |     border-radius: 8px;
124 |     border: none;
125 |     box-shadow: 0 2px 4px rgba(0,0,0,0.1);
126 |     margin: 1.5rem 0;
127 | }
128 | 
129 | .rst-content .admonition-title {
130 |     border-radius: 8px 8px 0 0;
131 |     text-transform: uppercase;
132 |     letter-spacing: 0.5px;
133 |     font-size: 0.9rem;
134 | }
135 | 
136 | /* Links */
137 | .rst-content a {
138 |     color: #3498db;
139 |     text-decoration: none;
140 |     transition: color 0.2s ease;
141 | }
142 | 
143 | .rst-content a:hover {
144 |     color: #2980b9;
145 |     text-decoration: underline;
146 | }
147 | 
148 | /* Tables */
149 | .rst-content table {
150 |     border-radius: 8px;
151 |     overflow: hidden;
152 |     box-shadow: 0 2px 4px rgba(0,0,0,0.1);
153 |     margin: 2rem 0;
154 | }
155 | 
156 | .rst-content table thead th {
157 |     background: #f8f9fa;
158 |     border-bottom: 2px solid #e1e8ed;
159 |     color: #2c3e50;
160 |     padding: 12px 15px;
161 | }
162 | 
163 | .rst-content table td {
164 |     padding: 12px 15px;
165 |     border-bottom: 1px solid #e1e8ed;
166 | }
167 | 
168 | /* Lists */
169 | .rst-content ul, .rst-content ol {
170 |     padding-left: 1.5rem;
171 |     margin: 1rem 0;
172 | }
173 | 
174 | .rst-content li {
175 |     margin: 0.5rem 0;
176 |     color: #2c3e50;
177 | }
178 | 
179 | /* Smooth scrolling */
180 | html {
181 |     scroll-behavior: smooth;
182 | }
183 | 
184 | /* Selection color */
185 | ::selection {
186 |     background: #3498db33;
187 |     color: #2c3e50;
188 | }
189 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL Advanced"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     branches: [ "main" ]
19 |   schedule:
20 |     - cron: '31 12 * * 2'
21 | 
22 | jobs:
23 |   analyze:
24 |     name: Analyze (${{ matrix.language }})
25 |     # Runner size impacts CodeQL analysis time. To learn more, please see:
26 |     #   - https://gh.io/recommended-hardware-resources-for-running-codeql
27 |     #   - https://gh.io/supported-runners-and-hardware-resources
28 |     #   - https://gh.io/using-larger-runners (GitHub.com only)
29 |     # Consider using larger runners or machines with greater resources for possible analysis time improvements.
30 |     runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
31 |     permissions:
32 |       # required for all workflows
33 |       security-events: write
34 | 
35 |       # required to fetch internal or private CodeQL packs
36 |       packages: read
37 | 
38 |       # only required for workflows in private repositories
39 |       actions: read
40 |       contents: read
41 | 
42 |     strategy:
43 |       fail-fast: false
44 |       matrix:
45 |         include:
46 |         - language: python
47 |           build-mode: none
48 |         # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
49 |         # Use `c-cpp` to analyze code written in C, C++ or both
50 |         # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
51 |         # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
52 |         # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
53 |         # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
54 |         # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
55 |         # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
56 |     steps:
57 |     - name: Checkout repository
58 |       uses: actions/checkout@v4
59 | 
60 |     # Add any setup steps before running the `github/codeql-action/init` action.
61 |     # This includes steps like installing compilers or runtimes (`actions/setup-node`
62 |     # or others). This is typically only required for manual builds.
63 |     # - name: Setup runtime (example)
64 |     #   uses: actions/setup-example@v1
65 | 
66 |     # Initializes the CodeQL tools for scanning.
67 |     - name: Initialize CodeQL
68 |       uses: github/codeql-action/init@v3
69 |       with:
70 |         languages: ${{ matrix.language }}
71 |         build-mode: ${{ matrix.build-mode }}
72 |         # If you wish to specify custom queries, you can do so here or in a config file.
73 |         # By default, queries listed here will override any specified in a config file.
74 |         # Prefix the list here with "+" to use these queries and those in the config file.
75 | 
76 |         # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
77 |         # queries: security-extended,security-and-quality
78 | 
79 |     # If the analyze step fails for one of the languages you are analyzing with
80 |     # "We were unable to automatically build your code", modify the matrix above
81 |     # to set the build mode to "manual" for that language. Then modify this step
82 |     # to build your code.
83 |     # ℹ️ Command-line programs to run using the OS shell.
84 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
85 |     - if: matrix.build-mode == 'manual'
86 |       shell: bash
87 |       run: |
88 |         echo 'If you are using a "manual" build mode for one or more of the' \
89 |           'languages you are analyzing, replace this with the commands to build' \
90 |           'your code, for example:'
91 |         echo '  make bootstrap'
92 |         echo '  make release'
93 |         exit 1
94 | 
95 |     - name: Perform CodeQL Analysis
96 |       uses: github/codeql-action/analyze@v3
97 |       with:
98 |         category: "/language:${{matrix.language}}"
99 | 


--------------------------------------------------------------------------------
/benchmark/llm/kbl.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import ArxivDataset
 11 | 
 12 | from pygda.models import KBL
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='llm-bert-arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='llm-bert-arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=800, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--k_cross', type=int, default=10, help='number of edges for cross domains')
 37 | parser.add_argument('--k_within', type=int, default=3, help='number of edges for within domains')
 38 | 
 39 | args = parser.parse_args()
 40 | 
 41 | # load data 
 42 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 43 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source)
 44 |     source_dataset = ArxivDataset(path, args.source)
 45 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source)
 47 |     source_dataset = ArxivDataset(path, args.source)
 48 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source)
 50 |     source_dataset = ArxivDataset(path, args.source)
 51 | 
 52 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 53 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target)
 54 |     target_dataset = ArxivDataset(path, args.target)
 55 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target)
 57 |     target_dataset = ArxivDataset(path, args.target)
 58 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target)
 60 |     target_dataset = ArxivDataset(path, args.target)
 61 | 
 62 | source_data = source_dataset[0].to(args.device)
 63 | target_data = target_dataset[0].to(args.device)
 64 | 
 65 | if not is_undirected(source_data.edge_index):
 66 |     source_data.edge_index = to_undirected(source_data.edge_index)
 67 | 
 68 | if not is_undirected(target_data.edge_index):
 69 |     target_data.edge_index = to_undirected(target_data.edge_index)
 70 | 
 71 | num_features = source_data.x.size(1)
 72 | num_classes = len(np.unique(source_data.y.cpu().numpy()))
 73 | 
 74 | # choose a graph domain adaptation model
 75 | model = KBL(
 76 |     in_dim=num_features,
 77 |     hid_dim=args.nhid,
 78 |     num_classes=num_classes,
 79 |     num_layers=args.num_layers,
 80 |     weight_decay=args.weight_decay,
 81 |     lr=args.lr,
 82 |     dropout=args.dropout_ratio,
 83 |     epoch=args.epochs,
 84 |     device=args.device,
 85 |     k_cross=args.k_cross,
 86 |     k_within=args.k_within
 87 |     )
 88 | 
 89 | # train the model
 90 | model.fit(source_data, target_data)
 91 | 
 92 | # evaluate the performance
 93 | logits, labels = model.predict(target_data)
 94 | 
 95 | preds = logits.argmax(dim=1)
 96 | 
 97 | mi_f1 = eval_micro_f1(labels, preds)
 98 | ma_f1 = eval_macro_f1(labels, preds)
 99 | 
100 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
101 |     auc = eval_roc_auc(labels, logits[:, 1])
102 | else:
103 |     auc = 0.0
104 | 
105 | results = 'kbl,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
106 | 
107 | with open(args.filename, 'a+') as f:
108 |     f.write(results + '\n')
109 | 
110 | print(results)


--------------------------------------------------------------------------------
/benchmark/llm/udagcn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import ArxivDataset
 11 | 
 12 | from pygda.models import UDAGCN
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.utils import degree, is_undirected, to_undirected
 17 | from torch_geometric.transforms import OneHotDegree
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | 
 21 | # model agnostic params
 22 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 23 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 24 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 25 | parser.add_argument('--weight_decay', type=float, default=0.005, help='weight decay')
 26 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 27 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 28 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 29 | parser.add_argument('--source', type=str, default='llm-bert-arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1')
 30 | parser.add_argument('--target', type=str, default='llm-bert-arxiv-2018-2020', help='target domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 32 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 33 | 
 34 | # model specific params
 35 | parser.add_argument('--ppmi', type=bool, default=True, help='use PPMI matrix or not')
 36 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module')
 37 | 
 38 | args = parser.parse_args()
 39 | 
 40 | # load data 
 41 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 42 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source)
 43 |     source_dataset = ArxivDataset(path, args.source)
 44 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 45 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source)
 46 |     source_dataset = ArxivDataset(path, args.source)
 47 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 48 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source)
 49 |     source_dataset = ArxivDataset(path, args.source)
 50 | 
 51 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 52 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target)
 53 |     target_dataset = ArxivDataset(path, args.target)
 54 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 55 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target)
 56 |     target_dataset = ArxivDataset(path, args.target)
 57 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 58 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target)
 59 |     target_dataset = ArxivDataset(path, args.target)
 60 | 
 61 | source_data = source_dataset[0].to(args.device)
 62 | target_data = target_dataset[0].to(args.device)
 63 | 
 64 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 65 |     if not is_undirected(source_data.edge_index):
 66 |         source_data.edge_index = to_undirected(source_data.edge_index)
 67 |     
 68 |     if not is_undirected(target_data.edge_index):
 69 |         target_data.edge_index = to_undirected(target_data.edge_index)
 70 | 
 71 | num_features = source_data.x.size(1)
 72 | num_classes = len(np.unique(source_data.y.cpu().numpy()))
 73 | 
 74 | # choose a graph domain adaptation model
 75 | model = UDAGCN(
 76 |     in_dim=num_features,
 77 |     hid_dim=args.nhid,
 78 |     num_classes=num_classes,
 79 |     num_layers=args.num_layers,
 80 |     weight_decay=args.weight_decay,
 81 |     lr=args.lr,
 82 |     dropout=args.dropout_ratio,
 83 |     epoch=args.epochs,
 84 |     device=args.device,
 85 |     ppmi=args.ppmi,
 86 |     adv_dim=args.adv_dim
 87 |     )
 88 | 
 89 | # train the model
 90 | model.fit(source_data, target_data)
 91 | 
 92 | # evaluate the performance
 93 | logits, labels = model.predict(target_data)
 94 | 
 95 | preds = logits.argmax(dim=1)
 96 | 
 97 | mi_f1 = eval_micro_f1(labels, preds)
 98 | ma_f1 = eval_macro_f1(labels, preds)
 99 | 
100 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
101 |     auc = eval_roc_auc(labels, logits[:, 1])
102 | else:
103 |     auc = 0.0
104 | 
105 | results = 'udagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
106 | 
107 | with open(args.filename, 'a+') as f:
108 |     f.write(results + '\n')
109 | 
110 | print(results)


--------------------------------------------------------------------------------
/benchmark/llm/grade.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import ArxivDataset
 11 | 
 12 | from pygda.models import GRADE
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=800, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--disc', type=str, default='JS', help='discriminator')
 37 | parser.add_argument('--weight', type=float, default=0.01, help='trade off parameter for loss')
 38 | 
 39 | args = parser.parse_args()
 40 | 
 41 | # load data 
 42 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 43 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source)
 44 |     source_dataset = ArxivDataset(path, args.source)
 45 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source)
 47 |     source_dataset = ArxivDataset(path, args.source)
 48 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source)
 50 |     source_dataset = ArxivDataset(path, args.source)
 51 | 
 52 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 53 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target)
 54 |     target_dataset = ArxivDataset(path, args.target)
 55 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target)
 57 |     target_dataset = ArxivDataset(path, args.target)
 58 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target)
 60 |     target_dataset = ArxivDataset(path, args.target)
 61 | 
 62 | source_data = source_dataset[0].to(args.device)
 63 | target_data = target_dataset[0].to(args.device)
 64 | 
 65 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 66 |     if not is_undirected(source_data.edge_index):
 67 |         source_data.edge_index = to_undirected(source_data.edge_index)
 68 |     
 69 |     if not is_undirected(target_data.edge_index):
 70 |         target_data.edge_index = to_undirected(target_data.edge_index)
 71 | 
 72 | num_features = source_data.x.size(1)
 73 | num_classes = len(np.unique(source_data.y.cpu().numpy()))
 74 | 
 75 | # choose a graph domain adaptation model
 76 | model = GRADE(
 77 |     in_dim=num_features,
 78 |     hid_dim=args.nhid,
 79 |     num_classes=num_classes,
 80 |     num_layers=args.num_layers,
 81 |     weight_decay=args.weight_decay,
 82 |     lr=args.lr,
 83 |     dropout=args.dropout_ratio,
 84 |     epoch=args.epochs,
 85 |     device=args.device,
 86 |     disc=args.disc,
 87 |     weight=args.weight
 88 |     )
 89 | 
 90 | # train the model
 91 | model.fit(source_data, target_data)
 92 | 
 93 | # evaluate the performance
 94 | logits, labels = model.predict(target_data)
 95 | 
 96 | preds = logits.argmax(dim=1)
 97 | 
 98 | mi_f1 = eval_micro_f1(labels, preds)
 99 | ma_f1 = eval_macro_f1(labels, preds)
100 | 
101 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
102 |     auc = eval_roc_auc(labels, logits[:, 1])
103 | else:
104 |     auc = 0.0
105 | 
106 | results = 'grade,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
107 | 
108 | with open(args.filename, 'a+') as f:
109 |     f.write(results + '\n')
110 | 
111 | print(results)


--------------------------------------------------------------------------------
/benchmark/graph/cwgcn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import GraphTUDataset
 11 | 
 12 | from pygda.models import CWGCN
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=2, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--gnn', type=str, default='gcn', help='GNN backbone')
 37 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks')
 38 | 
 39 | args = parser.parse_args()
 40 | 
 41 | # load data 
 42 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 43 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source)
 44 |     source_dataset = GraphTUDataset(path, args.source)
 45 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source)
 47 |     source_dataset = GraphTUDataset(path, args.source)
 48 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source)
 50 |     source_dataset = GraphTUDataset(path, args.source)
 51 | 
 52 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 53 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target)
 54 |     target_dataset = GraphTUDataset(path, args.target)
 55 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target)
 57 |     target_dataset = GraphTUDataset(path, args.target)
 58 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target)
 60 |     target_dataset = GraphTUDataset(path, args.target)
 61 | 
 62 | if args.mode == 'node':
 63 |     source_data = source_dataset[0].to(args.device)
 64 |     target_data = target_dataset[0].to(args.device)
 65 | 
 66 |     num_features = source_data.x.size(1)
 67 |     num_classes = len(np.unique(source_data.y.cpu().numpy()))
 68 | 
 69 |     if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 70 |         if not is_undirected(source_data.edge_index):
 71 |             source_data.edge_index = to_undirected(source_data.edge_index)
 72 |     
 73 |         if not is_undirected(target_data.edge_index):
 74 |             target_data.edge_index = to_undirected(target_data.edge_index)
 75 | elif args.mode == 'graph':
 76 |     source_data = source_dataset
 77 |     target_data = target_dataset
 78 | 
 79 |     num_features = source_data.num_features
 80 |     num_classes = source_data.num_classes
 81 | 
 82 | # choose a graph domain adaptation model
 83 | model = CWGCN(
 84 |     in_dim=num_features,
 85 |     hid_dim=args.nhid,
 86 |     num_classes=num_classes,
 87 |     mode=args.mode,
 88 |     num_layers=args.num_layers,
 89 |     weight_decay=args.weight_decay,
 90 |     lr=args.lr,
 91 |     dropout=args.dropout_ratio,
 92 |     epoch=args.epochs,
 93 |     device=args.device,
 94 |     gnn=args.gnn
 95 |     )
 96 | 
 97 | # train the model
 98 | model.fit(source_data, target_data)
 99 | 
100 | # evaluate the performance
101 | logits, labels = model.predict(target_data)
102 | 
103 | preds = logits.argmax(dim=1)
104 | 
105 | mi_f1 = eval_micro_f1(labels, preds)
106 | ma_f1 = eval_macro_f1(labels, preds)
107 | 
108 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
109 |     auc = eval_roc_auc(labels, logits[:, 1])
110 | else:
111 |     auc = 0.0
112 | 
113 | results = 'cwgcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
114 | 
115 | with open(args.filename, 'a+') as f:
116 |     f.write(results + '\n')
117 | 
118 | print(results)


--------------------------------------------------------------------------------
/pygda/models/base.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from abc import ABC, abstractmethod
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | import torch.nn.functional as F
  7 | 
  8 | from torch_geometric.loader import NeighborLoader
  9 | 
 10 | from ..utils import logger
 11 | 
 12 | 
 13 | class BaseGDA(ABC):
 14 |     """
 15 |     Abstract Class for Graph Domain Adaptation.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     in_dim  :  int
 20 |         Input feature dimension.
 21 |     hid_dim :  int
 22 |         Hidden dimension of model.
 23 |     num_classes: int
 24 |         Total number of classes.
 25 |     num_layers : int, optional
 26 |         Total number of layers in model.
 27 |     dropout : float, optional
 28 |         Dropout rate. Default: ``0.``.
 29 |     weight_decay : float, optional
 30 |         Weight decay (L2 penalty). Default: ``0.``.
 31 |     act : callable activation function or None, optional
 32 |         Activation function if not None.
 33 |         Default: ``torch.nn.functional.relu``.
 34 |     lr : float, optional
 35 |         Learning rate. Default: ``0.001``.
 36 |     epoch : int, optional
 37 |         Maximum number of training epoch. Default: ``100``.
 38 |     device : str, optional
 39 |         GPU or CPU. Default: ``cuda:0``.
 40 |     batch_size : int, optional
 41 |         Minibatch size, 0 for full batch training. Default: ``0``.
 42 |     num_neigh : int, optional
 43 |         Number of neighbors in sampling, -1 for all neighbors.
 44 |         Default: ``-1``.
 45 |     verbose : int, optional
 46 |         Verbosity mode. Range in [0, 3]. Larger value for printing out
 47 |         more log information. Default: ``0``.
 48 |     **kwargs
 49 |         Other parameters for the model.
 50 |     """
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         in_dim,
 55 |         hid_dim,
 56 |         num_classes,
 57 |         num_layers=2,
 58 |         dropout=0.,
 59 |         weight_decay=0.,
 60 |         act=F.relu,
 61 |         lr=4e-3,
 62 |         epoch=100,
 63 |         device='cuda:0',
 64 |         batch_size=0,
 65 |         num_neigh=-1,
 66 |         verbose=2,
 67 |         **kwargs):
 68 | 
 69 |         super(BaseGDA, self).__init__()
 70 | 
 71 |         self.in_dim = in_dim
 72 |         self.hid_dim = hid_dim
 73 |         self.num_classes = num_classes
 74 |         self.num_layers = num_layers
 75 |         self.dropout = dropout
 76 |         self.weight_decay = weight_decay
 77 |         self.act = act
 78 |         self.verbose = verbose
 79 |         self.kwargs = kwargs
 80 | 
 81 |         self.lr = lr
 82 |         self.epoch = epoch
 83 |         self.device = device
 84 |         self.batch_size = batch_size
 85 | 
 86 |         if type(num_neigh) is int:
 87 |             self.num_neigh = [num_neigh] * self.num_layers
 88 |         elif type(num_neigh) is list:
 89 |             if len(num_neigh) != self.num_layers:
 90 |                 raise ValueError('Number of neighbors should have the '
 91 |                                  'same length as hidden layers dimension or'
 92 |                                  'the number of layers.')
 93 |             self.num_neigh = num_neigh
 94 |         else:
 95 |             raise ValueError('Number of neighbors must be int or list of int')
 96 | 
 97 |         self.model = None
 98 | 
 99 |     def fit(self, data, **kwargs):
100 |         """
101 |         Training the graph neural network.
102 | 
103 |         Parameters
104 |         ----------
105 |         data : torch_geometric.data.Data, optional
106 |             The input graph.
107 |         """
108 | 
109 | 
110 |     def predict(self, data, **kwargs):
111 |         """Prediction for testing graph using the fitted graph domain adaptation model.
112 |         Return predicted labels and probabilities by default.
113 | 
114 |         Parameters
115 |         ----------
116 |         data : torch_geometric.data.Data, optional
117 |             The testing graph.
118 | 
119 |         Returns
120 |         -------
121 |         pred : torch.Tensor
122 |             The predicted labels of shape :math:`N`.
123 |         prob : torch.Tensor
124 |             The output probabilities of shape :math:`N`.
125 |         """
126 | 
127 |     @abstractmethod
128 |     def init_model(self, **kwargs):
129 |         """
130 |         Initialize the graph neural network.
131 | 
132 |         Returns
133 |         -------
134 |         model : torch.nn.Module
135 |             The initialized graph neural network.
136 |         """
137 |     
138 |     @abstractmethod
139 |     def process_graph(self, data,  **kwargs):
140 |         """
141 |         Data preprocessing for the input graph.
142 | 
143 |         Parameters
144 |         ----------
145 |         data : torch_geometric.data.Data
146 |             The input graph.
147 |         """
148 | 
149 |     @abstractmethod
150 |     def forward_model(self, data,  **kwargs):
151 |         """
152 |         Forward pass of the graph neural network.
153 | 
154 |         Parameters
155 |         ----------
156 |         data : torch_geometric.data.Data
157 |             The input graph.
158 | 
159 |         Returns
160 |         -------
161 |         loss : torch.Tensor
162 |             The loss of the current batch.
163 |         """


--------------------------------------------------------------------------------
/benchmark/graph/grade.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import GraphTUDataset
 11 | 
 12 | from pygda.models import GRADE
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--disc', type=str, default='JS', help='discriminator')
 37 | parser.add_argument('--weight', type=float, default=0.01, help='trade off parameter for loss')
 38 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks')
 39 | 
 40 | args = parser.parse_args()
 41 | 
 42 | # load data 
 43 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 44 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source)
 45 |     source_dataset = GraphTUDataset(path, args.source)
 46 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}:
 47 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source)
 48 |     source_dataset = GraphTUDataset(path, args.source)
 49 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 50 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source)
 51 |     source_dataset = GraphTUDataset(path, args.source)
 52 | 
 53 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 54 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target)
 55 |     target_dataset = GraphTUDataset(path, args.target)
 56 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}:
 57 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target)
 58 |     target_dataset = GraphTUDataset(path, args.target)
 59 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 60 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target)
 61 |     target_dataset = GraphTUDataset(path, args.target)
 62 | 
 63 | if args.mode == 'node':
 64 |     source_data = source_dataset[0].to(args.device)
 65 |     target_data = target_dataset[0].to(args.device)
 66 | 
 67 |     num_features = source_data.x.size(1)
 68 |     num_classes = len(np.unique(source_data.y.cpu().numpy()))
 69 | 
 70 |     if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 71 |         if not is_undirected(source_data.edge_index):
 72 |             source_data.edge_index = to_undirected(source_data.edge_index)
 73 |     
 74 |         if not is_undirected(target_data.edge_index):
 75 |             target_data.edge_index = to_undirected(target_data.edge_index)
 76 | elif args.mode == 'graph':
 77 |     source_data = source_dataset
 78 |     target_data = target_dataset
 79 | 
 80 |     num_features = source_data.num_features
 81 |     num_classes = source_data.num_classes
 82 | 
 83 | # choose a graph domain adaptation model
 84 | model = GRADE(
 85 |     in_dim=num_features,
 86 |     hid_dim=args.nhid,
 87 |     num_classes=num_classes,
 88 |     mode=args.mode,
 89 |     num_layers=args.num_layers,
 90 |     weight_decay=args.weight_decay,
 91 |     lr=args.lr,
 92 |     dropout=args.dropout_ratio,
 93 |     epoch=args.epochs,
 94 |     device=args.device,
 95 |     disc=args.disc,
 96 |     weight=args.weight
 97 |     )
 98 | 
 99 | # train the model
100 | model.fit(source_data, target_data)
101 | 
102 | # evaluate the performance
103 | logits, labels = model.predict(target_data)
104 | 
105 | preds = logits.argmax(dim=1)
106 | 
107 | mi_f1 = eval_micro_f1(labels, preds)
108 | ma_f1 = eval_macro_f1(labels, preds)
109 | 
110 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
111 |     auc = eval_roc_auc(labels, logits[:, 1])
112 | else:
113 |     auc = 0.0
114 | 
115 | results = 'grade,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
116 | 
117 | with open(args.filename, 'a+') as f:
118 |     f.write(results + '\n')
119 | 
120 | print(results)


--------------------------------------------------------------------------------
/benchmark/llm/a2gnn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import ArxivDataset
 11 | 
 12 | from pygda.models import A2GNN
 13 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 14 | 
 15 | from torch_geometric.utils import degree, is_undirected, to_undirected
 16 | from torch_geometric.transforms import OneHotDegree
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | 
 20 | # model agnostic params
 21 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 22 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 23 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 24 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 25 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 26 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 27 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 28 | parser.add_argument('--source', type=str, default='llm-bert-arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1')
 29 | parser.add_argument('--target', type=str, default='llm-bert-arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1')
 30 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 31 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 32 | 
 33 | # model specific params
 34 | parser.add_argument('--adv', type=bool, default=False, help='adversarial training or not')
 35 | parser.add_argument('--weight', type=float, default=0.1, help='trade-off parameter for loss')
 36 | parser.add_argument('--s_pnums', type=int, default=0, help='propagation for source models')
 37 | parser.add_argument('--t_pnums', type=int, default=20, help='propagation for target models')
 38 | 
 39 | args = parser.parse_args()
 40 | 
 41 | # load data 
 42 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 43 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source)
 44 |     source_dataset = ArxivDataset(path, args.source)
 45 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source)
 47 |     source_dataset = ArxivDataset(path, args.source)
 48 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source)
 50 |     source_dataset = ArxivDataset(path, args.source)
 51 | 
 52 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 53 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target)
 54 |     target_dataset = ArxivDataset(path, args.target)
 55 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target)
 57 |     target_dataset = ArxivDataset(path, args.target)
 58 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target)
 60 |     target_dataset = ArxivDataset(path, args.target)
 61 | 
 62 | source_data = source_dataset[0].to(args.device)
 63 | target_data = target_dataset[0].to(args.device)
 64 | 
 65 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 66 |     if not is_undirected(source_data.edge_index):
 67 |         source_data.edge_index = to_undirected(source_data.edge_index)
 68 |     
 69 |     if not is_undirected(target_data.edge_index):
 70 |         target_data.edge_index = to_undirected(target_data.edge_index)
 71 | 
 72 | num_features = source_data.x.size(1)
 73 | num_classes = len(np.unique(source_data.y.cpu().numpy()))
 74 | 
 75 | # choose a graph domain adaptation model
 76 | model = A2GNN(
 77 |     in_dim=num_features,
 78 |     hid_dim=args.nhid,
 79 |     num_classes=num_classes,
 80 |     num_layers=args.num_layers,
 81 |     weight_decay=args.weight_decay,
 82 |     lr=args.lr,
 83 |     dropout=args.dropout_ratio,
 84 |     epoch=args.epochs,
 85 |     device=args.device,
 86 |     weight=args.weight,
 87 |     adv=args.adv,
 88 |     s_pnums=args.s_pnums,
 89 |     t_pnums=args.t_pnums
 90 |     )
 91 | 
 92 | # train the model
 93 | model.fit(source_data, target_data)
 94 | 
 95 | # evaluate the performance
 96 | logits, labels = model.predict(target_data)
 97 | 
 98 | preds = logits.argmax(dim=1)
 99 | 
100 | mi_f1 = eval_micro_f1(labels, preds)
101 | ma_f1 = eval_macro_f1(labels, preds)
102 | 
103 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
104 |     auc = eval_roc_auc(labels, logits[:, 1])
105 | else:
106 |     auc = 0.0
107 | 
108 | results = 'a2gnn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
109 | 
110 | with open(args.filename, 'a+') as f:
111 |     f.write(results + '\n')
112 | 
113 | print(results)
114 | 


--------------------------------------------------------------------------------
/benchmark/graph/udagcn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import GraphTUDataset
 11 | 
 12 | from pygda.models import UDAGCN
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.005, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--ppmi', type=bool, default=True, help='use PPMI matrix or not')
 37 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module')
 38 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks')
 39 | 
 40 | args = parser.parse_args()
 41 | 
 42 | # load data 
 43 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 44 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source)
 45 |     source_dataset = GraphTUDataset(path, args.source)
 46 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}:
 47 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source)
 48 |     source_dataset = GraphTUDataset(path, args.source)
 49 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 50 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source)
 51 |     source_dataset = GraphTUDataset(path, args.source)
 52 | 
 53 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 54 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target)
 55 |     target_dataset = GraphTUDataset(path, args.target)
 56 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}:
 57 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target)
 58 |     target_dataset = GraphTUDataset(path, args.target)
 59 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 60 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target)
 61 |     target_dataset = GraphTUDataset(path, args.target)
 62 | 
 63 | if args.mode == 'node':
 64 |     source_data = source_dataset[0].to(args.device)
 65 |     target_data = target_dataset[0].to(args.device)
 66 | 
 67 |     num_features = source_data.x.size(1)
 68 |     num_classes = len(np.unique(source_data.y.cpu().numpy()))
 69 | 
 70 |     if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 71 |         if not is_undirected(source_data.edge_index):
 72 |             source_data.edge_index = to_undirected(source_data.edge_index)
 73 |     
 74 |         if not is_undirected(target_data.edge_index):
 75 |             target_data.edge_index = to_undirected(target_data.edge_index)
 76 | elif args.mode == 'graph':
 77 |     source_data = source_dataset
 78 |     target_data = target_dataset
 79 | 
 80 |     num_features = source_data.num_features
 81 |     num_classes = source_data.num_classes
 82 | 
 83 | # choose a graph domain adaptation model
 84 | model = UDAGCN(
 85 |     in_dim=num_features,
 86 |     hid_dim=args.nhid,
 87 |     num_classes=num_classes,
 88 |     mode=args.mode,
 89 |     num_layers=args.num_layers,
 90 |     weight_decay=args.weight_decay,
 91 |     lr=args.lr,
 92 |     dropout=args.dropout_ratio,
 93 |     epoch=args.epochs,
 94 |     device=args.device,
 95 |     ppmi=args.ppmi,
 96 |     adv_dim=args.adv_dim
 97 |     )
 98 | 
 99 | # train the model
100 | model.fit(source_data, target_data)
101 | 
102 | # evaluate the performance
103 | logits, labels = model.predict(target_data)
104 | 
105 | preds = logits.argmax(dim=1)
106 | 
107 | mi_f1 = eval_micro_f1(labels, preds)
108 | ma_f1 = eval_macro_f1(labels, preds)
109 | 
110 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
111 |     auc = eval_roc_auc(labels, logits[:, 1])
112 | else:
113 |     auc = 0.0
114 | 
115 | results = 'udagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
116 | 
117 | with open(args.filename, 'a+') as f:
118 |     f.write(results + '\n')
119 | 
120 | print(results)


--------------------------------------------------------------------------------
/benchmark/llm/adagcn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import ArxivDataset
 11 | 
 12 | from pygda.models import AdaGCN
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--gnn_type', type=str, default='gcn', help='use GCN or PPMIConv')
 37 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module')
 38 | parser.add_argument('--gp_weight', type=float, default=5.0, help='trade off parameter for gradient penalty')
 39 | parser.add_argument('--domain_weight', type=float, default=1.0, help='trade off parameter for domain loss')
 40 | 
 41 | 
 42 | args = parser.parse_args()
 43 | 
 44 | # load data 
 45 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source)
 47 |     source_dataset = ArxivDataset(path, args.source)
 48 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source)
 50 |     source_dataset = ArxivDataset(path, args.source)
 51 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 52 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source)
 53 |     source_dataset = ArxivDataset(path, args.source)
 54 | 
 55 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target)
 57 |     target_dataset = ArxivDataset(path, args.target)
 58 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target)
 60 |     target_dataset = ArxivDataset(path, args.target)
 61 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}:
 62 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target)
 63 |     target_dataset = ArxivDataset(path, args.target)
 64 | 
 65 | source_data = source_dataset[0].to(args.device)
 66 | target_data = target_dataset[0].to(args.device)
 67 | 
 68 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 69 |     if not is_undirected(source_data.edge_index):
 70 |         source_data.edge_index = to_undirected(source_data.edge_index)
 71 |     
 72 |     if not is_undirected(target_data.edge_index):
 73 |         target_data.edge_index = to_undirected(target_data.edge_index)
 74 | 
 75 | num_features = source_data.x.size(1)
 76 | num_classes = len(np.unique(source_data.y.cpu().numpy()))
 77 | 
 78 | # choose a graph domain adaptation model
 79 | model = AdaGCN(
 80 |     in_dim=num_features,
 81 |     hid_dim=args.nhid,
 82 |     num_classes=num_classes,
 83 |     num_layers=args.num_layers,
 84 |     weight_decay=args.weight_decay,
 85 |     lr=args.lr,
 86 |     dropout=args.dropout_ratio,
 87 |     epoch=args.epochs,
 88 |     device=args.device,
 89 |     gnn_type=args.gnn_type,
 90 |     adv_dim=args.adv_dim,
 91 |     gp_weight=args.gp_weight,
 92 |     domain_weight=args.domain_weight
 93 |     )
 94 | 
 95 | # train the model
 96 | model.fit(source_data, target_data)
 97 | 
 98 | # evaluate the performance
 99 | logits, labels = model.predict(target_data)
100 | 
101 | preds = logits.argmax(dim=1)
102 | 
103 | mi_f1 = eval_micro_f1(labels, preds)
104 | ma_f1 = eval_macro_f1(labels, preds)
105 | 
106 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
107 |     auc = eval_roc_auc(labels, logits[:, 1])
108 | else:
109 |     auc = 0.0
110 | 
111 | results = 'adagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
112 | 
113 | with open(args.filename, 'a+') as f:
114 |     f.write(results + '\n')
115 | 
116 | print(results)


--------------------------------------------------------------------------------
/benchmark/graph/dane.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import GraphTUDataset
 11 | 
 12 | from pygda.models import DANE
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=2, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--gnn', type=str, default='gcn', help='GNN backbone')
 37 | parser.add_argument('--train_mode', type=str, default='unsup', help='unsupervised or semi-supervised')
 38 | parser.add_argument('--k', type=int, default=5, help='number of negative samples')
 39 | parser.add_argument('--tgt_rate', type=float, default=0.05, help='target graph rate of labeled nodes')
 40 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks')
 41 | 
 42 | args = parser.parse_args()
 43 | 
 44 | # load data 
 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source)
 47 |     source_dataset = GraphTUDataset(path, args.source)
 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source)
 50 |     source_dataset = GraphTUDataset(path, args.source)
 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 52 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source)
 53 |     source_dataset = GraphTUDataset(path, args.source)
 54 | 
 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target)
 57 |     target_dataset = GraphTUDataset(path, args.target)
 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target)
 60 |     target_dataset = GraphTUDataset(path, args.target)
 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 62 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target)
 63 |     target_dataset = GraphTUDataset(path, args.target)
 64 | 
 65 | if args.mode == 'node':
 66 |     source_data = source_dataset[0].to(args.device)
 67 |     target_data = target_dataset[0].to(args.device)
 68 | 
 69 |     num_features = source_data.x.size(1)
 70 |     num_classes = len(np.unique(source_data.y.cpu().numpy()))
 71 | 
 72 |     if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 73 |         if not is_undirected(source_data.edge_index):
 74 |             source_data.edge_index = to_undirected(source_data.edge_index)
 75 |     
 76 |         if not is_undirected(target_data.edge_index):
 77 |             target_data.edge_index = to_undirected(target_data.edge_index)
 78 | elif args.mode == 'graph':
 79 |     source_data = source_dataset
 80 |     target_data = target_dataset
 81 | 
 82 |     num_features = source_data.num_features
 83 |     num_classes = source_data.num_classes
 84 | 
 85 | # choose a graph domain adaptation model
 86 | model = DANE(
 87 |     in_dim=num_features,
 88 |     hid_dim=args.nhid,
 89 |     num_classes=num_classes,
 90 |     mode=args.mode,
 91 |     num_layers=args.num_layers,
 92 |     weight_decay=args.weight_decay,
 93 |     lr=args.lr,
 94 |     dropout=args.dropout_ratio,
 95 |     epoch=args.epochs,
 96 |     device=args.device,
 97 |     gnn=args.gnn,
 98 |     train_mode=args.train_mode,
 99 |     tgt_rate=args.tgt_rate,
100 |     k=args.k
101 |     )
102 | 
103 | # train the model
104 | model.fit(source_data, target_data)
105 | 
106 | # evaluate the performance
107 | logits, labels = model.predict(target_data)
108 | 
109 | preds = logits.argmax(dim=1)
110 | 
111 | mi_f1 = eval_micro_f1(labels, preds)
112 | ma_f1 = eval_macro_f1(labels, preds)
113 | 
114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
115 |     auc = eval_roc_auc(labels, logits[:, 1])
116 | else:
117 |     auc = 0.0
118 | 
119 | results = 'dane,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
120 | 
121 | with open(args.filename, 'a+') as f:
122 |     f.write(results + '\n')
123 | 
124 | print(results)


--------------------------------------------------------------------------------
/benchmark/graph/a2gnn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import GraphTUDataset
 11 | 
 12 | from pygda.models import A2GNN
 13 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 14 | 
 15 | from torch_geometric.loader import NeighborLoader
 16 | from torch_geometric.utils import degree, is_undirected, to_undirected
 17 | from torch_geometric.transforms import OneHotDegree
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | 
 21 | # model agnostic params
 22 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 23 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 24 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 25 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 26 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 27 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 28 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 29 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1')
 30 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 32 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 33 | 
 34 | # model specific params
 35 | parser.add_argument('--adv', type=bool, default=False, help='adversarial training or not')
 36 | parser.add_argument('--weight', type=float, default=0.1, help='trade-off parameter for loss')
 37 | parser.add_argument('--s_pnums', type=int, default=0, help='propagation for source models')
 38 | parser.add_argument('--t_pnums', type=int, default=20, help='propagation for target models')
 39 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks')
 40 | 
 41 | 
 42 | args = parser.parse_args()
 43 | 
 44 | # load data 
 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source)
 47 |     source_dataset = GraphTUDataset(path, args.source)
 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source)
 50 |     source_dataset = GraphTUDataset(path, args.source)
 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 52 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source)
 53 |     source_dataset = GraphTUDataset(path, args.source)
 54 | 
 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target)
 57 |     target_dataset = GraphTUDataset(path, args.target)
 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target)
 60 |     target_dataset = GraphTUDataset(path, args.target)
 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 62 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target)
 63 |     target_dataset = GraphTUDataset(path, args.target)
 64 | 
 65 | if args.mode == 'node':
 66 |     source_data = source_dataset[0].to(args.device)
 67 |     target_data = target_dataset[0].to(args.device)
 68 | 
 69 |     num_features = source_data.x.size(1)
 70 |     num_classes = len(np.unique(source_data.y.cpu().numpy()))
 71 | 
 72 |     if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 73 |         if not is_undirected(source_data.edge_index):
 74 |             source_data.edge_index = to_undirected(source_data.edge_index)
 75 |     
 76 |         if not is_undirected(target_data.edge_index):
 77 |             target_data.edge_index = to_undirected(target_data.edge_index)
 78 | elif args.mode == 'graph':
 79 |     source_data = source_dataset
 80 |     target_data = target_dataset
 81 | 
 82 |     num_features = source_data.num_features
 83 |     num_classes = source_data.num_classes
 84 | 
 85 | # choose a graph domain adaptation model
 86 | model = A2GNN(
 87 |     in_dim=num_features,
 88 |     hid_dim=args.nhid,
 89 |     num_classes=num_classes,
 90 |     mode=args.mode,
 91 |     num_layers=args.num_layers,
 92 |     weight_decay=args.weight_decay,
 93 |     lr=args.lr,
 94 |     dropout=args.dropout_ratio,
 95 |     epoch=args.epochs,
 96 |     device=args.device,
 97 |     weight=args.weight,
 98 |     adv=args.adv,
 99 |     s_pnums=args.s_pnums,
100 |     t_pnums=args.t_pnums
101 |     )
102 | 
103 | # train the model
104 | model.fit(source_data, target_data)
105 | 
106 | # evaluate the performance
107 | logits, labels = model.predict(target_data)
108 | 
109 | preds = logits.argmax(dim=1)
110 | 
111 | mi_f1 = eval_micro_f1(labels, preds)
112 | ma_f1 = eval_macro_f1(labels, preds)
113 | 
114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
115 |     auc = eval_roc_auc(labels, logits[:, 1])
116 | else:
117 |     auc = 0.0
118 | 
119 | results = 'a2gnn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
120 | 
121 | with open(args.filename, 'a+') as f:
122 |     f.write(results + '\n')
123 | 
124 | print(results)
125 | 


--------------------------------------------------------------------------------
/benchmark/graph/sagda.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import GraphTUDataset
 11 | 
 12 | from pygda.models import SAGDA
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=2, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--alpha', type=float, default=1.0, help='trade-off parameter for high pass filter')
 37 | parser.add_argument('--beta', type=float, default=1.0, help='trade-off parameter for low pass filter')
 38 | parser.add_argument('--ppmi', type=bool, default=True, help='use PPMI matrix or not')
 39 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module')
 40 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks')
 41 | 
 42 | args = parser.parse_args()
 43 | 
 44 | # load data 
 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source)
 47 |     source_dataset = GraphTUDataset(path, args.source)
 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source)
 50 |     source_dataset = GraphTUDataset(path, args.source)
 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 52 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source)
 53 |     source_dataset = GraphTUDataset(path, args.source)
 54 | 
 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target)
 57 |     target_dataset = GraphTUDataset(path, args.target)
 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target)
 60 |     target_dataset = GraphTUDataset(path, args.target)
 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 62 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target)
 63 |     target_dataset = GraphTUDataset(path, args.target)
 64 | 
 65 | if args.mode == 'node':
 66 |     source_data = source_dataset[0].to(args.device)
 67 |     target_data = target_dataset[0].to(args.device)
 68 | 
 69 |     num_features = source_data.x.size(1)
 70 |     num_classes = len(np.unique(source_data.y.cpu().numpy()))
 71 | 
 72 |     if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 73 |         if not is_undirected(source_data.edge_index):
 74 |             source_data.edge_index = to_undirected(source_data.edge_index)
 75 |     
 76 |         if not is_undirected(target_data.edge_index):
 77 |             target_data.edge_index = to_undirected(target_data.edge_index)
 78 | elif args.mode == 'graph':
 79 |     source_data = source_dataset
 80 |     target_data = target_dataset
 81 | 
 82 |     num_features = source_data.num_features
 83 |     num_classes = source_data.num_classes
 84 | 
 85 | # choose a graph domain adaptation model
 86 | model = SAGDA(
 87 |     in_dim=num_features,
 88 |     hid_dim=args.nhid,
 89 |     num_classes=num_classes,
 90 |     mode=args.mode,
 91 |     num_layers=args.num_layers,
 92 |     weight_decay=args.weight_decay,
 93 |     lr=args.lr,
 94 |     dropout=args.dropout_ratio,
 95 |     epoch=args.epochs,
 96 |     device=args.device,
 97 |     alpha=args.alpha,
 98 |     beta=args.beta,
 99 |     ppmi=args.ppmi,
100 |     adv_dim=args.adv_dim
101 |     )
102 | 
103 | # train the model
104 | model.fit(source_data, target_data)
105 | 
106 | # evaluate the performance
107 | logits, labels = model.predict(target_data)
108 | 
109 | preds = logits.argmax(dim=1)
110 | 
111 | mi_f1 = eval_micro_f1(labels, preds)
112 | ma_f1 = eval_macro_f1(labels, preds)
113 | 
114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
115 |     auc = eval_roc_auc(labels, logits[:, 1])
116 | else:
117 |     auc = 0.0
118 | 
119 | results = 'sagda,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
120 | 
121 | with open(args.filename, 'a+') as f:
122 |     f.write(results + '\n')
123 | 
124 | print(results)


--------------------------------------------------------------------------------
/benchmark/graph/adagcn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import torch
  5 | import argparse
  6 | import time
  7 | import os.path as osp
  8 | import numpy as np
  9 | 
 10 | from pygda.datasets import GraphTUDataset
 11 | 
 12 | from pygda.models import AdaGCN
 13 | 
 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc
 15 | 
 16 | from torch_geometric.loader import NeighborLoader
 17 | from torch_geometric.utils import degree, is_undirected, to_undirected
 18 | from torch_geometric.transforms import OneHotDegree
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | # model agnostic params
 23 | parser.add_argument('--seed', type=int, default=200, help='random seed')
 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size')
 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio')
 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices')
 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1')
 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1')
 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs')
 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file')
 34 | 
 35 | # model specific params
 36 | parser.add_argument('--gnn_type', type=str, default='gcn', help='use GCN or PPMIConv')
 37 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module')
 38 | parser.add_argument('--gp_weight', type=float, default=5.0, help='trade off parameter for gradient penalty')
 39 | parser.add_argument('--domain_weight', type=float, default=1.0, help='trade off parameter for domain loss')
 40 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks')
 41 | 
 42 | args = parser.parse_args()
 43 | 
 44 | # load data 
 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 46 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source)
 47 |     source_dataset = GraphTUDataset(path, args.source)
 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}:
 49 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source)
 50 |     source_dataset = GraphTUDataset(path, args.source)
 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 52 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source)
 53 |     source_dataset = GraphTUDataset(path, args.source)
 54 | 
 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}:
 56 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target)
 57 |     target_dataset = GraphTUDataset(path, args.target)
 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}:
 59 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target)
 60 |     target_dataset = GraphTUDataset(path, args.target)
 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}:
 62 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target)
 63 |     target_dataset = GraphTUDataset(path, args.target)
 64 | 
 65 | if args.mode == 'node':
 66 |     source_data = source_dataset[0].to(args.device)
 67 |     target_data = target_dataset[0].to(args.device)
 68 | 
 69 |     num_features = source_data.x.size(1)
 70 |     num_classes = len(np.unique(source_data.y.cpu().numpy()))
 71 | 
 72 |     if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
 73 |         if not is_undirected(source_data.edge_index):
 74 |             source_data.edge_index = to_undirected(source_data.edge_index)
 75 |     
 76 |         if not is_undirected(target_data.edge_index):
 77 |             target_data.edge_index = to_undirected(target_data.edge_index)
 78 | elif args.mode == 'graph':
 79 |     source_data = source_dataset
 80 |     target_data = target_dataset
 81 | 
 82 |     num_features = source_data.num_features
 83 |     num_classes = source_data.num_classes
 84 | 
 85 | # choose a graph domain adaptation model
 86 | model = AdaGCN(
 87 |     in_dim=num_features,
 88 |     hid_dim=args.nhid,
 89 |     num_classes=num_classes,
 90 |     mode=args.mode,
 91 |     num_layers=args.num_layers,
 92 |     weight_decay=args.weight_decay,
 93 |     lr=args.lr,
 94 |     dropout=args.dropout_ratio,
 95 |     epoch=args.epochs,
 96 |     device=args.device,
 97 |     gnn_type=args.gnn_type,
 98 |     adv_dim=args.adv_dim,
 99 |     gp_weight=args.gp_weight,
100 |     domain_weight=args.domain_weight
101 |     )
102 | 
103 | # train the model
104 | model.fit(source_data, target_data)
105 | 
106 | # evaluate the performance
107 | logits, labels = model.predict(target_data)
108 | 
109 | preds = logits.argmax(dim=1)
110 | 
111 | mi_f1 = eval_micro_f1(labels, preds)
112 | ma_f1 = eval_macro_f1(labels, preds)
113 | 
114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}:
115 |     auc = eval_roc_auc(labels, logits[:, 1])
116 | else:
117 |     auc = 0.0
118 | 
119 | results = 'adagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc)
120 | 
121 | with open(args.filename, 'a+') as f:
122 |     f.write(results + '\n')
123 | 
124 | print(results)


--------------------------------------------------------------------------------
/pygda/utils/mmd.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | def guassian_kernel(source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
  5 |     """
  6 |     Calculate Gaussian kernel matrix between source and target features.
  7 | 
  8 |     Parameters
  9 |     ----------
 10 |     source : torch.Tensor
 11 |         Source domain features in shape of (n_source, feature_dim)
 12 |     target : torch.Tensor
 13 |         Target domain features in shape of (n_target, feature_dim)
 14 |     kernel_mul : float, optional
 15 |         Multiplication factor for kernel bandwidth. Default: 2.0
 16 |     kernel_num : int, optional
 17 |         Number of kernels to use. Default: 5
 18 |     fix_sigma : float, optional
 19 |         Fixed bandwidth value. If None, computed from data. Default: None
 20 | 
 21 |     Returns
 22 |     -------
 23 |     torch.Tensor
 24 |         Combined kernel matrix from multiple bandwidths
 25 | 
 26 |     Notes
 27 |     -----
 28 |     Processing Steps:
 29 | 
 30 |     - Combine source and target features
 31 |     - Compute pairwise L2 distances
 32 |     - Calculate kernel bandwidth
 33 |     - Generate multiple kernels
 34 |     - Sum kernel matrices
 35 | 
 36 |     Features:
 37 |     
 38 |     - Multiple kernel computation
 39 |     - Adaptive bandwidth
 40 |     - Efficient matrix operations
 41 |     """
 42 |     n_samples = int(source.size()[0]) + int(target.size()[0])
 43 |     total = torch.cat([source, target], dim=0)
 44 |     total0 = total.unsqueeze(0).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
 45 |     total1 = total.unsqueeze(1).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
 46 |     L2_distance = ((total0-total1)**2).sum(2)
 47 |     if fix_sigma:
 48 |         bandwidth = fix_sigma
 49 |     else:
 50 |         bandwidth = (torch.sum(L2_distance.data) + 1e-6) / (n_samples**2-n_samples)
 51 |     bandwidth /= kernel_mul ** (kernel_num // 2)
 52 |     bandwidth_list = [bandwidth * (kernel_mul**i) for i in range(kernel_num)]
 53 |     kernel_val = [torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list]
 54 |     
 55 |     return sum(kernel_val)
 56 | 
 57 | def get_MMD(source_feat, target_feat, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
 58 |     """
 59 |     Calculate Maximum Mean Discrepancy (MMD) between source and target features.
 60 | 
 61 |     Parameters
 62 |     ----------
 63 |     source_feat : torch.Tensor
 64 |         Source domain features in shape of (n_source, feature_dim)
 65 |     target_feat : torch.Tensor
 66 |         Target domain features in shape of (n_target, feature_dim)
 67 |     kernel_mul : float, optional
 68 |         Multiplication factor for kernel bandwidth. Default: 2.0
 69 |     kernel_num : int, optional
 70 |         Number of kernels to use. Default: 5
 71 |     fix_sigma : float, optional
 72 |         Fixed bandwidth value. If None, computed from data. Default: None
 73 | 
 74 |     Returns
 75 |     -------
 76 |     torch.Tensor
 77 |         MMD loss value between source and target domains
 78 | 
 79 |     Notes
 80 |     -----
 81 |     Processing Steps:
 82 | 
 83 |     - Compute Gaussian kernel matrix
 84 |     - Extract within-domain kernels (XX, YY)
 85 |     - Extract cross-domain kernels (XY, YX)
 86 |     - Calculate MMD loss
 87 | 
 88 |     Features:
 89 |     
 90 |     - Batch-wise computation
 91 |     - Multiple kernel integration
 92 |     - Unbiased estimation
 93 |     """
 94 |     kernels = guassian_kernel(source_feat, 
 95 |                               target_feat,
 96 |                               kernel_mul=kernel_mul, 
 97 |                               kernel_num=kernel_num,
 98 |                               fix_sigma=fix_sigma)
 99 |     
100 |     batch_size = min(int(source_feat.size()[0]), int(target_feat.size()[0]))  
101 |     
102 |     XX = kernels[:batch_size, :batch_size]
103 |     YY = kernels[batch_size:, batch_size:]
104 |     XY = kernels[:batch_size, batch_size:]
105 |     YX = kernels[batch_size:, :batch_size]
106 |     loss = torch.mean(XX + YY - XY - YX)
107 |     return loss
108 | 
109 | def MMD(source_feat, target_feat, sampling_num=1000, times=5):
110 |     """
111 |     Calculate MMD with random sampling for large-scale datasets.
112 | 
113 |     Parameters
114 |     ----------
115 |     source_feat : torch.Tensor
116 |         Source domain features in shape of (n_source, feature_dim)
117 |     target_feat : torch.Tensor
118 |         Target domain features in shape of (n_target, feature_dim)
119 |     sampling_num : int, optional
120 |         Number of samples per iteration. Default: 1000
121 |     times : int, optional
122 |         Number of sampling iterations. Default: 5
123 | 
124 |     Returns
125 |     -------
126 |     torch.Tensor
127 |         Averaged MMD loss value across sampling iterations
128 | 
129 |     Notes
130 |     -----
131 |     Processing Steps:
132 | 
133 |     - Generate random sample indices
134 |     - Sample features from both domains
135 |     - Calculate MMD for each sample
136 |     - Average across iterations
137 | 
138 |     Features:
139 |     
140 |     - Random sampling
141 |     - Multiple iterations
142 |     - Memory efficient
143 |     - Scalable computation
144 |     """
145 |     source_num = source_feat.size(0)
146 |     target_num = target_feat.size(0)
147 | 
148 |     source_sample = torch.randint(source_num, (times, sampling_num))
149 |     target_sample = torch.randint(target_num, (times, sampling_num))
150 | 
151 |     mmd = 0
152 |     for i in range(times):
153 |         source_sample_feat = source_feat[source_sample[i]]
154 |         target_sample_feat = target_feat[target_sample[i]]
155 | 
156 |         mmd = mmd + get_MMD(source_sample_feat, target_sample_feat)
157 | 
158 |     mmd = mmd / times
159 |     return mmd


--------------------------------------------------------------------------------
/pygda/datasets/mag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import torch
  4 | import numpy as np
  5 | import torch.nn.functional as F
  6 | from torch_geometric.data import InMemoryDataset, Data
  7 | from torch_geometric.io import read_txt_array
  8 | 
  9 | import csv
 10 | import json
 11 | import pickle as pkl
 12 | import scipy
 13 | import scipy.io as sio
 14 | 
 15 | import warnings
 16 | warnings.filterwarnings('ignore', category=DeprecationWarning)
 17 | 
 18 | 
 19 | class MAGDataset(InMemoryDataset):
 20 |     """
 21 |     Microsoft Academic Graph (MAG) dataset loader for graph-based analysis.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     root : str
 26 |         Root directory where the dataset should be saved
 27 |     name : str
 28 |         Name of the MAG dataset
 29 |     transform : callable, optional
 30 |         Function/transform that takes in a Data object and returns a transformed
 31 |         version. Default: None
 32 |     pre_transform : callable, optional
 33 |         Function/transform to be applied to the data object before saving.
 34 |         Default: None
 35 |     pre_filter : callable, optional
 36 |         Function that takes in a Data object and returns a boolean value,
 37 |         indicating whether the data object should be included. Default: None
 38 | 
 39 |     Notes
 40 |     -----
 41 |     Dataset Structure:
 42 | 
 43 |     - Nodes represent academic papers
 44 |     - Edges represent citation relationships
 45 |     - Node features from paper content
 46 |     - Labels indicate paper fields (top 20)
 47 |     - Includes train/val/test splits (80/10/10)
 48 |     """
 49 | 
 50 |     def __init__(self,
 51 |                  root,
 52 |                  name,
 53 |                  transform=None,
 54 |                  pre_transform=None,
 55 |                  pre_filter=None):
 56 |         self.name = name
 57 |         self.root = root
 58 |         super(MAGDataset, self).__init__(root, transform, pre_transform, pre_filter)
 59 | 
 60 |         self.data, self.slices = torch.load(self.processed_paths[0])
 61 |     
 62 |     @property
 63 |     def raw_file_names(self):
 64 |         """
 65 |         Names of required raw files.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         list[str]
 70 |             List of required raw file names
 71 | 
 72 |         Notes
 73 |         -----
 74 |         Required files:
 75 | 
 76 |         - labels_20.pt: PyTorch file containing graph data with top 20 fields
 77 |         """
 78 |         return ["labels_20.pt"]
 79 | 
 80 |     @property
 81 |     def processed_file_names(self):
 82 |         """
 83 |         Names of processed data files.
 84 | 
 85 |         Returns
 86 |         -------
 87 |         list[str]
 88 |             List of processed file names
 89 | 
 90 |         Notes
 91 |         -----
 92 |         Processed files:
 93 | 
 94 |         - data.pt: Contains processed PyTorch Geometric data object
 95 |         """
 96 |         return ['data.pt']
 97 | 
 98 |     def download(self):
 99 |         """
100 |         Download raw data files.
101 | 
102 |         Notes
103 |         -----
104 |         Empty implementation - data should be manually placed in raw directory
105 |         """
106 |         pass
107 |         
108 |     def process(self):
109 |         """
110 |         Process raw data into PyTorch Geometric Data format.
111 | 
112 |         Notes
113 |         -----
114 |         Processing Steps:
115 | 
116 |         - Load PyTorch data:
117 | 
118 |             * Node features (paper content)
119 |             * Edge indices (citations)
120 |             * Labels (paper fields)
121 | 
122 |         - Create Data object with:
123 | 
124 |             * Edge indices
125 |             * Node features
126 |             * Node labels
127 |             * Train/val/test masks
128 | 
129 |         - Apply pre-transform if specified
130 |         - Save processed data
131 | 
132 |         Data Split:
133 | 
134 |         - Training: 80%
135 |         - Validation: 10%
136 |         - Testing: 10%
137 | 
138 |         Features:
139 |         
140 |         - Direct tensor loading
141 |         - Random split generation
142 |         - Optional pre-transform support
143 |         - Efficient data storage
144 |         """
145 |         path = osp.join(self.raw_dir, '{}_labels_20.pt'.format(self.name))
146 |         graph = torch.load(path)
147 |         x, edge_index, y = graph.x, graph.edge_index, graph.y
148 | 
149 |         data_list = []
150 |         data = Data(edge_index=edge_index, x=x, y=y)
151 | 
152 |         random_node_indices = np.random.permutation(y.shape[0])
153 |         training_size = int(len(random_node_indices) * 0.8)
154 |         val_size = int(len(random_node_indices) * 0.1)
155 |         train_node_indices = random_node_indices[:training_size]
156 |         val_node_indices = random_node_indices[training_size:training_size + val_size]
157 |         test_node_indices = random_node_indices[training_size + val_size:]
158 | 
159 |         train_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
160 |         train_masks[train_node_indices] = 1
161 |         val_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
162 |         val_masks[val_node_indices] = 1
163 |         test_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
164 |         test_masks[test_node_indices] = 1
165 | 
166 |         data.train_mask = train_masks
167 |         data.val_mask = val_masks
168 |         data.test_mask = test_masks
169 | 
170 |         if self.pre_transform is not None:
171 |             if not os.path.exists(self.processed_paths[0] + 'eival.pt'):
172 |                 data = self.pre_transform(data, self.processed_paths[0])
173 | 
174 |         data_list.append(data)
175 | 
176 |         data, slices = self.collate([data])
177 | 
178 |         torch.save((data, slices), self.processed_paths[0])
179 | 


--------------------------------------------------------------------------------
/pygda/nn/adagcn_base.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from torch_geometric.nn import GCNConv
  6 | 
  7 | from .ppmi_conv import PPMIConv
  8 | from torch_geometric.nn import global_mean_pool
  9 | 
 10 | 
 11 | class GNN(torch.nn.Module):
 12 |     """
 13 |     Generic GNN encoder supporting multiple GNN types.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     in_dim : int
 18 |         Input feature dimension.
 19 |     hid_dim : int
 20 |         Hidden layer dimension.
 21 |     gnn_type : str, optional
 22 |         Type of GNN layer ('gcn' or 'ppmi'). Default: 'gcn'.
 23 |     num_layers : int, optional
 24 |         Number of GNN layers. Default: 3.
 25 |     act : callable, optional
 26 |         Activation function. Default: F.relu.
 27 |     dropout : float, optional
 28 |         Dropout rate. Default: 0.1.
 29 |     **kwargs
 30 |         Additional arguments for GNN layers.
 31 | 
 32 |     Notes
 33 |     -----
 34 |     - Supports both GCN and PPMI convolution types
 35 |     - Multiple layers with residual connections
 36 |     - Configurable activation and dropout
 37 |     """
 38 | 
 39 |     def __init__(self, in_dim, hid_dim, gnn_type='gcn', num_layers=3, act=F.relu, dropout=0.1, **kwargs):
 40 |         super(GNN, self).__init__()
 41 | 
 42 |         self.gnn_type = gnn_type
 43 |         self.act = act
 44 |         self.num_layers = num_layers
 45 | 
 46 |         self.conv_layers = nn.ModuleList()
 47 | 
 48 |         if self.gnn_type == 'gcn':
 49 |             self.conv_layers.append(GCNConv(in_dim, hid_dim))
 50 | 
 51 |             for i in range(1, self.num_layers):
 52 |                 self.conv_layers.append(GCNConv(hid_dim, hid_dim))
 53 |         else:
 54 |             self.conv_layers.append(PPMIConv(in_dim, hid_dim))
 55 | 
 56 |             for i in range(1, self.num_layers):
 57 |                 self.conv_layers.append(PPMIConv(hid_dim, hid_dim))
 58 |         
 59 |         self.dropout = nn.Dropout(dropout)
 60 | 
 61 |     def forward(self, x, edge_index, batch, mode='node'):
 62 |         """
 63 |         Forward pass of the GNN.
 64 | 
 65 |         Parameters
 66 |         ----------
 67 |         x : torch.Tensor
 68 |             Node features.
 69 |         edge_index : torch.Tensor
 70 |             Edge indices.
 71 |         batch : torch.Tensor
 72 |             Batch assignment for graph-level tasks.
 73 |         mode : str, optional
 74 |             'node' or 'graph' level task. Default: 'node'.
 75 | 
 76 |         Returns
 77 |         -------
 78 |         torch.Tensor
 79 |             Node or graph embeddings.
 80 | 
 81 |         Notes
 82 |         -----
 83 |         - Applies multiple GNN layers sequentially
 84 |         - Optional graph pooling for graph-level tasks
 85 |         - Dropout and activation between layers
 86 |         """
 87 |         for i, conv_layer in enumerate(self.conv_layers):
 88 |             x = conv_layer(x, edge_index)
 89 |             if i < len(self.conv_layers) - 1:
 90 |                 x = self.act(x)
 91 |                 x = self.dropout(x)
 92 |         
 93 |         if mode == 'graph':
 94 |             x = global_mean_pool(x, batch)
 95 |         
 96 |         return x
 97 | 
 98 | 
 99 | class AdaGCNBase(nn.Module):
100 |     """
101 |     Base class for AdaGCN.
102 | 
103 |     Parameters
104 |     ----------
105 |     in_dim : int
106 |         Input feature dimension.
107 |     hid_dim : int
108 |         Hidden dimension.
109 |     num_classes : int
110 |         Number of target classes.
111 |     num_layers : int, optional
112 |         Number of GNN layers. Default: 3.
113 |     dropout : float, optional
114 |         Dropout rate. Default: 0.1.
115 |     act : callable, optional
116 |         Activation function. Default: F.relu.
117 |     gnn_type : str, optional
118 |         Type of GNN ('gcn' or 'ppmi'). Default: 'gcn'.
119 |     mode : str, optional
120 |         'node' or 'graph' level task. Default: 'node'.
121 |     **kwargs
122 |         Additional arguments.
123 | 
124 |     Notes
125 |     -----
126 |     Architecture components:
127 |     
128 |     1. GNN encoder for feature extraction
129 |     2. Classification layer
130 |     3. Cross-entropy loss function
131 |     """
132 | 
133 |     def __init__(self,
134 |                  in_dim,
135 |                  hid_dim,
136 |                  num_classes,
137 |                  num_layers=3,
138 |                  dropout=0.1,
139 |                  act=F.relu,
140 |                  gnn_type='gcn',
141 |                  mode='node',
142 |                  **kwargs):
143 |         super(AdaGCNBase, self).__init__()
144 | 
145 |         self.encoder = GNN(in_dim=in_dim, hid_dim=hid_dim, gnn_type=gnn_type, act=act, num_layers=num_layers)
146 |         
147 |         self.cls_model = nn.Sequential(nn.Linear(hid_dim, num_classes))
148 | 
149 |         self.mode = mode
150 |         
151 |         self.loss_func = nn.CrossEntropyLoss()
152 |     
153 |     def forward(self, data):
154 |         """
155 |         Forward pass of AdaGCN.
156 | 
157 |         Parameters
158 |         ----------
159 |         data : torch_geometric.data.Data
160 |             Input graph data.
161 | 
162 |         Returns
163 |         -------
164 |         torch.Tensor
165 |             Node/graph embeddings.
166 | 
167 |         Notes
168 |         -----
169 |         Process:
170 | 
171 |         1. Extract features based on mode (node/graph)
172 |         2. Apply GNN encoder
173 |         3. Return embeddings for downstream tasks
174 |         """
175 |         if self.mode == 'node':
176 |             x, edge_index, batch = data.x, data.edge_index, None
177 |         else:
178 |             x, edge_index, batch = data.x, data.edge_index, data.batch
179 |         x = self.encoder(x, edge_index, batch, mode=self.mode)
180 | 
181 |         return x
182 | 


--------------------------------------------------------------------------------
/benchmark/node/run_blog.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "Task Blog2->Blog1"
 4 | echo "=========="
 5 | python grade.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 300 --dropout_ratio 0.2 --weight 0.01 --filename 'results-blog.txt'
 6 | python strurw.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.4 --lamb 0.6 --filename 'results-blog.txt'
 7 | python asn.py --source 'Blog2' --target 'Blog1' --nhid 128 --hid_dim_vae 128 --lr 0.0003 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.2 --lambda_r 0.01 --lambda_d 0.5 --lambda_f 0.0001 --filename 'results-blog.txt'
 8 | python acdne.py --source 'Blog2' --target 'Blog1' --nhid 128 --lr 0.0001 --weight_decay 0.01 --epochs 300 --dropout_ratio 0.1 --pair_weight 0.03 --step 1 --filename 'results-blog.txt'
 9 | python adagcn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-blog.txt'
10 | python udagcn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-blog.txt'
11 | python specreg.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 4 --lr 0.003 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.1 --gamma_adv 0.1 --gamma_smooth 0.001 --gamma_mfr 0.001 --filename 'results-blog.txt'
12 | python a2gnn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-blog.txt'
13 | python pairalign.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.003 --weight_decay 0.003 --epochs 200 --dropout_ratio 0.0 --rw_lmda 1 --ls_lambda 3.0 --lw_lambda 0.01 --filename 'results-blog.txt'
14 | 
15 | python kbl.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 200 --k_cross 20 --k_within 10 --filename 'results-blog.txt'
16 | python cwgcn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-blog.txt'
17 | python dane.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-blog.txt'
18 | python dgda.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --m_w 0.5 --beta 0.5 --filename 'results-blog.txt'
19 | python dmgnn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pair_weight 0.1 --filename 'results-blog.txt'
20 | python jhgda.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pool_ratio 0.2 --filename 'results-blog.txt'
21 | python sagda.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-blog.txt'
22 | 
23 | echo "Task Blog1->Blog2"
24 | echo "=========="
25 | python grade.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.01 --filename 'results-blog.txt'
26 | python strurw.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.4 --lamb 0.6 --filename 'results-blog.txt'
27 | python asn.py --source 'Blog1' --target 'Blog2' --nhid 128 --hid_dim_vae 128 --lr 0.0003 --weight_decay 0.01 --epochs 300 --dropout_ratio 0.2 --lambda_r 0.01 --lambda_d 0.5 --lambda_f 0.0001 --filename 'results-blog.txt'
28 | python acdne.py --source 'Blog1' --target 'Blog2' --nhid 128 --lr 0.0003 --weight_decay 0.01 --epochs 300 --dropout_ratio 0.0 --pair_weight 0.01 --step 1 --filename 'results-blog.txt'
29 | python adagcn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-blog.txt'
30 | python udagcn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-blog.txt'
31 | python specreg.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 4 --lr 0.003 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.1 --gamma_adv 0.1 --gamma_smooth 0.001 --gamma_mfr 0.001 --filename 'results-blog.txt'
32 | python a2gnn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-blog.txt'
33 | python pairalign.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.003 --weight_decay 0.003 --epochs 200 --dropout_ratio 0.0 --rw_lmda 1 --ls_lambda 3.0 --lw_lambda 0.01 --filename 'results-blog.txt'
34 | 
35 | python kbl.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 200 --k_cross 20 --k_within 10 --filename 'results-blog.txt'
36 | python cwgcn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-blog.txt'
37 | python dane.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-blog.txt'
38 | python dgda.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --m_w 0.5 --beta 0.5 --filename 'results-blog.txt'
39 | python dmgnn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pair_weight 0.1 --filename 'results-blog.txt'
40 | python jhgda.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pool_ratio 0.2 --filename 'results-blog.txt'
41 | python sagda.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-blog.txt'
42 | 


--------------------------------------------------------------------------------
/pygda/datasets/airport.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import torch
  4 | import numpy as np
  5 | import torch.nn.functional as F
  6 | from torch_geometric.data import InMemoryDataset, Data
  7 | from torch_geometric.io import read_txt_array
  8 | 
  9 | import csv
 10 | import json
 11 | import pickle as pkl
 12 | import scipy
 13 | import scipy.io as sio
 14 | 
 15 | import warnings
 16 | warnings.filterwarnings('ignore', category=DeprecationWarning)
 17 | 
 18 | 
 19 | class AirportDataset(InMemoryDataset):
 20 |     """
 21 |     Airport network dataset loader for graph-based analysis.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     root : str
 26 |         Root directory where the dataset should be saved
 27 |     name : str
 28 |         Name of the airport dataset
 29 |     transform : callable, optional
 30 |         Function/transform that takes in a Data object and returns a transformed
 31 |         version. Default: None
 32 |     pre_transform : callable, optional
 33 |         Function/transform to be applied to the data object before saving.
 34 |         Default: None
 35 |     pre_filter : callable, optional
 36 |         Function that takes in a Data object and returns a boolean value,
 37 |         indicating whether the data object should be included. Default: None
 38 | 
 39 |     Notes
 40 |     -----
 41 |     - Nodes represent airports
 42 |     - Edges represent routes between airports
 43 |     - Labels indicate airport categories
 44 |     - Includes train/val/test splits (80/10/10)
 45 |     """
 46 | 
 47 |     def __init__(self,
 48 |                  root,
 49 |                  name,
 50 |                  transform=None,
 51 |                  pre_transform=None,
 52 |                  pre_filter=None):
 53 |         self.name = name
 54 |         self.root = root
 55 |         super(AirportDataset, self).__init__(root, transform, pre_transform, pre_filter)
 56 | 
 57 |         self.data, self.slices = torch.load(self.processed_paths[0])
 58 |     
 59 |     @property
 60 |     def raw_file_names(self):
 61 |         """
 62 |         Names of required raw files.
 63 | 
 64 |         Returns
 65 |         -------
 66 |         list[str]
 67 |             List of required raw file names
 68 | 
 69 |         Notes
 70 |         -----
 71 |         Required files:
 72 | 
 73 |         - edgelist.txt: Contains edge connectivity
 74 |         - labels.txt: Contains node labels
 75 |         """
 76 |         return ["edgelist.txt", "labels.txt"]
 77 | 
 78 |     @property
 79 |     def processed_file_names(self):
 80 |         """
 81 |         Names of processed data files.
 82 | 
 83 |         Returns
 84 |         -------
 85 |         list[str]
 86 |             List of processed file names
 87 | 
 88 |         Notes
 89 |         -----
 90 |         Processed files:
 91 | 
 92 |         - data.pt: Contains processed PyTorch Geometric data object
 93 |         """
 94 |         return ['data.pt']
 95 | 
 96 |     def download(self):
 97 |         """
 98 |         Download raw data files.
 99 | 
100 |         Notes
101 |         -----
102 |         Empty implementation - data should be manually placed in raw directory
103 |         """
104 |         pass
105 | 
106 |     def process(self):
107 |         """
108 |         Process raw data into PyTorch Geometric Data format.
109 | 
110 |         Notes
111 |         -----
112 |         - Load edge list from text file
113 |         - Load node labels from text file
114 |         - Create Data object with:
115 |             
116 |             * Edge indices
117 |             * Node labels
118 |             * Train/val/test masks
119 |         
120 |         - Apply pre-transform if specified
121 |         - Save processed data
122 | 
123 |         Data Split:
124 | 
125 |         - Training: 80%
126 |         - Validation: 10%
127 |         - Testing: 10%
128 | 
129 |         Features:
130 |         
131 |         - Random split generation
132 |         - Optional pre-transform support
133 |         - Efficient data storage
134 |         """
135 |         edge_path = osp.join(self.raw_dir, '{}_edgelist.txt'.format(self.name))
136 |         edge_index = read_txt_array(edge_path, sep=',', dtype=torch.long).t()
137 | 
138 |         label_path = osp.join(self.raw_dir, '{}_labels.txt'.format(self.name))
139 |         f = open(label_path, 'rb')
140 |         content_list = []
141 |         for line in f.readlines():
142 |             line = str(line, encoding="utf-8")
143 |             line = line.replace("\r", "").replace("\n", "")
144 |             content_list.append(line)
145 |         y = np.array(content_list, dtype=int)
146 |         y = torch.from_numpy(y).to(torch.int64)
147 | 
148 |         data_list = []
149 |         data = Data(edge_index=edge_index, x=None, y=y, num_nodes=y.size(0))
150 | 
151 |         random_node_indices = np.random.permutation(y.shape[0])
152 |         training_size = int(len(random_node_indices) * 0.8)
153 |         val_size = int(len(random_node_indices) * 0.1)
154 |         train_node_indices = random_node_indices[:training_size]
155 |         val_node_indices = random_node_indices[training_size:training_size + val_size]
156 |         test_node_indices = random_node_indices[training_size + val_size:]
157 | 
158 |         train_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
159 |         train_masks[train_node_indices] = 1
160 |         val_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
161 |         val_masks[val_node_indices] = 1
162 |         test_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
163 |         test_masks[test_node_indices] = 1
164 | 
165 |         data.train_mask = train_masks
166 |         data.val_mask = val_masks
167 |         data.test_mask = test_masks
168 | 
169 |         if self.pre_transform is not None:
170 |             if not os.path.exists(self.processed_paths[0] + 'eival.pt'):
171 |                 data = self.pre_transform(data, self.processed_paths[0])
172 | 
173 |         data_list.append(data)
174 | 
175 |         data, slices = self.collate([data])
176 | 
177 |         torch.save((data, slices), self.processed_paths[0])


--------------------------------------------------------------------------------
/pygda/datasets/arxiv.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import torch
  4 | import numpy as np
  5 | import torch.nn.functional as F
  6 | from torch_geometric.data import InMemoryDataset, Data
  7 | from torch_geometric.io import read_txt_array
  8 | 
  9 | import csv
 10 | import json
 11 | import pickle as pkl
 12 | import scipy
 13 | import scipy.io as sio
 14 | 
 15 | import warnings
 16 | warnings.filterwarnings('ignore', category=DeprecationWarning) 
 17 | 
 18 | 
 19 | class ArxivDataset(InMemoryDataset):
 20 |     """
 21 |     ArXiv citation network dataset loader for graph-based analysis.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     root : str
 26 |         Root directory where the dataset should be saved
 27 |     name : str
 28 |         Name of the arXiv dataset
 29 |     transform : callable, optional
 30 |         Function/transform that takes in a Data object and returns a transformed
 31 |         version. Default: None
 32 |     pre_transform : callable, optional
 33 |         Function/transform to be applied to the data object before saving.
 34 |         Default: None
 35 |     pre_filter : callable, optional
 36 |         Function that takes in a Data object and returns a boolean value,
 37 |         indicating whether the data object should be included. Default: None
 38 | 
 39 |     Notes
 40 |     -----
 41 |     Dataset Structure:
 42 | 
 43 |     - Nodes represent arXiv papers
 44 |     - Edges represent citations between papers
 45 |     - Node features from paper content
 46 |     - Labels indicate paper categories
 47 |     - Includes train/val/test splits (80/10/10)
 48 |     """
 49 | 
 50 |     def __init__(self,
 51 |                  root,
 52 |                  name,
 53 |                  transform=None,
 54 |                  pre_transform=None,
 55 |                  pre_filter=None):
 56 |         self.name = name
 57 |         self.root = root
 58 |         super(ArxivDataset, self).__init__(root, transform, pre_transform, pre_filter)
 59 | 
 60 |         self.data, self.slices = torch.load(self.processed_paths[0])
 61 |     
 62 |     @property
 63 |     def raw_file_names(self):
 64 |         """
 65 |         Names of required raw files.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         list[str]
 70 |             List of required raw file names
 71 | 
 72 |         Notes
 73 |         -----
 74 |         Required files:
 75 | 
 76 |         - *.pkl: Pickle file containing graph data, features, and labels
 77 |         """
 78 |         return ["*.pkl"]
 79 | 
 80 |     @property
 81 |     def processed_file_names(self):
 82 |         """
 83 |         Names of processed data files.
 84 | 
 85 |         Returns
 86 |         -------
 87 |         list[str]
 88 |             List of processed file names
 89 | 
 90 |         Notes
 91 |         -----
 92 |         Processed files:
 93 | 
 94 |         - data.pt: Contains processed PyTorch Geometric data object
 95 |         """
 96 |         return ['data.pt']
 97 | 
 98 |     def download(self):
 99 |         """
100 |         Download raw data files.
101 | 
102 |         Notes
103 |         -----
104 |         Empty implementation - data should be manually placed in raw directory
105 |         """
106 |         pass
107 |     
108 |     def process(self):
109 |         """
110 |         Process raw data into PyTorch Geometric Data format.
111 | 
112 |         Notes
113 |         -----
114 |         Processing Steps:
115 |         
116 |         - Load pickle file containing:
117 |             
118 |             * Edge indices (citations)
119 |             * Node features (paper content)
120 |             * Labels (paper categories)
121 |         
122 |         - Convert to PyTorch tensors
123 |         - Create Data object with:
124 |             
125 |             * Edge indices
126 |             * Node features
127 |             * Node labels
128 |             * Train/val/test masks
129 | 
130 |         - Apply pre-transform if specified
131 |         - Save processed data
132 | 
133 |         Data Split:
134 |         
135 |         - Training: 80%
136 |         - Validation: 10%
137 |         - Testing: 10%
138 | 
139 |         Features:
140 |         
141 |         - Random split generation
142 |         - Feature type conversion
143 |         - Optional pre-transform support
144 |         - Efficient data storage
145 |         """
146 |         path = osp.join(self.raw_dir, '{}.pkl'.format(self.name))
147 |         dataset = pkl.load(open(path, 'rb'))
148 | 
149 |         edge_index = dataset.graph['edge_index']
150 |         features = dataset.graph['node_feat']
151 |         label = dataset.label
152 | 
153 |         x = features.to(torch.float)
154 |         y = label.squeeze().to(torch.int64)
155 | 
156 |         data_list = []
157 |         data = Data(edge_index=edge_index, x=x, y=y)
158 | 
159 |         random_node_indices = np.random.permutation(y.shape[0])
160 |         training_size = int(len(random_node_indices) * 0.8)
161 |         val_size = int(len(random_node_indices) * 0.1)
162 |         train_node_indices = random_node_indices[:training_size]
163 |         val_node_indices = random_node_indices[training_size:training_size + val_size]
164 |         test_node_indices = random_node_indices[training_size + val_size:]
165 | 
166 |         train_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
167 |         train_masks[train_node_indices] = 1
168 |         val_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
169 |         val_masks[val_node_indices] = 1
170 |         test_masks = torch.zeros([y.shape[0]], dtype=torch.bool)
171 |         test_masks[test_node_indices] = 1
172 | 
173 |         data.train_mask = train_masks
174 |         data.val_mask = val_masks
175 |         data.test_mask = test_masks
176 | 
177 |         if self.pre_transform is not None:
178 |             if not os.path.exists(self.processed_paths[0] + 'eival.pt'):
179 |                 data = self.pre_transform(data, self.processed_paths[0])
180 | 
181 |         data_list.append(data)
182 | 
183 |         data, slices = self.collate([data])
184 | 
185 |         torch.save((data, slices), self.processed_paths[0])


--------------------------------------------------------------------------------