├── pygda ├── version.py ├── utils │ ├── __init__.py │ ├── svd_transform.py │ ├── utility.py │ └── mmd.py ├── __init__.py ├── metrics │ └── __init__.py ├── datasets │ ├── __init__.py │ ├── tugraph.py │ ├── mag.py │ ├── airport.py │ └── arxiv.py ├── models │ ├── __init__.py │ └── base.py └── nn │ ├── attention.py │ ├── __init__.py │ ├── reverse_layer.py │ ├── deepwalk_pretrain.py │ └── adagcn_base.py ├── docs ├── requirements.in ├── pygda_logo.png ├── models │ ├── ASN.md │ ├── GNN.md │ ├── KBL.md │ ├── SOGA.md │ ├── A2GNN.md │ ├── ACDNE.md │ ├── BaseGDA.md │ ├── CWGCN.md │ ├── DANE.md │ ├── DGDA.md │ ├── DGSDA.md │ ├── DMGNN.md │ ├── GRADE.md │ ├── JHGDA.md │ ├── PairAlign.md │ ├── SAGDA.md │ ├── SEPA.md │ ├── TDSS.md │ ├── AdaGCN.md │ ├── GTrans.md │ ├── GraphATA.md │ ├── GraphCTA.md │ ├── SpecReg.md │ ├── StruRW.md │ ├── UDAGCN.md │ └── Overview.md ├── nn │ ├── A2GNNBase.md │ ├── ACDNEBase.md │ ├── ASNBase.md │ ├── Attention.md │ ├── CWGCNBase.md │ ├── DGDABase.md │ ├── DGSDABase.md │ ├── GNNBase.md │ ├── GRADEBase.md │ ├── JHGDABase.md │ ├── KBLBase.md │ ├── MixUpBase.md │ ├── PPMIConv.md │ ├── SAGDABase.md │ ├── SOGABase.md │ ├── AdaGCNBase.md │ ├── GradReverse.md │ ├── PropGCNConv.md │ ├── ReweightGNN.md │ ├── UDAGCNBase.md │ ├── CacheGCNConv.md │ ├── DWPretrain.md │ ├── GMMClustering.md │ ├── GraphATABase.md │ ├── MixUpGCNConv.md │ └── NodeCentricConv.md ├── utils │ ├── MMD.md │ ├── Perturb.md │ ├── Sampler.md │ ├── Utility.md │ └── SVDTransform.md ├── datasets │ ├── MAG.md │ ├── Arxiv.md │ ├── Blog.md │ ├── Twitch.md │ ├── Airport.md │ ├── Citation.md │ ├── Elliptic.md │ ├── Facebook.md │ ├── Squirrel.md │ ├── TUGraph.md │ ├── Twitter.md │ └── Overview.md ├── metrics │ └── Metrics.md ├── cheatsheet │ ├── Dataset Cheatsheet.md │ └── Model Cheatsheet.md ├── requirements.txt ├── resources │ └── Resources.md ├── benchmark │ └── Overview.md └── assets │ └── css │ └── custom.css ├── .gitignore ├── benchmark ├── node │ ├── README.md │ ├── run.sh │ ├── parser.py │ └── run_blog.sh ├── graph │ ├── README.md │ ├── run_all_M.sh │ ├── parser.py │ ├── run_all_P.sh │ ├── run_all_F.sh │ ├── cwgcn.py │ ├── grade.py │ ├── udagcn.py │ ├── dane.py │ ├── a2gnn.py │ ├── sagda.py │ └── adagcn.py └── llm │ ├── README.md │ ├── parser.py │ ├── run1.sh │ ├── run2.sh │ ├── origin_preprocess.py │ ├── run3.sh │ ├── kbl.py │ ├── udagcn.py │ ├── grade.py │ ├── a2gnn.py │ └── adagcn.py ├── examples ├── README.md └── demo.py ├── .readthedocs.yaml ├── pyproject.toml ├── LICENSE ├── .github └── workflows │ ├── python-publish.yml │ └── codeql.yml ├── data └── README.md └── mkdocs.yml /pygda/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.2.1' -------------------------------------------------------------------------------- /docs/requirements.in: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocstrings[python] 3 | markdown-include -------------------------------------------------------------------------------- /docs/pygda_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pygda-team/pygda/HEAD/docs/pygda_logo.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | testg.py 3 | testn.py 4 | data/* 5 | !data/README.md 6 | bench/ 7 | dist/ 8 | site/ -------------------------------------------------------------------------------- /pygda/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utility import * 2 | from .mmd import * 3 | from .svd_transform import svd_transform 4 | from .sampler import * 5 | from .perturb import * 6 | -------------------------------------------------------------------------------- /benchmark/node/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | Evaluation scripts for 16 methods on the five datasets. Each experiment is repeated 3 times. 3 | 4 | Run the whole suite via 5 | ``` 6 | ./run.sh 7 | ``` -------------------------------------------------------------------------------- /benchmark/node/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | for i in 1 2 3 4 | do 5 | echo $i 6 | ./run_airport.sh 7 | ./run_blog.sh 8 | ./run_citation.sh 9 | ./run_twitch.sh 10 | ./run_mag.sh 11 | done 12 | -------------------------------------------------------------------------------- /docs/models/ASN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.asn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/GNN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.gnn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/KBL.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.kbl 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/SOGA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.soga 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/A2GNNBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.a2gnn_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/ACDNEBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.acdne_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/ASNBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.asn_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/Attention.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.attention 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/CWGCNBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.cwgcn_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/DGDABase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.dgda_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/DGSDABase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.dgsda_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/GNNBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.gnn_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/GRADEBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.grade_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/JHGDABase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.jhgda_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/KBLBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.kbl_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/MixUpBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.mixup_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/PPMIConv.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.ppmi_conv 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/SAGDABase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.sagda_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/SOGABase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.soga_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/utils/MMD.md: -------------------------------------------------------------------------------- 1 | ::: pygda.utils.mmd 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/MAG.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.mag 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/A2GNN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.a2gnn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/ACDNE.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.acdne 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/BaseGDA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/CWGCN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.cwgcn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/DANE.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.dane 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/DGDA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.dgda 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/DGSDA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.dgsda 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/DMGNN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.dmgnn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/GRADE.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.grade 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/JHGDA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.jhgda 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/PairAlign.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.pa 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/SAGDA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.sagda 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/SEPA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.sepa 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/TDSS.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.tdss 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/nn/AdaGCNBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.adagcn_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/GradReverse.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.reverse_layer 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/PropGCNConv.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.prop_gcn_conv 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/ReweightGNN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.reweight_gnn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/UDAGCNBase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.udagcn_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/datasets/Arxiv.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.arxiv 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Blog.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.blog 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Twitch.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.twitch 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/metrics/Metrics.md: -------------------------------------------------------------------------------- 1 | ::: pygda.metrics.metrics 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/AdaGCN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.adagcn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/GTrans.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.gtrans 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/GraphATA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.graphata 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/GraphCTA.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.graphcta 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/SpecReg.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.specreg 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/StruRW.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.strurw 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/models/UDAGCN.md: -------------------------------------------------------------------------------- 1 | ::: pygda.models.udagcn 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/nn/CacheGCNConv.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.cached_gcn_conv 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/DWPretrain.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.deepwalk_pretrain 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/GMMClustering.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.gmm_clustering 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/GraphATABase.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.graphata_base 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/nn/MixUpGCNConv.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.mixup_gcnconv 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/utils/Perturb.md: -------------------------------------------------------------------------------- 1 | ::: pygda.utils.perturb 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/utils/Sampler.md: -------------------------------------------------------------------------------- 1 | ::: pygda.utils.sampler 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/utils/Utility.md: -------------------------------------------------------------------------------- 1 | ::: pygda.utils.utility 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Airport.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.airport 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Citation.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.citation 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Elliptic.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.elliptic 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Facebook.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.facebook 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Squirrel.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.squirrel 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/TUGraph.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.tugraph 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/datasets/Twitter.md: -------------------------------------------------------------------------------- 1 | ::: pygda.datasets.twitter 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /docs/nn/NodeCentricConv.md: -------------------------------------------------------------------------------- 1 | ::: pygda.nn.node_centric_conv 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 -------------------------------------------------------------------------------- /docs/utils/SVDTransform.md: -------------------------------------------------------------------------------- 1 | ::: pygda.utils.svd_transform 2 | options: 3 | docstring_style: numpy 4 | show_source: false 5 | merge_init_into_class: true 6 | ignore_init_summary: true 7 | heading_level: 6 8 | -------------------------------------------------------------------------------- /pygda/__init__.py: -------------------------------------------------------------------------------- 1 | from . import datasets 2 | from . import models 3 | from . import metrics 4 | from . import utils 5 | from . import nn 6 | from .version import __version__ 7 | 8 | __all__ = ['datasets', 'models', 'metrics', 'utils', 'nn'] -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This folder contains a plethora of examples using different GDA models. This readme highlights some key examples. 4 | 5 | * For SpecReg, `svd_transform` is needed for datasets preprocess, i.e., `source_dataset = CitationDataset(path, args.source, pre_transform=svd_transform)`. -------------------------------------------------------------------------------- /benchmark/graph/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | Evaluation scripts for 7 methods on the three graph classification datasets. Each experiment is repeated 3 times. 3 | 4 | ## Datasets 5 | All datasets are accessible via the links provided in the data folder. 6 | 7 | ## Run 8 | 9 | Run via 10 | ``` 11 | ./run_all_F.sh 12 | ./run_all_M.sh 13 | ./run_all_P.sh 14 | ``` -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | 3 | # Required 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.12" 10 | # You can also specify other tool versions: 11 | # nodejs: "19" 12 | # rust: "1.64" 13 | # golang: "1.19" 14 | 15 | mkdocs: 16 | configuration: mkdocs.yml 17 | 18 | python: 19 | install: 20 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /pygda/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .metrics import eval_average_precision 2 | from .metrics import eval_macro_f1 3 | from .metrics import eval_micro_f1 4 | from .metrics import eval_precision_at_k 5 | from .metrics import eval_recall_at_k 6 | from .metrics import eval_roc_auc 7 | 8 | __all__ = [ 9 | 'eval_average_precision', 10 | 'eval_micro_f1', 11 | 'eval_macro_f1', 12 | 'eval_precision_at_k', 13 | 'eval_recall_at_k', 14 | 'eval_roc_auc' 15 | ] -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires=["flit_core >=3.2,<4"] 3 | build-backend="flit_core.buildapi" 4 | 5 | [project] 6 | name = "pygda" 7 | dynamic = ["version"] 8 | description = "A Python library for Graph Domain Adaptation" 9 | authors=[ 10 | {name="pygda-team"}, 11 | ] 12 | readme = "README.md" 13 | classifiers = [ 14 | "Programming Language :: Python :: 3", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent" 17 | ] 18 | 19 | dependencies=[ 20 | "numpy", 21 | "scikit-learn", 22 | "scipy", 23 | "tqdm" 24 | ] 25 | 26 | [tool.setuptools] 27 | package-dir = {"" = "pygda"} 28 | 29 | [project.urls] 30 | Repository = "https://github.com/pygda-team/pygda" 31 | 32 | -------------------------------------------------------------------------------- /pygda/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .airport import AirportDataset 2 | from .arxiv import ArxivDataset 3 | from .blog import BlogDataset 4 | from .citation import CitationDataset 5 | from .elliptic import EllipticDataset 6 | from .facebook import FacebookDataset 7 | from .mag import MAGDataset 8 | from .squirrel import SquirrelDataset 9 | from .twitch import TwitchDataset 10 | from .twitter import TwitterDataset 11 | from .tugraph import GraphTUDataset 12 | from .webkb import WebKBDataset 13 | 14 | 15 | __all__ = [ 16 | "AirportDataset", 17 | "ArxivDataset", 18 | "BlogDataset", 19 | "CitationDataset", 20 | "EllipticDataset", 21 | "FacebookDataset", 22 | "MAGDataset", 23 | "SquirrelDataset", 24 | "TwitchDataset", 25 | "TwitterDataset", 26 | "GraphTUDataset", 27 | "WebKBDataset" 28 | ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 pygda-team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/cheatsheet/Dataset Cheatsheet.md: -------------------------------------------------------------------------------- 1 | # Datasets Cheatsheet 2 | 3 | | Datasets | Domains | #Node | #Edge | #Class | 4 | |----------|---------|-------|-------|---------| 5 | | Airport | Brazil | 131 | 1,074 | 4 | 6 | | | Euroup | 399 | 5,995 | | 7 | | | USA | 1,190 | 13,599 | | 8 | | Blog | Blog1 | 2,300 | 33,471 | 6 | 9 | | | Blog2 | 2,896 | 53,836 | | 10 | | Citation | ACMv9 | 9,360 | 15,556 | 5 | 11 | | | Citationv1 | 8,935 | 15,098 | | 12 | | | DBLPv7 | 5,484 | 8,117 | | 13 | | MAG | CN | 101,952 | 285,561 | 20 | 14 | | | DE | 43,032 | 126,683 | | 15 | | | FR | 29,262 | 78,222 | | 16 | | | JP | 37,498 | 90,944 | | 17 | | | RU | 32,833 | 67,994 | | 18 | | | US | 132,558 | 697,450 | | 19 | | Twitch | DE | 9,498 | 153,138 | 2 | 20 | | | EN | 7,126 | 35,324 | | 21 | | | ES | 4,648 | 59,382 | | 22 | | | FR | 6,549 | 112,666 | | 23 | | | PT | 1,912 | 31,299 | | 24 | | | RU | 4,385 | 37,304 | | 25 | | ogbn-arxiv | 1950-2016 | 69,499 | 237,163 | 40 | 26 | | | 2016-2018 | 51,241 | 111,754 | | 27 | | | 2018-2020 | 48,603 | 60,403 | | 28 | |TUGraph | Proteins | ~39.06 | ~72.82 | 2 | 29 | | | Mutagenicity | ~30.32 | ~30.77 | | 30 | | | Frankenstein | ~16.90 | ~17.88 | | -------------------------------------------------------------------------------- /pygda/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseGDA 2 | from .udagcn import UDAGCN 3 | from .a2gnn import A2GNN 4 | from .grade import GRADE 5 | from .asn import ASN 6 | from .specreg import SpecReg 7 | from .gnn import GNN 8 | from .strurw import StruRW 9 | from .acdne import ACDNE 10 | from .dane import DANE 11 | from .adagcn import AdaGCN 12 | from .jhgda import JHGDA 13 | from .kbl import KBL 14 | from .dgda import DGDA 15 | from .sagda import SAGDA 16 | from .cwgcn import CWGCN 17 | from .dmgnn import DMGNN 18 | from .pa import PairAlign 19 | from .soga import SOGA 20 | from .gtrans import GTrans 21 | from .graphcta import GraphCTA 22 | from .graphata import GraphATA 23 | from .sepa import SEPA 24 | from .dgsda import DGSDA 25 | from .tdss import TDSS 26 | 27 | __all__ = [ 28 | "BaseGDA", 29 | "UDAGCN", 30 | "A2GNN", 31 | "GRADE", 32 | "ASN", 33 | "SpecReg", 34 | "GNN", 35 | "StruRW", 36 | "ACDNE", 37 | "DANE", 38 | "AdaGCN", 39 | "JHGDA", 40 | "KBL", 41 | "DGDA", 42 | "SAGDA", 43 | "CWGCN", 44 | "DMGNN", 45 | "PairAlign", 46 | "SOGA", 47 | "GTrans", 48 | "GraphCTA", 49 | "GraphATA", 50 | "SEPA", 51 | "DGSDA", 52 | "TDSS" 53 | ] -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python3 -m pip install --upgrade pip 32 | python3 -m pip install --upgrade build 33 | python3 -m pip install --user --upgrade twine 34 | - name: Build package 35 | run: | 36 | python3 -m build 37 | - name: Publish package 38 | uses: pypa/gh-action-pypi-publish@release/v1 39 | with: 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | -------------------------------------------------------------------------------- /docs/cheatsheet/Model Cheatsheet.md: -------------------------------------------------------------------------------- 1 | # Model Cheatsheet 2 | 3 | | Num | Methods | Settings | Supported Tasks | 4 | |-----|----------|----------|-----------------| 5 | | 1 | Vanilla GNN | No-adaptation | Node/Graph Level | 6 | | 2 | DANE | Source-needed | Node/Graph Level | 7 | | 3 | ACDNE | Source-needed | Node Level | 8 | | 4 | UDAGCN | Source-needed | Node/Graph Level | 9 | | 5 | ASN | Source-needed | Node Level | 10 | | 6 | AdaGCN | Source-needed | Node/Graph Level | 11 | | 7 | GRADE | Source-needed | Node/Graph Level | 12 | | 8 | SpecReg | Source-needed | Node Level | 13 | | 9 | StruRW | Source-needed | Node Level | 14 | | 10 | JHGDA | Source-needed | Node Level | 15 | | 11 | KBL | Source-needed | Node Level | 16 | | 12 | WGCNN | Source-needed | Node Level | 17 | | 13 | CWGCN | Source-needed | Node/Graph Level | 18 | | 14 | SAGDA | Source-needed | Node/Graph Level | 19 | | 15 | GTrans | Source-free | Node Level | 20 | | 16 | DGDA | Source-needed | Node Level | 21 | | 17 | A2GNN | Source-needed | Node/Graph Level | 22 | | 18 | PairAlign | Source-needed | Node Level | 23 | | 19 | SEPA | Source-needed | Node Level | 24 | | 20 | SOGA | Source-free | Node Level | 25 | | 21 | GraphCTA | Source-free | Node Level | 26 | | 22 | TDSS | Source-needed| Node Level | 27 | | 23 | GraphATA | Multi-Source-free | Node/Graph Level | 28 | | 24 | DGSDA | Source-needed | Node Level | -------------------------------------------------------------------------------- /pygda/nn/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Attention(nn.Module): 7 | """ 8 | Attention mechanism for feature aggregation. 9 | 10 | Parameters 11 | ---------- 12 | in_channels : int 13 | Input feature dimension. 14 | 15 | Notes 16 | ----- 17 | - Implements learnable attention weights 18 | - Uses softmax normalization 19 | - Includes dropout regularization 20 | - Single-head attention mechanism 21 | """ 22 | 23 | def __init__(self, in_channels): 24 | super().__init__() 25 | self.dense_weight = nn.Linear(in_channels, 1) 26 | self.dropout = nn.Dropout(0.1) 27 | 28 | def forward(self, inputs): 29 | """ 30 | Apply attention mechanism to input features. 31 | 32 | Parameters 33 | ---------- 34 | inputs : list[torch.Tensor] 35 | List of input tensors to be attended. 36 | 37 | Returns 38 | ------- 39 | torch.Tensor 40 | Attention-weighted feature aggregation. 41 | 42 | Notes 43 | ----- 44 | Process: 45 | 46 | 1. Stack input tensors 47 | 2. Compute attention weights 48 | 3. Apply softmax normalization 49 | 4. Weighted sum of features 50 | """ 51 | 52 | stacked = torch.stack(inputs, dim=1) 53 | weights = F.softmax(self.dense_weight(stacked), dim=1) 54 | outputs = torch.sum(stacked * weights, dim=1) 55 | return outputs 56 | -------------------------------------------------------------------------------- /pygda/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .cached_gcn_conv import CachedGCNConv 2 | from .ppmi_conv import PPMIConv 3 | from .attention import Attention 4 | from .udagcn_base import UDAGCNBase 5 | from .reverse_layer import GradReverse 6 | from .prop_gcn_conv import PropGCNConv 7 | from .a2gnn_base import A2GNNBase 8 | from .grade_base import GRADEBase 9 | from .asn_base import ASNBase 10 | from .gnn_base import GNNBase 11 | from .mixup_gcnconv import MixUpGCNConv 12 | from .mixup_base import MixupBase 13 | from .acdne_base import ACDNEBase 14 | from .adagcn_base import AdaGCNBase 15 | from .gmm_clustering import GMMClustering 16 | from .jhgda_base import JHGDABase 17 | from .kbl_base import KBLBase 18 | from .dgda_base import DGDABase 19 | from .deepwalk_pretrain import DWPretrain 20 | from .sagda_base import SAGDABase 21 | from .cwgcn_base import CWGCNBase 22 | from .reweight_gnn import ReweightGNN 23 | from .soga_base import SOGABase 24 | from .node_centric_conv import NodeCentricConv, NodeCentricMLP 25 | from .graphata_base import GraphATABase 26 | from .dgsda_base import DGSDABase 27 | 28 | 29 | __all__ = [ 30 | "CachedGCNConv", 31 | "PPMIConv", 32 | "Attention", 33 | "UDAGCNBase", 34 | "GradReverse", 35 | "PropGCNConv", 36 | "A2GNNBase", 37 | "GRADEBase", 38 | "ASNBase", 39 | "GNNBase", 40 | "MixUpGCNConv", 41 | "MixupBase", 42 | "ACDNEBase", 43 | "AdaGCNBase", 44 | "GMMClustering", 45 | "JHGDABase", 46 | "KBLBase", 47 | "DGDABase", 48 | "DWPretrain", 49 | "SAGDABase", 50 | "CWGCNBase", 51 | "ReweightGNN", 52 | "SOGABase", 53 | "NodeCentricConv", 54 | "NodeCentricMLP", 55 | "GraphATABase", 56 | "DGSDABase" 57 | ] 58 | -------------------------------------------------------------------------------- /pygda/nn/reverse_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class GradReverse(torch.autograd.Function): 5 | """ 6 | Gradient Reversal Layer for adversarial training. 7 | 8 | Implements a custom autograd function that: 9 | 10 | - Forward: Identity operation 11 | - Backward: Reverses and scales gradients 12 | 13 | """ 14 | 15 | @staticmethod 16 | def forward(ctx, x, alpha): 17 | """ 18 | Forward pass of gradient reversal. 19 | 20 | Parameters 21 | ---------- 22 | ctx : torch.autograd.function.Context 23 | Context object for storing variables for backward. 24 | x : torch.Tensor 25 | Input tensor. 26 | alpha : float 27 | Gradient scaling factor. 28 | 29 | Returns 30 | ------- 31 | torch.Tensor 32 | Input tensor without modification. 33 | 34 | Notes 35 | ----- 36 | Identity operation in forward pass, stores alpha for backward. 37 | """ 38 | ctx.alpha = alpha 39 | return x.view_as(x) 40 | 41 | @staticmethod 42 | def backward(ctx, grad_output): 43 | """ 44 | Backward pass of gradient reversal. 45 | 46 | Parameters 47 | ---------- 48 | ctx : torch.autograd.function.Context 49 | Context object containing saved alpha. 50 | grad_output : torch.Tensor 51 | Gradient from subsequent layer. 52 | 53 | Returns 54 | ------- 55 | tuple 56 | Contains: 57 | - torch.Tensor: Reversed and scaled gradient 58 | - None: For alpha parameter (not needed) 59 | 60 | Notes 61 | ----- 62 | Implements gradient reversal: 63 | grad = -alpha * grad_output 64 | """ 65 | grad_output = grad_output.neg() * ctx.alpha 66 | return grad_output, None 67 | -------------------------------------------------------------------------------- /benchmark/llm/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | Evaluation scripts for 5 methods on the ogbn-arxiv datasets with LLM predictions and explanations. Each experiment is repeated 3 times. 3 | 4 | ## LLM as Feature Encoder 5 | To investigate whether the distribution gap narrows after utilizing the LLM as the feature encoder, we utilize the prompts from TAPE (ICLR 2024, Explanations as Features: LLM-Based Features for Text-Attributed Graphs), which allows us to assess the impact of LLM-based features on the model's performance. 6 | 7 | The dataset is chronologically divided into 3 groups according to the publication years of the papers. We construct 3 graphs encompassing papers published before 2016 (Group A), 2016-2018 (Group B), and 2018-2020 (Group C). 8 | 9 | ### Datasets Preprocess 10 | - Original node attributes, which are obtained by averaging the embeddings of words in its title and abstract via word2vec. 11 | ``` 12 | python origin_preprocess.py 13 | ``` 14 | - LLM enhanced text with word2vec embedding, which combines the title, abstract, and LLM-generated predictions and explanations into a single input. This composite text is then fed into word2vec. Then, the node features are obtained by averaging the embeddings of its combined input. 15 | ``` 16 | python llm_w2v_preprocess.py 17 | ``` 18 | - LLM enhanced text with BERT embedding, which combines the title, abstract, and LLM-generated predictions and explanations into a single input. This composite text is then fed into a pretrained DeBERTa. Then, the node features are obtained by sentence embedding. **Note that, we did not finetune the DeBERTa like TAPE paper, since we study unsupervised graph domain adaptation**. 19 | ``` 20 | python llm_bert_preprocess.py 21 | ``` 22 | 23 | ### Data Download 24 | - ogbn-arixv. The [OGB](https://ogb.stanford.edu/docs/nodeprop/) provides the mapping from MAG paper IDs into the raw texts of titles and abstracts. Download the title and abstract data [here](https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz). 25 | - LLM-responses. Download the LLM responses data [here](https://drive.google.com/file/d/1A6mZSFzDIhJU795497R6mAAM2Y9qutI5/view?usp=sharing) from TAPE paper. 26 | 27 | ## Run 28 | 29 | Run via 30 | ``` 31 | ./run1.sh 32 | ./run2.sh 33 | ./run3.sh 34 | ``` -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile docs/requirements.in 6 | # 7 | astunparse==1.6.3 8 | # via griffe 9 | click==8.1.7 10 | # via 11 | # mkdocs 12 | # mkdocstrings 13 | colorama==0.4.6 14 | # via griffe 15 | ghp-import==2.1.0 16 | # via mkdocs 17 | griffe==0.44.0 18 | # via mkdocstrings-python 19 | importlib-metadata==7.1.0 20 | # via 21 | # markdown 22 | # mkdocs 23 | # mkdocs-get-deps 24 | # mkdocstrings 25 | jinja2>=3.1.5 26 | # via 27 | # mkdocs 28 | # mkdocstrings 29 | markdown==3.6 30 | # via 31 | # markdown-include 32 | # mkdocs 33 | # mkdocs-autorefs 34 | # mkdocstrings 35 | # pymdown-extensions 36 | markdown-include==0.8.1 37 | # via -r docs/requirements.in 38 | markupsafe==2.1.5 39 | # via 40 | # jinja2 41 | # mkdocs 42 | # mkdocs-autorefs 43 | # mkdocstrings 44 | mergedeep==1.3.4 45 | # via 46 | # mkdocs 47 | # mkdocs-get-deps 48 | mkdocs==1.6.0 49 | # via 50 | # -r docs/requirements.in 51 | # mkdocs-autorefs 52 | # mkdocstrings 53 | mkdocs-autorefs==1.0.1 54 | # via mkdocstrings 55 | mkdocs-get-deps==0.2.0 56 | # via mkdocs 57 | mkdocstrings[python]==0.25.1 58 | # via 59 | # -r docs/requirements.in 60 | # mkdocstrings-python 61 | mkdocstrings-python==1.10.0 62 | # via mkdocstrings 63 | packaging==24.0 64 | # via mkdocs 65 | pathspec==0.12.1 66 | # via mkdocs 67 | platformdirs==4.2.1 68 | # via 69 | # mkdocs-get-deps 70 | # mkdocstrings 71 | pymdown-extensions==10.8.1 72 | # via mkdocstrings 73 | python-dateutil==2.9.0.post0 74 | # via ghp-import 75 | pyyaml==6.0.1 76 | # via 77 | # mkdocs 78 | # mkdocs-get-deps 79 | # pymdown-extensions 80 | # pyyaml-env-tag 81 | pyyaml-env-tag==0.1 82 | # via mkdocs 83 | six==1.16.0 84 | # via 85 | # astunparse 86 | # python-dateutil 87 | typing-extensions==4.11.0 88 | # via mkdocstrings 89 | watchdog==4.0.0 90 | # via mkdocs 91 | wheel==0.43.0 92 | # via astunparse 93 | zipp==3.19.1 94 | # via importlib-metadata 95 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Datasets 2 | * **Airport**: It has 3 different domains, i.e., Brazil, Euroup and USA. They are adopted from the [struc2vec](https://arxiv.org/abs/1704.03165) and can be downloaded [here](https://drive.google.com/drive/folders/1zlluWoeukD33ZxwaTRQi3jCdD0qC-I2j?usp=share_link). The graph processing can be found at ``AirportDataset``. We utilize ``OneHotDegree`` to construct node features for each node. 3 | * **Blog**: It has 2 different domains, i.e., Blog1 and Blog2. They are adopted from the [ACDNE](https://arxiv.org/abs/2002.07366) and can be downloaded [here](https://drive.google.com/drive/folders/1jKKG0o7rEY-BaVEjBhuGijzwwhU0M-pQ?usp=share_link). The graph processing can be found at ``BlogDataset``. 4 | * **Citation**: It has 3 different domains, i.e., ACMv9 , Citationv1 and DBLPv7. They are adopted from the [ASN](https://dl.acm.org/doi/abs/10.1145/3459637.3482228) and can be downloaded [here](https://drive.google.com/drive/folders/1ntNt3qHE4p9Us8Re9tZDaB-tdtqwV8AX?usp=share_link). The graph processing can be found at ``CitationDataset``. 5 | * **MAG**: The MAG dataset is originally from the ogbn-mag dataset and the [PairAlign](https://arxiv.org/abs/2403.01092) separates it into 6 countries, including CN, DE, FR, JP, RU, and US. The data can be downloaded [here](https://drive.google.com/drive/folders/1HinhjpNPPivyqoubiYOr8X2jq-rjw3e9?usp=share_link) and the graph processing can be found at ``MAGDataset``. 6 | * **Twitch**: It has 6 different domains, i.e., DE, EN, ES, FR,PT and RU. They are adopted from the [Twitch Social Networks](https://github.com/benedekrozemberczki/datasets?tab=readme-ov-file#twitch-social-networks) and can be downloaded [here](https://drive.google.com/drive/folders/1GWMyyJOZ4CeeqP_H5dCA5voSQHT0WlXG?usp=share_link). The graph processing can be found at ``TwitchDataset``. 7 | * **PROTEINS**, **FRANKENSTEIN** and **Mutagenicity**: They have 2 domains based on its density. They are adopted from [TUDataset](https://chrsmrrs.github.io/datasets/docs/datasets/) and can be downloaded [here](https://drive.google.com/drive/folders/1NbPK71Dy0ulH3CdNyfvMwQECj_Oh867I?usp=sharing). The graph processing can be found at ``GraphTUDataset``. 8 | * **Arxiv**: It has 3 domains based the publication years of the papers. They are adopted from [ogbn-arxiv](https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv) and can be preprocessed with the scripts in benchmark folder. The graph processing can be found at ``ArxivDataset``. -------------------------------------------------------------------------------- /docs/models/Overview.md: -------------------------------------------------------------------------------- 1 | # Models Overview 2 | 3 | This section provides detailed documentation for all supported models in PyGDA. Our framework offers a comprehensive collection of graph domain adaptation models, built on a flexible and extensible architecture. 4 | 5 | ### Core Architecture 6 | 7 | #### [BaseGDA](BaseGDA.md) 8 | The foundation of PyGDA's model architecture, providing: 9 | 10 | - Base class for all graph domain adaptation models 11 | - Core training and inference functionalities 12 | - Standardized interfaces for model customization 13 | - Common utility methods and configurations 14 | 15 | ### Customization Guide 16 | 17 | PyGDA is designed for easy customization and extension. To create your own model: 18 | 19 | ```python 20 | from pygda.models import BaseGDA 21 | 22 | class CustomGDA(BaseGDA): 23 | def __init__(self, **kwargs): 24 | super().__init__(**kwargs) 25 | # Initialize your model components 26 | 27 | def fit(self, data): 28 | # Implement your training logic 29 | pass 30 | 31 | def predict(self, data): 32 | # Implement your inference logic 33 | return predictions 34 | ``` 35 | 36 | ### Key Features 37 | 38 | 1. **Flexible Base Architecture** 39 | 40 | - Inherit from `BaseGDA` for consistent interface 41 | - Access to core functionalities and utilities 42 | - Standardized training and evaluation methods 43 | 44 | 2. **Easy Training Process** 45 | 46 | - Use `fit()` method for model training 47 | - Support for custom hyperparameters 48 | - Flexible dataset input handling 49 | - Built-in optimization utilities 50 | 51 | 3. **Streamlined Evaluation** 52 | 53 | - Simple `predict()` interface 54 | - Standardized performance metrics 55 | - Easy integration with evaluation pipelines 56 | 57 | 4. **Extensibility** 58 | 59 | - Create custom model architectures 60 | - Add new training strategies 61 | - Implement domain-specific features 62 | - Integrate with existing PyGDA components 63 | 64 | ### Usage Example 65 | 66 | ```python 67 | from pygda.models import A2GNN 68 | 69 | # Initialize model 70 | model = A2GNN(in_dim=100, hidden_dim=64, num_classes=7) 71 | 72 | # Train model 73 | model.fit(train_data) 74 | 75 | # Make predictions 76 | predictions = model.predict(test_data) 77 | ``` 78 | 79 | For detailed information about each model, please visit their respective documentation pages linked above. -------------------------------------------------------------------------------- /benchmark/node/parser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | filename = 'results.txt' 5 | 6 | f = open(filename, 'r') 7 | lines = f.readlines() 8 | f.close() 9 | 10 | dataDict = dict() 11 | 12 | for line in lines: 13 | elements = line.strip('\n\r').split(',') 14 | name = elements[0] 15 | source = elements[2] 16 | target = elements[4] 17 | micro_f1 = eval(elements[6]) 18 | macro_f1 = eval(elements[8]) 19 | auc = eval(elements[10]) 20 | if name not in dataDict: 21 | dataDict[name] = dict() 22 | if (source, target) not in dataDict[name]: 23 | dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]] 24 | else: 25 | dataDict[name][(source, target)].append([micro_f1, macro_f1, auc]) 26 | else: 27 | if (source, target) not in dataDict[name]: 28 | dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]] 29 | else: 30 | dataDict[name][(source, target)].append([micro_f1, macro_f1, auc]) 31 | 32 | print('source target mean std:') 33 | 34 | for k, v in dataDict.items(): 35 | print(k) 36 | for st, value in v.items(): 37 | value = np.array(value) 38 | mean_v = np.mean(value, axis=0) 39 | std_v = np.std(value, axis=0) 40 | print(st, mean_v, std_v) 41 | 42 | 43 | # Create a pandas DataFrame from the nested dictionary 44 | data = [] 45 | 46 | # Collect all (src, tgt) pairs 47 | src_tgt_pairs = set() 48 | for model_results in dataDict.values(): 49 | src_tgt_pairs.update(model_results.keys()) 50 | 51 | # Sort the pairs for consistent ordering 52 | src_tgt_pairs = sorted(src_tgt_pairs) 53 | 54 | # Build the data for the DataFrame 55 | for model, model_results in dataDict.items(): 56 | row = {'Model': model} 57 | for pair in src_tgt_pairs: 58 | if pair in model_results: 59 | metrics = np.array(model_results[pair]) * 100 # Multiply by 100 60 | mean_metrics = np.mean(metrics, axis=0) 61 | std_metrics = np.std(metrics, axis=0) 62 | mean_metrics_rounded = np.round(mean_metrics, 2) 63 | std_metrics_rounded = np.round(std_metrics, 2) 64 | row[pair] = f"{mean_metrics_rounded} +/- {std_metrics_rounded}" 65 | else: 66 | row[pair] = "N/A" # Handle cases where there are no results for this pair 67 | data.append(row) 68 | 69 | # Create the DataFrame 70 | df = pd.DataFrame(data) 71 | 72 | # Set the 'Model' column as the index 73 | df.set_index('Model', inplace=True) 74 | 75 | # Optionally, save the DataFrame to a CSV file 76 | df.to_csv('csv_results.csv') -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: PyGDA 2 | theme: readthedocs 3 | 4 | plugins: 5 | - search 6 | - mkdocstrings 7 | 8 | nav: 9 | - GET STARTED: index.md 10 | - Package Reference: 11 | - pygda.nn: 12 | - nn/A2GNNBase.md 13 | - nn/ACDNEBase.md 14 | - nn/AdaGCNBase.md 15 | - nn/ASNBase.md 16 | - nn/Attention.md 17 | - nn/CWGCNBase.md 18 | - nn/CacheGCNConv.md 19 | - nn/DGDABase.md 20 | - nn/DGSDABase.md 21 | - nn/DWPretrain.md 22 | - nn/GMMClustering.md 23 | - nn/GNNBase.md 24 | - nn/GRADEBase.md 25 | - nn/GradReverse.md 26 | - nn/GraphATABase.md 27 | - nn/JHGDABase.md 28 | - nn/KBLBase.md 29 | - nn/MixUpBase.md 30 | - nn/MixUpGCNConv.md 31 | - nn/NodeCentricConv.md 32 | - nn/PPMIConv.md 33 | - nn/PropGCNConv.md 34 | - nn/ReweightGNN.md 35 | - nn/SAGDABase.md 36 | - nn/SOGABase.md 37 | - nn/UDAGCNBase.md 38 | - pygda.models: 39 | - models/Overview.md 40 | - models/BaseGDA.md 41 | - models/GNN.md 42 | - models/DANE.md 43 | - models/ACDNE.md 44 | - models/UDAGCN.md 45 | - models/ASN.md 46 | - models/AdaGCN.md 47 | - models/GRADE.md 48 | - models/SpecReg.md 49 | - models/StruRW.md 50 | - models/JHGDA.md 51 | - models/KBL.md 52 | - models/DMGNN.md 53 | - models/CWGCN.md 54 | - models/SAGDA.md 55 | - models/DGDA.md 56 | - models/A2GNN.md 57 | - models/PairAlign.md 58 | - models/GTrans.md 59 | - models/SOGA.md 60 | - models/SEPA.md 61 | - models/GraphCTA.md 62 | - models/TDSS.md 63 | - models/GraphATA.md 64 | - models/DGSDA.md 65 | - pygda.datasets: 66 | - datasets/Overview.md 67 | - datasets/Airport.md 68 | - datasets/Arxiv.md 69 | - datasets/Blog.md 70 | - datasets/Citation.md 71 | - datasets/Elliptic.md 72 | - datasets/Facebook.md 73 | - datasets/MAG.md 74 | - datasets/Squirrel.md 75 | - datasets/TUGraph.md 76 | - datasets/Twitch.md 77 | - datasets/Twitter.md 78 | - pygda.metrics: 79 | - metrics/Metrics.md 80 | - pygda.utils: 81 | - utils/MMD.md 82 | - utils/Perturb.md 83 | - utils/SVDTransform.md 84 | - utils/Sampler.md 85 | - utils/Utility.md 86 | - Benchmarks: 87 | - benchmark/Overview.md 88 | - Cheatsheets: 89 | - cheatsheet/Dataset Cheatsheet.md 90 | - cheatsheet/Model Cheatsheet.md 91 | - External Resources: 92 | - resources/Resources.md 93 | 94 | extra_css: 95 | - assets/css/custom.css 96 | -------------------------------------------------------------------------------- /benchmark/graph/run_all_M.sh: -------------------------------------------------------------------------------- 1 | python grade.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --filename 'results-M.txt' 2 | python adagcn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-M.txt' 3 | python udagcn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-M.txt' 4 | python a2gnn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-M.txt' 5 | python cwgcn.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-M.txt' 6 | python dane.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-M.txt' 7 | python sagda.py --source 'Mutagenicity_M1' --target 'Mutagenicity_M2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-M.txt' 8 | 9 | python grade.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --filename 'results-M.txt' 10 | python adagcn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-M.txt' 11 | python udagcn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-M.txt' 12 | python a2gnn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-M.txt' 13 | python cwgcn.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-M.txt' 14 | python dane.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-M.txt' 15 | python sagda.py --source 'Mutagenicity_M2' --target 'Mutagenicity_M1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-M.txt' -------------------------------------------------------------------------------- /benchmark/graph/parser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | filename = 'results-F.txt' 5 | 6 | f = open(filename, 'r') 7 | lines = f.readlines() 8 | f.close() 9 | 10 | dataDict = dict() 11 | 12 | for line in lines: 13 | elements = line.strip('\n\r').split(',') 14 | name = elements[0] 15 | source = elements[2] 16 | target = elements[4] 17 | micro_f1 = eval(elements[6]) 18 | macro_f1 = eval(elements[8]) 19 | auc = eval(elements[10]) 20 | if name not in dataDict: 21 | dataDict[name] = dict() 22 | if (source, target) not in dataDict[name]: 23 | dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]] 24 | else: 25 | dataDict[name][(source, target)].append([micro_f1, macro_f1, auc]) 26 | else: 27 | if (source, target) not in dataDict[name]: 28 | dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]] 29 | else: 30 | dataDict[name][(source, target)].append([micro_f1, macro_f1, auc]) 31 | 32 | print('source target mean std:') 33 | 34 | for k, v in dataDict.items(): 35 | print(k) 36 | for st, value in v.items(): 37 | value = np.array(value) 38 | mean_v = np.mean(value, axis=0) 39 | std_v = np.std(value, axis=0) 40 | print(st, mean_v, std_v) 41 | 42 | 43 | # Create a pandas DataFrame from the nested dictionary 44 | data = [] 45 | 46 | # Collect all (src, tgt) pairs 47 | src_tgt_pairs = set() 48 | for model_results in dataDict.values(): 49 | src_tgt_pairs.update(model_results.keys()) 50 | 51 | # Sort the pairs for consistent ordering 52 | src_tgt_pairs = sorted(src_tgt_pairs) 53 | 54 | # Build the data for the DataFrame 55 | for model, model_results in dataDict.items(): 56 | row = {'Model': model} 57 | for pair in src_tgt_pairs: 58 | if pair in model_results: 59 | metrics = np.array(model_results[pair]) * 100 # Multiply by 100 60 | mean_metrics = np.mean(metrics, axis=0) 61 | std_metrics = np.std(metrics, axis=0) 62 | mean_metrics_rounded = np.round(mean_metrics, 2) 63 | std_metrics_rounded = np.round(std_metrics, 2) 64 | row[pair] = f"{mean_metrics_rounded} +/- {std_metrics_rounded}" 65 | else: 66 | # row[pair] = "N/A" # Handle cases where there are no results for this pair 67 | row[pair] = f"[0 0 0] +/- [0 0 0]" 68 | data.append(row) 69 | 70 | # Create the DataFrame 71 | df = pd.DataFrame(data) 72 | 73 | # Set the 'Model' column as the index 74 | df.set_index('Model', inplace=True) 75 | 76 | # Display the DataFrame 77 | # print(df) 78 | 79 | # Optionally, save the DataFrame to a CSV file 80 | df.to_csv('results-F.csv') -------------------------------------------------------------------------------- /benchmark/llm/parser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | filename = 'results-llm-3.txt' 5 | 6 | f = open(filename, 'r') 7 | lines = f.readlines() 8 | f.close() 9 | 10 | dataDict = dict() 11 | 12 | for line in lines: 13 | elements = line.strip('\n\r').split(',') 14 | name = elements[0] 15 | source = elements[2] 16 | target = elements[4] 17 | micro_f1 = eval(elements[6]) 18 | macro_f1 = eval(elements[8]) 19 | auc = eval(elements[10]) 20 | if name not in dataDict: 21 | dataDict[name] = dict() 22 | if (source, target) not in dataDict[name]: 23 | dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]] 24 | else: 25 | dataDict[name][(source, target)].append([micro_f1, macro_f1, auc]) 26 | else: 27 | if (source, target) not in dataDict[name]: 28 | dataDict[name][(source, target)] = [[micro_f1, macro_f1, auc]] 29 | else: 30 | dataDict[name][(source, target)].append([micro_f1, macro_f1, auc]) 31 | 32 | print('source target mean std:') 33 | 34 | for k, v in dataDict.items(): 35 | print(k) 36 | for st, value in v.items(): 37 | value = np.array(value) 38 | mean_v = np.mean(value, axis=0) 39 | std_v = np.std(value, axis=0) 40 | print(st, mean_v, std_v) 41 | 42 | 43 | # Create a pandas DataFrame from the nested dictionary 44 | data = [] 45 | 46 | # Collect all (src, tgt) pairs 47 | src_tgt_pairs = set() 48 | for model_results in dataDict.values(): 49 | src_tgt_pairs.update(model_results.keys()) 50 | 51 | # Sort the pairs for consistent ordering 52 | src_tgt_pairs = sorted(src_tgt_pairs) 53 | 54 | # Build the data for the DataFrame 55 | for model, model_results in dataDict.items(): 56 | row = {'Model': model} 57 | for pair in src_tgt_pairs: 58 | if pair in model_results: 59 | metrics = np.array(model_results[pair]) * 100 # Multiply by 100 60 | mean_metrics = np.mean(metrics, axis=0) 61 | std_metrics = np.std(metrics, axis=0) 62 | mean_metrics_rounded = np.round(mean_metrics, 2) 63 | std_metrics_rounded = np.round(std_metrics, 2) 64 | row[pair] = f"{mean_metrics_rounded} +/- {std_metrics_rounded}" 65 | else: 66 | # row[pair] = "N/A" # Handle cases where there are no results for this pair 67 | row[pair] = f"[0 0 0] +/- [0 0 0]" 68 | data.append(row) 69 | 70 | # Create the DataFrame 71 | df = pd.DataFrame(data) 72 | 73 | # Set the 'Model' column as the index 74 | df.set_index('Model', inplace=True) 75 | 76 | # Display the DataFrame 77 | # print(df) 78 | 79 | # Optionally, save the DataFrame to a CSV file 80 | df.to_csv('results-llm-bert.csv') -------------------------------------------------------------------------------- /pygda/utils/svd_transform.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import numpy as np 5 | from torch_geometric.utils import get_laplacian 6 | from sklearn.decomposition import TruncatedSVD 7 | 8 | def svd_transform( 9 | data, 10 | processed_paths 11 | ): 12 | """ 13 | Perform SVD transformation on graph adjacency matrix and store eigenvalues/vectors. 14 | 15 | Parameters 16 | ---------- 17 | data : torch_geometric.data.Data 18 | Input graph data object containing edge indices and node features 19 | processed_paths : str 20 | Path to save processed eigenvalues and eigenvectors 21 | 22 | Returns 23 | ------- 24 | torch_geometric.data.Data 25 | Data object augmented with eigenvalues and eigenvectors 26 | 27 | Notes 28 | ----- 29 | Processing Steps: 30 | 31 | - Graph Processing: 32 | 33 | * Extract number of nodes 34 | * Compute Laplacian matrix 35 | * Convert to dense adjacency 36 | 37 | - SVD Computation: 38 | 39 | * Choose components based on graph size 40 | * Small graphs (<1000 nodes): 100 components 41 | * Large graphs: 1000 components 42 | * Perform truncated SVD 43 | 44 | - Data Storage: 45 | 46 | * Save square root of explained variance 47 | * Save component vectors 48 | * Load into data object 49 | 50 | Features: 51 | 52 | - Adaptive dimensionality 53 | - Memory efficient SVD 54 | - Sparse to dense conversion 55 | - Eigendecomposition storage 56 | 57 | Mathematical Details: 58 | 59 | - Uses truncated SVD for dimensionality reduction 60 | - Computes graph Laplacian eigendecomposition 61 | - Stores sqrt{explained_variance} as eigenvalues 62 | - Preserves principal components as eigenvectors 63 | """ 64 | num_node = data.y.shape[0] 65 | edge_index, edge_weight = get_laplacian(data.edge_index, num_nodes=num_node) 66 | edge_index = edge_index.numpy() 67 | edge_weight = edge_weight.numpy() 68 | adj = np.zeros((num_node, num_node), dtype=np.float32) 69 | adj[edge_index[0,:], edge_index[1,:]] = edge_weight 70 | 71 | if num_node < 1000: 72 | pca = TruncatedSVD(n_components=100, n_iter=20, random_state=42) 73 | else: 74 | pca = TruncatedSVD(n_components=1000, n_iter=20, random_state=42) 75 | pca.fit(adj) 76 | 77 | torch.save(torch.tensor(pca.explained_variance_ ** 0.5, dtype=torch.float32 ), processed_paths + 'eival.pt') 78 | torch.save(torch.tensor(pca.components_, dtype=torch.float32 ), processed_paths + 'eivec.pt') 79 | 80 | data.eival = torch.load(processed_paths + 'eival.pt') 81 | data.eivec = torch.load(processed_paths + 'eivec.pt') 82 | 83 | return data 84 | -------------------------------------------------------------------------------- /benchmark/graph/run_all_P.sh: -------------------------------------------------------------------------------- 1 | python grade.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:2' --filename 'results-P.txt' 2 | python adagcn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:2' --filename 'results-P.txt' 3 | python udagcn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:2' --filename 'results-P.txt' 4 | python a2gnn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:2' --filename 'results-P.txt' 5 | python cwgcn.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt' 6 | python dane.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt' 7 | python sagda.py --source 'PROTEINS_P1' --target 'PROTEINS_P2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:2' --filename 'results-P.txt' 8 | 9 | python grade.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:2' --filename 'results-P.txt' 10 | python adagcn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:2' --filename 'results-P.txt' 11 | python udagcn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:2' --filename 'results-P.txt' 12 | python a2gnn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:2' --filename 'results-P.txt' 13 | python cwgcn.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt' 14 | python dane.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:2' --filename 'results-P.txt' 15 | python sagda.py --source 'PROTEINS_P2' --target 'PROTEINS_P1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:2' --filename 'results-P.txt' -------------------------------------------------------------------------------- /benchmark/graph/run_all_F.sh: -------------------------------------------------------------------------------- 1 | python grade.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:3' --filename 'results-F.txt' 2 | python adagcn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:3' --filename 'results-F.txt' 3 | python udagcn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:3' --filename 'results-F.txt' 4 | python a2gnn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:3' --filename 'results-F.txt' 5 | python cwgcn.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt' 6 | python dane.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt' 7 | python sagda.py --source 'FRANKENSTEIN_F1' --target 'FRANKENSTEIN_F2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:3' --filename 'results-F.txt' 8 | 9 | python grade.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 5 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.005 --device 'cuda:3' --filename 'results-F.txt' 10 | python adagcn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --device 'cuda:3' --filename 'results-F.txt' 11 | python udagcn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --device 'cuda:3' --filename 'results-F.txt' 12 | python a2gnn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --device 'cuda:3' --filename 'results-F.txt' 13 | python cwgcn.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt' 14 | python dane.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --device 'cuda:3' --filename 'results-F.txt' 15 | python sagda.py --source 'FRANKENSTEIN_F2' --target 'FRANKENSTEIN_F1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --device 'cuda:3' --filename 'results-F.txt' -------------------------------------------------------------------------------- /docs/resources/Resources.md: -------------------------------------------------------------------------------- 1 | # External Links 2 | 3 | ### NumPy 4 | - Official Website: [https://numpy.org](https://numpy.org) 5 | - Documentation: [https://numpy.org/doc/stable](https://numpy.org/doc/stable) 6 | - GitHub Repository: [https://github.com/numpy/numpy](https://github.com/numpy/numpy) 7 | 8 | NumPy is a fundamental package for scientific computing in Python. It provides: 9 | 10 | - A powerful N-dimensional array object 11 | - Sophisticated broadcasting functions 12 | - Tools for integrating C/C++ code 13 | - Useful linear algebra, Fourier transform, and random number capabilities 14 | 15 | ### SciPy 16 | - Official Website: [https://scipy.org](https://scipy.org) 17 | - Documentation: [https://docs.scipy.org/doc/scipy](https://docs.scipy.org/doc/scipy) 18 | - GitHub Repository: [https://github.com/scipy/scipy](https://github.com/scipy/scipy) 19 | 20 | SciPy is a scientific computing library that builds on NumPy. It provides: 21 | 22 | - Optimization algorithms 23 | - Linear algebra operations 24 | - Signal and image processing tools 25 | - Statistical functions 26 | 27 | ### NetworkX 28 | - Official Website: [https://networkx.org](https://networkx.org) 29 | - Documentation: [https://networkx.org/documentation/stable](https://networkx.org/documentation/stable) 30 | - GitHub Repository: [https://github.com/networkx/networkx](https://github.com/networkx/networkx) 31 | 32 | NetworkX is a Python package for complex networks. It provides: 33 | 34 | - Graph creation and manipulation 35 | - Network structure and analysis algorithms 36 | - Network visualization tools 37 | - Large collection of graph algorithms 38 | 39 | ### Scikit-learn 40 | - Official Website: [https://scikit-learn.org](https://scikit-learn.org) 41 | - Documentation: [https://scikit-learn.org/stable](https://scikit-learn.org/stable) 42 | - GitHub Repository: [https://github.com/scikit-learn/scikit-learn](https://github.com/scikit-learn/scikit-learn) 43 | 44 | Scikit-learn is a machine learning library for Python. It provides: 45 | 46 | - Classification, regression, and clustering algorithms 47 | - Model selection and evaluation tools 48 | - Preprocessing and feature engineering utilities 49 | - Comprehensive documentation and examples 50 | 51 | ### PyTorch 52 | - Official Website: [https://pytorch.org](https://pytorch.org) 53 | - Documentation: [https://pytorch.org/docs/stable/index.html](https://pytorch.org/docs/stable/index.html) 54 | - GitHub Repository: [https://github.com/pytorch/pytorch](https://github.com/pytorch/pytorch) 55 | 56 | PyTorch is an open source machine learning framework. It provides: 57 | 58 | - Dynamic computational graphs 59 | - GPU acceleration 60 | - Deep neural network building blocks 61 | - Rich ecosystem of tools and libraries 62 | 63 | ### PyTorch Geometric 64 | - Official Website: [https://pytorch-geometric.readthedocs.io](https://pytorch-geometric.readthedocs.io) 65 | - Documentation: [https://pytorch-geometric.readthedocs.io/en/latest](https://pytorch-geometric.readthedocs.io/en/latest) 66 | - GitHub Repository: [https://github.com/pyg-team/pytorch_geometric](https://github.com/pyg-team/pytorch_geometric) 67 | 68 | PyTorch Geometric (PyG) is a library for deep learning on irregular input data. It provides: 69 | 70 | - Graph Neural Network implementations 71 | - Various graph datasets 72 | - Common graph operations 73 | - Efficient sparse matrix operations 74 | -------------------------------------------------------------------------------- /docs/benchmark/Overview.md: -------------------------------------------------------------------------------- 1 | # Benchmarks Overview 2 | 3 | PyGDA provides extensive benchmarking capabilities across different types of graph domain adaptation tasks. This document outlines our three main benchmark suites. 4 | 5 | ## Node Classification Benchmark 6 | 7 | ### Overview 8 | - Evaluates 16 different methods 9 | - Tests on 5 distinct datasets 10 | - Each experiment repeated 3 times for statistical significance 11 | 12 | ### Running the Benchmark 13 | ``` 14 | cd benchmark/node 15 | ./run.sh 16 | ``` 17 | 18 | ## Graph Classification Benchmark 19 | 20 | ### Overview 21 | - Evaluates 7 different methods 22 | - Tests on 3 graph classification datasets: 23 | 24 | * PROTEINS 25 | * FRANKENSTEIN 26 | * Mutagenicity 27 | 28 | - Each experiment repeated 3 times for statistical significance 29 | 30 | ### Running the Benchmark 31 | ``` 32 | cd benchmark/graph 33 | # Run benchmarks for each dataset 34 | ./run_all_F.sh # FRANKENSTEIN 35 | ./run_all_M.sh # Mutagenicity 36 | ./run_all_P.sh # PROTEINS 37 | ``` 38 | 39 | ## LLM-Enhanced Benchmark 40 | 41 | ### Overview 42 | - Evaluates 5 different methods 43 | - Focuses on ogbn-arxiv dataset with LLM predictions and explanations 44 | - Each experiment repeated 3 times for statistical significance 45 | - Tests different feature encoding approaches 46 | 47 | ### Dataset Preprocessing Options 48 | 49 | #### **Original Features** 50 | ``` 51 | python origin_preprocess.py 52 | ``` 53 | 54 | #### **LLM with Word2Vec** 55 | ``` 56 | python llm_w2v_preprocess.py 57 | ``` 58 | - Combines title, abstract, and LLM outputs 59 | - Processes using word2vec embeddings 60 | 61 | #### **LLM with BERT** 62 | ``` 63 | python llm_bert_preprocess.py 64 | ``` 65 | - Combines title, abstract, and LLM outputs 66 | - Uses DeBERTa for sentence embeddings 67 | - Unsupervised approach (no fine-tuning) 68 | 69 | ### Data Requirements 70 | - **ogbn-arxiv**: Download title and abstract data from [OGB](https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz) 71 | - **LLM Responses**: Download from [TAPE paper data](https://drive.google.com/file/d/1A6mZSFzDIhJU795497R6mAAM2Y9qutI5/view?usp=sharing) 72 | 73 | ### Chronological Split 74 | Dataset is divided into 3 groups based on publication years: 75 | 76 | - Group A: Papers before 2016 77 | - Group B: Papers from 2016-2018 78 | - Group C: Papers from 2018-2020 79 | 80 | ### Running the Benchmark 81 | ``` 82 | cd benchmark/llm 83 | ./run1.sh 84 | ./run2.sh 85 | ./run3.sh 86 | ``` 87 | 88 | ## General Guidelines 89 | 90 | ### Running Benchmarks 91 | - Ensure all required datasets are downloaded 92 | - Install all dependencies 93 | - Run benchmarks from their respective directories 94 | - Results will be saved in the corresponding output directories 95 | 96 | ### Reproducibility 97 | - Fixed random seeds are used 98 | - Multiple runs (3x) for statistical significance 99 | - Standardized evaluation metrics across all experiments 100 | 101 | ### Resource Requirements 102 | - Node classification: Moderate GPU memory 103 | - Graph classification: Lower GPU memory 104 | - LLM benchmark: Higher GPU memory (for BERT embeddings) 105 | 106 | 107 | This overview: 108 | 109 | 1. Provides a clear structure for each benchmark suite 110 | 2. Includes detailed setup and running instructions 111 | 3. Specifies data requirements and preprocessing steps 112 | 4. Offers guidelines for reproducibility 113 | 5. Maintains consistent formatting throughout 114 | 6. Includes resource requirements -------------------------------------------------------------------------------- /benchmark/llm/run1.sh: -------------------------------------------------------------------------------- 1 | echo "Task arxiv-1950-2016 -> arxiv-2016-2018" 2 | echo "Original ogbn features" 3 | 4 | python a2gnn.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-1.txt' --device 'cuda:1' 5 | python udagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-1.txt' --device 'cuda:1' 6 | python kbl.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-1.txt' --device 'cuda:1' 7 | python grade.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-1.txt' --device 'cuda:1' 8 | python adagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-1.txt' --device 'cuda:1' 9 | 10 | echo "Task arxiv-1950-2016 -> arxiv-2018-2020" 11 | echo "Original ogbn features" 12 | 13 | python a2gnn.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-1.txt' --device 'cuda:1' 14 | python udagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-1.txt' --device 'cuda:1' 15 | python kbl.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-1.txt' --device 'cuda:1' 16 | python grade.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-1.txt' --device 'cuda:1' 17 | python adagcn.py --source 'arxiv-1950-2016' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-1.txt' --device 'cuda:1' 18 | 19 | echo "Task arxiv-2016-2018 -> arxiv-2018-2020" 20 | echo "Original ogbn features" 21 | 22 | python a2gnn.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-1.txt' --device 'cuda:1' 23 | python udagcn.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-1.txt' --device 'cuda:1' 24 | python kbl.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-1.txt' --device 'cuda:1' 25 | python grade.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-1.txt' --device 'cuda:1' 26 | python adagcn.py --source 'arxiv-2016-2018' --target 'arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-1.txt' --device 'cuda:1' -------------------------------------------------------------------------------- /benchmark/llm/run2.sh: -------------------------------------------------------------------------------- 1 | echo "Task arxiv-1950-2016 -> arxiv-2016-2018" 2 | echo "LLM enhanced text with word2vec embedding" 3 | 4 | python a2gnn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-2.txt' --device 'cuda:2' 5 | python udagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-2.txt' --device 'cuda:2' 6 | python kbl.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-2.txt' --device 'cuda:2' 7 | python grade.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-2.txt' --device 'cuda:2' 8 | python adagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-2.txt' --device 'cuda:2' 9 | 10 | 11 | echo "Task arxiv-1950-2016 -> arxiv-2018-2020" 12 | echo "LLM enhanced text with word2vec embedding" 13 | 14 | python a2gnn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-2.txt' --device 'cuda:2' 15 | python udagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-2.txt' --device 'cuda:2' 16 | python kbl.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-2.txt' --device 'cuda:2' 17 | python grade.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-2.txt' --device 'cuda:2' 18 | python adagcn.py --source 'llm-arxiv-1950-2016' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-2.txt' --device 'cuda:2' 19 | 20 | echo "Task arxiv-2016-2018 -> arxiv-2018-2020" 21 | echo "LLM enhanced text with word2vec embedding" 22 | 23 | python a2gnn.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-2.txt' --device 'cuda:2' 24 | python udagcn.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-2.txt' --device 'cuda:2' 25 | python kbl.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-2.txt' --device 'cuda:2' 26 | python grade.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-2.txt' --device 'cuda:2' 27 | python adagcn.py --source 'llm-arxiv-2016-2018' --target 'llm-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-2.txt' --device 'cuda:2' 28 | -------------------------------------------------------------------------------- /examples/demo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path as osp 3 | import numpy as np 4 | 5 | from pygda.datasets import CitationDataset 6 | 7 | from pygda.models import UDAGCN, A2GNN, GRADE 8 | from pygda.models import ASN, SpecReg, GNN 9 | from pygda.models import StruRW, ACDNE, DANE 10 | from pygda.models import AdaGCN, JHGDA, KBL 11 | from pygda.models import DGDA, SAGDA, CWGCN 12 | from pygda.models import DMGNN, PairAlign 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1 15 | from pygda.utils import svd_transform 16 | 17 | parser = argparse.ArgumentParser() 18 | 19 | parser.add_argument('--nhid', type=int, default=64, help='hidden size') 20 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices') 21 | parser.add_argument('--source', type=str, default='DBLPv7', help='source domain data, DBLPv7/ACMv9/Citationv1') 22 | parser.add_argument('--target', type=str, default='ACMv9', help='target domain data, DBLPv7/ACMv9/Citationv1') 23 | 24 | args = parser.parse_args() 25 | 26 | # load data 27 | if args.source in {'DBLPv7', 'ACMv9', 'Citationv1'}: 28 | path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data/Citation', args.source) 29 | source_dataset = CitationDataset(path, args.source) 30 | 31 | if args.target in {'DBLPv7', 'ACMv9', 'Citationv1'}: 32 | path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data/Citation', args.target) 33 | target_dataset = CitationDataset(path, args.target) 34 | 35 | source_data = source_dataset[0].to(args.device) 36 | target_data = target_dataset[0].to(args.device) 37 | 38 | num_features = source_data.x.size(1) 39 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 40 | 41 | # choose a graph domain adaptation model 42 | model = A2GNN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 43 | # model = UDAGCN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 44 | # model = GRADE(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 45 | # model = ASN(in_dim=num_features, hid_dim=args.nhid, hid_dim_vae=args.nhid, num_classes=num_classes, device=args.device) 46 | # model = SpecReg(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device, reg_mode=True) 47 | # model = GNN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 48 | # model = StruRW(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 49 | # model = ACDNE(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 50 | # model = DANE(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 51 | # model = AdaGCN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 52 | # model = JHGDA(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 53 | # model = KBL(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 54 | # model = DGDA(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 55 | # model = SAGDA(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 56 | # model = CWGCN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 57 | # model = DMGNN(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 58 | # model = PairAlign(in_dim=num_features, hid_dim=args.nhid, num_classes=num_classes, device=args.device) 59 | 60 | # train the model 61 | model.fit(source_data, target_data) 62 | 63 | # evaluate the performance 64 | logits, labels = model.predict(target_data) 65 | 66 | preds = logits.argmax(dim=1) 67 | 68 | mi_f1 = eval_micro_f1(labels, preds) 69 | ma_f1 = eval_macro_f1(labels, preds) 70 | 71 | print('micro-f1: ' + str(mi_f1)) 72 | print('macro-f1: ' + str(ma_f1)) 73 | -------------------------------------------------------------------------------- /benchmark/llm/origin_preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import random 4 | import json 5 | import pandas as pd 6 | import argparse 7 | 8 | 9 | def take_second(element): 10 | return element[1] 11 | 12 | 13 | def load_ogb_arxiv(data_dir, year_bound = [2018, 2020], proportion = 1.0): 14 | import ogb.nodeproppred 15 | 16 | dataset = ogb.nodeproppred.NodePropPredDataset(name='ogbn-arxiv', root=data_dir) 17 | graph = dataset.graph 18 | 19 | node_years = graph['node_year'] 20 | # print(node_years) year for each node 21 | n = node_years.shape[0] 22 | # print(n) number of nodes 23 | node_years = node_years.reshape(n) 24 | 25 | gpt_text = load_data_gpt_text(n) 26 | raw_text = load_data_raw_text() 27 | 28 | d = np.zeros(len(node_years)) 29 | print(d.shape) 30 | 31 | edges = graph['edge_index'] 32 | for i in range(edges.shape[1]): 33 | if node_years[edges[0][i]] <= year_bound[1] and node_years[edges[1][i]] <= year_bound[1] and node_years[edges[0][i]] > year_bound[0] and node_years[edges[1][i]] > year_bound[0]: 34 | d[edges[0][i]] += 1 35 | d[edges[1][i]] += 1 36 | 37 | nodes = [] 38 | for i, year in enumerate(node_years): 39 | if year <= year_bound[1] and year > year_bound[0]: 40 | nodes.append([i, d[i]]) 41 | 42 | nodes.sort(key = take_second, reverse = True) 43 | 44 | nodes = nodes[: int(proportion * len(nodes))] 45 | 46 | random.shuffle(nodes) 47 | 48 | result_edges = [] 49 | result_features = [] 50 | result_labels = [] 51 | result_text = [] 52 | 53 | for node in nodes: 54 | result_features.append(graph['node_feat'][node[0]]) 55 | result_features = np.array(result_features) 56 | 57 | ids = {} 58 | for i, node in enumerate(nodes): 59 | ids[node[0]] = i 60 | 61 | for i in range(edges.shape[1]): 62 | if edges[0][i] in ids and edges[1][i] in ids: 63 | result_edges.append([ids[edges[0][i]], ids[edges[1][i]]]) 64 | result_edges = np.array(result_edges).transpose(1, 0) 65 | 66 | result_labels = dataset.labels[[node[0] for node in nodes]] 67 | 68 | edge_index = torch.tensor(result_edges, dtype=torch.long) 69 | 70 | # result_features: original features 71 | node_feat = torch.tensor(result_features, dtype=torch.float) 72 | 73 | dataset.graph = {'edge_index': edge_index, 74 | 'edge_feat': None, 75 | 'node_feat': node_feat, 76 | 'num_nodes': node_feat.size(0)} 77 | dataset.label = torch.tensor(result_labels) 78 | 79 | return dataset 80 | 81 | def main(args): 82 | data_dir = './data' 83 | 84 | # 3 domains: [1950, 2016], [2016, 2018], [2018, 2020] 85 | 86 | start_year = 1950 87 | end_year = 2016 88 | 89 | dataset = load_ogb_arxiv(data_dir, year_bound=[start_year, end_year]) 90 | 91 | dataset.n = dataset.graph['num_nodes'] 92 | dataset.c = max(dataset.label.max().item() + 1, dataset.label.shape[1]) 93 | dataset.d = dataset.graph['node_feat'].shape[1] 94 | 95 | print(torch.min(dataset.graph['edge_index'])) 96 | print(torch.max(dataset.graph['edge_index'])) 97 | print(len(torch.unique(dataset.graph['edge_index']))) 98 | print(len(dataset.graph['edge_index'][1])) 99 | print(dataset.graph['node_feat'].size()) 100 | print(len(dataset.label)) 101 | 102 | print(f"num nodes {dataset.n}| num classes {dataset.c} | num node feats {dataset.d}") 103 | 104 | import pickle 105 | 106 | filename = 'arxiv-' + str(start_year) + '-' + str(end_year) + '.pkl' 107 | 108 | fw = open(filename, 'wb') 109 | pickle.dump(dataset, fw) 110 | 111 | 112 | if __name__ == "__main__": 113 | parser = argparse.ArgumentParser() 114 | 115 | args = parser.parse_args() 116 | main(args) 117 | -------------------------------------------------------------------------------- /benchmark/llm/run3.sh: -------------------------------------------------------------------------------- 1 | echo "Task arxiv-1950-2016 -> arxiv-2016-2018" 2 | echo "LLM enhanced text with bert embedding" 3 | 4 | python a2gnn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-3.txt' --device 'cuda:3' 5 | python udagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-3.txt' --device 'cuda:3' 6 | python kbl.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-3.txt' --device 'cuda:3' 7 | python grade.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-3.txt' --device 'cuda:3' 8 | python adagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2016-2018' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-3.txt' --device 'cuda:3' 9 | 10 | 11 | echo "Task arxiv-1950-2016 -> arxiv-2018-2020" 12 | echo "LLM enhanced text with bert embedding" 13 | 14 | python a2gnn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-3.txt' --device 'cuda:3' 15 | python udagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-3.txt' --device 'cuda:3' 16 | python kbl.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-3.txt' --device 'cuda:3' 17 | python grade.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-3.txt' --device 'cuda:3' 18 | python adagcn.py --source 'llm-bert-arxiv-1950-2016' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-3.txt' --device 'cuda:3' 19 | 20 | echo "Task arxiv-2016-2018 -> arxiv-2018-2020" 21 | echo "LLM enhanced text with bert embedding" 22 | 23 | python a2gnn.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.1 --s_pnums 0 --t_pnums 5 --weight 0.1 --filename 'results-llm-3.txt' --device 'cuda:3' 24 | python udagcn.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --filename 'results-llm-3.txt' --device 'cuda:3' 25 | python kbl.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --k_cross 20 --k_within 10 --filename 'results-llm-3.txt' --device 'cuda:3' 26 | python grade.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.5 --weight 0.01 --filename 'results-llm-3.txt' --device 'cuda:3' 27 | python adagcn.py --source 'llm-bert-arxiv-2016-2018' --target 'llm-bert-arxiv-2018-2020' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.0 --epochs 800 --dropout_ratio 0.4 --domain_weight 1.0 --filename 'results-llm-3.txt' --device 'cuda:3' 28 | -------------------------------------------------------------------------------- /docs/datasets/Overview.md: -------------------------------------------------------------------------------- 1 | # Dataset Overview 2 | 3 | This section provides detailed documentation for all supported datasets in PyGDA, including their domains and sources. 4 | 5 | ### Citation Networks 6 | 7 | #### [Arxiv](Arxiv.md) 8 | - **Domains**: 3 domains based on publication years 9 | - **Source**: [ogbn-arxiv](https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv) 10 | - **Processing**: See [ArxivDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/arxiv.py) 11 | - **Features**: Generated from paper abstracts 12 | - **Note**: Can be preprocessed with scripts in benchmark folder 13 | 14 | #### [Citation](Citation.md) 15 | - **Domains**: ACMv9, Citationv1, DBLPv7 16 | - **Source**: Adopted from [ASN](https://arxiv.org/abs/2103.13355) 17 | - **Processing**: See [CitationDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/citation.py) 18 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1ntNt3qHE4p9Us8Re9tZDaB-tdtqwV8AX?usp=share_link) 19 | 20 | #### [MAG](MAG.md) 21 | - **Domains**: CN, DE, FR, JP, RU, US 22 | - **Source**: Originally from [ogbn-mag](https://ogb.stanford.edu/docs/nodeprop/#ogbn-mag) 23 | - **Processing**: See [MAGDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/mag.py) 24 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1HinhjpNPPivyqoubiYOr8X2jq-rjw3e9?usp=share_link) 25 | - **Note**: Separated into 6 countries by [PairAlign](https://arxiv.org/abs/2403.01092) 26 | 27 | ### Social Networks 28 | 29 | #### [Blog](Blog.md) 30 | - **Domains**: Blog1, Blog2 31 | - **Source**: Adopted from [ACDNE](https://arxiv.org/abs/2002.07366) 32 | - **Processing**: See [BlogDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/blog.py) 33 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1jKKG0o7rEY-BaVEjBhuGijzwwhU0M-pQ?usp=share_link) 34 | 35 | #### [Twitch](Twitch.md) 36 | - **Domains**: DE, EN, ES, FR, PT, RU 37 | - **Source**: [Twitch Social Networks](https://github.com/benedekrozemberczki/datasets#twitch-social-networks) 38 | - **Processing**: See [TwitchDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/twitch.py) 39 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1GWMyyJOZ4CeeqP_H5dCA5voSQHT0WlXG?usp=share_link) 40 | 41 | ### Infrastructure Networks 42 | 43 | #### [Airport](Airport.md) 44 | - **Domains**: Brazil, Europe, USA 45 | - **Source**: Adopted from [struc2vec](https://arxiv.org/abs/1704.03165) 46 | - **Processing**: See [AirportDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/airport.py) 47 | - **Features**: Constructed using OneHotDegree for each node 48 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1zlluWoeukD33ZxwaTRQi3jCdD0qC-I2j?usp=share_link) 49 | 50 | ### Graph Classification Benchmarks 51 | 52 | #### [TUGraph](TUGraph.md) 53 | - **Datasets**: 54 | 55 | * PROTEINS 56 | * FRANKENSTEIN 57 | * Mutagenicity 58 | 59 | - **Domains**: 2 domains based on density for each dataset 60 | - **Source**: Adopted from [TUDataset](https://chrsmrrs.github.io/datasets/docs/datasets/) 61 | - **Processing**: See [GraphTUDataset](https://github.com/pygda-team/pygda/blob/main/pygda/datasets/tugraph.py) 62 | - **Download**: [Download Link](https://drive.google.com/drive/folders/1NbPK71Dy0ulH3CdNyfvMwQECj_Oh867I?usp=sharing) 63 | 64 | ### Usage Example 65 | 66 | ```python 67 | from pygda.datasets import CitationDataset 68 | 69 | # Load the Citation dataset 70 | dataset = CitationDataset(root='data/citation', name='ACMv9') 71 | data = dataset[0] 72 | 73 | # Access the data 74 | x = data.x # Node features 75 | edge_index = data.edge_index # Graph connectivity 76 | y = data.y # Labels 77 | ``` 78 | 79 | Each dataset documentation includes: 80 | 81 | - Detailed domain descriptions 82 | - Data sources and references 83 | - Processing instructions 84 | - Download information 85 | - Usage examples 86 | - Implementation details 87 | 88 | For specific details about each dataset, please visit their respective documentation pages linked above. -------------------------------------------------------------------------------- /pygda/utils/utility.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def logger(epoch=0, 4 | loss=0, 5 | source_train_acc=None, 6 | source_val_acc=None, 7 | target=None, 8 | time=None, 9 | verbose=0, 10 | train=True): 11 | """ 12 | Print formatted training/testing progress information. 13 | 14 | Parameters 15 | ---------- 16 | epoch : int, optional 17 | Current training epoch. Default: 0 18 | loss : float or tuple, optional 19 | Loss value(s) for current epoch. If tuple, contains inner and outer losses. 20 | Default: 0 21 | source_train_acc : float, optional 22 | Source domain training accuracy. Default: None 23 | source_val_acc : float, optional 24 | Source domain validation accuracy. Default: None 25 | target : torch.Tensor, optional 26 | Target domain predictions/labels. Default: None 27 | time : float, optional 28 | Time taken for current epoch. Default: None 29 | verbose : int, optional 30 | Verbosity level controlling output detail: 31 | 32 | - 0: No output 33 | - 1: Basic loss information 34 | - 2: Add accuracy metrics 35 | - 3: Add detailed metrics (recall, precision, etc.) 36 | 37 | Default: 0 38 | train : bool, optional 39 | Whether in training or testing mode. Default: True 40 | 41 | Notes 42 | ----- 43 | Output Levels: 44 | 45 | - Basic Output (verbose=1): 46 | 47 | * Epoch number (training) or "Test" (testing) 48 | * Loss values (single or inner/outer) 49 | 50 | - Extended Output (verbose=2): 51 | 52 | * Basic output 53 | * Source domain accuracy 54 | * Target domain accuracy 55 | * Timing information 56 | 57 | - Detailed Output (verbose=3): 58 | 59 | * Extended output 60 | * Recall at k 61 | * Precision at k 62 | * Average precision 63 | * F1 score 64 | * Contamination metrics 65 | 66 | Features: 67 | 68 | - Multi-level verbosity 69 | - Flexible metric display 70 | - Progress tracking 71 | - Performance monitoring 72 | 73 | Format: 74 | 75 | - Epoch XXXX: Loss X.XXXX | Source Acc X.XXXX | Target Acc X.XXXX | Metrics ... | Time X.XX 76 | """ 77 | if verbose > 0: 78 | if train: 79 | print("Epoch {:04d}: ".format(epoch), end='') 80 | else: 81 | print("Test: ", end='') 82 | 83 | if isinstance(loss, tuple): 84 | print("Loss I {:.4f} | Loss O {:.4f} | " 85 | .format(loss[0], loss[1]), end='') 86 | else: 87 | print("loss {:.4f}, ".format(loss), end='') 88 | 89 | if verbose > 1: 90 | if source_train_acc is not None: 91 | print("source acc {:.4f}, ".format(source_train_acc), end='') 92 | 93 | if target is not None: 94 | print("target acc {:.4f}, ".format(target), end='') 95 | 96 | if verbose > 2: 97 | if target is not None: 98 | pos_size = target.nonzero().size(0) 99 | rec = eval_recall_at_k(target, score, pos_size) 100 | pre = eval_precision_at_k(target, score, pos_size) 101 | ap = eval_average_precision(target, score) 102 | 103 | contamination = sum(target) / len(target) 104 | threshold = np.percentile(score, 105 | 100 * (1 - contamination)) 106 | pred = (score > threshold).long() 107 | f1 = eval_f1(target, pred) 108 | 109 | print(" | Recall {:.4f} | Precision {:.4f} " 110 | "| AP {:.4f} | F1 {:.4f}" 111 | .format(rec, pre, ap, f1), end='') 112 | 113 | if time is not None: 114 | print("time {:.2f}".format(time), end='') 115 | 116 | print() -------------------------------------------------------------------------------- /pygda/nn/deepwalk_pretrain.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch_geometric.nn import Node2Vec 3 | 4 | class DWPretrain(torch.nn.Module): 5 | """ 6 | DeepWalk pretraining implementation for graph embeddings. 7 | 8 | Parameters 9 | ---------- 10 | data : torch_geometric.data.Data 11 | Input graph data object. 12 | epoch : int, optional 13 | Number of training epochs. Default: 200. 14 | embedding_dim : int, optional 15 | Dimension of node embeddings. Default: 128. 16 | walk_length : int, optional 17 | Length of each random walk. Default: 20. 18 | context_size : int, optional 19 | Size of context window. Default: 10. 20 | walks_per_node : int, optional 21 | Number of walks per node. Default: 10. 22 | num_negative_samples : int, optional 23 | Number of negative samples per positive pair. Default: 1. 24 | 25 | Notes 26 | ----- 27 | Implements DeepWalk algorithm using Node2Vec with p=q=1.0 (equivalent to DeepWalk). 28 | Uses sparse implementation for memory efficiency. 29 | """ 30 | 31 | def __init__( 32 | self, 33 | data, 34 | epoch=200, 35 | embedding_dim=128, 36 | walk_length=20, 37 | context_size=10, 38 | walks_per_node=10, 39 | num_negative_samples=1, 40 | ): 41 | super(DWPretrain, self).__init__() 42 | 43 | self.data = data 44 | self.device = data.edge_index.device 45 | self.epoch = epoch 46 | self.embedding_dim = embedding_dim 47 | self.walk_length = walk_length 48 | self.context_size = context_size 49 | self.walks_per_node = walks_per_node 50 | self.num_negative_samples = num_negative_samples 51 | 52 | self.model = Node2Vec( 53 | data.edge_index, 54 | embedding_dim=self.embedding_dim, 55 | walk_length=self.walk_length, 56 | context_size=self.context_size, 57 | walks_per_node=self.walks_per_node, 58 | num_negative_samples=self.num_negative_samples, 59 | p=1.0, 60 | q=1.0, 61 | sparse=True, 62 | ).to(self.device) 63 | 64 | num_workers = 4 65 | self.loader = self.model.loader(batch_size=128, shuffle=True, num_workers=num_workers) 66 | self.optimizer = torch.optim.SparseAdam(list(self.model.parameters()), lr=0.01) 67 | 68 | def train(self): 69 | """ 70 | Execute one epoch of training. 71 | 72 | Returns 73 | ------- 74 | float 75 | Average loss value for the epoch. 76 | 77 | Notes 78 | ----- 79 | Training process: 80 | 81 | 1. Generate random walks 82 | 2. Sample positive and negative context pairs 83 | 3. Update embeddings using SparseAdam optimizer 84 | """ 85 | self.model.train() 86 | total_loss = 0 87 | for pos_rw, neg_rw in self.loader: 88 | self.optimizer.zero_grad() 89 | loss = self.model.loss(pos_rw.to(self.device), neg_rw.to(self.device)) 90 | loss.backward() 91 | self.optimizer.step() 92 | total_loss += loss.item() 93 | 94 | return total_loss / len(self.loader) 95 | 96 | def fit(self): 97 | """ 98 | Complete training procedure for all epochs. 99 | 100 | Notes 101 | ----- 102 | Executes training loop for specified number of epochs. 103 | Prints progress including epoch number and loss value. 104 | """ 105 | for epoch in range(self.epoch): 106 | loss = self.train() 107 | print(f'Epoch: {epoch:03d}, pretrain loss: {loss:.4f}') 108 | 109 | def get_embedding(self): 110 | """ 111 | Retrieve learned node embeddings. 112 | 113 | Returns 114 | ------- 115 | torch.Tensor 116 | Node embedding matrix of shape (num_nodes, embedding_dim). 117 | 118 | Notes 119 | ----- 120 | Returns final node embeddings after training or during evaluation. 121 | """ 122 | self.model.eval() 123 | z = self.model() 124 | 125 | return z 126 | -------------------------------------------------------------------------------- /pygda/datasets/tugraph.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import torch 3 | import numpy as np 4 | from torch_geometric.data import InMemoryDataset, Data 5 | from torch_geometric.io import read_txt_array 6 | import torch.nn.functional as F 7 | import random 8 | 9 | import scipy 10 | import pickle as pkl 11 | from sklearn.preprocessing import label_binarize 12 | import csv 13 | import json 14 | 15 | import warnings 16 | warnings.filterwarnings('ignore', category=DeprecationWarning) 17 | 18 | 19 | class GraphTUDataset(InMemoryDataset): 20 | """ 21 | TUGraph Dataset loader for graph-based analysis. 22 | 23 | Parameters 24 | ---------- 25 | root : str 26 | Root directory where the dataset should be saved 27 | name : str 28 | Name of the TU dataset 29 | transform : callable, optional 30 | Function/transform that takes in a Data object and returns a transformed 31 | version. Default: None 32 | pre_transform : callable, optional 33 | Function/transform to be applied to the data object before saving. 34 | Default: None 35 | pre_filter : callable, optional 36 | Function that takes in a Data object and returns a boolean value, 37 | indicating whether the data object should be included. Default: None 38 | 39 | Notes 40 | ----- 41 | Dataset Structure: 42 | 43 | - Collection of graphs 44 | - Each graph has its own structure and features 45 | - Supports various graph classification tasks 46 | - Random shuffling for better training 47 | """ 48 | def __init__(self, 49 | root, 50 | name, 51 | transform=None, 52 | pre_transform=None, 53 | pre_filter=None): 54 | self.name = name 55 | self.root = root 56 | super(GraphTUDataset, self).__init__(root, transform, pre_transform, pre_filter) 57 | 58 | self.data, self.slices = torch.load(self.processed_paths[0]) 59 | 60 | @property 61 | def raw_file_names(self): 62 | """ 63 | Names of required raw files. 64 | 65 | Returns 66 | ------- 67 | list[str] 68 | List of required raw file names 69 | 70 | Notes 71 | ----- 72 | Required files: 73 | 74 | - .pkl: Pickle file containing list of graph data objects 75 | """ 76 | return [".pkl"] 77 | 78 | @property 79 | def processed_file_names(self): 80 | """ 81 | Names of processed data files. 82 | 83 | Returns 84 | ------- 85 | list[str] 86 | List of processed file names 87 | 88 | Notes 89 | ----- 90 | Processed files: 91 | 92 | - data.pt: Contains processed PyTorch Geometric data objects 93 | """ 94 | return ['data.pt'] 95 | 96 | def download(self): 97 | """ 98 | Download raw data files. 99 | 100 | Notes 101 | ----- 102 | Empty implementation - data should be manually placed in raw directory 103 | """ 104 | pass 105 | 106 | def process(self): 107 | """ 108 | Process raw data into PyTorch Geometric Data format. 109 | 110 | Notes 111 | ----- 112 | Processing Steps: 113 | 114 | - Load pickle data: 115 | 116 | * List of graph data objects 117 | 118 | - Random shuffling: 119 | 120 | * Shuffle graphs for better training 121 | 122 | - Apply pre-transform: 123 | 124 | * Transform each graph if specified 125 | 126 | - Collate graphs: 127 | 128 | * Combine into single data object 129 | 130 | - Save processed data 131 | 132 | Features: 133 | 134 | - Multiple graph handling 135 | - Random shuffling 136 | - Optional pre-transform support 137 | - Batch processing support 138 | """ 139 | path = osp.join(self.raw_dir, '{}.pkl'.format(self.name)) 140 | data_list = pkl.load(open(path, 'rb')) 141 | random.shuffle(data_list) 142 | 143 | if self.pre_transform is not None: 144 | data_list = [self.pre_transform(data) for data in data_list] 145 | 146 | self.data, self.slices = self.collate(data_list) 147 | 148 | torch.save((self.data, self.slices), self.processed_paths[0]) 149 | -------------------------------------------------------------------------------- /docs/assets/css/custom.css: -------------------------------------------------------------------------------- 1 | /* Global styles */ 2 | .rst-content { 3 | max-width: 1000px; 4 | margin: 0 auto; 5 | line-height: 1.7; 6 | } 7 | 8 | /* Beautiful Typography */ 9 | .rst-content h1 { 10 | font-size: 2.5rem; 11 | color: #2c3e50; 12 | border-bottom: 3px solid #3498db; 13 | padding-bottom: 0.5rem; 14 | margin-bottom: 2rem; 15 | } 16 | 17 | .rst-content h2 { 18 | font-size: 2rem; 19 | color: #34495e; 20 | margin-top: 2.5rem; 21 | margin-bottom: 1.5rem; 22 | } 23 | 24 | .rst-content h3 { 25 | font-size: 1.75rem; 26 | color: #2980b9; 27 | margin-top: 2rem; 28 | } 29 | 30 | /* Function signatures with gradient background */ 31 | .rst-content .function { 32 | background: linear-gradient(to right, #f6f9fc, #ffffff); 33 | border-left: 4px solid #3498db; 34 | padding: 1rem; 35 | margin: 1.5rem 0; 36 | border-radius: 0 6px 6px 0; 37 | font-size: 1.2rem; 38 | font-weight: 600; 39 | color: #2c3e50; 40 | box-shadow: 0 2px 4px rgba(0,0,0,0.1); 41 | } 42 | 43 | /* Beautiful docstring styling */ 44 | .rst-content .docstring { 45 | background: #ffffff; 46 | padding: 1.5rem; 47 | margin: 1rem 0 2rem 0; 48 | border-radius: 8px; 49 | box-shadow: 0 4px 6px rgba(0,0,0,0.07); 50 | } 51 | 52 | .rst-content .docstring .parameter { 53 | font-family: "Segoe UI", system-ui, -apple-system, sans-serif; 54 | color: #2980b9; 55 | background-color: #f8f9fa; 56 | padding: 1rem 1.5rem; 57 | margin: 0.5rem 0; 58 | border-radius: 6px; 59 | border: 1px solid #e1e8ed; 60 | transition: all 0.2s ease; 61 | } 62 | 63 | .rst-content .docstring .parameter:hover { 64 | transform: translateY(-2px); 65 | box-shadow: 0 4px 8px rgba(0,0,0,0.1); 66 | } 67 | 68 | /* Code blocks with modern styling */ 69 | .rst-content pre { 70 | background: #282c34; 71 | border-radius: 8px; 72 | margin: 1.5rem 0; 73 | box-shadow: 0 4px 6px rgba(0,0,0,0.1); 74 | } 75 | 76 | .rst-content pre code { 77 | color: #282c34; 78 | font-family: 'Fira Code', 'Consolas', monospace; 79 | padding: 1.5rem; 80 | font-size: 0.95rem; 81 | line-height: 1.6; 82 | } 83 | 84 | /* Inline code */ 85 | .rst-content code { 86 | color: #e83e8c; 87 | background: #f8f9fa; 88 | padding: 2px 6px; 89 | border-radius: 4px; 90 | font-size: 0.9em; 91 | } 92 | 93 | /* Method signatures */ 94 | .rst-content .method .signature { 95 | color: #6c5ce7; 96 | font-weight: 600; 97 | background: #f8f9fa; 98 | padding: 1rem; 99 | border-radius: 6px; 100 | margin: 1rem 0; 101 | border: 1px solid #e1e8ed; 102 | } 103 | 104 | /* Summary sections */ 105 | .rst-content .docstring .summary { 106 | font-weight: 600; 107 | color: #2c3e50; 108 | font-size: 1.1rem; 109 | margin-bottom: 1rem; 110 | padding-bottom: 0.5rem; 111 | border-bottom: 2px solid #e1e8ed; 112 | } 113 | 114 | /* Arguments styling */ 115 | .rst-content .arguments .argument { 116 | color: #576574; 117 | padding: 0.5rem 0; 118 | border-bottom: 1px solid #f1f1f1; 119 | } 120 | 121 | /* Admonitions (notes, warnings, etc.) */ 122 | .rst-content .admonition { 123 | border-radius: 8px; 124 | border: none; 125 | box-shadow: 0 2px 4px rgba(0,0,0,0.1); 126 | margin: 1.5rem 0; 127 | } 128 | 129 | .rst-content .admonition-title { 130 | border-radius: 8px 8px 0 0; 131 | text-transform: uppercase; 132 | letter-spacing: 0.5px; 133 | font-size: 0.9rem; 134 | } 135 | 136 | /* Links */ 137 | .rst-content a { 138 | color: #3498db; 139 | text-decoration: none; 140 | transition: color 0.2s ease; 141 | } 142 | 143 | .rst-content a:hover { 144 | color: #2980b9; 145 | text-decoration: underline; 146 | } 147 | 148 | /* Tables */ 149 | .rst-content table { 150 | border-radius: 8px; 151 | overflow: hidden; 152 | box-shadow: 0 2px 4px rgba(0,0,0,0.1); 153 | margin: 2rem 0; 154 | } 155 | 156 | .rst-content table thead th { 157 | background: #f8f9fa; 158 | border-bottom: 2px solid #e1e8ed; 159 | color: #2c3e50; 160 | padding: 12px 15px; 161 | } 162 | 163 | .rst-content table td { 164 | padding: 12px 15px; 165 | border-bottom: 1px solid #e1e8ed; 166 | } 167 | 168 | /* Lists */ 169 | .rst-content ul, .rst-content ol { 170 | padding-left: 1.5rem; 171 | margin: 1rem 0; 172 | } 173 | 174 | .rst-content li { 175 | margin: 0.5rem 0; 176 | color: #2c3e50; 177 | } 178 | 179 | /* Smooth scrolling */ 180 | html { 181 | scroll-behavior: smooth; 182 | } 183 | 184 | /* Selection color */ 185 | ::selection { 186 | background: #3498db33; 187 | color: #2c3e50; 188 | } 189 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL Advanced" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | branches: [ "main" ] 19 | schedule: 20 | - cron: '31 12 * * 2' 21 | 22 | jobs: 23 | analyze: 24 | name: Analyze (${{ matrix.language }}) 25 | # Runner size impacts CodeQL analysis time. To learn more, please see: 26 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 27 | # - https://gh.io/supported-runners-and-hardware-resources 28 | # - https://gh.io/using-larger-runners (GitHub.com only) 29 | # Consider using larger runners or machines with greater resources for possible analysis time improvements. 30 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 31 | permissions: 32 | # required for all workflows 33 | security-events: write 34 | 35 | # required to fetch internal or private CodeQL packs 36 | packages: read 37 | 38 | # only required for workflows in private repositories 39 | actions: read 40 | contents: read 41 | 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | include: 46 | - language: python 47 | build-mode: none 48 | # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' 49 | # Use `c-cpp` to analyze code written in C, C++ or both 50 | # Use 'java-kotlin' to analyze code written in Java, Kotlin or both 51 | # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both 52 | # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, 53 | # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. 54 | # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how 55 | # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages 56 | steps: 57 | - name: Checkout repository 58 | uses: actions/checkout@v4 59 | 60 | # Add any setup steps before running the `github/codeql-action/init` action. 61 | # This includes steps like installing compilers or runtimes (`actions/setup-node` 62 | # or others). This is typically only required for manual builds. 63 | # - name: Setup runtime (example) 64 | # uses: actions/setup-example@v1 65 | 66 | # Initializes the CodeQL tools for scanning. 67 | - name: Initialize CodeQL 68 | uses: github/codeql-action/init@v3 69 | with: 70 | languages: ${{ matrix.language }} 71 | build-mode: ${{ matrix.build-mode }} 72 | # If you wish to specify custom queries, you can do so here or in a config file. 73 | # By default, queries listed here will override any specified in a config file. 74 | # Prefix the list here with "+" to use these queries and those in the config file. 75 | 76 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 77 | # queries: security-extended,security-and-quality 78 | 79 | # If the analyze step fails for one of the languages you are analyzing with 80 | # "We were unable to automatically build your code", modify the matrix above 81 | # to set the build mode to "manual" for that language. Then modify this step 82 | # to build your code. 83 | # ℹ️ Command-line programs to run using the OS shell. 84 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 85 | - if: matrix.build-mode == 'manual' 86 | shell: bash 87 | run: | 88 | echo 'If you are using a "manual" build mode for one or more of the' \ 89 | 'languages you are analyzing, replace this with the commands to build' \ 90 | 'your code, for example:' 91 | echo ' make bootstrap' 92 | echo ' make release' 93 | exit 1 94 | 95 | - name: Perform CodeQL Analysis 96 | uses: github/codeql-action/analyze@v3 97 | with: 98 | category: "/language:${{matrix.language}}" 99 | -------------------------------------------------------------------------------- /benchmark/llm/kbl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import ArxivDataset 11 | 12 | from pygda.models import KBL 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='llm-bert-arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='llm-bert-arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=800, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--k_cross', type=int, default=10, help='number of edges for cross domains') 37 | parser.add_argument('--k_within', type=int, default=3, help='number of edges for within domains') 38 | 39 | args = parser.parse_args() 40 | 41 | # load data 42 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 43 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source) 44 | source_dataset = ArxivDataset(path, args.source) 45 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source) 47 | source_dataset = ArxivDataset(path, args.source) 48 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source) 50 | source_dataset = ArxivDataset(path, args.source) 51 | 52 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 53 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target) 54 | target_dataset = ArxivDataset(path, args.target) 55 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target) 57 | target_dataset = ArxivDataset(path, args.target) 58 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target) 60 | target_dataset = ArxivDataset(path, args.target) 61 | 62 | source_data = source_dataset[0].to(args.device) 63 | target_data = target_dataset[0].to(args.device) 64 | 65 | if not is_undirected(source_data.edge_index): 66 | source_data.edge_index = to_undirected(source_data.edge_index) 67 | 68 | if not is_undirected(target_data.edge_index): 69 | target_data.edge_index = to_undirected(target_data.edge_index) 70 | 71 | num_features = source_data.x.size(1) 72 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 73 | 74 | # choose a graph domain adaptation model 75 | model = KBL( 76 | in_dim=num_features, 77 | hid_dim=args.nhid, 78 | num_classes=num_classes, 79 | num_layers=args.num_layers, 80 | weight_decay=args.weight_decay, 81 | lr=args.lr, 82 | dropout=args.dropout_ratio, 83 | epoch=args.epochs, 84 | device=args.device, 85 | k_cross=args.k_cross, 86 | k_within=args.k_within 87 | ) 88 | 89 | # train the model 90 | model.fit(source_data, target_data) 91 | 92 | # evaluate the performance 93 | logits, labels = model.predict(target_data) 94 | 95 | preds = logits.argmax(dim=1) 96 | 97 | mi_f1 = eval_micro_f1(labels, preds) 98 | ma_f1 = eval_macro_f1(labels, preds) 99 | 100 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 101 | auc = eval_roc_auc(labels, logits[:, 1]) 102 | else: 103 | auc = 0.0 104 | 105 | results = 'kbl,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 106 | 107 | with open(args.filename, 'a+') as f: 108 | f.write(results + '\n') 109 | 110 | print(results) -------------------------------------------------------------------------------- /benchmark/llm/udagcn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import ArxivDataset 11 | 12 | from pygda.models import UDAGCN 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.utils import degree, is_undirected, to_undirected 17 | from torch_geometric.transforms import OneHotDegree 18 | 19 | parser = argparse.ArgumentParser() 20 | 21 | # model agnostic params 22 | parser.add_argument('--seed', type=int, default=200, help='random seed') 23 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 24 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 25 | parser.add_argument('--weight_decay', type=float, default=0.005, help='weight decay') 26 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 27 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 28 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 29 | parser.add_argument('--source', type=str, default='llm-bert-arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1') 30 | parser.add_argument('--target', type=str, default='llm-bert-arxiv-2018-2020', help='target domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 32 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 33 | 34 | # model specific params 35 | parser.add_argument('--ppmi', type=bool, default=True, help='use PPMI matrix or not') 36 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module') 37 | 38 | args = parser.parse_args() 39 | 40 | # load data 41 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 42 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source) 43 | source_dataset = ArxivDataset(path, args.source) 44 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 45 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source) 46 | source_dataset = ArxivDataset(path, args.source) 47 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 48 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source) 49 | source_dataset = ArxivDataset(path, args.source) 50 | 51 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 52 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target) 53 | target_dataset = ArxivDataset(path, args.target) 54 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 55 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target) 56 | target_dataset = ArxivDataset(path, args.target) 57 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 58 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target) 59 | target_dataset = ArxivDataset(path, args.target) 60 | 61 | source_data = source_dataset[0].to(args.device) 62 | target_data = target_dataset[0].to(args.device) 63 | 64 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 65 | if not is_undirected(source_data.edge_index): 66 | source_data.edge_index = to_undirected(source_data.edge_index) 67 | 68 | if not is_undirected(target_data.edge_index): 69 | target_data.edge_index = to_undirected(target_data.edge_index) 70 | 71 | num_features = source_data.x.size(1) 72 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 73 | 74 | # choose a graph domain adaptation model 75 | model = UDAGCN( 76 | in_dim=num_features, 77 | hid_dim=args.nhid, 78 | num_classes=num_classes, 79 | num_layers=args.num_layers, 80 | weight_decay=args.weight_decay, 81 | lr=args.lr, 82 | dropout=args.dropout_ratio, 83 | epoch=args.epochs, 84 | device=args.device, 85 | ppmi=args.ppmi, 86 | adv_dim=args.adv_dim 87 | ) 88 | 89 | # train the model 90 | model.fit(source_data, target_data) 91 | 92 | # evaluate the performance 93 | logits, labels = model.predict(target_data) 94 | 95 | preds = logits.argmax(dim=1) 96 | 97 | mi_f1 = eval_micro_f1(labels, preds) 98 | ma_f1 = eval_macro_f1(labels, preds) 99 | 100 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 101 | auc = eval_roc_auc(labels, logits[:, 1]) 102 | else: 103 | auc = 0.0 104 | 105 | results = 'udagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 106 | 107 | with open(args.filename, 'a+') as f: 108 | f.write(results + '\n') 109 | 110 | print(results) -------------------------------------------------------------------------------- /benchmark/llm/grade.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import ArxivDataset 11 | 12 | from pygda.models import GRADE 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=800, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--disc', type=str, default='JS', help='discriminator') 37 | parser.add_argument('--weight', type=float, default=0.01, help='trade off parameter for loss') 38 | 39 | args = parser.parse_args() 40 | 41 | # load data 42 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 43 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source) 44 | source_dataset = ArxivDataset(path, args.source) 45 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source) 47 | source_dataset = ArxivDataset(path, args.source) 48 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source) 50 | source_dataset = ArxivDataset(path, args.source) 51 | 52 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 53 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target) 54 | target_dataset = ArxivDataset(path, args.target) 55 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target) 57 | target_dataset = ArxivDataset(path, args.target) 58 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target) 60 | target_dataset = ArxivDataset(path, args.target) 61 | 62 | source_data = source_dataset[0].to(args.device) 63 | target_data = target_dataset[0].to(args.device) 64 | 65 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 66 | if not is_undirected(source_data.edge_index): 67 | source_data.edge_index = to_undirected(source_data.edge_index) 68 | 69 | if not is_undirected(target_data.edge_index): 70 | target_data.edge_index = to_undirected(target_data.edge_index) 71 | 72 | num_features = source_data.x.size(1) 73 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 74 | 75 | # choose a graph domain adaptation model 76 | model = GRADE( 77 | in_dim=num_features, 78 | hid_dim=args.nhid, 79 | num_classes=num_classes, 80 | num_layers=args.num_layers, 81 | weight_decay=args.weight_decay, 82 | lr=args.lr, 83 | dropout=args.dropout_ratio, 84 | epoch=args.epochs, 85 | device=args.device, 86 | disc=args.disc, 87 | weight=args.weight 88 | ) 89 | 90 | # train the model 91 | model.fit(source_data, target_data) 92 | 93 | # evaluate the performance 94 | logits, labels = model.predict(target_data) 95 | 96 | preds = logits.argmax(dim=1) 97 | 98 | mi_f1 = eval_micro_f1(labels, preds) 99 | ma_f1 = eval_macro_f1(labels, preds) 100 | 101 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 102 | auc = eval_roc_auc(labels, logits[:, 1]) 103 | else: 104 | auc = 0.0 105 | 106 | results = 'grade,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 107 | 108 | with open(args.filename, 'a+') as f: 109 | f.write(results + '\n') 110 | 111 | print(results) -------------------------------------------------------------------------------- /benchmark/graph/cwgcn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import GraphTUDataset 11 | 12 | from pygda.models import CWGCN 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=2, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--gnn', type=str, default='gcn', help='GNN backbone') 37 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks') 38 | 39 | args = parser.parse_args() 40 | 41 | # load data 42 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 43 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source) 44 | source_dataset = GraphTUDataset(path, args.source) 45 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source) 47 | source_dataset = GraphTUDataset(path, args.source) 48 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source) 50 | source_dataset = GraphTUDataset(path, args.source) 51 | 52 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 53 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target) 54 | target_dataset = GraphTUDataset(path, args.target) 55 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target) 57 | target_dataset = GraphTUDataset(path, args.target) 58 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target) 60 | target_dataset = GraphTUDataset(path, args.target) 61 | 62 | if args.mode == 'node': 63 | source_data = source_dataset[0].to(args.device) 64 | target_data = target_dataset[0].to(args.device) 65 | 66 | num_features = source_data.x.size(1) 67 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 68 | 69 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 70 | if not is_undirected(source_data.edge_index): 71 | source_data.edge_index = to_undirected(source_data.edge_index) 72 | 73 | if not is_undirected(target_data.edge_index): 74 | target_data.edge_index = to_undirected(target_data.edge_index) 75 | elif args.mode == 'graph': 76 | source_data = source_dataset 77 | target_data = target_dataset 78 | 79 | num_features = source_data.num_features 80 | num_classes = source_data.num_classes 81 | 82 | # choose a graph domain adaptation model 83 | model = CWGCN( 84 | in_dim=num_features, 85 | hid_dim=args.nhid, 86 | num_classes=num_classes, 87 | mode=args.mode, 88 | num_layers=args.num_layers, 89 | weight_decay=args.weight_decay, 90 | lr=args.lr, 91 | dropout=args.dropout_ratio, 92 | epoch=args.epochs, 93 | device=args.device, 94 | gnn=args.gnn 95 | ) 96 | 97 | # train the model 98 | model.fit(source_data, target_data) 99 | 100 | # evaluate the performance 101 | logits, labels = model.predict(target_data) 102 | 103 | preds = logits.argmax(dim=1) 104 | 105 | mi_f1 = eval_micro_f1(labels, preds) 106 | ma_f1 = eval_macro_f1(labels, preds) 107 | 108 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 109 | auc = eval_roc_auc(labels, logits[:, 1]) 110 | else: 111 | auc = 0.0 112 | 113 | results = 'cwgcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 114 | 115 | with open(args.filename, 'a+') as f: 116 | f.write(results + '\n') 117 | 118 | print(results) -------------------------------------------------------------------------------- /pygda/models/base.py: -------------------------------------------------------------------------------- 1 | import time 2 | from abc import ABC, abstractmethod 3 | 4 | import torch 5 | import numpy as np 6 | import torch.nn.functional as F 7 | 8 | from torch_geometric.loader import NeighborLoader 9 | 10 | from ..utils import logger 11 | 12 | 13 | class BaseGDA(ABC): 14 | """ 15 | Abstract Class for Graph Domain Adaptation. 16 | 17 | Parameters 18 | ---------- 19 | in_dim : int 20 | Input feature dimension. 21 | hid_dim : int 22 | Hidden dimension of model. 23 | num_classes: int 24 | Total number of classes. 25 | num_layers : int, optional 26 | Total number of layers in model. 27 | dropout : float, optional 28 | Dropout rate. Default: ``0.``. 29 | weight_decay : float, optional 30 | Weight decay (L2 penalty). Default: ``0.``. 31 | act : callable activation function or None, optional 32 | Activation function if not None. 33 | Default: ``torch.nn.functional.relu``. 34 | lr : float, optional 35 | Learning rate. Default: ``0.001``. 36 | epoch : int, optional 37 | Maximum number of training epoch. Default: ``100``. 38 | device : str, optional 39 | GPU or CPU. Default: ``cuda:0``. 40 | batch_size : int, optional 41 | Minibatch size, 0 for full batch training. Default: ``0``. 42 | num_neigh : int, optional 43 | Number of neighbors in sampling, -1 for all neighbors. 44 | Default: ``-1``. 45 | verbose : int, optional 46 | Verbosity mode. Range in [0, 3]. Larger value for printing out 47 | more log information. Default: ``0``. 48 | **kwargs 49 | Other parameters for the model. 50 | """ 51 | 52 | def __init__( 53 | self, 54 | in_dim, 55 | hid_dim, 56 | num_classes, 57 | num_layers=2, 58 | dropout=0., 59 | weight_decay=0., 60 | act=F.relu, 61 | lr=4e-3, 62 | epoch=100, 63 | device='cuda:0', 64 | batch_size=0, 65 | num_neigh=-1, 66 | verbose=2, 67 | **kwargs): 68 | 69 | super(BaseGDA, self).__init__() 70 | 71 | self.in_dim = in_dim 72 | self.hid_dim = hid_dim 73 | self.num_classes = num_classes 74 | self.num_layers = num_layers 75 | self.dropout = dropout 76 | self.weight_decay = weight_decay 77 | self.act = act 78 | self.verbose = verbose 79 | self.kwargs = kwargs 80 | 81 | self.lr = lr 82 | self.epoch = epoch 83 | self.device = device 84 | self.batch_size = batch_size 85 | 86 | if type(num_neigh) is int: 87 | self.num_neigh = [num_neigh] * self.num_layers 88 | elif type(num_neigh) is list: 89 | if len(num_neigh) != self.num_layers: 90 | raise ValueError('Number of neighbors should have the ' 91 | 'same length as hidden layers dimension or' 92 | 'the number of layers.') 93 | self.num_neigh = num_neigh 94 | else: 95 | raise ValueError('Number of neighbors must be int or list of int') 96 | 97 | self.model = None 98 | 99 | def fit(self, data, **kwargs): 100 | """ 101 | Training the graph neural network. 102 | 103 | Parameters 104 | ---------- 105 | data : torch_geometric.data.Data, optional 106 | The input graph. 107 | """ 108 | 109 | 110 | def predict(self, data, **kwargs): 111 | """Prediction for testing graph using the fitted graph domain adaptation model. 112 | Return predicted labels and probabilities by default. 113 | 114 | Parameters 115 | ---------- 116 | data : torch_geometric.data.Data, optional 117 | The testing graph. 118 | 119 | Returns 120 | ------- 121 | pred : torch.Tensor 122 | The predicted labels of shape :math:`N`. 123 | prob : torch.Tensor 124 | The output probabilities of shape :math:`N`. 125 | """ 126 | 127 | @abstractmethod 128 | def init_model(self, **kwargs): 129 | """ 130 | Initialize the graph neural network. 131 | 132 | Returns 133 | ------- 134 | model : torch.nn.Module 135 | The initialized graph neural network. 136 | """ 137 | 138 | @abstractmethod 139 | def process_graph(self, data, **kwargs): 140 | """ 141 | Data preprocessing for the input graph. 142 | 143 | Parameters 144 | ---------- 145 | data : torch_geometric.data.Data 146 | The input graph. 147 | """ 148 | 149 | @abstractmethod 150 | def forward_model(self, data, **kwargs): 151 | """ 152 | Forward pass of the graph neural network. 153 | 154 | Parameters 155 | ---------- 156 | data : torch_geometric.data.Data 157 | The input graph. 158 | 159 | Returns 160 | ------- 161 | loss : torch.Tensor 162 | The loss of the current batch. 163 | """ -------------------------------------------------------------------------------- /benchmark/graph/grade.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import GraphTUDataset 11 | 12 | from pygda.models import GRADE 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--disc', type=str, default='JS', help='discriminator') 37 | parser.add_argument('--weight', type=float, default=0.01, help='trade off parameter for loss') 38 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks') 39 | 40 | args = parser.parse_args() 41 | 42 | # load data 43 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 44 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source) 45 | source_dataset = GraphTUDataset(path, args.source) 46 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}: 47 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source) 48 | source_dataset = GraphTUDataset(path, args.source) 49 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 50 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source) 51 | source_dataset = GraphTUDataset(path, args.source) 52 | 53 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 54 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target) 55 | target_dataset = GraphTUDataset(path, args.target) 56 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}: 57 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target) 58 | target_dataset = GraphTUDataset(path, args.target) 59 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 60 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target) 61 | target_dataset = GraphTUDataset(path, args.target) 62 | 63 | if args.mode == 'node': 64 | source_data = source_dataset[0].to(args.device) 65 | target_data = target_dataset[0].to(args.device) 66 | 67 | num_features = source_data.x.size(1) 68 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 69 | 70 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 71 | if not is_undirected(source_data.edge_index): 72 | source_data.edge_index = to_undirected(source_data.edge_index) 73 | 74 | if not is_undirected(target_data.edge_index): 75 | target_data.edge_index = to_undirected(target_data.edge_index) 76 | elif args.mode == 'graph': 77 | source_data = source_dataset 78 | target_data = target_dataset 79 | 80 | num_features = source_data.num_features 81 | num_classes = source_data.num_classes 82 | 83 | # choose a graph domain adaptation model 84 | model = GRADE( 85 | in_dim=num_features, 86 | hid_dim=args.nhid, 87 | num_classes=num_classes, 88 | mode=args.mode, 89 | num_layers=args.num_layers, 90 | weight_decay=args.weight_decay, 91 | lr=args.lr, 92 | dropout=args.dropout_ratio, 93 | epoch=args.epochs, 94 | device=args.device, 95 | disc=args.disc, 96 | weight=args.weight 97 | ) 98 | 99 | # train the model 100 | model.fit(source_data, target_data) 101 | 102 | # evaluate the performance 103 | logits, labels = model.predict(target_data) 104 | 105 | preds = logits.argmax(dim=1) 106 | 107 | mi_f1 = eval_micro_f1(labels, preds) 108 | ma_f1 = eval_macro_f1(labels, preds) 109 | 110 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 111 | auc = eval_roc_auc(labels, logits[:, 1]) 112 | else: 113 | auc = 0.0 114 | 115 | results = 'grade,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 116 | 117 | with open(args.filename, 'a+') as f: 118 | f.write(results + '\n') 119 | 120 | print(results) -------------------------------------------------------------------------------- /benchmark/llm/a2gnn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import ArxivDataset 11 | 12 | from pygda.models import A2GNN 13 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 14 | 15 | from torch_geometric.utils import degree, is_undirected, to_undirected 16 | from torch_geometric.transforms import OneHotDegree 17 | 18 | parser = argparse.ArgumentParser() 19 | 20 | # model agnostic params 21 | parser.add_argument('--seed', type=int, default=200, help='random seed') 22 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 23 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 24 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 25 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 26 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 27 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 28 | parser.add_argument('--source', type=str, default='llm-bert-arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1') 29 | parser.add_argument('--target', type=str, default='llm-bert-arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1') 30 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 31 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 32 | 33 | # model specific params 34 | parser.add_argument('--adv', type=bool, default=False, help='adversarial training or not') 35 | parser.add_argument('--weight', type=float, default=0.1, help='trade-off parameter for loss') 36 | parser.add_argument('--s_pnums', type=int, default=0, help='propagation for source models') 37 | parser.add_argument('--t_pnums', type=int, default=20, help='propagation for target models') 38 | 39 | args = parser.parse_args() 40 | 41 | # load data 42 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 43 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source) 44 | source_dataset = ArxivDataset(path, args.source) 45 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source) 47 | source_dataset = ArxivDataset(path, args.source) 48 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source) 50 | source_dataset = ArxivDataset(path, args.source) 51 | 52 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 53 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target) 54 | target_dataset = ArxivDataset(path, args.target) 55 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target) 57 | target_dataset = ArxivDataset(path, args.target) 58 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target) 60 | target_dataset = ArxivDataset(path, args.target) 61 | 62 | source_data = source_dataset[0].to(args.device) 63 | target_data = target_dataset[0].to(args.device) 64 | 65 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 66 | if not is_undirected(source_data.edge_index): 67 | source_data.edge_index = to_undirected(source_data.edge_index) 68 | 69 | if not is_undirected(target_data.edge_index): 70 | target_data.edge_index = to_undirected(target_data.edge_index) 71 | 72 | num_features = source_data.x.size(1) 73 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 74 | 75 | # choose a graph domain adaptation model 76 | model = A2GNN( 77 | in_dim=num_features, 78 | hid_dim=args.nhid, 79 | num_classes=num_classes, 80 | num_layers=args.num_layers, 81 | weight_decay=args.weight_decay, 82 | lr=args.lr, 83 | dropout=args.dropout_ratio, 84 | epoch=args.epochs, 85 | device=args.device, 86 | weight=args.weight, 87 | adv=args.adv, 88 | s_pnums=args.s_pnums, 89 | t_pnums=args.t_pnums 90 | ) 91 | 92 | # train the model 93 | model.fit(source_data, target_data) 94 | 95 | # evaluate the performance 96 | logits, labels = model.predict(target_data) 97 | 98 | preds = logits.argmax(dim=1) 99 | 100 | mi_f1 = eval_micro_f1(labels, preds) 101 | ma_f1 = eval_macro_f1(labels, preds) 102 | 103 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 104 | auc = eval_roc_auc(labels, logits[:, 1]) 105 | else: 106 | auc = 0.0 107 | 108 | results = 'a2gnn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 109 | 110 | with open(args.filename, 'a+') as f: 111 | f.write(results + '\n') 112 | 113 | print(results) 114 | -------------------------------------------------------------------------------- /benchmark/graph/udagcn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import GraphTUDataset 11 | 12 | from pygda.models import UDAGCN 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.005, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--ppmi', type=bool, default=True, help='use PPMI matrix or not') 37 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module') 38 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks') 39 | 40 | args = parser.parse_args() 41 | 42 | # load data 43 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 44 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source) 45 | source_dataset = GraphTUDataset(path, args.source) 46 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}: 47 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source) 48 | source_dataset = GraphTUDataset(path, args.source) 49 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 50 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source) 51 | source_dataset = GraphTUDataset(path, args.source) 52 | 53 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 54 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target) 55 | target_dataset = GraphTUDataset(path, args.target) 56 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}: 57 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target) 58 | target_dataset = GraphTUDataset(path, args.target) 59 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 60 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target) 61 | target_dataset = GraphTUDataset(path, args.target) 62 | 63 | if args.mode == 'node': 64 | source_data = source_dataset[0].to(args.device) 65 | target_data = target_dataset[0].to(args.device) 66 | 67 | num_features = source_data.x.size(1) 68 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 69 | 70 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 71 | if not is_undirected(source_data.edge_index): 72 | source_data.edge_index = to_undirected(source_data.edge_index) 73 | 74 | if not is_undirected(target_data.edge_index): 75 | target_data.edge_index = to_undirected(target_data.edge_index) 76 | elif args.mode == 'graph': 77 | source_data = source_dataset 78 | target_data = target_dataset 79 | 80 | num_features = source_data.num_features 81 | num_classes = source_data.num_classes 82 | 83 | # choose a graph domain adaptation model 84 | model = UDAGCN( 85 | in_dim=num_features, 86 | hid_dim=args.nhid, 87 | num_classes=num_classes, 88 | mode=args.mode, 89 | num_layers=args.num_layers, 90 | weight_decay=args.weight_decay, 91 | lr=args.lr, 92 | dropout=args.dropout_ratio, 93 | epoch=args.epochs, 94 | device=args.device, 95 | ppmi=args.ppmi, 96 | adv_dim=args.adv_dim 97 | ) 98 | 99 | # train the model 100 | model.fit(source_data, target_data) 101 | 102 | # evaluate the performance 103 | logits, labels = model.predict(target_data) 104 | 105 | preds = logits.argmax(dim=1) 106 | 107 | mi_f1 = eval_micro_f1(labels, preds) 108 | ma_f1 = eval_macro_f1(labels, preds) 109 | 110 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 111 | auc = eval_roc_auc(labels, logits[:, 1]) 112 | else: 113 | auc = 0.0 114 | 115 | results = 'udagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 116 | 117 | with open(args.filename, 'a+') as f: 118 | f.write(results + '\n') 119 | 120 | print(results) -------------------------------------------------------------------------------- /benchmark/llm/adagcn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import ArxivDataset 11 | 12 | from pygda.models import AdaGCN 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:3', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='arxiv-1950-2016', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='arxiv-2016-2018', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--gnn_type', type=str, default='gcn', help='use GCN or PPMIConv') 37 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module') 38 | parser.add_argument('--gp_weight', type=float, default=5.0, help='trade off parameter for gradient penalty') 39 | parser.add_argument('--domain_weight', type=float, default=1.0, help='trade off parameter for domain loss') 40 | 41 | 42 | args = parser.parse_args() 43 | 44 | # load data 45 | if args.source in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.source) 47 | source_dataset = ArxivDataset(path, args.source) 48 | elif args.source in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.source) 50 | source_dataset = ArxivDataset(path, args.source) 51 | elif args.source in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 52 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.source) 53 | source_dataset = ArxivDataset(path, args.source) 54 | 55 | if args.target in {'arxiv-1950-2016', 'arxiv-2016-2018', 'arxiv-2018-2020'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/arxiv', args.target) 57 | target_dataset = ArxivDataset(path, args.target) 58 | elif args.target in {'llm-arxiv-1950-2016', 'llm-arxiv-2016-2018', 'llm-arxiv-2018-2020'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/llm-arxiv', args.target) 60 | target_dataset = ArxivDataset(path, args.target) 61 | elif args.target in {'llm-bert-arxiv-1950-2016', 'llm-bert-arxiv-2016-2018', 'llm-bert-arxiv-2018-2020'}: 62 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/bert-arxiv', args.target) 63 | target_dataset = ArxivDataset(path, args.target) 64 | 65 | source_data = source_dataset[0].to(args.device) 66 | target_data = target_dataset[0].to(args.device) 67 | 68 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 69 | if not is_undirected(source_data.edge_index): 70 | source_data.edge_index = to_undirected(source_data.edge_index) 71 | 72 | if not is_undirected(target_data.edge_index): 73 | target_data.edge_index = to_undirected(target_data.edge_index) 74 | 75 | num_features = source_data.x.size(1) 76 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 77 | 78 | # choose a graph domain adaptation model 79 | model = AdaGCN( 80 | in_dim=num_features, 81 | hid_dim=args.nhid, 82 | num_classes=num_classes, 83 | num_layers=args.num_layers, 84 | weight_decay=args.weight_decay, 85 | lr=args.lr, 86 | dropout=args.dropout_ratio, 87 | epoch=args.epochs, 88 | device=args.device, 89 | gnn_type=args.gnn_type, 90 | adv_dim=args.adv_dim, 91 | gp_weight=args.gp_weight, 92 | domain_weight=args.domain_weight 93 | ) 94 | 95 | # train the model 96 | model.fit(source_data, target_data) 97 | 98 | # evaluate the performance 99 | logits, labels = model.predict(target_data) 100 | 101 | preds = logits.argmax(dim=1) 102 | 103 | mi_f1 = eval_micro_f1(labels, preds) 104 | ma_f1 = eval_macro_f1(labels, preds) 105 | 106 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 107 | auc = eval_roc_auc(labels, logits[:, 1]) 108 | else: 109 | auc = 0.0 110 | 111 | results = 'adagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 112 | 113 | with open(args.filename, 'a+') as f: 114 | f.write(results + '\n') 115 | 116 | print(results) -------------------------------------------------------------------------------- /benchmark/graph/dane.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import GraphTUDataset 11 | 12 | from pygda.models import DANE 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=2, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--gnn', type=str, default='gcn', help='GNN backbone') 37 | parser.add_argument('--train_mode', type=str, default='unsup', help='unsupervised or semi-supervised') 38 | parser.add_argument('--k', type=int, default=5, help='number of negative samples') 39 | parser.add_argument('--tgt_rate', type=float, default=0.05, help='target graph rate of labeled nodes') 40 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks') 41 | 42 | args = parser.parse_args() 43 | 44 | # load data 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source) 47 | source_dataset = GraphTUDataset(path, args.source) 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source) 50 | source_dataset = GraphTUDataset(path, args.source) 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 52 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source) 53 | source_dataset = GraphTUDataset(path, args.source) 54 | 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target) 57 | target_dataset = GraphTUDataset(path, args.target) 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target) 60 | target_dataset = GraphTUDataset(path, args.target) 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 62 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target) 63 | target_dataset = GraphTUDataset(path, args.target) 64 | 65 | if args.mode == 'node': 66 | source_data = source_dataset[0].to(args.device) 67 | target_data = target_dataset[0].to(args.device) 68 | 69 | num_features = source_data.x.size(1) 70 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 71 | 72 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 73 | if not is_undirected(source_data.edge_index): 74 | source_data.edge_index = to_undirected(source_data.edge_index) 75 | 76 | if not is_undirected(target_data.edge_index): 77 | target_data.edge_index = to_undirected(target_data.edge_index) 78 | elif args.mode == 'graph': 79 | source_data = source_dataset 80 | target_data = target_dataset 81 | 82 | num_features = source_data.num_features 83 | num_classes = source_data.num_classes 84 | 85 | # choose a graph domain adaptation model 86 | model = DANE( 87 | in_dim=num_features, 88 | hid_dim=args.nhid, 89 | num_classes=num_classes, 90 | mode=args.mode, 91 | num_layers=args.num_layers, 92 | weight_decay=args.weight_decay, 93 | lr=args.lr, 94 | dropout=args.dropout_ratio, 95 | epoch=args.epochs, 96 | device=args.device, 97 | gnn=args.gnn, 98 | train_mode=args.train_mode, 99 | tgt_rate=args.tgt_rate, 100 | k=args.k 101 | ) 102 | 103 | # train the model 104 | model.fit(source_data, target_data) 105 | 106 | # evaluate the performance 107 | logits, labels = model.predict(target_data) 108 | 109 | preds = logits.argmax(dim=1) 110 | 111 | mi_f1 = eval_micro_f1(labels, preds) 112 | ma_f1 = eval_macro_f1(labels, preds) 113 | 114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 115 | auc = eval_roc_auc(labels, logits[:, 1]) 116 | else: 117 | auc = 0.0 118 | 119 | results = 'dane,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 120 | 121 | with open(args.filename, 'a+') as f: 122 | f.write(results + '\n') 123 | 124 | print(results) -------------------------------------------------------------------------------- /benchmark/graph/a2gnn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import GraphTUDataset 11 | 12 | from pygda.models import A2GNN 13 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 14 | 15 | from torch_geometric.loader import NeighborLoader 16 | from torch_geometric.utils import degree, is_undirected, to_undirected 17 | from torch_geometric.transforms import OneHotDegree 18 | 19 | parser = argparse.ArgumentParser() 20 | 21 | # model agnostic params 22 | parser.add_argument('--seed', type=int, default=200, help='random seed') 23 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 24 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 25 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 26 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 27 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 28 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 29 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1') 30 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 32 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 33 | 34 | # model specific params 35 | parser.add_argument('--adv', type=bool, default=False, help='adversarial training or not') 36 | parser.add_argument('--weight', type=float, default=0.1, help='trade-off parameter for loss') 37 | parser.add_argument('--s_pnums', type=int, default=0, help='propagation for source models') 38 | parser.add_argument('--t_pnums', type=int, default=20, help='propagation for target models') 39 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks') 40 | 41 | 42 | args = parser.parse_args() 43 | 44 | # load data 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source) 47 | source_dataset = GraphTUDataset(path, args.source) 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source) 50 | source_dataset = GraphTUDataset(path, args.source) 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 52 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source) 53 | source_dataset = GraphTUDataset(path, args.source) 54 | 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target) 57 | target_dataset = GraphTUDataset(path, args.target) 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target) 60 | target_dataset = GraphTUDataset(path, args.target) 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 62 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target) 63 | target_dataset = GraphTUDataset(path, args.target) 64 | 65 | if args.mode == 'node': 66 | source_data = source_dataset[0].to(args.device) 67 | target_data = target_dataset[0].to(args.device) 68 | 69 | num_features = source_data.x.size(1) 70 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 71 | 72 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 73 | if not is_undirected(source_data.edge_index): 74 | source_data.edge_index = to_undirected(source_data.edge_index) 75 | 76 | if not is_undirected(target_data.edge_index): 77 | target_data.edge_index = to_undirected(target_data.edge_index) 78 | elif args.mode == 'graph': 79 | source_data = source_dataset 80 | target_data = target_dataset 81 | 82 | num_features = source_data.num_features 83 | num_classes = source_data.num_classes 84 | 85 | # choose a graph domain adaptation model 86 | model = A2GNN( 87 | in_dim=num_features, 88 | hid_dim=args.nhid, 89 | num_classes=num_classes, 90 | mode=args.mode, 91 | num_layers=args.num_layers, 92 | weight_decay=args.weight_decay, 93 | lr=args.lr, 94 | dropout=args.dropout_ratio, 95 | epoch=args.epochs, 96 | device=args.device, 97 | weight=args.weight, 98 | adv=args.adv, 99 | s_pnums=args.s_pnums, 100 | t_pnums=args.t_pnums 101 | ) 102 | 103 | # train the model 104 | model.fit(source_data, target_data) 105 | 106 | # evaluate the performance 107 | logits, labels = model.predict(target_data) 108 | 109 | preds = logits.argmax(dim=1) 110 | 111 | mi_f1 = eval_micro_f1(labels, preds) 112 | ma_f1 = eval_macro_f1(labels, preds) 113 | 114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 115 | auc = eval_roc_auc(labels, logits[:, 1]) 116 | else: 117 | auc = 0.0 118 | 119 | results = 'a2gnn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 120 | 121 | with open(args.filename, 'a+') as f: 122 | f.write(results + '\n') 123 | 124 | print(results) 125 | -------------------------------------------------------------------------------- /benchmark/graph/sagda.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import GraphTUDataset 11 | 12 | from pygda.models import SAGDA 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=2, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--alpha', type=float, default=1.0, help='trade-off parameter for high pass filter') 37 | parser.add_argument('--beta', type=float, default=1.0, help='trade-off parameter for low pass filter') 38 | parser.add_argument('--ppmi', type=bool, default=True, help='use PPMI matrix or not') 39 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module') 40 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks') 41 | 42 | args = parser.parse_args() 43 | 44 | # load data 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source) 47 | source_dataset = GraphTUDataset(path, args.source) 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source) 50 | source_dataset = GraphTUDataset(path, args.source) 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 52 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source) 53 | source_dataset = GraphTUDataset(path, args.source) 54 | 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target) 57 | target_dataset = GraphTUDataset(path, args.target) 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target) 60 | target_dataset = GraphTUDataset(path, args.target) 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 62 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target) 63 | target_dataset = GraphTUDataset(path, args.target) 64 | 65 | if args.mode == 'node': 66 | source_data = source_dataset[0].to(args.device) 67 | target_data = target_dataset[0].to(args.device) 68 | 69 | num_features = source_data.x.size(1) 70 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 71 | 72 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 73 | if not is_undirected(source_data.edge_index): 74 | source_data.edge_index = to_undirected(source_data.edge_index) 75 | 76 | if not is_undirected(target_data.edge_index): 77 | target_data.edge_index = to_undirected(target_data.edge_index) 78 | elif args.mode == 'graph': 79 | source_data = source_dataset 80 | target_data = target_dataset 81 | 82 | num_features = source_data.num_features 83 | num_classes = source_data.num_classes 84 | 85 | # choose a graph domain adaptation model 86 | model = SAGDA( 87 | in_dim=num_features, 88 | hid_dim=args.nhid, 89 | num_classes=num_classes, 90 | mode=args.mode, 91 | num_layers=args.num_layers, 92 | weight_decay=args.weight_decay, 93 | lr=args.lr, 94 | dropout=args.dropout_ratio, 95 | epoch=args.epochs, 96 | device=args.device, 97 | alpha=args.alpha, 98 | beta=args.beta, 99 | ppmi=args.ppmi, 100 | adv_dim=args.adv_dim 101 | ) 102 | 103 | # train the model 104 | model.fit(source_data, target_data) 105 | 106 | # evaluate the performance 107 | logits, labels = model.predict(target_data) 108 | 109 | preds = logits.argmax(dim=1) 110 | 111 | mi_f1 = eval_micro_f1(labels, preds) 112 | ma_f1 = eval_macro_f1(labels, preds) 113 | 114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 115 | auc = eval_roc_auc(labels, logits[:, 1]) 116 | else: 117 | auc = 0.0 118 | 119 | results = 'sagda,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 120 | 121 | with open(args.filename, 'a+') as f: 122 | f.write(results + '\n') 123 | 124 | print(results) -------------------------------------------------------------------------------- /benchmark/graph/adagcn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import torch 5 | import argparse 6 | import time 7 | import os.path as osp 8 | import numpy as np 9 | 10 | from pygda.datasets import GraphTUDataset 11 | 12 | from pygda.models import AdaGCN 13 | 14 | from pygda.metrics import eval_micro_f1, eval_macro_f1, eval_roc_auc 15 | 16 | from torch_geometric.loader import NeighborLoader 17 | from torch_geometric.utils import degree, is_undirected, to_undirected 18 | from torch_geometric.transforms import OneHotDegree 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | # model agnostic params 23 | parser.add_argument('--seed', type=int, default=200, help='random seed') 24 | parser.add_argument('--num_layers', type=int, default=3, help='number of layers') 25 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 26 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay') 27 | parser.add_argument('--nhid', type=int, default=128, help='hidden size') 28 | parser.add_argument('--dropout_ratio', type=float, default=0.1, help='dropout ratio') 29 | parser.add_argument('--device', type=str, default='cuda:1', help='specify cuda devices') 30 | parser.add_argument('--source', type=str, default='FRANKENSTEIN_F1', help='source domain data, DBLPv7/ACMv9/Citationv1') 31 | parser.add_argument('--target', type=str, default='FRANKENSTEIN_F2', help='target domain data, DBLPv7/ACMv9/Citationv1') 32 | parser.add_argument('--epochs', type=int, default=200, help='maximum number of epochs') 33 | parser.add_argument('--filename', type=str, default='test.txt', help='store results into file') 34 | 35 | # model specific params 36 | parser.add_argument('--gnn_type', type=str, default='gcn', help='use GCN or PPMIConv') 37 | parser.add_argument('--adv_dim', type=int, default=40, help='hidden dimension of adversarial module') 38 | parser.add_argument('--gp_weight', type=float, default=5.0, help='trade off parameter for gradient penalty') 39 | parser.add_argument('--domain_weight', type=float, default=1.0, help='trade off parameter for domain loss') 40 | parser.add_argument('--mode', type=str, default='graph', help='node or graph tasks') 41 | 42 | args = parser.parse_args() 43 | 44 | # load data 45 | if args.source in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 46 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.source) 47 | source_dataset = GraphTUDataset(path, args.source) 48 | elif args.source in {'PROTEINS_P1', 'PROTEINS_P2'}: 49 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.source) 50 | source_dataset = GraphTUDataset(path, args.source) 51 | elif args.source in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 52 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.source) 53 | source_dataset = GraphTUDataset(path, args.source) 54 | 55 | if args.target in {'FRANKENSTEIN_F1', 'FRANKENSTEIN_F2'}: 56 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/FRANKENSTEIN', args.target) 57 | target_dataset = GraphTUDataset(path, args.target) 58 | elif args.target in {'PROTEINS_P1', 'PROTEINS_P2'}: 59 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/PROTEINS', args.target) 60 | target_dataset = GraphTUDataset(path, args.target) 61 | elif args.target in {'Mutagenicity_M1', 'Mutagenicity_M2'}: 62 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data/Mutagenicity', args.target) 63 | target_dataset = GraphTUDataset(path, args.target) 64 | 65 | if args.mode == 'node': 66 | source_data = source_dataset[0].to(args.device) 67 | target_data = target_dataset[0].to(args.device) 68 | 69 | num_features = source_data.x.size(1) 70 | num_classes = len(np.unique(source_data.y.cpu().numpy())) 71 | 72 | if args.source not in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 73 | if not is_undirected(source_data.edge_index): 74 | source_data.edge_index = to_undirected(source_data.edge_index) 75 | 76 | if not is_undirected(target_data.edge_index): 77 | target_data.edge_index = to_undirected(target_data.edge_index) 78 | elif args.mode == 'graph': 79 | source_data = source_dataset 80 | target_data = target_dataset 81 | 82 | num_features = source_data.num_features 83 | num_classes = source_data.num_classes 84 | 85 | # choose a graph domain adaptation model 86 | model = AdaGCN( 87 | in_dim=num_features, 88 | hid_dim=args.nhid, 89 | num_classes=num_classes, 90 | mode=args.mode, 91 | num_layers=args.num_layers, 92 | weight_decay=args.weight_decay, 93 | lr=args.lr, 94 | dropout=args.dropout_ratio, 95 | epoch=args.epochs, 96 | device=args.device, 97 | gnn_type=args.gnn_type, 98 | adv_dim=args.adv_dim, 99 | gp_weight=args.gp_weight, 100 | domain_weight=args.domain_weight 101 | ) 102 | 103 | # train the model 104 | model.fit(source_data, target_data) 105 | 106 | # evaluate the performance 107 | logits, labels = model.predict(target_data) 108 | 109 | preds = logits.argmax(dim=1) 110 | 111 | mi_f1 = eval_micro_f1(labels, preds) 112 | ma_f1 = eval_macro_f1(labels, preds) 113 | 114 | if args.source in {'DE', 'EN', 'ES', 'FR', 'PT', 'RU'}: 115 | auc = eval_roc_auc(labels, logits[:, 1]) 116 | else: 117 | auc = 0.0 118 | 119 | results = 'adagcn,source,' + args.source + ',target,' + args.target + ',micro-f1,' + str(mi_f1) + ',macro-f1,' + str(ma_f1) + ',auc,' + str(auc) 120 | 121 | with open(args.filename, 'a+') as f: 122 | f.write(results + '\n') 123 | 124 | print(results) -------------------------------------------------------------------------------- /pygda/utils/mmd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def guassian_kernel(source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None): 5 | """ 6 | Calculate Gaussian kernel matrix between source and target features. 7 | 8 | Parameters 9 | ---------- 10 | source : torch.Tensor 11 | Source domain features in shape of (n_source, feature_dim) 12 | target : torch.Tensor 13 | Target domain features in shape of (n_target, feature_dim) 14 | kernel_mul : float, optional 15 | Multiplication factor for kernel bandwidth. Default: 2.0 16 | kernel_num : int, optional 17 | Number of kernels to use. Default: 5 18 | fix_sigma : float, optional 19 | Fixed bandwidth value. If None, computed from data. Default: None 20 | 21 | Returns 22 | ------- 23 | torch.Tensor 24 | Combined kernel matrix from multiple bandwidths 25 | 26 | Notes 27 | ----- 28 | Processing Steps: 29 | 30 | - Combine source and target features 31 | - Compute pairwise L2 distances 32 | - Calculate kernel bandwidth 33 | - Generate multiple kernels 34 | - Sum kernel matrices 35 | 36 | Features: 37 | 38 | - Multiple kernel computation 39 | - Adaptive bandwidth 40 | - Efficient matrix operations 41 | """ 42 | n_samples = int(source.size()[0]) + int(target.size()[0]) 43 | total = torch.cat([source, target], dim=0) 44 | total0 = total.unsqueeze(0).expand(int(total.size(0)), int(total.size(0)), int(total.size(1))) 45 | total1 = total.unsqueeze(1).expand(int(total.size(0)), int(total.size(0)), int(total.size(1))) 46 | L2_distance = ((total0-total1)**2).sum(2) 47 | if fix_sigma: 48 | bandwidth = fix_sigma 49 | else: 50 | bandwidth = (torch.sum(L2_distance.data) + 1e-6) / (n_samples**2-n_samples) 51 | bandwidth /= kernel_mul ** (kernel_num // 2) 52 | bandwidth_list = [bandwidth * (kernel_mul**i) for i in range(kernel_num)] 53 | kernel_val = [torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list] 54 | 55 | return sum(kernel_val) 56 | 57 | def get_MMD(source_feat, target_feat, kernel_mul=2.0, kernel_num=5, fix_sigma=None): 58 | """ 59 | Calculate Maximum Mean Discrepancy (MMD) between source and target features. 60 | 61 | Parameters 62 | ---------- 63 | source_feat : torch.Tensor 64 | Source domain features in shape of (n_source, feature_dim) 65 | target_feat : torch.Tensor 66 | Target domain features in shape of (n_target, feature_dim) 67 | kernel_mul : float, optional 68 | Multiplication factor for kernel bandwidth. Default: 2.0 69 | kernel_num : int, optional 70 | Number of kernels to use. Default: 5 71 | fix_sigma : float, optional 72 | Fixed bandwidth value. If None, computed from data. Default: None 73 | 74 | Returns 75 | ------- 76 | torch.Tensor 77 | MMD loss value between source and target domains 78 | 79 | Notes 80 | ----- 81 | Processing Steps: 82 | 83 | - Compute Gaussian kernel matrix 84 | - Extract within-domain kernels (XX, YY) 85 | - Extract cross-domain kernels (XY, YX) 86 | - Calculate MMD loss 87 | 88 | Features: 89 | 90 | - Batch-wise computation 91 | - Multiple kernel integration 92 | - Unbiased estimation 93 | """ 94 | kernels = guassian_kernel(source_feat, 95 | target_feat, 96 | kernel_mul=kernel_mul, 97 | kernel_num=kernel_num, 98 | fix_sigma=fix_sigma) 99 | 100 | batch_size = min(int(source_feat.size()[0]), int(target_feat.size()[0])) 101 | 102 | XX = kernels[:batch_size, :batch_size] 103 | YY = kernels[batch_size:, batch_size:] 104 | XY = kernels[:batch_size, batch_size:] 105 | YX = kernels[batch_size:, :batch_size] 106 | loss = torch.mean(XX + YY - XY - YX) 107 | return loss 108 | 109 | def MMD(source_feat, target_feat, sampling_num=1000, times=5): 110 | """ 111 | Calculate MMD with random sampling for large-scale datasets. 112 | 113 | Parameters 114 | ---------- 115 | source_feat : torch.Tensor 116 | Source domain features in shape of (n_source, feature_dim) 117 | target_feat : torch.Tensor 118 | Target domain features in shape of (n_target, feature_dim) 119 | sampling_num : int, optional 120 | Number of samples per iteration. Default: 1000 121 | times : int, optional 122 | Number of sampling iterations. Default: 5 123 | 124 | Returns 125 | ------- 126 | torch.Tensor 127 | Averaged MMD loss value across sampling iterations 128 | 129 | Notes 130 | ----- 131 | Processing Steps: 132 | 133 | - Generate random sample indices 134 | - Sample features from both domains 135 | - Calculate MMD for each sample 136 | - Average across iterations 137 | 138 | Features: 139 | 140 | - Random sampling 141 | - Multiple iterations 142 | - Memory efficient 143 | - Scalable computation 144 | """ 145 | source_num = source_feat.size(0) 146 | target_num = target_feat.size(0) 147 | 148 | source_sample = torch.randint(source_num, (times, sampling_num)) 149 | target_sample = torch.randint(target_num, (times, sampling_num)) 150 | 151 | mmd = 0 152 | for i in range(times): 153 | source_sample_feat = source_feat[source_sample[i]] 154 | target_sample_feat = target_feat[target_sample[i]] 155 | 156 | mmd = mmd + get_MMD(source_sample_feat, target_sample_feat) 157 | 158 | mmd = mmd / times 159 | return mmd -------------------------------------------------------------------------------- /pygda/datasets/mag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import torch 4 | import numpy as np 5 | import torch.nn.functional as F 6 | from torch_geometric.data import InMemoryDataset, Data 7 | from torch_geometric.io import read_txt_array 8 | 9 | import csv 10 | import json 11 | import pickle as pkl 12 | import scipy 13 | import scipy.io as sio 14 | 15 | import warnings 16 | warnings.filterwarnings('ignore', category=DeprecationWarning) 17 | 18 | 19 | class MAGDataset(InMemoryDataset): 20 | """ 21 | Microsoft Academic Graph (MAG) dataset loader for graph-based analysis. 22 | 23 | Parameters 24 | ---------- 25 | root : str 26 | Root directory where the dataset should be saved 27 | name : str 28 | Name of the MAG dataset 29 | transform : callable, optional 30 | Function/transform that takes in a Data object and returns a transformed 31 | version. Default: None 32 | pre_transform : callable, optional 33 | Function/transform to be applied to the data object before saving. 34 | Default: None 35 | pre_filter : callable, optional 36 | Function that takes in a Data object and returns a boolean value, 37 | indicating whether the data object should be included. Default: None 38 | 39 | Notes 40 | ----- 41 | Dataset Structure: 42 | 43 | - Nodes represent academic papers 44 | - Edges represent citation relationships 45 | - Node features from paper content 46 | - Labels indicate paper fields (top 20) 47 | - Includes train/val/test splits (80/10/10) 48 | """ 49 | 50 | def __init__(self, 51 | root, 52 | name, 53 | transform=None, 54 | pre_transform=None, 55 | pre_filter=None): 56 | self.name = name 57 | self.root = root 58 | super(MAGDataset, self).__init__(root, transform, pre_transform, pre_filter) 59 | 60 | self.data, self.slices = torch.load(self.processed_paths[0]) 61 | 62 | @property 63 | def raw_file_names(self): 64 | """ 65 | Names of required raw files. 66 | 67 | Returns 68 | ------- 69 | list[str] 70 | List of required raw file names 71 | 72 | Notes 73 | ----- 74 | Required files: 75 | 76 | - labels_20.pt: PyTorch file containing graph data with top 20 fields 77 | """ 78 | return ["labels_20.pt"] 79 | 80 | @property 81 | def processed_file_names(self): 82 | """ 83 | Names of processed data files. 84 | 85 | Returns 86 | ------- 87 | list[str] 88 | List of processed file names 89 | 90 | Notes 91 | ----- 92 | Processed files: 93 | 94 | - data.pt: Contains processed PyTorch Geometric data object 95 | """ 96 | return ['data.pt'] 97 | 98 | def download(self): 99 | """ 100 | Download raw data files. 101 | 102 | Notes 103 | ----- 104 | Empty implementation - data should be manually placed in raw directory 105 | """ 106 | pass 107 | 108 | def process(self): 109 | """ 110 | Process raw data into PyTorch Geometric Data format. 111 | 112 | Notes 113 | ----- 114 | Processing Steps: 115 | 116 | - Load PyTorch data: 117 | 118 | * Node features (paper content) 119 | * Edge indices (citations) 120 | * Labels (paper fields) 121 | 122 | - Create Data object with: 123 | 124 | * Edge indices 125 | * Node features 126 | * Node labels 127 | * Train/val/test masks 128 | 129 | - Apply pre-transform if specified 130 | - Save processed data 131 | 132 | Data Split: 133 | 134 | - Training: 80% 135 | - Validation: 10% 136 | - Testing: 10% 137 | 138 | Features: 139 | 140 | - Direct tensor loading 141 | - Random split generation 142 | - Optional pre-transform support 143 | - Efficient data storage 144 | """ 145 | path = osp.join(self.raw_dir, '{}_labels_20.pt'.format(self.name)) 146 | graph = torch.load(path) 147 | x, edge_index, y = graph.x, graph.edge_index, graph.y 148 | 149 | data_list = [] 150 | data = Data(edge_index=edge_index, x=x, y=y) 151 | 152 | random_node_indices = np.random.permutation(y.shape[0]) 153 | training_size = int(len(random_node_indices) * 0.8) 154 | val_size = int(len(random_node_indices) * 0.1) 155 | train_node_indices = random_node_indices[:training_size] 156 | val_node_indices = random_node_indices[training_size:training_size + val_size] 157 | test_node_indices = random_node_indices[training_size + val_size:] 158 | 159 | train_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 160 | train_masks[train_node_indices] = 1 161 | val_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 162 | val_masks[val_node_indices] = 1 163 | test_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 164 | test_masks[test_node_indices] = 1 165 | 166 | data.train_mask = train_masks 167 | data.val_mask = val_masks 168 | data.test_mask = test_masks 169 | 170 | if self.pre_transform is not None: 171 | if not os.path.exists(self.processed_paths[0] + 'eival.pt'): 172 | data = self.pre_transform(data, self.processed_paths[0]) 173 | 174 | data_list.append(data) 175 | 176 | data, slices = self.collate([data]) 177 | 178 | torch.save((data, slices), self.processed_paths[0]) 179 | -------------------------------------------------------------------------------- /pygda/nn/adagcn_base.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from torch_geometric.nn import GCNConv 6 | 7 | from .ppmi_conv import PPMIConv 8 | from torch_geometric.nn import global_mean_pool 9 | 10 | 11 | class GNN(torch.nn.Module): 12 | """ 13 | Generic GNN encoder supporting multiple GNN types. 14 | 15 | Parameters 16 | ---------- 17 | in_dim : int 18 | Input feature dimension. 19 | hid_dim : int 20 | Hidden layer dimension. 21 | gnn_type : str, optional 22 | Type of GNN layer ('gcn' or 'ppmi'). Default: 'gcn'. 23 | num_layers : int, optional 24 | Number of GNN layers. Default: 3. 25 | act : callable, optional 26 | Activation function. Default: F.relu. 27 | dropout : float, optional 28 | Dropout rate. Default: 0.1. 29 | **kwargs 30 | Additional arguments for GNN layers. 31 | 32 | Notes 33 | ----- 34 | - Supports both GCN and PPMI convolution types 35 | - Multiple layers with residual connections 36 | - Configurable activation and dropout 37 | """ 38 | 39 | def __init__(self, in_dim, hid_dim, gnn_type='gcn', num_layers=3, act=F.relu, dropout=0.1, **kwargs): 40 | super(GNN, self).__init__() 41 | 42 | self.gnn_type = gnn_type 43 | self.act = act 44 | self.num_layers = num_layers 45 | 46 | self.conv_layers = nn.ModuleList() 47 | 48 | if self.gnn_type == 'gcn': 49 | self.conv_layers.append(GCNConv(in_dim, hid_dim)) 50 | 51 | for i in range(1, self.num_layers): 52 | self.conv_layers.append(GCNConv(hid_dim, hid_dim)) 53 | else: 54 | self.conv_layers.append(PPMIConv(in_dim, hid_dim)) 55 | 56 | for i in range(1, self.num_layers): 57 | self.conv_layers.append(PPMIConv(hid_dim, hid_dim)) 58 | 59 | self.dropout = nn.Dropout(dropout) 60 | 61 | def forward(self, x, edge_index, batch, mode='node'): 62 | """ 63 | Forward pass of the GNN. 64 | 65 | Parameters 66 | ---------- 67 | x : torch.Tensor 68 | Node features. 69 | edge_index : torch.Tensor 70 | Edge indices. 71 | batch : torch.Tensor 72 | Batch assignment for graph-level tasks. 73 | mode : str, optional 74 | 'node' or 'graph' level task. Default: 'node'. 75 | 76 | Returns 77 | ------- 78 | torch.Tensor 79 | Node or graph embeddings. 80 | 81 | Notes 82 | ----- 83 | - Applies multiple GNN layers sequentially 84 | - Optional graph pooling for graph-level tasks 85 | - Dropout and activation between layers 86 | """ 87 | for i, conv_layer in enumerate(self.conv_layers): 88 | x = conv_layer(x, edge_index) 89 | if i < len(self.conv_layers) - 1: 90 | x = self.act(x) 91 | x = self.dropout(x) 92 | 93 | if mode == 'graph': 94 | x = global_mean_pool(x, batch) 95 | 96 | return x 97 | 98 | 99 | class AdaGCNBase(nn.Module): 100 | """ 101 | Base class for AdaGCN. 102 | 103 | Parameters 104 | ---------- 105 | in_dim : int 106 | Input feature dimension. 107 | hid_dim : int 108 | Hidden dimension. 109 | num_classes : int 110 | Number of target classes. 111 | num_layers : int, optional 112 | Number of GNN layers. Default: 3. 113 | dropout : float, optional 114 | Dropout rate. Default: 0.1. 115 | act : callable, optional 116 | Activation function. Default: F.relu. 117 | gnn_type : str, optional 118 | Type of GNN ('gcn' or 'ppmi'). Default: 'gcn'. 119 | mode : str, optional 120 | 'node' or 'graph' level task. Default: 'node'. 121 | **kwargs 122 | Additional arguments. 123 | 124 | Notes 125 | ----- 126 | Architecture components: 127 | 128 | 1. GNN encoder for feature extraction 129 | 2. Classification layer 130 | 3. Cross-entropy loss function 131 | """ 132 | 133 | def __init__(self, 134 | in_dim, 135 | hid_dim, 136 | num_classes, 137 | num_layers=3, 138 | dropout=0.1, 139 | act=F.relu, 140 | gnn_type='gcn', 141 | mode='node', 142 | **kwargs): 143 | super(AdaGCNBase, self).__init__() 144 | 145 | self.encoder = GNN(in_dim=in_dim, hid_dim=hid_dim, gnn_type=gnn_type, act=act, num_layers=num_layers) 146 | 147 | self.cls_model = nn.Sequential(nn.Linear(hid_dim, num_classes)) 148 | 149 | self.mode = mode 150 | 151 | self.loss_func = nn.CrossEntropyLoss() 152 | 153 | def forward(self, data): 154 | """ 155 | Forward pass of AdaGCN. 156 | 157 | Parameters 158 | ---------- 159 | data : torch_geometric.data.Data 160 | Input graph data. 161 | 162 | Returns 163 | ------- 164 | torch.Tensor 165 | Node/graph embeddings. 166 | 167 | Notes 168 | ----- 169 | Process: 170 | 171 | 1. Extract features based on mode (node/graph) 172 | 2. Apply GNN encoder 173 | 3. Return embeddings for downstream tasks 174 | """ 175 | if self.mode == 'node': 176 | x, edge_index, batch = data.x, data.edge_index, None 177 | else: 178 | x, edge_index, batch = data.x, data.edge_index, data.batch 179 | x = self.encoder(x, edge_index, batch, mode=self.mode) 180 | 181 | return x 182 | -------------------------------------------------------------------------------- /benchmark/node/run_blog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Task Blog2->Blog1" 4 | echo "==========" 5 | python grade.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 300 --dropout_ratio 0.2 --weight 0.01 --filename 'results-blog.txt' 6 | python strurw.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.4 --lamb 0.6 --filename 'results-blog.txt' 7 | python asn.py --source 'Blog2' --target 'Blog1' --nhid 128 --hid_dim_vae 128 --lr 0.0003 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.2 --lambda_r 0.01 --lambda_d 0.5 --lambda_f 0.0001 --filename 'results-blog.txt' 8 | python acdne.py --source 'Blog2' --target 'Blog1' --nhid 128 --lr 0.0001 --weight_decay 0.01 --epochs 300 --dropout_ratio 0.1 --pair_weight 0.03 --step 1 --filename 'results-blog.txt' 9 | python adagcn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-blog.txt' 10 | python udagcn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-blog.txt' 11 | python specreg.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 4 --lr 0.003 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.1 --gamma_adv 0.1 --gamma_smooth 0.001 --gamma_mfr 0.001 --filename 'results-blog.txt' 12 | python a2gnn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-blog.txt' 13 | python pairalign.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.003 --weight_decay 0.003 --epochs 200 --dropout_ratio 0.0 --rw_lmda 1 --ls_lambda 3.0 --lw_lambda 0.01 --filename 'results-blog.txt' 14 | 15 | python kbl.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 200 --k_cross 20 --k_within 10 --filename 'results-blog.txt' 16 | python cwgcn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-blog.txt' 17 | python dane.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-blog.txt' 18 | python dgda.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --m_w 0.5 --beta 0.5 --filename 'results-blog.txt' 19 | python dmgnn.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pair_weight 0.1 --filename 'results-blog.txt' 20 | python jhgda.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pool_ratio 0.2 --filename 'results-blog.txt' 21 | python sagda.py --source 'Blog2' --target 'Blog1' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-blog.txt' 22 | 23 | echo "Task Blog1->Blog2" 24 | echo "==========" 25 | python grade.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.5 --weight 0.01 --filename 'results-blog.txt' 26 | python strurw.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.4 --lamb 0.6 --filename 'results-blog.txt' 27 | python asn.py --source 'Blog1' --target 'Blog2' --nhid 128 --hid_dim_vae 128 --lr 0.0003 --weight_decay 0.01 --epochs 300 --dropout_ratio 0.2 --lambda_r 0.01 --lambda_d 0.5 --lambda_f 0.0001 --filename 'results-blog.txt' 28 | python acdne.py --source 'Blog1' --target 'Blog2' --nhid 128 --lr 0.0003 --weight_decay 0.01 --epochs 300 --dropout_ratio 0.0 --pair_weight 0.01 --step 1 --filename 'results-blog.txt' 29 | python adagcn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.01 --epochs 400 --dropout_ratio 0.4 --domain_weight 0.1 --filename 'results-blog.txt' 30 | python udagcn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.001 --epochs 400 --dropout_ratio 0.4 --filename 'results-blog.txt' 31 | python specreg.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 4 --lr 0.003 --weight_decay 0.001 --epochs 200 --dropout_ratio 0.1 --gamma_adv 0.1 --gamma_smooth 0.001 --gamma_mfr 0.001 --filename 'results-blog.txt' 32 | python a2gnn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.01 --weight_decay 0.005 --epochs 200 --dropout_ratio 0.5 --s_pnums 0 --t_pnums 10 --weight 10 --filename 'results-blog.txt' 33 | python pairalign.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.003 --weight_decay 0.003 --epochs 200 --dropout_ratio 0.0 --rw_lmda 1 --ls_lambda 3.0 --lw_lambda 0.01 --filename 'results-blog.txt' 34 | 35 | python kbl.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.01 --epochs 200 --k_cross 20 --k_within 10 --filename 'results-blog.txt' 36 | python cwgcn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.0001 --weight_decay 0.0001 --epochs 200 --filename 'results-blog.txt' 37 | python dane.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 3 --lr 0.001 --weight_decay 0.001 --epochs 200 --filename 'results-blog.txt' 38 | python dgda.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --m_w 0.5 --beta 0.5 --filename 'results-blog.txt' 39 | python dmgnn.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pair_weight 0.1 --filename 'results-blog.txt' 40 | python jhgda.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 2 --lr 0.001 --weight_decay 0.001 --epochs 200 --pool_ratio 0.2 --filename 'results-blog.txt' 41 | python sagda.py --source 'Blog1' --target 'Blog2' --nhid 128 --num_layers 1 --lr 0.001 --weight_decay 0.001 --epochs 200 --adv_dim 40 --filename 'results-blog.txt' 42 | -------------------------------------------------------------------------------- /pygda/datasets/airport.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import torch 4 | import numpy as np 5 | import torch.nn.functional as F 6 | from torch_geometric.data import InMemoryDataset, Data 7 | from torch_geometric.io import read_txt_array 8 | 9 | import csv 10 | import json 11 | import pickle as pkl 12 | import scipy 13 | import scipy.io as sio 14 | 15 | import warnings 16 | warnings.filterwarnings('ignore', category=DeprecationWarning) 17 | 18 | 19 | class AirportDataset(InMemoryDataset): 20 | """ 21 | Airport network dataset loader for graph-based analysis. 22 | 23 | Parameters 24 | ---------- 25 | root : str 26 | Root directory where the dataset should be saved 27 | name : str 28 | Name of the airport dataset 29 | transform : callable, optional 30 | Function/transform that takes in a Data object and returns a transformed 31 | version. Default: None 32 | pre_transform : callable, optional 33 | Function/transform to be applied to the data object before saving. 34 | Default: None 35 | pre_filter : callable, optional 36 | Function that takes in a Data object and returns a boolean value, 37 | indicating whether the data object should be included. Default: None 38 | 39 | Notes 40 | ----- 41 | - Nodes represent airports 42 | - Edges represent routes between airports 43 | - Labels indicate airport categories 44 | - Includes train/val/test splits (80/10/10) 45 | """ 46 | 47 | def __init__(self, 48 | root, 49 | name, 50 | transform=None, 51 | pre_transform=None, 52 | pre_filter=None): 53 | self.name = name 54 | self.root = root 55 | super(AirportDataset, self).__init__(root, transform, pre_transform, pre_filter) 56 | 57 | self.data, self.slices = torch.load(self.processed_paths[0]) 58 | 59 | @property 60 | def raw_file_names(self): 61 | """ 62 | Names of required raw files. 63 | 64 | Returns 65 | ------- 66 | list[str] 67 | List of required raw file names 68 | 69 | Notes 70 | ----- 71 | Required files: 72 | 73 | - edgelist.txt: Contains edge connectivity 74 | - labels.txt: Contains node labels 75 | """ 76 | return ["edgelist.txt", "labels.txt"] 77 | 78 | @property 79 | def processed_file_names(self): 80 | """ 81 | Names of processed data files. 82 | 83 | Returns 84 | ------- 85 | list[str] 86 | List of processed file names 87 | 88 | Notes 89 | ----- 90 | Processed files: 91 | 92 | - data.pt: Contains processed PyTorch Geometric data object 93 | """ 94 | return ['data.pt'] 95 | 96 | def download(self): 97 | """ 98 | Download raw data files. 99 | 100 | Notes 101 | ----- 102 | Empty implementation - data should be manually placed in raw directory 103 | """ 104 | pass 105 | 106 | def process(self): 107 | """ 108 | Process raw data into PyTorch Geometric Data format. 109 | 110 | Notes 111 | ----- 112 | - Load edge list from text file 113 | - Load node labels from text file 114 | - Create Data object with: 115 | 116 | * Edge indices 117 | * Node labels 118 | * Train/val/test masks 119 | 120 | - Apply pre-transform if specified 121 | - Save processed data 122 | 123 | Data Split: 124 | 125 | - Training: 80% 126 | - Validation: 10% 127 | - Testing: 10% 128 | 129 | Features: 130 | 131 | - Random split generation 132 | - Optional pre-transform support 133 | - Efficient data storage 134 | """ 135 | edge_path = osp.join(self.raw_dir, '{}_edgelist.txt'.format(self.name)) 136 | edge_index = read_txt_array(edge_path, sep=',', dtype=torch.long).t() 137 | 138 | label_path = osp.join(self.raw_dir, '{}_labels.txt'.format(self.name)) 139 | f = open(label_path, 'rb') 140 | content_list = [] 141 | for line in f.readlines(): 142 | line = str(line, encoding="utf-8") 143 | line = line.replace("\r", "").replace("\n", "") 144 | content_list.append(line) 145 | y = np.array(content_list, dtype=int) 146 | y = torch.from_numpy(y).to(torch.int64) 147 | 148 | data_list = [] 149 | data = Data(edge_index=edge_index, x=None, y=y, num_nodes=y.size(0)) 150 | 151 | random_node_indices = np.random.permutation(y.shape[0]) 152 | training_size = int(len(random_node_indices) * 0.8) 153 | val_size = int(len(random_node_indices) * 0.1) 154 | train_node_indices = random_node_indices[:training_size] 155 | val_node_indices = random_node_indices[training_size:training_size + val_size] 156 | test_node_indices = random_node_indices[training_size + val_size:] 157 | 158 | train_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 159 | train_masks[train_node_indices] = 1 160 | val_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 161 | val_masks[val_node_indices] = 1 162 | test_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 163 | test_masks[test_node_indices] = 1 164 | 165 | data.train_mask = train_masks 166 | data.val_mask = val_masks 167 | data.test_mask = test_masks 168 | 169 | if self.pre_transform is not None: 170 | if not os.path.exists(self.processed_paths[0] + 'eival.pt'): 171 | data = self.pre_transform(data, self.processed_paths[0]) 172 | 173 | data_list.append(data) 174 | 175 | data, slices = self.collate([data]) 176 | 177 | torch.save((data, slices), self.processed_paths[0]) -------------------------------------------------------------------------------- /pygda/datasets/arxiv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import torch 4 | import numpy as np 5 | import torch.nn.functional as F 6 | from torch_geometric.data import InMemoryDataset, Data 7 | from torch_geometric.io import read_txt_array 8 | 9 | import csv 10 | import json 11 | import pickle as pkl 12 | import scipy 13 | import scipy.io as sio 14 | 15 | import warnings 16 | warnings.filterwarnings('ignore', category=DeprecationWarning) 17 | 18 | 19 | class ArxivDataset(InMemoryDataset): 20 | """ 21 | ArXiv citation network dataset loader for graph-based analysis. 22 | 23 | Parameters 24 | ---------- 25 | root : str 26 | Root directory where the dataset should be saved 27 | name : str 28 | Name of the arXiv dataset 29 | transform : callable, optional 30 | Function/transform that takes in a Data object and returns a transformed 31 | version. Default: None 32 | pre_transform : callable, optional 33 | Function/transform to be applied to the data object before saving. 34 | Default: None 35 | pre_filter : callable, optional 36 | Function that takes in a Data object and returns a boolean value, 37 | indicating whether the data object should be included. Default: None 38 | 39 | Notes 40 | ----- 41 | Dataset Structure: 42 | 43 | - Nodes represent arXiv papers 44 | - Edges represent citations between papers 45 | - Node features from paper content 46 | - Labels indicate paper categories 47 | - Includes train/val/test splits (80/10/10) 48 | """ 49 | 50 | def __init__(self, 51 | root, 52 | name, 53 | transform=None, 54 | pre_transform=None, 55 | pre_filter=None): 56 | self.name = name 57 | self.root = root 58 | super(ArxivDataset, self).__init__(root, transform, pre_transform, pre_filter) 59 | 60 | self.data, self.slices = torch.load(self.processed_paths[0]) 61 | 62 | @property 63 | def raw_file_names(self): 64 | """ 65 | Names of required raw files. 66 | 67 | Returns 68 | ------- 69 | list[str] 70 | List of required raw file names 71 | 72 | Notes 73 | ----- 74 | Required files: 75 | 76 | - *.pkl: Pickle file containing graph data, features, and labels 77 | """ 78 | return ["*.pkl"] 79 | 80 | @property 81 | def processed_file_names(self): 82 | """ 83 | Names of processed data files. 84 | 85 | Returns 86 | ------- 87 | list[str] 88 | List of processed file names 89 | 90 | Notes 91 | ----- 92 | Processed files: 93 | 94 | - data.pt: Contains processed PyTorch Geometric data object 95 | """ 96 | return ['data.pt'] 97 | 98 | def download(self): 99 | """ 100 | Download raw data files. 101 | 102 | Notes 103 | ----- 104 | Empty implementation - data should be manually placed in raw directory 105 | """ 106 | pass 107 | 108 | def process(self): 109 | """ 110 | Process raw data into PyTorch Geometric Data format. 111 | 112 | Notes 113 | ----- 114 | Processing Steps: 115 | 116 | - Load pickle file containing: 117 | 118 | * Edge indices (citations) 119 | * Node features (paper content) 120 | * Labels (paper categories) 121 | 122 | - Convert to PyTorch tensors 123 | - Create Data object with: 124 | 125 | * Edge indices 126 | * Node features 127 | * Node labels 128 | * Train/val/test masks 129 | 130 | - Apply pre-transform if specified 131 | - Save processed data 132 | 133 | Data Split: 134 | 135 | - Training: 80% 136 | - Validation: 10% 137 | - Testing: 10% 138 | 139 | Features: 140 | 141 | - Random split generation 142 | - Feature type conversion 143 | - Optional pre-transform support 144 | - Efficient data storage 145 | """ 146 | path = osp.join(self.raw_dir, '{}.pkl'.format(self.name)) 147 | dataset = pkl.load(open(path, 'rb')) 148 | 149 | edge_index = dataset.graph['edge_index'] 150 | features = dataset.graph['node_feat'] 151 | label = dataset.label 152 | 153 | x = features.to(torch.float) 154 | y = label.squeeze().to(torch.int64) 155 | 156 | data_list = [] 157 | data = Data(edge_index=edge_index, x=x, y=y) 158 | 159 | random_node_indices = np.random.permutation(y.shape[0]) 160 | training_size = int(len(random_node_indices) * 0.8) 161 | val_size = int(len(random_node_indices) * 0.1) 162 | train_node_indices = random_node_indices[:training_size] 163 | val_node_indices = random_node_indices[training_size:training_size + val_size] 164 | test_node_indices = random_node_indices[training_size + val_size:] 165 | 166 | train_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 167 | train_masks[train_node_indices] = 1 168 | val_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 169 | val_masks[val_node_indices] = 1 170 | test_masks = torch.zeros([y.shape[0]], dtype=torch.bool) 171 | test_masks[test_node_indices] = 1 172 | 173 | data.train_mask = train_masks 174 | data.val_mask = val_masks 175 | data.test_mask = test_masks 176 | 177 | if self.pre_transform is not None: 178 | if not os.path.exists(self.processed_paths[0] + 'eival.pt'): 179 | data = self.pre_transform(data, self.processed_paths[0]) 180 | 181 | data_list.append(data) 182 | 183 | data, slices = self.collate([data]) 184 | 185 | torch.save((data, slices), self.processed_paths[0]) --------------------------------------------------------------------------------