├── .gitignore ├── README.md ├── cfgs ├── mlm │ └── cfg1.yaml ├── single-gpu │ ├── cfg0.yaml │ └── cfg1.yaml └── training │ ├── cfg0.yaml │ └── cfg1.yaml ├── docs ├── PII Data Detection.pdf ├── pii-data-detection-eda.ipynb ├── tlal-pii-data-detection-eda-learn-with-me.ipynb └── view-results.ipynb ├── gen-data ├── ai-gen-llama3.py ├── cfgs │ ├── cfg-auto-llama3-v0.yaml │ └── cfg-auto-llama3-v1.yaml ├── finalize-placeholder-data-llama3.py ├── pii-syn-data.py ├── prompt-templates │ ├── direct-pii │ │ ├── temp1.txt │ │ ├── temp1_author.txt │ │ ├── temp2.txt │ │ ├── temp2_author.txt │ │ ├── temp3.txt │ │ ├── temp3_author.txt │ │ ├── temp4.txt │ │ └── temp4_author.txt │ ├── majors.txt │ ├── placeholder │ │ ├── mixed-llama3 │ │ │ └── temp1.txt │ │ └── mixed │ │ │ ├── temp1.txt │ │ │ ├── temp2.txt │ │ │ ├── temp3.txt │ │ │ └── temp4.txt │ └── topics-list.txt ├── synthetic-data.ipynb ├── top-domains.txt └── view-gen-data.ipynb ├── imgs └── pii-bio-example.png ├── requirements.txt ├── scripts ├── dual-gpu-train.sh ├── generate-data.sh ├── single-gpu-train.sh └── train_multiple.sh ├── src ├── .gitkeep ├── create_datasets.py ├── cxmetrics.py ├── gendata.py ├── gendata_placeholder_mistral.py ├── load_data.py ├── preprocessing.py └── utils.py └── training ├── mlm-training.py ├── train_chunks_cib.py ├── train_dual_gpu.py └── train_single_large.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/README.md -------------------------------------------------------------------------------- /cfgs/mlm/cfg1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/cfgs/mlm/cfg1.yaml -------------------------------------------------------------------------------- /cfgs/single-gpu/cfg0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/cfgs/single-gpu/cfg0.yaml -------------------------------------------------------------------------------- /cfgs/single-gpu/cfg1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/cfgs/single-gpu/cfg1.yaml -------------------------------------------------------------------------------- /cfgs/training/cfg0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/cfgs/training/cfg0.yaml -------------------------------------------------------------------------------- /cfgs/training/cfg1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/cfgs/training/cfg1.yaml -------------------------------------------------------------------------------- /docs/PII Data Detection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/docs/PII Data Detection.pdf -------------------------------------------------------------------------------- /docs/pii-data-detection-eda.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/docs/pii-data-detection-eda.ipynb -------------------------------------------------------------------------------- /docs/tlal-pii-data-detection-eda-learn-with-me.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/docs/tlal-pii-data-detection-eda-learn-with-me.ipynb -------------------------------------------------------------------------------- /docs/view-results.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/docs/view-results.ipynb -------------------------------------------------------------------------------- /gen-data/ai-gen-llama3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/ai-gen-llama3.py -------------------------------------------------------------------------------- /gen-data/cfgs/cfg-auto-llama3-v0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/cfgs/cfg-auto-llama3-v0.yaml -------------------------------------------------------------------------------- /gen-data/cfgs/cfg-auto-llama3-v1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/cfgs/cfg-auto-llama3-v1.yaml -------------------------------------------------------------------------------- /gen-data/finalize-placeholder-data-llama3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/finalize-placeholder-data-llama3.py -------------------------------------------------------------------------------- /gen-data/pii-syn-data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/pii-syn-data.py -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp1.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp1_author.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp1_author.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp2.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp2_author.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp2_author.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp3.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp3_author.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp3_author.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp4.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/direct-pii/temp4_author.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/direct-pii/temp4_author.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/majors.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/majors.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/placeholder/mixed-llama3/temp1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/placeholder/mixed-llama3/temp1.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/placeholder/mixed/temp1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/placeholder/mixed/temp1.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/placeholder/mixed/temp2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/placeholder/mixed/temp2.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/placeholder/mixed/temp3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/placeholder/mixed/temp3.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/placeholder/mixed/temp4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/placeholder/mixed/temp4.txt -------------------------------------------------------------------------------- /gen-data/prompt-templates/topics-list.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/prompt-templates/topics-list.txt -------------------------------------------------------------------------------- /gen-data/synthetic-data.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/synthetic-data.ipynb -------------------------------------------------------------------------------- /gen-data/top-domains.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/top-domains.txt -------------------------------------------------------------------------------- /gen-data/view-gen-data.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/gen-data/view-gen-data.ipynb -------------------------------------------------------------------------------- /imgs/pii-bio-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/imgs/pii-bio-example.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/requirements.txt -------------------------------------------------------------------------------- /scripts/dual-gpu-train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/scripts/dual-gpu-train.sh -------------------------------------------------------------------------------- /scripts/generate-data.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/scripts/generate-data.sh -------------------------------------------------------------------------------- /scripts/single-gpu-train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/scripts/single-gpu-train.sh -------------------------------------------------------------------------------- /scripts/train_multiple.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/scripts/train_multiple.sh -------------------------------------------------------------------------------- /src/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/create_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/src/create_datasets.py -------------------------------------------------------------------------------- /src/cxmetrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/src/cxmetrics.py -------------------------------------------------------------------------------- /src/gendata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/src/gendata.py -------------------------------------------------------------------------------- /src/gendata_placeholder_mistral.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/src/gendata_placeholder_mistral.py -------------------------------------------------------------------------------- /src/load_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/src/load_data.py -------------------------------------------------------------------------------- /src/preprocessing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/src/preprocessing.py -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/src/utils.py -------------------------------------------------------------------------------- /training/mlm-training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/training/mlm-training.py -------------------------------------------------------------------------------- /training/train_chunks_cib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/training/train_chunks_cib.py -------------------------------------------------------------------------------- /training/train_dual_gpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/training/train_dual_gpu.py -------------------------------------------------------------------------------- /training/train_single_large.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mddunlap924/PII-Detection/main/training/train_single_large.py --------------------------------------------------------------------------------