├── .gitignore
├── .gitmodules
├── README.md
├── Usage.md
├── back.ps1
├── base_config.yaml
├── convert.ps1
├── gen_config.py
├── get_alt_script.ps1
├── models
    └── put_ckpt_and_vae_here.txt
├── old_REAME.md
├── train.ps1
└── utils
    ├── README.md
    ├── handle_annotation.py
    └── rename.py


/.gitignore:
--------------------------------------------------------------------------------
1 | concept.json
2 | deepbooru/models/*
3 | !deepbooru/models/dd_models_should_be_here.txt


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "repos/diffusers"]
 2 | 	path = repos/diffusers
 3 | 	url = https://github.com/CCRcmcpe/diffusers
 4 | [submodule "repos/xformers"]
 5 | 	path = repos/xformers
 6 | 	url = https://github.com/facebookresearch/xformers
 7 | [submodule "blip_helper"]
 8 | 	path = blip_helper
 9 | 	url = https://github.com/crosstyan/blip_helper
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoDL DreamBooth Config Cheat Sheet
 2 | 
 3 | **This repository is deprecated.**
 4 | 
 5 | [CCRcmcpe/scal-sdt](https://github.com/CCRcmcpe/scal-sdt) and
 6 | [Mikubill/naifu-diffusion](https://github.com/Mikubill/naifu-diffusion)
 7 | are easy to use enough that I don't think another wrapper will be
 8 | beneficial. Just use them directly.
 9 | 
10 | Checkout [this gist](https://gist.github.com/crosstyan/bc1de3f74ceac1e43b491af58a05c69b)
11 | for Docker configuration.
12 | 
13 | [SCAL-SDT Colab](https://colab.research.google.com/drive/1dCwUGjgi3IralIB9jIfFPkEVUVcXw-G_)
14 | is also recommended to read.
15 | 
16 | If you insist you can still read [original README](old_REAME.md).
17 | 


--------------------------------------------------------------------------------
/Usage.md:
--------------------------------------------------------------------------------
 1 | # Usage
 2 | 
 3 | Check the source code. Just simple wrapper for the original command line interface.
 4 | 
 5 | - `convert.ps1` convert the `ckpt` format to diffusers format
 6 | - `train.ps1` train will train the model. Edit this file to change parameters. See [DreamBooth training example](https://github.com/ShivamShrirao/diffusers/tree/main/examples/dreambooth) for details.
 7 | - `back.ps1` would convert the diffusers format back to `ckpt` format. the `ckpt` would be half precision and only takes *2.4G*.
 8 | 
 9 | and check the [SOURCE CODE](https://github.com/CCRcmcpe/diffusers/blob/main/examples/dreambooth/modules/args.py) of `train_dreambooth.py` for details.
10 | 
11 | I would copy and paste the description from the original colab for now.
12 | 
13 | ## Features
14 | 
15 | See [`base_config.yaml`](base_config.yaml) 
16 | and [`dreambooth.yaml`](https://github.com/CCRcmcpe/diffusers/blob/yaml-config/examples/dreambooth/configs/dreambooth.yaml)
17 | for the full list of options.
18 | 
19 | ### YAML Config
20 | 
21 | See [`base_config.yaml`](base_config.yaml). I'm using native training by default. I assume you know the difference between native and DreamBooth. If not, read [the FAQ](https://gist.github.com/crosstyan/f912612f4c26e298feec4a2924c41d99).
22 | 
23 | Use `--config` in [`train.ps1`](train.ps1) to specify the config file and use [`gen_config.py`](gen_config.py) to generate the config file, modify it if you like but it suits my needs.
24 | 
25 | ### WandB
26 | 
27 | You can use [WandB](https://wandb.ai/) (Weight and Bias) to monitor your training process.
28 | 
29 | ```bash
30 | pip install wandb
31 | wandb login
32 | # input your wandb API token
33 | ```
34 | 
35 | You can view sample images from WandB now.
36 | 
37 | ### Multiple Class/Concept
38 | 
39 | See [`gen_config.py`](gen_config.py)
40 | 
41 | ### Aspect Ratio Bucket
42 | 
43 | See also [NovelAI/novelai-aspect-ratio-bucketing](https://github.com/NovelAI/novelai-aspect-ratio-bucketing/).
44 | 
45 | > BucketManager impls NovelAI Aspect Ratio Bucketing, which may greatly improve the quality of outputs according to [Novelai's blog](https://blog.novelai.net/novelai-improvements-on-stable-diffusion-e10d38db82ac)
46 | 
47 | ### Train Text Encoder
48 | 
49 | `train_text_encoder` is weird. Check [the FAQ](https://gist.github.com/crosstyan/f912612f4c26e298feec4a2924c41d99) for details.
50 | 
51 | ### With Prior Preservation
52 | 
53 | > DB without prior preservation loss and enable variable instance prompt (Read Prompt TXT) is fine tuning directly.
54 | 


--------------------------------------------------------------------------------
/back.ps1:
--------------------------------------------------------------------------------
 1 | param(
 2 |     # the original use_checkpoint
 3 |     [Parameter(Mandatory = $true)]
 4 |     [string]$id,
 5 |     [Parameter(Mandatory = $true)]
 6 |     [string]$step,
 7 |     [Parameter()]
 8 |     [string]$outputDir = "output"
 9 | )
10 | 
11 | $BackConverter = "repos/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py"
12 | 
13 | $exist = Test-Path $BackConverter -PathType Leaf
14 | if (!$exist) {
15 |     Write-Host -ForegroundColor red "Trainer not found. Have you run 'git submodule update --init --recursive'?"
16 |     exit 1
17 | }
18 | 
19 | $AutoDLTmp = "/root/autodl-tmp"
20 | $OutPath = Join-Path $AutoDLTmp $outputDir
21 | # input
22 | # change this directly if you did not use the default ouput dir
23 | # remove params
24 | $ModelPath = Join-Path $OutPath $id $step
25 | # output
26 | $CheckpointPath = Join-Path $ModelPath "model.ckpt"
27 | 
28 | # unet_half could reduce the size of the model
29 | python $BackConverter  --model_path $ModelPath `
30 |     --checkpoint_path $CheckpointPath `
31 |     --unet_half
32 | 
33 | Write-Host "Back conversion done"
34 | 


--------------------------------------------------------------------------------
/base_config.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   resolution: 512
 3 |   center_crop: false
 4 |   concepts:
 5 |     # You can add more concepts
 6 |     - instance_set:
 7 |         path: 'example/data/instance'
 8 |         prompt: 'sks 1girl'
 9 |         combine_prompt_from_txt: false
10 |         prompt_combine_template: '{PROMPT}, {TXT_PROMPT}'
11 |       class_set:
12 |         path: 'example/data/class'
13 |         prompt: '1girl'
14 |         combine_prompt_from_txt: false
15 |         prompt_combine_template: '{PROMPT}, {TXT_PROMPT}'
16 |         auto_generate:
17 |           enabled: true
18 |           negative_prompt: 'lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'
19 |           steps: 28
20 |           cfg_scale: 11
21 |           num_target: 100
22 |           batch_size: 1
23 | 
24 | sampling:
25 |   interval_steps: 50
26 |   batch_size: 1
27 |   concepts:
28 |     - prompt: 'sks 1girl, sitting'
29 |       negative_prompt: 'lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'
30 |       steps: 28
31 |       cfg_scale: 11
32 |       num_samples: 8
33 |       seed: 114514
34 | 
35 | batch_size: 2
36 | seed: 114514
37 | 
38 | prior_preservation:
39 |   enabled: true
40 |   prior_loss_weight: 1.0
41 | 
42 | train_text_encoder: false
43 | 
44 | aspect_ratio_bucket:
45 |   enabled: false
46 |   debug: false
47 | 
48 | gradient_checkpointing: true
49 | gradient_accumulation_steps: 1
50 | gradient_clipping:
51 |   enabled: false
52 |   max_grad_norm: 1.0
53 | 
54 | mixed_precision: 'fp16'
55 | cache_latents: true
56 | clip_skip: 2
57 | pad_tokens: true
58 | 
59 | saving:
60 |   #interval_steps: 1000
61 |   interval_epochs: 10
62 |   min_steps: 100
63 |   unet_half: false
64 | 
65 | monitoring:
66 |   monitors:
67 |     - 'wandb'
68 |   wandb:
69 |     sample: true
70 |     artifact: false
71 |     remove_ckpt_after_upload: false
72 | 
73 | optimizer:
74 |   name: bitsandbytes.optim.AdamW8bit
75 |   params:
76 |     lr: 5e-6
77 |     beta1: 0.9
78 |     beta2: 0.999
79 |     weight_decay: 1e-2
80 |     eps: 1e-8
81 |   lr_scale:
82 |     enabled: true
83 |     method: 'sqrt'
84 |   lr_scheduler:
85 |     name: torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
86 |     params:
87 |       T_0: 10
88 |       T_mult: 1
89 |       eta_min: 7e-8
90 |       last_epoch: -1
91 |     warmup:
92 |       enabled: true
93 |       init_lr: 7e-8
94 |       steps: 100
95 |       strategy: 'linear'
96 | 


--------------------------------------------------------------------------------
/convert.ps1:
--------------------------------------------------------------------------------
 1 | $Converter = "repos/diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py"
 2 | 
 3 | $exist = Test-Path $Converter -PathType Leaf
 4 | if (!$exist) {
 5 |     Write-Host -ForegroundColor red "Trainer not found. Have you run 'git submodule update --init --recursive'?"
 6 |     exit 1
 7 | }
 8 | 
 9 | # Download the model
10 | # https://pub-2fdef7a2969f43289c42ac5ae3412fd4.r2.dev/animefull-pruned.tar
11 | # https://pub-2fdef7a2969f43289c42ac5ae3412fd4.r2.dev/animevae.pt
12 | # tar -cf animefull-pruned.tar
13 | 
14 | $ModelDir = "models"
15 | $CheckpointPath = Join-Path $ModelDir "model.ckpt"
16 | $VaePath = Join-Path $ModelDir "animevae.pt"
17 | $ConfPath = Join-Path $ModelDir "config.yaml"
18 | $ModelName = "animefull-pruned"
19 | # dump to auto-tmp to save space of system disk
20 | $AutoDLTmp = "/root/autodl-tmp"
21 | $DumpPath = Join-Path $AutoDLTmp $ModelName
22 | 
23 | python $Converter --checkpoint_path $CheckpointPath `
24 |                   --original_config_file $ConfPath `
25 |                   --vae_path $VaePath `
26 |                   --dump_path $DumpPath `
27 |                   --scheduler_type ddim
28 | 
29 | Write-Host "Conversion done"
30 | 


--------------------------------------------------------------------------------
/gen_config.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | from yaml import load, dump
 3 | from pathlib import Path
 4 | import glob
 5 | import os
 6 | try:
 7 |     from yaml import CLoader as Loader, CDumper as Dumper
 8 | except ImportError:
 9 |     from yaml import Loader, Dumper
10 | 
11 | 
12 | base_config = None
13 | with open("base_config.yaml", "r") as f:
14 |     base_config = load(f, Loader=Loader)
15 | 
16 | # list all the directory but not the files
17 | dirs = [x for x in glob.glob("*") if os.path.isdir(x)]
18 | 
19 | concepts = []
20 | 
21 | # - instance_set:
22 | #     path: 'example/data/instance'
23 | #     prompt: 'sks 1girl'
24 | #     combine_prompt_from_txt: false
25 | #     prompt_combine_template: '{PROMPT}, {TXT_PROMPT}'
26 | #   class_set:
27 | #     path: 'example/data/class'
28 | #     prompt: '1girl'
29 | #     combine_prompt_from_txt: false
30 | #     prompt_combine_template: '{PROMPT}, {TXT_PROMPT}'
31 | #     auto_generate:
32 | #       enabled: true
33 | #       negative_prompt: 'lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'
34 | #       steps: 28
35 | #       cfg_scale: 11
36 | #       num_target: 100
37 | #       batch_size: 1
38 | def get_concept(path: str):
39 |   # don't care about class set since I don't use DreamBooth
40 |   abs_path = Path(path).absolute()
41 |   concept = {
42 |       "instance_set": {
43 |           "path": str(abs_path),
44 |           # don't have any additional prompt here. All we need is the txt
45 |           "prompt": "", 
46 |           "combine_prompt_from_txt": True,
47 |           "prompt_combine_template": "{TXT_PROMPT}",
48 |       },
49 |       "class_set": {
50 |         "path": "",
51 |         "prompt": "",
52 |         "combine_prompt_from_txt": False,
53 |         "auto_generate": {
54 |           "enabled": False,
55 |         }
56 |       }
57 |   }
58 |   return concept
59 | 
60 | # I expect concept to be a list of concept
61 | concepts = map(get_concept, dirs)
62 | 
63 | base_config["data"]["concepts"] = list(concepts) 
64 | dump(base_config, open("config.yaml", "w"), Dumper=Dumper)
65 | 


--------------------------------------------------------------------------------
/get_alt_script.ps1:
--------------------------------------------------------------------------------
1 | # download the alternative script from
2 | # https://colab.research.google.com/drive/17yM4mlPVOFdJE_81oWBz5mXH9cxvhmz8#scrollTo=aLWXPZqjsZVV
3 | Invoke-WebRequest -OutFile repos/diffusers/examples/dreambooth/train_dreambooth_alt.py https://pub-2fdef7a2969f43289c42ac5ae3412fd4.r2.dev/dreambooth.py


--------------------------------------------------------------------------------
/models/put_ckpt_and_vae_here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crosstyan/dreambooth-scripts-for-autodl/0a16287d6330798d6d25f639dfc8464943d57d2b/models/put_ckpt_and_vae_here.txt


--------------------------------------------------------------------------------
/old_REAME.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Original README
  3 | 
  4 | Code is adapted from this [the colab notebook](https://colab.research.google.com/drive/1C1vVZ59S4kWfL7jIsczyLpmxbD4cOA-k). Thanks to the contribution from community.
  5 | 
  6 | Add alternative `train_dreambooth.py` from [this colab notebook](https://colab.research.google.com/drive/17yM4mlPVOFdJE_81oWBz5mXH9cxvhmz8#scrollTo=aLWXPZqjsZVV). use `get_alt_script.ps1` to download it.
  7 | 
  8 | Choose PyTorch 1.11.0 Python 3.8 (Ubuntu 20.04) as base image.
  9 | 
 10 | ```bash
 11 | cd ~
 12 | git clone https://github.com/crosstyan/dreambooth-scripts-for-autodl dreambooth
 13 | cd dreambooth
 14 | git submodule update --init --recursive
 15 | # TODO write an init script to help configure the envrionment
 16 | # for now you have to do it manually
 17 | ```
 18 | 
 19 | ## Diffusers
 20 | 
 21 | ### Conda Environment Configuration
 22 | 
 23 | using conda/mamba with Python 3.10.6. 
 24 | 
 25 | First of all you have to install conda and mamba. I assume you have done that.
 26 | 
 27 | ```bash
 28 | # See https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-python.html
 29 | mamba create -n diffusers python=3.10.6
 30 | # if you have installed powershell
 31 | ```
 32 | 
 33 | If you're NOT using my scripts you can choose to configure bash
 34 | 
 35 | ```bash
 36 | conda init bash
 37 | conda activate diffusers
 38 | # or use your favourite text editor
 39 | # or sed, whatever
 40 | # see also https://stackoverflow.com/questions/17701989/how-do-i-append-text-to-a-file
 41 | vim ~/.bashrc
 42 | # echo "conda activate diffusers" >> ~/.bashrc
 43 | # add `conda activate diffusers` to the end of file
 44 | ```
 45 | 
 46 | #### PowerShell
 47 | 
 48 | If you're using my script you have to install PowerShell.
 49 | See [Installing PowerShell on Ubuntu](https://learn.microsoft.com/en-us/powershell/scripting/install/install-ubuntu?view=powershell-7.2).
 50 | 
 51 | ```bash
 52 | # after installing PowerShell
 53 | conda init powershell
 54 | conda activate diffusers
 55 | vim ~/.config/powershell/profile.ps1
 56 | # add `conda activate diffusers` to the end of file
 57 | ```
 58 | 
 59 | > Why do I use PowerShell? Because I can't write correct bash scripts (Help wanted!) and too lazy to use python.
 60 | 
 61 | ### Clone the Repo
 62 | 
 63 | I'm using [the diffusers fork of CCRcmcpe](https://github.com/CCRcmcpe/diffusers), which added [wandb](https://wandb.ai/site) support and a few improvements.
 64 | 
 65 | ```bash
 66 | git clone https://github.com/CCRcmcpe/diffusers repos/diffusers
 67 | ```
 68 | 
 69 | LLVM10 is required as well
 70 | 
 71 | ```bash
 72 | # https://packages.ubuntu.com/focal/llvm-10-dev
 73 | apt install llvm-10-dev
 74 | ```
 75 | 
 76 | ```bash
 77 | # install these packages
 78 | # switch to TUNA PiPy mirror if you get any error
 79 | # https://mirrors.tuna.tsinghua.edu.cn/help/pypi/
 80 | pip install -U pip
 81 | pip install wandb
 82 | pip install -U --pre triton
 83 | pip install accelerate==0.12.0 transformers ftfy bitsandbytes gradio
 84 | pip install omegaconf einops pytorch_lightning
 85 | pip install transformers
 86 | cd repos/diffusers
 87 | pip install .
 88 | ```
 89 | 
 90 | ## xformers
 91 | 
 92 | [xformers](https://github.com/facebookresearch/xformers) is totally OPTIONAL. You can skip this part if you feel like doing it. It just speed up the training process, which is trivial if you have a beefy machine like A5000. If you mess up anything about xformers, just uninstall it.
 93 | 
 94 | ```
 95 | pip uninstall xformers
 96 | ```
 97 | 
 98 | [prebuilt wheel](https://github.com/crosstyan/dreambooth-scripts-for-autodl/releases/tag/v0.0.14) build with
 99 | 
100 | ```txt
101 | Cuda compilation tools, release 11.3, V11.3.109
102 | RTX A5000
103 | Python: 3.10.6
104 | OS: Ubuntu 20.04.4 LTS x86_64 
105 | Kernel: 5.4.0-100-generic 
106 | ```
107 | 
108 | ```bash
109 | wget https://github.com/crosstyan/dreambooth-scripts-for-autodl/releases/download/v0.0.14/xformers-0.0.14.dev0-cp310-cp310-linux_x86_64.whl
110 | pip install xformers-0.0.14.dev0-cp310-cp310-linux_x86_64.whl
111 | ```
112 | 
113 | ### Compile from source
114 | 
115 | Here's how you build it from source.
116 | 
117 | ```bash
118 | # https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/linux/xformers-0.0.14.dev0-cp310-cp310-linux_x86_64.whl
119 | # won't work since AutoDL provided Ubuntu version is too old
120 | # GLIBC_2.32 is required
121 | git clone https://github.com/facebookresearch/xformers repos/xformers
122 | cd repos/xformers
123 | # install ninja to speedup building.
124 | pip install ninja
125 | # or maybe
126 | # apt install ninja-build build-essential
127 | pip install -r requirements.txt
128 | # use `pip wheel .` to create a whl file
129 | pip install .
130 | ```
131 | 
132 | ~~I'm not sure if `MAKEFLAGS` is effective since it still takes a long time to~~
133 | ~~compile and still only one core be used. I mean a about an hour or less, not sure.~~
134 | 
135 | Using ninja could speed the building process up. ([source](https://github.com/facebookresearch/xformers/issues/481))
136 | 
137 | ## Tagging
138 | 
139 | See also [blip_helper](https://github.com/crosstyan/blip_helper). It will apply the result of [BLIP](https://github.com/salesforce/BLIP) and [DeepDanbooru](https://github.com/KichangKim/DeepDanbooru) at the same time.
140 | 
141 | ```bash
142 | python blip_helper/run.py --path /path/to/your/image
143 | ```
144 | 
145 | ## Usage
146 | 
147 | See [Usage](Usage.md)
148 | 
149 | ## TODOs
150 | 
151 | - [ ] Provide a Jupyter interface directly
152 | - [ ] Intergrate with AUTOMATIC WebUI (I'm afraid there's no enough space)


--------------------------------------------------------------------------------
/train.ps1:
--------------------------------------------------------------------------------
 1 | param(
 2 |   [switch] $alt=$false
 3 | )
 4 | 
 5 | $Trainer = "repos/diffusers/examples/dreambooth/train_dreambooth.py"
 6 | 
 7 | # https://stackoverflow.com/questions/31879814/check-if-a-file-exists-or-not-in-windows-powershell
 8 | if ($alt) {
 9 |   Write-Host "Try to use alternative script"
10 |   $Trainer = "repos/diffusers/examples/dreambooth/train_dreambooth_alt.py"
11 |   $exist = Test-Path $Trainer -PathType Leaf
12 |   if (!$exist) {
13 |     Write-Host -ForegroundColor red "Alternative trainer not found. Run 'get_alt_script.ps1' to get it."
14 |     exit 1
15 |   }
16 | } else {
17 |   $exist = Test-Path $Trainer -PathType Leaf
18 |   if (!$exist) {
19 |     Write-Host -ForegroundColor red "Trainer not found. Have you run 'git submodule update --init --recursive'?"
20 |     exit 1
21 |   }
22 | }
23 | 
24 | $AutoDLTmp = "/root/autodl-tmp"
25 | 
26 | # maybe use 512?
27 | $Resolution = 768 
28 | 
29 | $ConfigPath = Join-Path (Invoke-Expression "Get-Location") "dreambooth.yaml"
30 | 
31 | # Previewing
32 | # Prompt for saving samples.
33 | $SaveSamplePrompt = "sks 1girl standing looking at viewer, cowboy shot" 
34 | $SaveSampleNegative = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry" 
35 | 
36 | # TODO: detect if the model exists
37 | # Remind the user to excecute `convert.ps1` if the model is not existing.
38 | 
39 | # Model path
40 | $ModelName = "animefull-pruned"
41 | # I will dump to auto-tmp to save space of system disk
42 | $ModelPath = Join-Path $AutoDLTmp $ModelName
43 | $VaePath = Join-Path $ModelPath "vae"
44 | $OutPath = Join-Path $AutoDLTmp "output"
45 | mkdir -p $OutPath
46 | 
47 | # use this setting if you are using A5000 like me
48 | 
49 | # See https://github.com/CCRcmcpe/diffusers/blob/main/examples/dreambooth/modules/args.py
50 | # for full parameter list
51 | accelerate launch $Trainer `
52 |   --pretrained_model_name_or_path $ModelPath `
53 |   --pretrained_vae_name_or_path $VaePath `
54 |   --output_dir $OutPath `
55 |   <# Everything should be configured here #> `
56 |   --config $ConfigPath `
57 |   <# WandB Project Name #> `
58 |   <# --project "test" #> `
59 |   <# ID of this run, random generated by default. #> `
60 |   <# --run_id "test" #> `
61 |   <# would ovrride epochs if provided #> `
62 |   <# --train_n_steps=2000 #> `
63 |   <# target epochs #> `
64 |   --train_to_epochs=50
65 | 
66 | # `gradient accumulation` will save VRAM but slow it down
67 | # `train_text_encoder` would train text encoder
68 | # increase the batch size if you still got spare VRAM
69 | 
70 | # TODO: write a script to inference images with parameters like json?
71 | # don't care about inference. I would do it some where else.
72 | # see `back.ps1`
73 | 


--------------------------------------------------------------------------------
/utils/README.md:
--------------------------------------------------------------------------------
1 | # Utils
2 | 
3 | Some random script for me to handle the data.


--------------------------------------------------------------------------------
/utils/handle_annotation.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import glob
 3 | import os
 4 | import pandas as pd
 5 | from pprint import pprint
 6 | 
 7 | # base_dir = "C:/Users/cross/Desktop/Grabber"
 8 | # base_dir = "C:\\Users\\cross\\Desktop\\Shion"
 9 | base_dir = "C:\\Users\\cross\\Desktop\\gyokai"
10 | txts = os.path.join(base_dir, "*.txt")
11 | texts = glob.glob(txts)
12 | 
13 | delete_suffix = [".jpg", ".png", ".gif", ".jpeg", ".bmp", ".webp"]
14 | 
15 | 
16 | # a list of Path
17 | def rename_filename(files: list[str]):
18 |     for file in files:
19 |         p = Path(file)
20 |         s = p.stem
21 |         for suffix in delete_suffix:
22 |             s = s.replace(suffix, "")
23 |         p.rename(p.with_name(s).with_suffix(p.suffix))
24 | 
25 | def correct_artists_name(txt: str) -> str:
26 |   strange_artists = {
27 |     "aaaa+aaaa (quad-a)": "aaaa",
28 |     "hagi (ame hagi)": "hagi",
29 |     "muk (monsieur)": "muk",
30 |     "asou (asabu202)": "asou",
31 |     "chou (meteorite3)": "chou",
32 |     "shion (mirudakemann)": "shion",
33 |   }
34 |   # I assume you only get one artist
35 |   for k, v in strange_artists.items():
36 |     if k in txt:
37 |       replaced = txt.replace(k, v)
38 |       if replaced is not None:
39 |         return replaced
40 |   # can't find any strange artist
41 |   return txt
42 | 
43 | # rename_filename(texts)
44 | def read_authors(files: list[str]) -> dict[str, int]:
45 |     result = {}
46 |     for file in files:
47 |         with open(file, "r") as f:
48 |             l = f.readline()
49 |             by = l.split("by")[-1]
50 |             by = by.strip()
51 |             result[by] = result.get(by, 0) + 1
52 |     return result
53 | 
54 | def print_ranking(authors: dict[str, int]):
55 |   df = pd.DataFrame(authors.items(), columns=["Author", "Count"])
56 |   # filter out the author with less than 10 images
57 |   d = df[df["Count"] > 5].sort_values("Count", ascending=False)
58 |   print(d)
59 | 
60 | def do_correct(files: list[str]):
61 |   for file in files:
62 |     old = ""
63 |     l = ""
64 |     with open(file, "r") as f:
65 |       old = f.readline()
66 |     with open(file, "w") as f:
67 |       l = correct_artists_name(old)
68 |       f.write(l)
69 | 
70 | 
71 | rename_filename(texts)
72 | # do_correct(texts)
73 | # authors = read_authors(texts)
74 | # print_ranking(authors)
75 | 


--------------------------------------------------------------------------------
/utils/rename.py:
--------------------------------------------------------------------------------
 1 | # rename to md5
 2 | import os 
 3 | import glob
 4 | import shutil
 5 | from pathlib import Path
 6 | import hashlib
 7 | 
 8 | 
 9 | p = "C:\\Users\\cross\\Desktop\\dataset\\mt\\akima_sketch"
10 | picture_suffix = ("jpg", "png", "gif", "jpeg", "bmp", "webp")
11 | txt_suffix = "txt"
12 | 
13 | files_grabbed = glob.glob(os.path.join(p, "*"), recursive=True)
14 | print("found {} files".format(len(files_grabbed)))
15 | files_with_ext = [ f for f in files_grabbed if f.endswith(picture_suffix) ]
16 | print("found {} files with picture extensions".format(len(files_with_ext)))
17 | file_with_txt = [ f for f in files_grabbed if f.endswith(txt_suffix) ]
18 | 
19 | for f in files_with_ext:
20 |     p = Path(f)
21 |     h = None
22 |     old_name = p.stem
23 |     with open(f, "rb") as f:
24 |         h = hashlib.md5(f.read()).hexdigest()
25 |     if h is not None:
26 |       p.rename(p.with_name(h).with_suffix(p.suffix))
27 |       for txt in file_with_txt:
28 |           t = Path(txt)
29 |           if t.stem == old_name:
30 |               t.rename(t.with_name(h).with_suffix(t.suffix))
31 |               break
32 | 


--------------------------------------------------------------------------------