├── src └── __init__.py ├── generation_idea_template ├── probiou │ ├── seed_ideas.json │ ├── prompt.json │ └── experiment.py ├── plane_voronoi │ ├── seed_ideas.json │ ├── code │ │ ├── img │ │ │ ├── voronoi.jpg │ │ │ └── voronoi_adaptive.jpg │ │ └── graph_based_voronoi.py │ └── prompt.json ├── coordattention │ ├── seed_ideas.json │ ├── prompt.json │ ├── experiment.py │ └── code │ │ ├── xxx-cross_dim_coordatt.py │ │ ├── xxx-hierarchical_coordatt.py │ │ ├── xxa-depthwise_coordatt.py │ │ ├── xaa-global_context_coordatt.py │ │ ├── xaa-temporal_coordatt.py │ │ ├── xaa-channel_mix_coordatt.py │ │ ├── xxx-geo_transform_coordatt.py │ │ ├── xxx-dynamic_complexity_coordatt.py │ │ ├── xxx-dual_domain_coordatt.py │ │ ├── xxx-se_coordatt.py │ │ ├── xxx-shared_params_coordatt.py │ │ ├── xxx-probabilistic_coordatt.py │ │ ├── xxa-edge_aware_coordatt.py │ │ ├── aaa-freq_domain_coordatt.py │ │ ├── xaa-adaptive_coordatt.py │ │ ├── xxx-content_adaptive_coordatt.py │ │ ├── xxx-multi_scale_coordatt.py │ │ ├── xxx-sparse_coordatt.py │ │ ├── xxa-nonlocal_coordatt.py │ │ └── xxa-deformable_pooling_coordatt.py ├── coordattention-gemini │ ├── seed_ideas.json │ ├── prompt.json │ ├── experiment.py │ └── code │ │ ├── pre_pool_depthwise_spatial.py │ │ ├── separate_learnable_fusion.py │ │ ├── modulated_post_sigmoid_attention.py │ │ ├── sigmoid_weighted_interaction_group_conv.py │ │ ├── pre_pool_spatial_adaptive_channel_attention.py │ │ ├── pre_pool_conv_spatial_attention.py │ │ ├── refined_modulated_cross_spatial_interaction.py │ │ ├── early_fusion_addition_projected.py │ │ ├── bottleneck_attention_modulation.py │ │ └── sigmoid_dynamic_weighted_fusion.py └── small_object_attention │ ├── seed_ideas.json │ ├── prompt.json │ ├── experiment.py │ └── code │ ├── scale_normalization_attention.py │ ├── hierarchical_attention_scaling.py │ ├── semantic_attention.py │ ├── meta_attention.py │ ├── spatial_channel_attention.py │ ├── simulated_temporal_attention.py │ ├── cross_channel_attention.py │ ├── dynamic_data_transformation.py │ ├── dual_attention_integration.py │ ├── multi_resolution_attention.py │ ├── contrastive_attention_enhancement.py │ ├── quality_adaptive_attention.py │ ├── internal_attention_bootstrapping.py │ ├── sparsity_attention.py │ ├── contextual_attention.py │ ├── task_adaptive_attention.py │ ├── pyramid_attention.py │ ├── adaptive_complexity_attention.py │ ├── adaptive_gating_attention.py │ ├── denoising_attention.py │ ├── temporal_attention_fusion.py │ ├── simplified_graph_attention.py │ ├── geometric_transformation_attention.py │ ├── dynamic_attention_selection.py │ ├── color_channel_fusion_attention.py │ ├── uncertainty_guided_attention.py │ └── contrastive_learning_integration.py ├── paper_review_example ├── attention.pdf ├── gan_diffusion.pdf └── layerwise_lr_grokking.pdf ├── .env.example ├── requirements.txt ├── paper_review_example.py ├── generation_idea_example.py ├── review.txt ├── README.md └── .gitignore /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /generation_idea_template/probiou/seed_ideas.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /generation_idea_template/plane_voronoi/seed_ideas.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /generation_idea_template/coordattention/seed_ideas.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/seed_ideas.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/seed_ideas.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /paper_review_example/attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KMnO4-zx/paper-agent/HEAD/paper_review_example/attention.pdf -------------------------------------------------------------------------------- /paper_review_example/gan_diffusion.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KMnO4-zx/paper-agent/HEAD/paper_review_example/gan_diffusion.pdf -------------------------------------------------------------------------------- /paper_review_example/layerwise_lr_grokking.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KMnO4-zx/paper-agent/HEAD/paper_review_example/layerwise_lr_grokking.pdf -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY='' 2 | # OPENAI_BASE_URL='' 3 | 4 | DEEPSEEK_API_KEY='' 5 | DEEPSEEK_BASE_URL='https://api.deepseek.com' 6 | 7 | # 用于S2的API Key 8 | S2_API_KEY='' -------------------------------------------------------------------------------- /generation_idea_template/plane_voronoi/code/img/voronoi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KMnO4-zx/paper-agent/HEAD/generation_idea_template/plane_voronoi/code/img/voronoi.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # LLM APIs 2 | backoff 3 | openai 4 | # Viz 5 | pypdf 6 | pymupdf4llm 7 | # Common Requirements 8 | numpy 9 | tqdm 10 | # env 11 | python-dotenv 12 | -------------------------------------------------------------------------------- /generation_idea_template/plane_voronoi/code/img/voronoi_adaptive.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KMnO4-zx/paper-agent/HEAD/generation_idea_template/plane_voronoi/code/img/voronoi_adaptive.jpg -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "You are an ambitious AI PhD student who is looking to publish a paper that will contribute significantly to the field.", 3 | "task_description": "You are given the following file to work with, which can better detect small targets in detection target detection." 4 | } -------------------------------------------------------------------------------- /generation_idea_template/plane_voronoi/prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "You are an ambitious AI PhD student who is looking to publish a paper that will contribute significantly to the field.", 3 | "task_description": "You are given the following file to work with, which can improve the algorithm to make it more efficient to generate voronoi graphs." 4 | } -------------------------------------------------------------------------------- /generation_idea_template/coordattention/prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "You are an ambitious AI PhD student who is looking to publish a paper that will contribute significantly to the field.", 3 | "task_description": "You are given the following file to work with, you can improve this attention module by adapting feature extraction and lightweight attention mechanisms." 4 | } -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "You are an ambitious AI PhD student who is looking to publish a paper that will contribute significantly to the field.", 3 | "task_description": "You are given the following file to work with, you can improve this attention module by adapting feature extraction and lightweight attention mechanisms." 4 | } -------------------------------------------------------------------------------- /generation_idea_template/probiou/prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "You are an ambitious AI PhD student who is looking to publish a paper that will contribute significantly to the field.", 3 | "task_description": "You are provided with the following file to work with. Without modifying the probiou theme algorithm, analyze the shortcomings of Ciou and propose a new optimized iou algorithm. You will work with the following file, conducting the analysis without modifying the probiou theme algorithm." 4 | } -------------------------------------------------------------------------------- /paper_review_example.py: -------------------------------------------------------------------------------- 1 | from src.perform_review import load_paper, perform_review 2 | from src.prompt import reviewer_system_prompt_neg, reviewer_system_prompt_base, reviewer_system_prompt_pos 3 | from openai import OpenAI 4 | import json 5 | import os 6 | 7 | import pprint 8 | 9 | # gpt-4o 10 | openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")) 11 | 12 | # deepseek-chat 13 | deepseek_clinet = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url=os.getenv('DEEPSEEK_BASE_URL')) 14 | 15 | # load paper 16 | text = load_paper("./paper_review_example/attention.pdf") 17 | 18 | """ 19 | reviewer_system_prompt: 有三个等级 20 | reviewer_system_prompt_neg : 严格模式 21 | reviewer_system_prompt_base : 中等模式 22 | reviewer_system_prompt_pos : 宽松模式 23 | """ 24 | # 建议使用 gpt-4o review,使用 deepseek-chat 或许会找到一些意想不到的惊喜~ 25 | review = perform_review(text, 'gpt-4o-2024-08-06', openai_client, num_reviews_ensemble=1, num_reflections=2, reviewer_system_prompt=reviewer_system_prompt_neg) 26 | 27 | with open('review.txt', 'a') as f: 28 | json.dump(review, f, indent=4) 29 | f.write('\n\n\n') # 添加换行符以便于区分多个 JSON 对象 30 | 31 | pprint.pp(review) -------------------------------------------------------------------------------- /generation_idea_example.py: -------------------------------------------------------------------------------- 1 | from src.generate_idea import generate_ideas, check_idea_novelty, generation_idea_code 2 | from openai import OpenAI 3 | import json 4 | import os 5 | import pprint 6 | 7 | from dotenv import load_dotenv, find_dotenv 8 | _ = load_dotenv(find_dotenv()) 9 | 10 | # gpt-4o 11 | openai_model = "gpt-4o-2024-08-06" 12 | openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")) 13 | 14 | # deepseek-chat 15 | deepseek_model = "deepseek-chat" 16 | deepseek_clinet = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url=os.getenv('DEEPSEEK_BASE_URL')) 17 | 18 | base_dir = './generation_idea_template/probiou/' 19 | 20 | # generate ideas 21 | # 会自动保存 ideas 的结果到文件中,下次运行时会直接从文件中加载 22 | # ideas = generate_ideas( 23 | # base_dir=base_dir, 24 | # client=openai_client, 25 | # model=openai_model, 26 | # skip_generation=False, 27 | # max_num_generations=20, 28 | # num_reflections=5, 29 | # ) 30 | 31 | with open(os.path.join(base_dir, 'ideas.json'), 'r') as f: 32 | ideas = json.load(f) 33 | 34 | # check novelty 35 | # 会自动更新 novelty 的结果到 ideas.json 文件中,下次运行时会直接从文件中加载 36 | # novelty_ideas = check_idea_novelty( 37 | # ideas=ideas, 38 | # base_dir=base_dir, 39 | # client=openai_client, 40 | # model=openai_model, 41 | # ) 42 | 43 | # generate ideas with code ,自动保存 code 到 base_dir/code 目录下 44 | generation_idea_code(base_dir=base_dir, client=openai_client, model=openai_model, num_reflections=5) 45 | 46 | pprint.pp(ideas) 47 | 48 | -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/experiment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import flatten, nn 4 | from torch.nn import init 5 | from torch.nn.modules.activation import ReLU 6 | from torch.nn.modules.batchnorm import BatchNorm2d 7 | from torch.nn import functional as F 8 | 9 | 10 | class SEAttention(nn.Module): 11 | 12 | def __init__(self, channel=512,reduction=16): 13 | super().__init__() 14 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 15 | self.fc = nn.Sequential( 16 | nn.Linear(channel, channel // reduction, bias=False), 17 | nn.ReLU(inplace=True), 18 | nn.Linear(channel // reduction, channel, bias=False), 19 | nn.Sigmoid() 20 | ) 21 | 22 | def init_weights(self): 23 | for m in self.modules(): 24 | if isinstance(m, nn.Conv2d): 25 | init.kaiming_normal_(m.weight, mode='fan_out') 26 | if m.bias is not None: 27 | init.constant_(m.bias, 0) 28 | elif isinstance(m, nn.BatchNorm2d): 29 | init.constant_(m.weight, 1) 30 | init.constant_(m.bias, 0) 31 | elif isinstance(m, nn.Linear): 32 | init.normal_(m.weight, std=0.001) 33 | if m.bias is not None: 34 | init.constant_(m.bias, 0) 35 | 36 | def forward(self, x): 37 | b, c, _, _ = x.size() 38 | y = self.avg_pool(x).view(b, c) 39 | y = self.fc(y).view(b, c, 1, 1) 40 | return x * y.expand_as(x) 41 | 42 | if __name__ == '__main__': 43 | model = SEAttention() 44 | model.init_weights() 45 | input = torch.randn(1, 512, 7, 7) 46 | output = model(input) 47 | print(output.shape) -------------------------------------------------------------------------------- /review.txt: -------------------------------------------------------------------------------- 1 | { 2 | "Summary": "The paper presents the Transformer, a novel neural network architecture that utilizes attention mechanisms exclusively, eliminating the need for recurrence or convolution. It demonstrates superior performance on machine translation tasks, with enhanced parallelization and reduced training time compared to existing models.", 3 | "Strengths": [ 4 | "Introduces a novel architecture based entirely on attention mechanisms.", 5 | "Achieves state-of-the-art results on machine translation tasks.", 6 | "Improves computational efficiency and parallelization.", 7 | "Comprehensive experimental evaluation and analysis.", 8 | "Clear and well-organized writing." 9 | ], 10 | "Weaknesses": [ 11 | "Limited exploration of tasks beyond machine translation.", 12 | "Potential limitations in handling very long sequences not extensively discussed.", 13 | "Lacks discussion of potential societal impacts." 14 | ], 15 | "Originality": 4, 16 | "Quality": 4, 17 | "Clarity": 4, 18 | "Significance": 4, 19 | "Questions": [ 20 | "How does the model perform on tasks beyond machine translation?", 21 | "Are there any specific challenges anticipated for applying the Transformer to very long sequences?", 22 | "Could the authors elaborate on any potential ethical concerns or societal impacts?" 23 | ], 24 | "Limitations": [ 25 | "The model may have limitations with very long sequences due to fixed positional encodings.", 26 | "Lack of exploration in diverse application domains could imply limited generalization evidence.", 27 | "Does not address the potential negative societal impacts or ethical considerations of deploying such models." 28 | ], 29 | "Ethical Concerns": false, 30 | "Soundness": 4, 31 | "Presentation": 4, 32 | "Contribution": 4, 33 | "Overall": 9, 34 | "Confidence": 5, 35 | "Decision": "Accept" 36 | } 37 | 38 | 39 | -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/experiment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class h_sigmoid(nn.Module): 7 | def __init__(self, inplace=True): 8 | super(h_sigmoid, self).__init__() 9 | self.relu = nn.ReLU6(inplace=inplace) 10 | 11 | def forward(self, x): 12 | return self.relu(x + 3) / 6 13 | 14 | 15 | class h_swish(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_swish, self).__init__() 18 | self.sigmoid = h_sigmoid(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return x * self.sigmoid(x) 22 | 23 | 24 | class CoordAtt(nn.Module): 25 | def __init__(self, inp, reduction=32): 26 | super(CoordAtt, self).__init__() 27 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 28 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 29 | 30 | mip = max(8, inp // reduction) 31 | 32 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 33 | self.bn1 = nn.BatchNorm2d(mip) 34 | self.act = h_swish() 35 | 36 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 37 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 38 | 39 | def forward(self, x): 40 | identity = x 41 | 42 | n, c, h, w = x.size() 43 | x_h = self.pool_h(x) 44 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 45 | 46 | y = torch.cat([x_h, x_w], dim=2) 47 | y = self.conv1(y) 48 | y = self.bn1(y) 49 | y = self.act(y) 50 | 51 | x_h, x_w = torch.split(y, [h, w], dim=2) 52 | x_w = x_w.permute(0, 1, 3, 2) 53 | 54 | a_h = self.conv_h(x_h).sigmoid() 55 | a_w = self.conv_w(x_w).sigmoid() 56 | 57 | out = identity * a_w * a_h 58 | 59 | return out 60 | 61 | if __name__ == '__main__': 62 | x = torch.randn(2, 64, 32, 32) 63 | att = CoordAtt(inp=64, reduction=32) 64 | out = att(x) 65 | print("输入尺寸:", x.shape) 66 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/experiment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class h_sigmoid(nn.Module): 7 | def __init__(self, inplace=True): 8 | super(h_sigmoid, self).__init__() 9 | self.relu = nn.ReLU6(inplace=inplace) 10 | 11 | def forward(self, x): 12 | return self.relu(x + 3) / 6 13 | 14 | 15 | class h_swish(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_swish, self).__init__() 18 | self.sigmoid = h_sigmoid(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return x * self.sigmoid(x) 22 | 23 | 24 | class CoordAtt(nn.Module): 25 | def __init__(self, inp, reduction=32): 26 | super(CoordAtt, self).__init__() 27 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 28 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 29 | 30 | mip = max(8, inp // reduction) 31 | 32 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 33 | self.bn1 = nn.BatchNorm2d(mip) 34 | self.act = h_swish() 35 | 36 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 37 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 38 | 39 | def forward(self, x): # x (2, 64, 32, 32) 40 | identity = x # 残差结构 41 | 42 | n, c, h, w = x.size() # 获得 h 和 w 形状 43 | x_h = self.pool_h(x) # (2, 64, 32, 1) 44 | x_w = self.pool_w(x).permute(0, 1, 3, 2) # (2, 64, 1, 32) --> (2, 64, 32, 1) 45 | 46 | y = torch.cat([x_h, x_w], dim=2) # (2, 64, 64, 1) 47 | y = self.conv1(y) # (2, 8, 64, 1) 48 | y = self.bn1(y) # (2, 8, 64, 1) 49 | y = self.act(y) # (2, 8, 64, 1) 50 | 51 | x_h, x_w = torch.split(y, [h, w], dim=2) # (2, 8, 32, 1), (2, 8, 32, 1) 52 | x_w = x_w.permute(0, 1, 3, 2) # (2, 8, 1, 32) 53 | 54 | a_h = self.conv_h(x_h).sigmoid() # (2, 64, 32, 1) 55 | a_w = self.conv_w(x_w).sigmoid() # (2, 64, 1, 32) 56 | out = identity * a_w * a_h # (2, 64, 32, 32) 57 | return out 58 | 59 | if __name__ == '__main__': 60 | x = torch.randn(2, 64, 32, 32) 61 | att = CoordAtt(inp=64, reduction=32) 62 | out = att(x) 63 | print("输入尺寸:", x.shape) 64 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/scale_normalization_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enhance SEAttention by introducing a scale normalization layer that preprocesses feature maps to emphasize smaller targets 3 | Implement this using a learned scaling factor that dynamically adjusts feature intensities based on size relevance before applying SEAttention 4 | Modify the forward function to include this normalization step, ensuring minimal computational overhead 5 | Evaluate the model's effectiveness by comparing precision and recall metrics, alongside visualizations of attention focus on small targets, using synthetic datasets 6 | 7 | """ 8 | 9 | # Modified code 10 | 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | 19 | 20 | class SEAttention(nn.Module): 21 | 22 | def __init__(self, channel=512, reduction=16): 23 | super().__init__() 24 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 25 | self.fc = nn.Sequential( 26 | nn.Linear(channel, channel // reduction, bias=False), 27 | nn.ReLU(inplace=True), 28 | nn.Linear(channel // reduction, channel, bias=False), 29 | nn.Sigmoid() 30 | ) 31 | self.scale_norm = nn.Parameter(torch.ones(1, channel, 1, 1)) 32 | 33 | def init_weights(self): 34 | for m in self.modules(): 35 | if isinstance(m, nn.Conv2d): 36 | init.kaiming_normal_(m.weight, mode='fan_out') 37 | if m.bias is not None: 38 | init.constant_(m.bias, 0) 39 | elif isinstance(m, nn.BatchNorm2d): 40 | init.constant_(m.weight, 1) 41 | init.constant_(m.bias, 0) 42 | elif isinstance(m, nn.Linear): 43 | init.normal_(m.weight, std=0.001) 44 | if m.bias is not None: 45 | init.constant_(m.bias, 0) 46 | 47 | def forward(self, x): 48 | # Scale normalization step 49 | x = x * self.scale_norm 50 | 51 | # SEAttention mechanism 52 | b, c, _, _ = x.size() 53 | y = self.avg_pool(x).view(b, c) 54 | y = self.fc(y).view(b, c, 1, 1) 55 | return x * y.expand_as(x) 56 | 57 | if __name__ == '__main__': 58 | model = SEAttention() 59 | model.init_weights() 60 | input = torch.randn(1, 512, 7, 7) 61 | output = model(input) 62 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/hierarchical_attention_scaling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the SEAttention class to incorporate a dynamic scaling factor for attention weights 3 | Implement a new function that computes scaling factors based on the spatial dimensions of feature maps 4 | Integrate this function into the forward pass of SEAttention to adjust attention weights dynamically 5 | Evaluate performance using precision, recall, and F1-score on small target detection tasks, comparing against the baseline SEAttention model and other enhanced models to demonstrate improvements in detecting small targets 6 | 7 | """ 8 | 9 | import numpy as np 10 | import torch 11 | from torch import flatten, nn 12 | from torch.nn import init 13 | from torch.nn.modules.activation import ReLU 14 | from torch.nn.modules.batchnorm import BatchNorm2d 15 | from torch.nn import functional as F 16 | 17 | class SEAttention(nn.Module): 18 | 19 | def __init__(self, channel=512, reduction=16): 20 | super().__init__() 21 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 22 | self.fc = nn.Sequential( 23 | nn.Linear(channel, channel // reduction, bias=False), 24 | nn.ReLU(inplace=True), 25 | nn.Linear(channel // reduction, channel, bias=False), 26 | nn.Sigmoid() 27 | ) 28 | 29 | def init_weights(self): 30 | for m in self.modules(): 31 | if isinstance(m, nn.Conv2d): 32 | init.kaiming_normal_(m.weight, mode='fan_out') 33 | if m.bias is not None: 34 | init.constant_(m.bias, 0) 35 | elif isinstance(m, nn.BatchNorm2d): 36 | init.constant_(m.weight, 1) 37 | init.constant_(m.bias, 0) 38 | elif isinstance(m, nn.Linear): 39 | init.normal_(m.weight, std=0.001) 40 | if m.bias is not None: 41 | init.constant_(m.bias, 0) 42 | 43 | def compute_scaling_factor(self, x): 44 | _, _, h, w = x.size() 45 | # Example scaling factor: inverse of the sum of spatial dimensions 46 | return 1.0 / (h + w) 47 | 48 | def forward(self, x): 49 | b, c, _, _ = x.size() 50 | scaling_factor = self.compute_scaling_factor(x) 51 | y = self.avg_pool(x).view(b, c) 52 | y = self.fc(y).view(b, c, 1, 1) 53 | y = y * scaling_factor 54 | return x * y.expand_as(x) 55 | 56 | if __name__ == '__main__': 57 | model = SEAttention() 58 | model.init_weights() 59 | input = torch.randn(1, 512, 7, 7) 60 | output = model(input) 61 | print(output.shape) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Paper-Agent 2 | 3 | > *该项目来源于 [AI-Scientist](https://github.com/SakanaAI/AI-Scientist) 项目,Paper: [The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery](https://arxiv.org/abs/2408.06292)* 4 | 5 | AI-Scientist 项目很棒,实验思路设计很精巧,很值得学习。但现有模型在写论文和修改代码做实验方面还有很多不足,所以在仔细研读了 AI-Scientist 的论文和项目代码之后,将 `generation ideas` 和 `paper revierw` 部分的代码拿来出来,做了一些简单的修改,并给代码添加了详细的中文注释,方便学习。 6 | 7 | 每篇论文的诞生都经过了漫长的思考和实验,目前的LLM应该还无法胜任做实验这个任务,但它可以为我们提供一些更为大胆的 idea ,以及对论文的 review。review 可以找出论文中意想不到优缺点(优点:更好的讲故事;缺点:避重就轻,扬长避短,省略不要!哈哈)。LLM generation idea 和 paper review 希望可以把你变成 ***论文打印机!*** 8 | 9 | > *项目所有的 prompt 可以在 `src/prompt.py` 中找到。* 10 | 11 | ## Usage 12 | 13 | ### .env 14 | 15 | 首先需要配置一些环境变量,可以参考 `.env.example` 文件。 16 | 17 | ``` 18 | OPENAI_API_KEY='' 19 | # OPENAI_BASE_URL='' 20 | 21 | DEEPSEEK_API_KEY='' 22 | DEEPSEEK_BASE_URL='https://api.deepseek.com' 23 | 24 | # 用于S2的API Key 25 | S2_API_KEY='' 26 | ``` 27 | 28 | > *S2 的 API Key 可以在 [Semantic Scholar](https://www.semanticscholar.org/) 上申请。* 29 | 30 | ### Generation Ideas 31 | 32 | 首先需要准备几个文件:`experiment.py`, `prompt.json` 和 `seed_ideas.json`。 33 | 34 | - `experiment.py` 是你的实验/模块代码,必须有。 35 | - `prompt.json` 是对任务的描述,必须有。 36 | - `seed_ideas.json` 是你的实验代码的种子想法,可以没有(但文件要存在)。 37 | 38 | > 可以参考 `generation_idea_template` 文件夹中的示例。 39 | 40 | 运行代码可以参考 `generation_idea_example.py` 文件,也可以直接运行该文件。 41 | 42 | ```bash 43 | python generation_idea_example.py 44 | ``` 45 | 46 | > note: 代码的运行结果还会保存在 `base_dir` 目录下的 `ideas.json` 文件中。 47 | 48 | #### 代码详解 49 | 50 | `generation_idea_example.py` 文件中大致代码如下: 51 | 52 | - `generate_ideas`:用于生成 idea。 53 | - `check_idea_novelty`:用于检查 idea 的新颖性。但需要申请 emantic Scholar 的 API Key。 54 | - `generation_idea_code`:可以根据以上两个函数生成idea来生成代码。如果已经 `check novelty`,会直接生成 `novel=True` 的代码,如果没有 `check novelty`,会生成所有的代码。 55 | 56 | ```python 57 | # gpt-4o 58 | # generate ideas 59 | # 会自动保存 ideas 的结果到文件中,下次运行时会直接从文件中加载 60 | ideas = generate_ideas( 61 | base_dir=base_dir, 62 | client=openai_client, 63 | model=openai_model, 64 | skip_generation=False, 65 | max_num_generations=20, 66 | num_reflections=5, 67 | ) 68 | 69 | # check novelty 70 | # 会自动更新 novelty 的结果到 ideas.json 文件中,下次运行时会直接从文件中加载 71 | novelty_ideas = check_idea_novelty( 72 | ideas=ideas, 73 | base_dir=base_dir, 74 | client=openai_client, 75 | model=openai_model, 76 | ) 77 | 78 | # generate ideas with code ,自动保存 code 到 base_dir/code 目录下 79 | generation_idea_code(base_dir=base_dir, client=openai_client, model=openai_model, num_reflections=5) 80 | 81 | pprint.pp(novelty_ideas) 82 | ``` 83 | 84 | ### Paper Review 85 | 86 | 运行代码可以参考 `paper_review_example.py` 文件,也可以直接运行该文件。 87 | 88 | ```bash 89 | python paper_review_example.py 90 | ``` 91 | 92 | > note: 代码的运行结果还会保存在根目录下的 `reviews.txt` 文件中。 93 | 94 | ## Blog 95 | 96 | *后续有时间会有一个关于 AI-Scientist Blog (如果有时间的话)* 97 | 98 | -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/pre_pool_depthwise_spatial.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | Before pooling, apply a depthwise convolution to mix the spatial information 4 | Then, perform the standard height and width pooling 5 | Modify the `__init__` to include the depthwise convolution 6 | Modify the `forward` to implement the depthwise convolution, and then the standard pooling 7 | The rest of the forward pass remains unchanged 8 | Compare the output with the baseline using the same test input and observe changes 9 | This involves adding a depthwise conv, and modifying the forward pass 10 | 11 | """ 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | 17 | 18 | class h_sigmoid(nn.Module): 19 | def __init__(self, inplace=True): 20 | super(h_sigmoid, self).__init__() 21 | self.relu = nn.ReLU6(inplace=inplace) 22 | 23 | def forward(self, x): 24 | return self.relu(x + 3) / 6 25 | 26 | 27 | class h_swish(nn.Module): 28 | def __init__(self, inplace=True): 29 | super(h_swish, self).__init__() 30 | self.sigmoid = h_sigmoid(inplace=inplace) 31 | 32 | def forward(self, x): 33 | return x * self.sigmoid(x) 34 | 35 | 36 | class CoordAtt(nn.Module): 37 | def __init__(self, inp, reduction=32): 38 | super(CoordAtt, self).__init__() 39 | # Depthwise convolution before pooling 40 | self.depthwise = nn.Conv2d(inp, inp, kernel_size=3, stride=1, padding=1, groups=inp) 41 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 42 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 43 | 44 | mip = max(8, inp // reduction) 45 | 46 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 47 | self.bn1 = nn.BatchNorm2d(mip) 48 | self.act = h_swish() 49 | 50 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 51 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 52 | 53 | def forward(self, x): 54 | identity = x 55 | 56 | # Apply depthwise conv 57 | x = self.depthwise(x) 58 | 59 | n, c, h, w = x.size() 60 | x_h = self.pool_h(x) 61 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 62 | 63 | y = torch.cat([x_h, x_w], dim=2) 64 | y = self.conv1(y) 65 | y = self.bn1(y) 66 | y = self.act(y) 67 | 68 | x_h, x_w = torch.split(y, [h, w], dim=2) 69 | x_w = x_w.permute(0, 1, 3, 2) 70 | 71 | a_h = self.conv_h(x_h).sigmoid() 72 | a_w = self.conv_w(x_w).sigmoid() 73 | 74 | out = identity * a_w * a_h 75 | 76 | return out 77 | 78 | if __name__ == '__main__': 79 | x = torch.randn(2, 64, 32, 32) 80 | att = CoordAtt(inp=64, reduction=32) 81 | out = att(x) 82 | print("输入尺寸:", x.shape) 83 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/semantic_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by integrating a semantic attention module that utilizes a simple pooling strategy to identify salient semantic features 3 | Implement this by adding a global max pooling layer to extract prominent features, followed by a learnable attention layer that assigns weights based on semantic relevance 4 | Modify the forward function to incorporate this semantic attention after the channel attention 5 | Evaluate the model's effectiveness by comparing detection performance on synthetic datasets, focusing on improvements in semantic understanding and detection accuracy 6 | 7 | """ 8 | 9 | # Modified code 10 | 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | 19 | class SEAttention(nn.Module): 20 | 21 | def __init__(self, channel=512, reduction=16): 22 | super().__init__() 23 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 24 | self.fc = nn.Sequential( 25 | nn.Linear(channel, channel // reduction, bias=False), 26 | nn.ReLU(inplace=True), 27 | nn.Linear(channel // reduction, channel, bias=False), 28 | nn.Sigmoid() 29 | ) 30 | # Semantic attention components 31 | self.global_max_pool = nn.AdaptiveMaxPool2d(1) 32 | self.semantic_fc = nn.Sequential( 33 | nn.Linear(channel, channel, bias=False), 34 | nn.Sigmoid() 35 | ) 36 | 37 | def init_weights(self): 38 | for m in self.modules(): 39 | if isinstance(m, nn.Conv2d): 40 | init.kaiming_normal_(m.weight, mode='fan_out') 41 | if m.bias is not None: 42 | init.constant_(m.bias, 0) 43 | elif isinstance(m, nn.BatchNorm2d): 44 | init.constant_(m.weight, 1) 45 | init.constant_(m.bias, 0) 46 | elif isinstance(m, nn.Linear): 47 | init.normal_(m.weight, std=0.001) 48 | if m.bias is not None: 49 | init.constant_(m.bias, 0) 50 | 51 | def forward(self, x): 52 | b, c, _, _ = x.size() 53 | 54 | # Channel attention 55 | y = self.avg_pool(x).view(b, c) 56 | y = self.fc(y).view(b, c, 1, 1) 57 | x = x * y.expand_as(x) 58 | 59 | # Semantic attention 60 | z = self.global_max_pool(x).view(b, c) 61 | z = self.semantic_fc(z).view(b, c, 1, 1) 62 | x = x * z.expand_as(x) 63 | 64 | return x 65 | 66 | if __name__ == '__main__': 67 | model = SEAttention() 68 | model.init_weights() 69 | input = torch.randn(1, 512, 7, 7) 70 | output = model(input) 71 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/meta_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by incorporating a meta-attention mechanism 3 | Implement this by introducing a secondary attention module, such as an additional SE block or a simple linear transformation, to refine the channel weights produced by the original SEAttention 4 | Modify the forward function to apply this meta-attention after the original attention recalibration while maintaining computational efficiency 5 | Evaluate the model's performance by comparing feature maps and conducting quantitative assessments on synthetic datasets, focusing on improvements in attention focus and detection performance on small targets 6 | 7 | """ 8 | 9 | # Modified code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | 19 | class SEAttention(nn.Module): 20 | 21 | def __init__(self, channel=512, reduction=16): 22 | super().__init__() 23 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 24 | self.fc = nn.Sequential( 25 | nn.Linear(channel, channel // reduction, bias=False), 26 | nn.ReLU(inplace=True), 27 | nn.Linear(channel // reduction, channel, bias=False), 28 | nn.Sigmoid() 29 | ) 30 | # Meta-attention module: an additional SE block to refine channel weights 31 | self.meta_fc = nn.Sequential( 32 | nn.Linear(channel, channel // reduction, bias=False), 33 | nn.ReLU(inplace=True), 34 | nn.Linear(channel // reduction, channel, bias=False), 35 | nn.Sigmoid() 36 | ) 37 | 38 | def init_weights(self): 39 | for m in self.modules(): 40 | if isinstance(m, nn.Conv2d): 41 | init.kaiming_normal_(m.weight, mode='fan_out') 42 | if m.bias is not None: 43 | init.constant_(m.bias, 0) 44 | elif isinstance(m, nn.BatchNorm2d): 45 | init.constant_(m.weight, 1) 46 | init.constant_(m.bias, 0) 47 | elif isinstance(m, nn.Linear): 48 | init.normal_(m.weight, std=0.001) 49 | if m.bias is not None: 50 | init.constant_(m.bias, 0) 51 | 52 | def forward(self, x): 53 | b, c, _, _ = x.size() 54 | y = self.avg_pool(x).view(b, c) 55 | y = self.fc(y) # Original SE block 56 | # Apply meta-attention to refine channel weights 57 | y_meta = self.meta_fc(y).view(b, c, 1, 1) 58 | return x * y_meta.expand_as(x) 59 | 60 | if __name__ == '__main__': 61 | model = SEAttention() 62 | model.init_weights() 63 | input = torch.randn(1, 512, 7, 7) 64 | output = model(input) 65 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/spatial_channel_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by adding a spatial attention layer 3 | Implement this by introducing a convolutional layer that outputs a spatial attention map with the same height and width as the input feature map 4 | In the forward function, apply spatial attention by element-wise multiplying the spatial attention map with the input feature map, followed by the existing channel attention 5 | Evaluate the model's effectiveness by comparing the output feature maps against those from the original SEAttention, using input tensors of varying scales and complexities 6 | Performance can be assessed by visual inspection of feature maps and quantitative analysis using synthetic datasets if available 7 | 8 | """ 9 | 10 | # Modified code 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | 19 | class SEAttention(nn.Module): 20 | 21 | def __init__(self, channel=512, reduction=16): 22 | super().__init__() 23 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 24 | self.fc = nn.Sequential( 25 | nn.Linear(channel, channel // reduction, bias=False), 26 | nn.ReLU(inplace=True), 27 | nn.Linear(channel // reduction, channel, bias=False), 28 | nn.Sigmoid() 29 | ) 30 | 31 | # Spatial attention layer 32 | self.spatial_conv = nn.Conv2d(channel, 1, kernel_size=7, padding=3, bias=False) 33 | self.spatial_sigmoid = nn.Sigmoid() 34 | 35 | def init_weights(self): 36 | for m in self.modules(): 37 | if isinstance(m, nn.Conv2d): 38 | init.kaiming_normal_(m.weight, mode='fan_out') 39 | if m.bias is not None: 40 | init.constant_(m.bias, 0) 41 | elif isinstance(m, nn.BatchNorm2d): 42 | init.constant_(m.weight, 1) 43 | init.constant_(m.bias, 0) 44 | elif isinstance(m, nn.Linear): 45 | init.normal_(m.weight, std=0.001) 46 | if m.bias is not None: 47 | init.constant_(m.bias, 0) 48 | 49 | def forward(self, x): 50 | # Spatial attention 51 | spatial_att = self.spatial_conv(x) 52 | spatial_att = self.spatial_sigmoid(spatial_att) 53 | x = x * spatial_att 54 | 55 | # Channel attention 56 | b, c, _, _ = x.size() 57 | y = self.avg_pool(x).view(b, c) 58 | y = self.fc(y).view(b, c, 1, 1) 59 | return x * y.expand_as(x) 60 | 61 | if __name__ == '__main__': 62 | model = SEAttention() 63 | model.init_weights() 64 | input = torch.randn(1, 512, 7, 7) 65 | output = model(input) 66 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/simulated_temporal_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by simulating temporal attention using a sliding window approach on spatial feature maps 3 | Implement this by adding a mechanism that divides feature maps into non-overlapping subregions, treating each as a pseudo-temporal step, and applies attention across these regions using a shared attention mechanism 4 | This should be integrated into the forward function following the channel attention 5 | Evaluate by testing the model on datasets where small targets are embedded in varying spatial contexts within a single image, with performance assessed through quantitative metrics and visualization of feature map focus areas to compare against the original model 6 | 7 | """ 8 | 9 | # Modified code 10 | 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | 19 | 20 | class SEAttention(nn.Module): 21 | 22 | def __init__(self, channel=512, reduction=16, window_size=2): 23 | super().__init__() 24 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 25 | self.fc = nn.Sequential( 26 | nn.Linear(channel, channel // reduction, bias=False), 27 | nn.ReLU(inplace=True), 28 | nn.Linear(channel // reduction, channel, bias=False), 29 | nn.Sigmoid() 30 | ) 31 | self.window_size = window_size 32 | 33 | def init_weights(self): 34 | for m in self.modules(): 35 | if isinstance(m, nn.Conv2d): 36 | init.kaiming_normal_(m.weight, mode='fan_out') 37 | if m.bias is not None: 38 | init.constant_(m.bias, 0) 39 | elif isinstance(m, nn.BatchNorm2d): 40 | init.constant_(m.weight, 1) 41 | init.constant_(m.bias, 0) 42 | elif isinstance(m, nn.Linear): 43 | init.normal_(m.weight, std=0.001) 44 | if m.bias is not None: 45 | init.constant_(m.bias, 0) 46 | 47 | def forward(self, x): 48 | b, c, h, w = x.size() 49 | y = self.avg_pool(x).view(b, c) 50 | y = self.fc(y).view(b, c, 1, 1) 51 | x = x * y.expand_as(x) 52 | 53 | # Simulating temporal attention via sliding window 54 | sw = self.window_size 55 | for i in range(0, h, sw): 56 | for j in range(0, w, sw): 57 | subregion = x[:, :, i:i+sw, j:j+sw] 58 | pooled = subregion.mean(dim=(2, 3), keepdim=True) 59 | x[:, :, i:i+sw, j:j+sw] = subregion * pooled 60 | 61 | return x 62 | 63 | if __name__ == '__main__': 64 | model = SEAttention() 65 | model.init_weights() 66 | input = torch.randn(1, 512, 7, 7) 67 | output = model(input) 68 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/separate_learnable_fusion.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | After pooling height and width features, apply a separate 1x1 convolution to each of them 4 | Then, perform an element-wise addition of the transformed height and width features 5 | Feed the result to the shared `conv1` 6 | In the `__init__` function, add two 1x1 convolution layers, one for height and one for width 7 | In the `forward` function, implement the separate convolutions and element-wise addition before the shared `conv1` 8 | The rest of the forward pass remains the same 9 | Compare output with the baseline using same test input, observe changes 10 | 11 | """ 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | 17 | 18 | class h_sigmoid(nn.Module): 19 | def __init__(self, inplace=True): 20 | super(h_sigmoid, self).__init__() 21 | self.relu = nn.ReLU6(inplace=inplace) 22 | 23 | def forward(self, x): 24 | return self.relu(x + 3) / 6 25 | 26 | 27 | class h_swish(nn.Module): 28 | def __init__(self, inplace=True): 29 | super(h_swish, self).__init__() 30 | self.sigmoid = h_sigmoid(inplace=inplace) 31 | 32 | def forward(self, x): 33 | return x * self.sigmoid(x) 34 | 35 | 36 | class CoordAtt(nn.Module): 37 | def __init__(self, inp, reduction=32): 38 | super(CoordAtt, self).__init__() 39 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 40 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 41 | 42 | mip = max(8, inp // reduction) 43 | 44 | self.conv_h_proj = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 45 | self.conv_w_proj = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 46 | 47 | 48 | self.conv1 = nn.Conv2d(mip, mip, kernel_size=1, stride=1, padding=0) 49 | self.bn1 = nn.BatchNorm2d(mip) 50 | self.act = h_swish() 51 | 52 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 53 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 54 | 55 | def forward(self, x): 56 | identity = x 57 | 58 | n, c, h, w = x.size() 59 | x_h = self.pool_h(x) 60 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 61 | 62 | x_h_proj = self.conv_h_proj(x_h) 63 | x_w_proj = self.conv_w_proj(x_w) 64 | 65 | y = x_h_proj + x_w_proj.permute(0, 1, 3, 2) 66 | 67 | y = self.conv1(y) 68 | y = self.bn1(y) 69 | y = self.act(y) 70 | 71 | x_h, x_w = torch.split(y, [h, w], dim=2) 72 | x_w = x_w.permute(0, 1, 3, 2) 73 | 74 | a_h = self.conv_h(x_h).sigmoid() 75 | a_w = self.conv_w(x_w).sigmoid() 76 | 77 | out = identity * a_w * a_h 78 | 79 | return out 80 | 81 | if __name__ == '__main__': 82 | x = torch.randn(2, 64, 32, 32) 83 | att = CoordAtt(inp=64, reduction=32) 84 | out = att(x) 85 | print("输入尺寸:", x.shape) 86 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-cross_dim_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the CoordAtt class to include a cross-dimensional attention mechanism 3 | Implement an attention mechanism that captures interactions across both height and width dimensions simultaneously using a form of matrix multiplication 4 | Adjust the forward method to compute these attention weights and integrate them with the existing coordinate attention features before applying them to the input 5 | Evaluate the impact on feature representation by testing on a small benchmark dataset, comparing performance and computational efficiency against the original CoordAtt and other variants 6 | 7 | """ 8 | 9 | # Modified code 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class h_sigmoid(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_sigmoid, self).__init__() 18 | self.relu = nn.ReLU6(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return self.relu(x + 3) / 6 22 | 23 | class h_swish(nn.Module): 24 | def __init__(self, inplace=True): 25 | super(h_swish, self).__init__() 26 | self.sigmoid = h_sigmoid(inplace=inplace) 27 | 28 | def forward(self, x): 29 | return x * self.sigmoid(x) 30 | 31 | class CoordAtt(nn.Module): 32 | def __init__(self, inp, reduction=32): 33 | super(CoordAtt, self).__init__() 34 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 35 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 36 | 37 | mip = max(8, inp // reduction) 38 | 39 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 40 | self.bn1 = nn.BatchNorm2d(mip) 41 | self.act = h_swish() 42 | 43 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 44 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 45 | 46 | # Improved Cross-Dimensional Attention 47 | self.attention_conv = nn.Conv2d(inp, inp, kernel_size=1, stride=1, padding=0) 48 | 49 | def forward(self, x): 50 | identity = x 51 | 52 | n, c, h, w = x.size() 53 | x_h = self.pool_h(x) 54 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 55 | 56 | y = torch.cat([x_h, x_w], dim=2) 57 | y = self.conv1(y) 58 | y = self.bn1(y) 59 | y = self.act(y) 60 | 61 | x_h, x_w = torch.split(y, [h, w], dim=2) 62 | x_w = x_w.permute(0, 1, 3, 2) 63 | 64 | a_h = self.conv_h(x_h).sigmoid() 65 | a_w = self.conv_w(x_w).sigmoid() 66 | 67 | # Compute cross-dimensional attention via convolution 68 | attention_weights = self.attention_conv(identity).sigmoid() 69 | 70 | out = identity * a_w * a_h * attention_weights 71 | 72 | return out 73 | 74 | if __name__ == '__main__': 75 | x = torch.randn(2, 64, 32, 32) 76 | att = CoordAtt(inp=64, reduction=32) 77 | out = att(x) 78 | print("输入尺寸:", x.shape) 79 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/cross_channel_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by integrating a cross-channel attention mechanism using a multi-head attention layer 3 | This layer computes interactions between channels to create a comprehensive attention map that enhances feature recalibration 4 | Modify the forward function to apply this cross-channel attention before the existing channel attention 5 | Evaluate the model by comparing outputs with those from SEAttention and other modifications, using attention map visualizations and performance on synthetic datasets designed to test inter-channel dependencies 6 | 7 | """ 8 | 9 | # Modified code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | class SEAttention(nn.Module): 19 | def __init__(self, channel=512, reduction=16, num_heads=8): 20 | super().__init__() 21 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 22 | self.fc = nn.Sequential( 23 | nn.Linear(channel, channel // reduction, bias=False), 24 | nn.ReLU(inplace=True), 25 | nn.Linear(channel // reduction, channel, bias=False), 26 | nn.Sigmoid() 27 | ) 28 | # Cross-channel multi-head attention 29 | self.multihead_attn = nn.MultiheadAttention(embed_dim=channel, num_heads=num_heads, batch_first=True) 30 | 31 | def init_weights(self): 32 | for m in self.modules(): 33 | if isinstance(m, nn.Conv2d): 34 | init.kaiming_normal_(m.weight, mode='fan_out') 35 | if m.bias is not None: 36 | init.constant_(m.bias, 0) 37 | elif isinstance(m, nn.BatchNorm2d): 38 | init.constant_(m.weight, 1) 39 | init.constant_(m.bias, 0) 40 | elif isinstance(m, nn.Linear): 41 | init.normal_(m.weight, std=0.001) 42 | if m.bias is not None: 43 | init.constant_(m.bias, 0) 44 | 45 | def forward(self, x): 46 | b, c, h, w = x.size() 47 | 48 | # Reshape and transpose for multi-head attention 49 | x_flat = x.view(b, c, h * w).transpose(1, 2) # shape: (b, hw, c) 50 | 51 | # Apply multi-head attention 52 | attn_output, _ = self.multihead_attn(x_flat, x_flat, x_flat) 53 | attn_output = attn_output.transpose(1, 2).view(b, c, h, w) # reshape back to original input shape 54 | 55 | # Existing SEAttention mechanism 56 | y = self.avg_pool(attn_output).view(b, c) 57 | y = self.fc(y).view(b, c, 1, 1) 58 | 59 | return attn_output * y.expand_as(attn_output) 60 | 61 | if __name__ == '__main__': 62 | model = SEAttention() 63 | model.init_weights() 64 | input = torch.randn(1, 512, 7, 7) 65 | output = model(input) 66 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/dynamic_data_transformation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Develop a dynamic data transformation module that learns to apply optimal transformations to input data to improve small target visibility 3 | Integrate this module into the SEAttention model by preprocessing input data before the attention mechanism 4 | Evaluate the model's performance by comparing precision, recall, and F1-score on small target detection tasks with and without the transformation module 5 | Analyze the effectiveness of different transformations in enhancing small target detection 6 | 7 | """ 8 | 9 | # Modified code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | 19 | class DynamicDataTransformation(nn.Module): 20 | def __init__(self, channel): 21 | super().__init__() 22 | self.transform = nn.Sequential( 23 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1, bias=False), 24 | nn.BatchNorm2d(channel), 25 | nn.ReLU(inplace=True), 26 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1, bias=False), 27 | nn.BatchNorm2d(channel), 28 | nn.Sigmoid() 29 | ) 30 | 31 | def forward(self, x): 32 | return x * self.transform(x) 33 | 34 | 35 | class SEAttention(nn.Module): 36 | 37 | def __init__(self, channel=512, reduction=16): 38 | super().__init__() 39 | self.data_transform = DynamicDataTransformation(channel) 40 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 41 | self.fc = nn.Sequential( 42 | nn.Linear(channel, channel // reduction, bias=False), 43 | nn.ReLU(inplace=True), 44 | nn.Linear(channel // reduction, channel, bias=False), 45 | nn.Sigmoid() 46 | ) 47 | 48 | def init_weights(self): 49 | for m in self.modules(): 50 | if isinstance(m, nn.Conv2d): 51 | init.kaiming_normal_(m.weight, mode='fan_out') 52 | if m.bias is not None: 53 | init.constant_(m.bias, 0) 54 | elif isinstance(m, nn.BatchNorm2d): 55 | init.constant_(m.weight, 1) 56 | init.constant_(m.bias, 0) 57 | elif isinstance(m, nn.Linear): 58 | init.normal_(m.weight, std=0.001) 59 | if m.bias is not None: 60 | init.constant_(m.bias, 0) 61 | 62 | def forward(self, x): 63 | x = self.data_transform(x) 64 | b, c, _, _ = x.size() 65 | y = self.avg_pool(x).view(b, c) 66 | y = self.fc(y).view(b, c, 1, 1) 67 | return x * y.expand_as(x) 68 | 69 | if __name__ == '__main__': 70 | model = SEAttention() 71 | model.init_weights() 72 | input = torch.randn(1, 512, 7, 7) 73 | output = model(input) 74 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/dual_attention_integration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Expand SEAttention to include a spatial attention mechanism 3 | Implement spatial attention by adding two convolutional layers, followed by a softmax activation to produce a spatial attention map 4 | Combine the spatial attention map with the SE channel attention output through element-wise multiplication 5 | Evaluate performance improvements using metrics such as precision, recall, and F1-score on small target detection tasks 6 | Compare these results to the baseline SEAttention model's performance 7 | 8 | """ 9 | 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | 19 | class SEAttentionWithSpatial(nn.Module): 20 | 21 | def __init__(self, channel=512, reduction=16): 22 | super().__init__() 23 | # Channel Attention 24 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 25 | self.fc = nn.Sequential( 26 | nn.Linear(channel, channel // reduction, bias=False), 27 | nn.ReLU(inplace=True), 28 | nn.Linear(channel // reduction, channel, bias=False), 29 | nn.Sigmoid() 30 | ) 31 | # Spatial Attention 32 | self.spatial_conv1 = nn.Conv2d(channel, channel // 8, kernel_size=7, padding=3, bias=False) 33 | self.spatial_conv2 = nn.Conv2d(channel // 8, 1, kernel_size=7, padding=3, bias=False) 34 | self.softmax = nn.Softmax(dim=2) 35 | 36 | def init_weights(self): 37 | for m in self.modules(): 38 | if isinstance(m, nn.Conv2d): 39 | init.kaiming_normal_(m.weight, mode='fan_out') 40 | if m.bias is not None: 41 | init.constant_(m.bias, 0) 42 | elif isinstance(m, nn.BatchNorm2d): 43 | init.constant_(m.weight, 1) 44 | init.constant_(m.bias, 0) 45 | elif isinstance(m, nn.Linear): 46 | init.normal_(m.weight, std=0.001) 47 | if m.bias is not None: 48 | init.constant_(m.bias, 0) 49 | 50 | def forward(self, x): 51 | b, c, h, w = x.size() 52 | 53 | # Channel Attention 54 | y_c = self.avg_pool(x).view(b, c) 55 | y_c = self.fc(y_c).view(b, c, 1, 1) 56 | channel_attention = x * y_c.expand_as(x) 57 | 58 | # Spatial Attention 59 | y_s = self.spatial_conv1(channel_attention) 60 | y_s = F.relu(y_s) 61 | y_s = self.spatial_conv2(y_s) 62 | y_s = self.softmax(y_s.view(b, 1, h * w)).view(b, 1, h, w) 63 | 64 | # Combined Attention 65 | combined_attention = channel_attention * y_s 66 | 67 | return combined_attention 68 | 69 | if __name__ == '__main__': 70 | model = SEAttentionWithSpatial() 71 | model.init_weights() 72 | input = torch.randn(1, 512, 7, 7) 73 | output = model(input) 74 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-hierarchical_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implement a hierarchical structure within the CoordAtt module, using two sequential layers 3 | The first layer applies channel-wise attention with 1x1 convolutions, focusing on channel dependencies 4 | The second layer implements spatial attention with adaptive pooling and convolutions to capture spatial dependencies 5 | Adjust the forward method to process these layers in sequence 6 | Evaluate the enhanced attention mechanism by testing on a small benchmark dataset, comparing improvements in feature representation and performance to the original CoordAtt while monitoring computational efficiency 7 | 8 | """ 9 | 10 | # Modified code 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class h_sigmoid(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_sigmoid, self).__init__() 18 | self.relu = nn.ReLU6(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return self.relu(x + 3) / 6 22 | 23 | class h_swish(nn.Module): 24 | def __init__(self, inplace=True): 25 | super(h_swish, self).__init__() 26 | self.sigmoid = h_sigmoid(inplace=inplace) 27 | 28 | def forward(self, x): 29 | return x * self.sigmoid(x) 30 | 31 | class CoordAtt(nn.Module): 32 | def __init__(self, inp, reduction=32): 33 | super(CoordAtt, self).__init__() 34 | # First layer: channel-wise attention 35 | mip = max(8, inp // reduction) 36 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 37 | self.bn1 = nn.BatchNorm2d(mip) 38 | self.act1 = h_swish() 39 | 40 | # Second layer: spatial attention 41 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 42 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 43 | self.conv_h = nn.Conv2d(mip, mip, kernel_size=1, stride=1, padding=0) 44 | self.conv_w = nn.Conv2d(mip, mip, kernel_size=1, stride=1, padding=0) 45 | self.bn2 = nn.BatchNorm2d(mip) 46 | self.act2 = h_swish() 47 | 48 | self.conv_out = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 49 | 50 | def forward(self, x): 51 | identity = x 52 | 53 | # Channel-wise attention 54 | y = self.conv1(x) 55 | y = self.bn1(y) 56 | y = self.act1(y) 57 | 58 | # Spatial attention 59 | n, c, h, w = y.size() 60 | x_h = self.pool_h(y) 61 | x_w = self.pool_w(y).permute(0, 1, 3, 2) 62 | y = torch.cat([x_h, x_w], dim=2) 63 | y = self.bn2(y) 64 | y = self.act2(y) 65 | 66 | x_h, x_w = torch.split(y, [h, w], dim=2) 67 | x_w = x_w.permute(0, 1, 3, 2) 68 | 69 | a_h = self.conv_h(x_h).sigmoid() 70 | a_w = self.conv_w(x_w).sigmoid() 71 | 72 | out = identity * a_w * a_h 73 | out = self.conv_out(out) 74 | 75 | return out 76 | 77 | if __name__ == '__main__': 78 | x = torch.randn(2, 64, 32, 32) 79 | att = CoordAtt(inp=64, reduction=32) 80 | out = att(x) 81 | print("输入尺寸:", x.shape) 82 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/multi_resolution_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by implementing a multi-resolution attention mechanism 3 | Create two versions of the input feature map: the original and a single downsampled version 4 | Apply the SEAttention block to each version, and then upsample the downsampled attention-weighted feature map back to the original resolution 5 | Combine these maps to form a final attention map 6 | Modify the forward function to include these steps while optimizing for computational efficiency 7 | Evaluate the effectiveness by comparing detection performance on small targets using synthetic datasets, assessing both qualitative and quantitative improvements over the baseline SEAttention 8 | 9 | """ 10 | 11 | # Modified code 12 | 13 | import numpy as np 14 | import torch 15 | from torch import nn 16 | from torch.nn import init 17 | from torch.nn import functional as F 18 | 19 | class SEAttention(nn.Module): 20 | 21 | def __init__(self, channel=512, reduction=16): 22 | super().__init__() 23 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 24 | self.fc = nn.Sequential( 25 | nn.Linear(channel, channel // reduction, bias=False), 26 | nn.ReLU(inplace=True), 27 | nn.Linear(channel // reduction, channel, bias=False), 28 | nn.Sigmoid() 29 | ) 30 | 31 | def init_weights(self): 32 | for m in self.modules(): 33 | if isinstance(m, nn.Conv2d): 34 | init.kaiming_normal_(m.weight, mode='fan_out') 35 | if m.bias is not None: 36 | init.constant_(m.bias, 0) 37 | elif isinstance(m, nn.BatchNorm2d): 38 | init.constant_(m.weight, 1) 39 | init.constant_(m.bias, 0) 40 | elif isinstance(m, nn.Linear): 41 | init.normal_(m.weight, std=0.001) 42 | if m.bias is not None: 43 | init.constant_(m.bias, 0) 44 | 45 | def forward(self, x): 46 | b, c, h, w = x.size() 47 | 48 | # Original SEAttention on the original feature map 49 | y1 = self.avg_pool(x).view(b, c) 50 | y1 = self.fc(y1).view(b, c, 1, 1) 51 | out1 = x * y1.expand_as(x) 52 | 53 | # Downsample the feature map using interpolation 54 | x_down = F.interpolate(x, scale_factor=0.5, mode='bilinear', align_corners=False) 55 | 56 | # SEAttention on the downsampled feature map 57 | y2 = self.avg_pool(x_down).view(b, c) 58 | y2 = self.fc(y2).view(b, c, 1, 1) 59 | out2 = x_down * y2.expand_as(x_down) 60 | 61 | # Upsample back to the original resolution 62 | out2_upsampled = F.interpolate(out2, size=(h, w), mode='bilinear', align_corners=False) 63 | 64 | # Combine attention maps 65 | out_combined = out1 + out2_upsampled 66 | 67 | return out_combined 68 | 69 | if __name__ == '__main__': 70 | model = SEAttention() 71 | model.init_weights() 72 | input = torch.randn(1, 512, 7, 7) 73 | output = model(input) 74 | print(output.shape) 75 | 76 | # I am done -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/contrastive_attention_enhancement.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enhance SEAttention by integrating contrastive learning to improve spatial awareness 3 | Implement this by creating pairs of feature maps: one with SEAttention applied and one without 4 | Use a contrastive loss function to train the model to differentiate between these maps, emphasizing small target detection 5 | Modify the forward function to support this training regime 6 | Evaluate the model by comparing the contrastive loss and visualizing the attention focus on small targets, demonstrating improved spatial discrimination over the baseline model 7 | 8 | """ 9 | 10 | # Modified code 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | from torch.nn import CosineSimilarity 19 | 20 | class SEAttention(nn.Module): 21 | 22 | def __init__(self, channel=512, reduction=16): 23 | super().__init__() 24 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 25 | self.fc = nn.Sequential( 26 | nn.Linear(channel, channel // reduction, bias=False), 27 | nn.ReLU(inplace=True), 28 | nn.Linear(channel // reduction, channel, bias=False), 29 | nn.Sigmoid() 30 | ) 31 | # Initialize cosine similarity for contrastive learning 32 | self.cos_sim = CosineSimilarity(dim=1) 33 | 34 | def init_weights(self): 35 | for m in self.modules(): 36 | if isinstance(m, nn.Conv2d): 37 | init.kaiming_normal_(m.weight, mode='fan_out') 38 | if m.bias is not None: 39 | init.constant_(m.bias, 0) 40 | elif isinstance(m, nn.BatchNorm2d): 41 | init.constant_(m.weight, 1) 42 | init.constant_(m.bias, 0) 43 | elif isinstance(m, nn.Linear): 44 | init.normal_(m.weight, std=0.001) 45 | if m.bias is not None: 46 | init.constant_(m.bias, 0) 47 | 48 | def forward(self, x): 49 | b, c, _, _ = x.size() 50 | 51 | # SEAttention applied map 52 | y = self.avg_pool(x).view(b, c) 53 | y = self.fc(y).view(b, c, 1, 1) 54 | se_attention_map = x * y.expand_as(x) 55 | 56 | # Original map without SEAttention 57 | original_map = x 58 | 59 | # Calculate cosine similarity between the two maps 60 | similarity = self.cos_sim(se_attention_map, original_map) 61 | 62 | # Contrastive loss: encourage high similarity 63 | contrastive_loss = 1 - similarity.mean() # Using 1 - cosine similarity as a simple contrastive loss 64 | 65 | return se_attention_map, contrastive_loss 66 | 67 | if __name__ == '__main__': 68 | model = SEAttention() 69 | model.init_weights() 70 | input = torch.randn(1, 512, 7, 7) 71 | output, contrastive_loss = model(input) 72 | print("Output shape:", output.shape) 73 | print("Contrastive Loss:", contrastive_loss.item()) 74 | 75 | # I am done -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxa-depthwise_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` class to replace standard convolutions with depthwise separable convolutions 3 | Implement a new class `DepthwiseSeparableConv` and use it to replace `conv1`, `conv_h`, and `conv_w` 4 | Each depthwise separable convolution consists of a depthwise convolution followed by a pointwise convolution 5 | Evaluate the efficiency improvements by measuring parameter count and computational time, and compare the accuracy on a small benchmark dataset against the original implementation 6 | 7 | """ 8 | 9 | # 创新不足,但可以用来降低计算量 10 | 11 | # Modified code 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | 17 | class h_sigmoid(nn.Module): 18 | def __init__(self, inplace=True): 19 | super(h_sigmoid, self).__init__() 20 | self.relu = nn.ReLU6(inplace=inplace) 21 | 22 | def forward(self, x): 23 | return self.relu(x + 3) / 6 24 | 25 | 26 | class h_swish(nn.Module): 27 | def __init__(self, inplace=True): 28 | super(h_swish, self).__init__() 29 | self.sigmoid = h_sigmoid(inplace=inplace) 30 | 31 | def forward(self, x): 32 | return x * self.sigmoid(x) 33 | 34 | 35 | class DepthwiseSeparableConv(nn.Module): 36 | def __init__(self, inp, oup, kernel_size=1, stride=1, padding=0): 37 | super(DepthwiseSeparableConv, self).__init__() 38 | self.depthwise = nn.Conv2d(inp, inp, kernel_size=kernel_size, stride=stride, padding=padding, groups=inp) 39 | self.pointwise = nn.Conv2d(inp, oup, kernel_size=1, stride=1, padding=0) 40 | 41 | def forward(self, x): 42 | x = self.depthwise(x) 43 | x = self.pointwise(x) 44 | return x 45 | 46 | 47 | class CoordAtt(nn.Module): 48 | def __init__(self, inp, reduction=32): 49 | super(CoordAtt, self).__init__() 50 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 51 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 52 | 53 | mip = max(8, inp // reduction) 54 | 55 | self.conv1 = DepthwiseSeparableConv(inp, mip, kernel_size=1, stride=1, padding=0) 56 | self.bn1 = nn.BatchNorm2d(mip) 57 | self.act = h_swish() 58 | 59 | self.conv_h = DepthwiseSeparableConv(mip, inp, kernel_size=1, stride=1, padding=0) 60 | self.conv_w = DepthwiseSeparableConv(mip, inp, kernel_size=1, stride=1, padding=0) 61 | 62 | def forward(self, x): 63 | identity = x 64 | 65 | n, c, h, w = x.size() 66 | x_h = self.pool_h(x) 67 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 68 | 69 | y = torch.cat([x_h, x_w], dim=2) 70 | y = self.conv1(y) 71 | y = self.bn1(y) 72 | y = self.act(y) 73 | 74 | x_h, x_w = torch.split(y, [h, w], dim=2) 75 | x_w = x_w.permute(0, 1, 3, 2) 76 | 77 | a_h = self.conv_h(x_h).sigmoid() 78 | a_w = self.conv_w(x_w).sigmoid() 79 | 80 | out = identity * a_w * a_h 81 | 82 | return out 83 | 84 | if __name__ == '__main__': 85 | x = torch.randn(2, 64, 32, 32) 86 | att = CoordAtt(inp=64, reduction=32) 87 | out = att(x) 88 | print("输入尺寸:", x.shape) 89 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/quality_adaptive_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Develop a quality assessment module that computes a quality score for each input feature map using metrics like noise level or sharpness 3 | Integrate this module into the SEAttention class, modifying the attention weights based on quality scores 4 | Implement this by adding a quality assessment function and updating the forward pass of SEAttention to apply adaptive modulation of attention weights 5 | Evaluate performance improvements using precision, recall, and F1-score on small target detection tasks, and compare results with the baseline SEAttention model and other enhanced models 6 | 7 | """ 8 | 9 | # Refined code 10 | import numpy as np 11 | import torch 12 | from torch import nn 13 | from torch.nn import init 14 | from torch.nn import functional as F 15 | 16 | 17 | class SEAttention(nn.Module): 18 | 19 | def __init__(self, channel=512, reduction=16): 20 | super().__init__() 21 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 22 | self.fc = nn.Sequential( 23 | nn.Linear(channel, channel // reduction, bias=False), 24 | nn.ReLU(inplace=True), 25 | nn.Linear(channel // reduction, channel, bias=False), 26 | nn.Sigmoid() 27 | ) 28 | # Define a Laplacian kernel for sharpness calculation 29 | self.laplacian_kernel = torch.tensor([[[[-1, -1, -1], 30 | [-1, 8, -1], 31 | [-1, -1, -1]]]], dtype=torch.float32) 32 | 33 | def compute_quality_score(self, x): 34 | # Apply the Laplacian kernel to compute sharpness 35 | laplacian = F.conv2d(x, self.laplacian_kernel, padding=1) 36 | sharpness = laplacian.var(dim=[2, 3], keepdim=True) 37 | quality_score = torch.sigmoid(sharpness) # Normalize to [0, 1] 38 | return quality_score 39 | 40 | def init_weights(self): 41 | for m in self.modules(): 42 | if isinstance(m, nn.Conv2d): 43 | init.kaiming_normal_(m.weight, mode='fan_out') 44 | if m.bias is not None: 45 | init.constant_(m.bias, 0) 46 | elif isinstance(m, nn.BatchNorm2d): 47 | init.constant_(m.weight, 1) 48 | init.constant_(m.bias, 0) 49 | elif isinstance(m, nn.Linear): 50 | init.normal_(m.weight, std=0.001) 51 | if m.bias is not None: 52 | init.constant_(m.bias, 0) 53 | 54 | def forward(self, x): 55 | b, c, _, _ = x.size() 56 | y = self.avg_pool(x).view(b, c) 57 | y = self.fc(y).view(b, c, 1, 1) 58 | 59 | # Compute quality score and adjust attention weights 60 | quality_score = self.compute_quality_score(x) 61 | adjusted_y = y * quality_score 62 | 63 | return x * adjusted_y.expand_as(x) 64 | 65 | 66 | if __name__ == '__main__': 67 | # Initialize the model and weights 68 | model = SEAttention() 69 | model.init_weights() 70 | 71 | # Test the model with a random input 72 | input = torch.randn(1, 512, 7, 7) 73 | output = model(input) 74 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xaa-global_context_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Introduce a Global Context Block within the CoordAtt module 3 | Implement global average pooling on the input feature map to extract global context vectors 4 | Use these vectors to modulate the attention weights by integrating them with the existing coordinate attention features 5 | Modify the forward method to include this global context before applying the spatial attention mechanism 6 | Evaluate the impact on feature representation and performance using a small benchmark dataset, comparing the results with the original CoordAtt and other variants 7 | 8 | """ 9 | 10 | # Modified code 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | class h_sigmoid(nn.Module): 17 | def __init__(self, inplace=True): 18 | super(h_sigmoid, self).__init__() 19 | self.relu = nn.ReLU6(inplace=inplace) 20 | 21 | def forward(self, x): 22 | return self.relu(x + 3) / 6 23 | 24 | class h_swish(nn.Module): 25 | def __init__(self, inplace=True): 26 | super(h_swish, self).__init__() 27 | self.sigmoid = h_sigmoid(inplace=inplace) 28 | 29 | def forward(self, x): 30 | return x * self.sigmoid(x) 31 | 32 | class CoordAtt(nn.Module): 33 | def __init__(self, inp, reduction=32): 34 | super(CoordAtt, self).__init__() 35 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 36 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 37 | 38 | mip = max(8, inp // reduction) 39 | 40 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 41 | self.bn1 = nn.BatchNorm2d(mip) 42 | self.act = h_swish() 43 | 44 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 45 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 46 | 47 | # Global context block 48 | self.global_pool = nn.AdaptiveAvgPool2d(1) 49 | self.global_fc = nn.Sequential( 50 | nn.Conv2d(inp, mip, kernel_size=1, padding=0), 51 | nn.BatchNorm2d(mip), 52 | nn.ReLU(inplace=True), 53 | nn.Conv2d(mip, inp, kernel_size=1, padding=0), 54 | nn.Sigmoid() 55 | ) 56 | 57 | def forward(self, x): 58 | identity = x 59 | 60 | n, c, h, w = x.size() 61 | x_h = self.pool_h(x) 62 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 63 | 64 | y = torch.cat([x_h, x_w], dim=2) 65 | y = self.conv1(y) 66 | y = self.bn1(y) 67 | y = self.act(y) 68 | 69 | x_h, x_w = torch.split(y, [h, w], dim=2) 70 | x_w = x_w.permute(0, 1, 3, 2) 71 | 72 | a_h = self.conv_h(x_h).sigmoid() 73 | a_w = self.conv_w(x_w).sigmoid() 74 | 75 | # Apply global context 76 | global_context = self.global_pool(identity) 77 | global_context = self.global_fc(global_context) 78 | 79 | out = identity * a_w * a_h * global_context 80 | 81 | return out 82 | 83 | if __name__ == '__main__': 84 | x = torch.randn(2, 64, 32, 32) 85 | att = CoordAtt(inp=64, reduction=32) 86 | out = att(x) 87 | print("输入尺寸:", x.shape) 88 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/internal_attention_bootstrapping.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implement an internal attention bootstrapping mechanism where SEAttention periodically saves and analyzes its attention distribution at various training stages 3 | Modify the training routine to adjust current attention maps to better align with or improve upon these previously saved distributions, focusing on enhancing small target detection capabilities 4 | Evaluate attention map alignment and detection performance improvements over baseline SEAttention using synthetic datasets 5 | 6 | """ 7 | 8 | # Modified code 9 | import numpy as np 10 | import torch 11 | from torch import flatten, nn 12 | from torch.nn import init 13 | from torch.nn.modules.activation import ReLU 14 | from torch.nn.modules.batchnorm import BatchNorm2d 15 | from torch.nn import functional as F 16 | 17 | 18 | class SEAttention(nn.Module): 19 | 20 | def __init__(self, channel=512, reduction=16, save_interval=10): 21 | super().__init__() 22 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 23 | self.fc = nn.Sequential( 24 | nn.Linear(channel, channel // reduction, bias=False), 25 | nn.ReLU(inplace=True), 26 | nn.Linear(channel // reduction, channel, bias=False), 27 | nn.Sigmoid() 28 | ) 29 | self.attention_history = [] 30 | self.save_interval = save_interval 31 | self.training_step = 0 32 | 33 | def init_weights(self): 34 | for m in self.modules(): 35 | if isinstance(m, nn.Conv2d): 36 | init.kaiming_normal_(m.weight, mode='fan_out') 37 | if m.bias is not None: 38 | init.constant_(m.bias, 0) 39 | elif isinstance(m, nn.BatchNorm2d): 40 | init.constant_(m.weight, 1) 41 | init.constant_(m.bias, 0) 42 | elif isinstance(m, nn.Linear): 43 | init.normal_(m.weight, std=0.001) 44 | if m.bias is not None: 45 | init.constant_(m.bias, 0) 46 | 47 | def forward(self, x): 48 | b, c, _, _ = x.size() 49 | y = self.avg_pool(x).view(b, c) 50 | y = self.fc(y).view(b, c, 1, 1) 51 | 52 | # Save attention distribution periodically 53 | if self.training and self.training_step % self.save_interval == 0: 54 | self.attention_history.append(y.detach().clone()) 55 | 56 | # Adjust attention maps based on saved distributions 57 | if self.attention_history: 58 | historical_attention = self.attention_history[-1] 59 | y = self._adjust_attention(y, historical_attention) 60 | 61 | self.training_step += 1 62 | return x * y.expand_as(x) 63 | 64 | def _adjust_attention(self, current_attention, historical_attention): 65 | # Simple example of adjustment: interpolate between current and historical attention 66 | adjusted_attention = (current_attention + historical_attention) / 2 67 | return adjusted_attention 68 | 69 | if __name__ == '__main__': 70 | model = SEAttention() 71 | model.init_weights() 72 | input = torch.randn(1, 512, 7, 7) 73 | output = model(input) 74 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/sparsity_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by incorporating a sparsity-promoting transformation within the attention mechanism 3 | Implement a sparse encoding step using a learned thresholding layer, applied to the input feature maps before the existing attention recalibration 4 | This thresholding layer will dynamically adjust based on the input characteristics to promote sparsity efficiently 5 | Modify the forward function to include this sparsity transformation and evaluate its impact by comparing detection performance on synthetic datasets with baseline SEAttention 6 | Use visualization of attention maps to assess enhanced focus on critical features and improved noise suppression 7 | 8 | """ 9 | 10 | # Modified code 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | 19 | 20 | class SEAttention(nn.Module): 21 | 22 | def __init__(self, channel=512, reduction=16): 23 | super().__init__() 24 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 25 | self.sparsity_layer = nn.Sequential( 26 | nn.Linear(channel, channel // reduction, bias=False), 27 | nn.ReLU(inplace=True), 28 | nn.Linear(channel // reduction, channel, bias=False), 29 | nn.Sigmoid() 30 | ) 31 | self.attention_layer = nn.Sequential( 32 | nn.Linear(channel, channel // reduction, bias=False), 33 | nn.ReLU(inplace=True), 34 | nn.Linear(channel // reduction, channel, bias=False), 35 | nn.Sigmoid() 36 | ) 37 | 38 | def init_weights(self): 39 | for m in self.modules(): 40 | if isinstance(m, nn.Conv2d): 41 | init.kaiming_normal_(m.weight, mode='fan_out') 42 | if m.bias is not None: 43 | init.constant_(m.bias, 0) 44 | elif isinstance(m, nn.BatchNorm2d): 45 | init.constant_(m.weight, 1) 46 | init.constant_(m.bias, 0) 47 | elif isinstance(m, nn.Linear): 48 | init.normal_(m.weight, std=0.001) 49 | if m.bias is not None: 50 | init.constant_(m.bias, 0) 51 | 52 | def apply_sparsity(self, x, threshold): 53 | """Apply sparsity mask based on the dynamic threshold.""" 54 | return x * (x > threshold).float() 55 | 56 | def forward(self, x): 57 | b, c, _, _ = x.size() 58 | 59 | # Sparsity-promoting transformation 60 | sparsity_threshold = self.avg_pool(x).view(b, c) 61 | sparsity_threshold = self.sparsity_layer(sparsity_threshold).view(b, c, 1, 1) 62 | x = self.apply_sparsity(x, sparsity_threshold) 63 | 64 | # SEAttention mechanism 65 | y = self.avg_pool(x).view(b, c) 66 | y = self.attention_layer(y).view(b, c, 1, 1) 67 | return x * y.expand_as(x) 68 | 69 | if __name__ == '__main__': 70 | model = SEAttention() 71 | model.init_weights() 72 | input = torch.randn(1, 512, 7, 7) 73 | output = model(input) 74 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xaa-temporal_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add a lightweight temporal attention mechanism to the CoordAtt module 3 | Introduce 1D convolutions that operate on temporal sequences derived from input feature maps 4 | Modify the forward method to compute temporal attention weights and integrate them with spatial attention 5 | Evaluate the effectiveness on synthetic sequential data to assess improvements in temporal feature representations, while monitoring any additional computational cost incurred 6 | 7 | """ 8 | 9 | # 可以一试 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class h_sigmoid(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_sigmoid, self).__init__() 18 | self.relu = nn.ReLU6(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return self.relu(x + 3) / 6 22 | 23 | class h_swish(nn.Module): 24 | def __init__(self, inplace=True): 25 | super(h_swish, self).__init__() 26 | self.sigmoid = h_sigmoid(inplace=inplace) 27 | 28 | def forward(self, x): 29 | return x * self.sigmoid(x) 30 | 31 | class CoordAtt(nn.Module): 32 | def __init__(self, inp, reduction=32, temporal_reduction=4): 33 | super(CoordAtt, self).__init__() 34 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 35 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 36 | 37 | mip = max(8, inp // reduction) 38 | 39 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 40 | self.bn1 = nn.BatchNorm2d(mip) 41 | self.act = h_swish() 42 | 43 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 44 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 45 | 46 | # Temporal Attention Module 47 | self.temporal_conv1 = nn.Conv1d(inp, inp // temporal_reduction, kernel_size=3, padding=1) 48 | self.temporal_bn1 = nn.BatchNorm1d(inp // temporal_reduction) 49 | self.temporal_conv2 = nn.Conv1d(inp // temporal_reduction, inp, kernel_size=3, padding=1) 50 | 51 | def forward(self, x): 52 | identity = x 53 | 54 | n, c, h, w = x.size() 55 | x_h = self.pool_h(x) 56 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 57 | 58 | y = torch.cat([x_h, x_w], dim=2) 59 | y = self.conv1(y) 60 | y = self.bn1(y) 61 | y = self.act(y) 62 | 63 | x_h, x_w = torch.split(y, [h, w], dim=2) 64 | x_w = x_w.permute(0, 1, 3, 2) 65 | 66 | a_h = self.conv_h(x_h).sigmoid() 67 | a_w = self.conv_w(x_w).sigmoid() 68 | 69 | # Temporal attention computation 70 | temporal_x = x.view(n, c, -1) # Reshape to (batch_size, channels, temporal_dim) 71 | t = self.temporal_conv1(temporal_x) 72 | t = self.temporal_bn1(t) 73 | t = F.relu(t) 74 | t = self.temporal_conv2(t).sigmoid() 75 | t = t.view(n, c, h, w) # Reshape back to original dimensions 76 | 77 | out = identity * a_w * a_h * t 78 | 79 | return out 80 | 81 | if __name__ == '__main__': 82 | x = torch.randn(2, 64, 32, 32) 83 | att = CoordAtt(inp=64, reduction=32) 84 | out = att(x) 85 | print("输入尺寸:", x.shape) 86 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/contextual_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by adding a global context block that pools the entire feature map into a context vector 3 | Use this vector to inform a spatial attention recalibration mechanism, which is applied after channel attention 4 | Implement this by adding a global context pooling layer and a recalibration module in the forward function 5 | Evaluate the model's effectiveness by comparing feature maps and performance metrics on synthetic datasets designed to mimic scenarios with small target detection challenges 6 | 7 | """ 8 | 9 | # Modified code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | class GlobalContextBlock(nn.Module): 19 | def __init__(self, channel): 20 | super(GlobalContextBlock, self).__init__() 21 | self.pool = nn.AdaptiveAvgPool2d(1) 22 | self.fc = nn.Sequential( 23 | nn.Linear(channel, channel // 16, bias=False), 24 | nn.ReLU(inplace=True), 25 | nn.Linear(channel // 16, channel, bias=False), 26 | nn.Sigmoid() 27 | ) 28 | 29 | def forward(self, x): 30 | b, c, _, _ = x.size() 31 | y = self.pool(x).view(b, c) 32 | y = self.fc(y).view(b, c, 1, 1) 33 | return y.expand_as(x) 34 | 35 | class SEAttention(nn.Module): 36 | 37 | def __init__(self, channel=512, reduction=16): 38 | super().__init__() 39 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 40 | self.channel_fc = nn.Sequential( 41 | nn.Linear(channel, channel // reduction, bias=False), 42 | nn.ReLU(inplace=True), 43 | nn.Linear(channel // reduction, channel, bias=False), 44 | nn.Sigmoid() 45 | ) 46 | self.global_context = GlobalContextBlock(channel) 47 | 48 | def init_weights(self): 49 | for m in self.modules(): 50 | if isinstance(m, nn.Conv2d): 51 | init.kaiming_normal_(m.weight, mode='fan_out') 52 | if m.bias is not None: 53 | init.constant_(m.bias, 0) 54 | elif isinstance(m, nn.BatchNorm2d): 55 | init.constant_(m.weight, 1) 56 | init.constant_(m.bias, 0) 57 | elif isinstance(m, nn.Linear): 58 | init.normal_(m.weight, std=0.001) 59 | if m.bias is not None: 60 | init.constant_(m.bias, 0) 61 | 62 | def forward(self, x): 63 | b, c, _, _ = x.size() 64 | # Channel attention 65 | channel_attention = self.avg_pool(x).view(b, c) 66 | channel_attention = self.channel_fc(channel_attention).view(b, c, 1, 1) 67 | x = x * channel_attention.expand_as(x) 68 | 69 | # Spatial attention using global context 70 | spatial_attention = self.global_context(x) 71 | x = x * spatial_attention 72 | 73 | return x 74 | 75 | if __name__ == '__main__': 76 | model = SEAttention() 77 | model.init_weights() 78 | input = torch.randn(1, 512, 7, 7) 79 | output = model(input) 80 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xaa-channel_mix_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Introduce a Channel Mixing Block (CMB) within the CoordAtt module 3 | Implement grouped convolutions in the CMB to capture channel-wise dependencies efficiently 4 | Modify the CoordAtt class to include a new CMB after the initial convolutional layers 5 | Evaluate the impact of channel mixing on feature representation by testing on a small benchmark dataset, comparing the performance and computational efficiency against the original CoordAtt and other variants 6 | 7 | """ 8 | 9 | # Modified code 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | class h_sigmoid(nn.Module): 15 | def __init__(self, inplace=True): 16 | super(h_sigmoid, self).__init__() 17 | self.relu = nn.ReLU6(inplace=inplace) 18 | 19 | def forward(self, x): 20 | return self.relu(x + 3) / 6 21 | 22 | class h_swish(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_swish, self).__init__() 25 | self.sigmoid = h_sigmoid(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return x * self.sigmoid(x) 29 | 30 | class ChannelMixingBlock(nn.Module): 31 | def __init__(self, channels, groups=4): 32 | super(ChannelMixingBlock, self).__init__() 33 | self.groups = groups 34 | self.grouped_conv = nn.Conv2d(channels, channels, kernel_size=1, groups=groups, bias=False) 35 | self.bn = nn.BatchNorm2d(channels) 36 | self.act = h_swish() 37 | 38 | def forward(self, x): 39 | x = self.grouped_conv(x) 40 | x = self.bn(x) 41 | x = self.act(x) 42 | return x 43 | 44 | class CoordAtt(nn.Module): 45 | def __init__(self, inp, reduction=32, groups=4): 46 | super(CoordAtt, self).__init__() 47 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 48 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 49 | 50 | mip = max(8, inp // reduction) 51 | 52 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 53 | self.bn1 = nn.BatchNorm2d(mip) 54 | self.act = h_swish() 55 | 56 | # Introduce Channel Mixing Block 57 | self.cmb = ChannelMixingBlock(mip, groups=groups) 58 | 59 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 60 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 61 | 62 | def forward(self, x): 63 | identity = x 64 | 65 | n, c, h, w = x.size() 66 | x_h = self.pool_h(x) 67 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 68 | 69 | y = torch.cat([x_h, x_w], dim=2) 70 | y = self.conv1(y) 71 | y = self.bn1(y) 72 | y = self.act(y) 73 | 74 | # Pass through Channel Mixing Block 75 | y = self.cmb(y) 76 | 77 | x_h, x_w = torch.split(y, [h, w], dim=2) 78 | x_w = x_w.permute(0, 1, 3, 2) 79 | 80 | a_h = self.conv_h(x_h).sigmoid() 81 | a_w = self.conv_w(x_w).sigmoid() 82 | 83 | out = identity * a_w * a_h 84 | 85 | return out 86 | 87 | if __name__ == '__main__': 88 | x = torch.randn(2, 64, 32, 32) 89 | att = CoordAtt(inp=64, reduction=32) 90 | out = att(x) 91 | print("输入尺寸:", x.shape) 92 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/plane_voronoi/code/graph_based_voronoi.py: -------------------------------------------------------------------------------- 1 | """ 2 | Use a priority queue to dynamically expand the influence of seed points across the grid, analogous to breadth-first search without explicitly constructing a graph 3 | Modify the 'attribution' function to utilize this queue-based propagation, ensuring efficient handling of neighboring grid cells 4 | Evaluate the execution time and scalability improvements compared to the baseline approach 5 | 6 | """ 7 | 8 | # Modified code 9 | #!/usr/bin/env python 10 | # -*- coding: utf-8 -*- 11 | ''' 12 | @File : pane_voronoi.py 13 | @Time : 2023/09/21 17:03:58 14 | @Author : 不要葱姜蒜 15 | @Version : 1.0 16 | @Desc : None 17 | ''' 18 | 19 | import pprint 20 | import copy 21 | import random 22 | from PIL import Image 23 | from tqdm import tqdm # 进度条 24 | from collections import defaultdict, deque 25 | 26 | class PaneVoronoi: 27 | def __init__(self, seed, seed_list, n): 28 | self.n = n # 边长 默认都是正方形 29 | self.seed = seed # 种子点 30 | self.hash_map = [i * i for i in range(self.n)] 31 | self.seed_list = seed_list # 生成种子点 32 | self.table = [[0] * self.n for _ in range(self.n)] 33 | self.visited = [[False] * self.n for _ in range(self.n)] 34 | self.colors = self.colors() # 随机化颜色,并且种子点设置为黑色 35 | self.count = n * 4 - 4 36 | 37 | def creat_seed(self): 38 | res = [] 39 | for _ in range(self.seed): 40 | res.append([random.randrange(self.n), random.randrange(self.n)]) 41 | return res 42 | 43 | def colors(self): 44 | res = [[0, 0, 0]] 45 | for _ in range(self.n): 46 | res.append([random.randrange(99, 206) for _ in range(3)]) 47 | return res 48 | 49 | def deal(self): 50 | # Initialize the queue with seed points 51 | queue = deque() 52 | for idx, (x, y) in enumerate(self.seed_list): 53 | queue.append((x, y, idx + 1)) # (x, y, seed_index) 54 | self.table[x][y] = idx + 1 55 | self.visited[x][y] = True 56 | 57 | directions = [(-1, 0), (1, 0), (0, -1), (0, 1)] 58 | while queue: 59 | x, y, seed_index = queue.popleft() 60 | for dx, dy in directions: 61 | nx, ny = x + dx, y + dy 62 | if 0 <= nx < self.n and 0 <= ny < self.n and not self.visited[nx][ny]: 63 | self.table[nx][ny] = seed_index 64 | self.visited[nx][ny] = True 65 | queue.append((nx, ny, seed_index)) 66 | 67 | def positive_reverse(self): 68 | return self.table 69 | 70 | @classmethod 71 | def paint(cls, data, name, colors): 72 | image = Image.new('RGB', (len(data), len(data[0]))) 73 | put_pixel = image.putpixel 74 | for i in tqdm(range(len(data))): 75 | for j in range(len(data[0])): 76 | color = colors[data[i][j]] 77 | put_pixel((i, j), (color[0], color[1], color[2])) 78 | image.save(f'img/{name}.jpg') 79 | 80 | if __name__ == '__main__': 81 | seed_list = [[random.randrange(32), random.randrange(32)] for _ in range(500)] 82 | v = PaneVoronoi(500, seed_list, 32) 83 | v.deal() 84 | da = v.positive_reverse() 85 | v.paint(da, 'voronoi', v.colors) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-geo_transform_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Introduce a lightweight transformation network within the CoordAtt module, using a small convolutional layer followed by a fully connected layer to predict affine transformation parameters 3 | Use these parameters to warp the input feature maps via spatial transformations before applying coordinate attention 4 | Modify the forward method accordingly, and evaluate effectiveness on datasets with geometric variations, comparing feature representation, performance, and computational efficiency against the original CoordAtt and other variants 5 | Ensure the transformation network is efficient to maintain minimal overhead 6 | 7 | """ 8 | 9 | # Modified code 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | class h_sigmoid(nn.Module): 15 | def __init__(self, inplace=True): 16 | super(h_sigmoid, self).__init__() 17 | self.relu = nn.ReLU6(inplace=inplace) 18 | 19 | def forward(self, x): 20 | return self.relu(x + 3) / 6 21 | 22 | class h_swish(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_swish, self).__init__() 25 | self.sigmoid = h_sigmoid(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return x * self.sigmoid(x) 29 | 30 | class CoordAtt(nn.Module): 31 | def __init__(self, inp, reduction=32): 32 | super(CoordAtt, self).__init__() 33 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 34 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 35 | 36 | mip = max(8, inp // reduction) 37 | 38 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 39 | self.bn1 = nn.BatchNorm2d(mip) 40 | self.act = h_swish() 41 | 42 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 43 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 44 | 45 | # Transformation network 46 | self.trans_conv = nn.Conv2d(inp, 6, kernel_size=1, stride=1, padding=0) 47 | self.trans_fc = nn.Linear(6 * 1 * 1, 6) # Predict affine parameters 48 | 49 | def forward(self, x): 50 | identity = x 51 | 52 | # Predict affine parameters 53 | n, c, h, w = x.size() 54 | theta = self.trans_conv(x) 55 | theta = theta.view(n, -1) 56 | theta = self.trans_fc(theta) 57 | theta = theta.view(-1, 2, 3) 58 | 59 | # Create affine grid and apply transformation 60 | grid = F.affine_grid(theta, x.size(), align_corners=False) 61 | x = F.grid_sample(x, grid, align_corners=False) 62 | 63 | x_h = self.pool_h(x) 64 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 65 | 66 | y = torch.cat([x_h, x_w], dim=2) 67 | y = self.conv1(y) 68 | y = self.bn1(y) 69 | y = self.act(y) 70 | 71 | x_h, x_w = torch.split(y, [h, w], dim=2) 72 | x_w = x_w.permute(0, 1, 3, 2) 73 | 74 | a_h = self.conv_h(x_h).sigmoid() 75 | a_w = self.conv_w(x_w).sigmoid() 76 | 77 | out = identity * a_w * a_h 78 | 79 | return out 80 | 81 | if __name__ == '__main__': 82 | x = torch.randn(2, 64, 32, 32) 83 | att = CoordAtt(inp=64, reduction=32) 84 | out = att(x) 85 | print("输入尺寸:", x.shape) 86 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/task_adaptive_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Develop a meta-attention module that infers a task descriptor from input data characteristics 3 | Integrate this module within the SEAttention framework to modulate its parameters dynamically 4 | Implement functions to extract task descriptors and modify SEAttention weights based on these descriptors 5 | Evaluate the model's adaptability and performance across diverse small target detection tasks using precision, recall, and F1-score, comparing its performance against the baseline SEAttention model and other enhanced models 6 | 7 | """ 8 | 9 | # Improved Code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | class TaskAdaptiveAttention(nn.Module): 19 | """A module to infer task descriptors from input data characteristics.""" 20 | 21 | def __init__(self, channel=512): 22 | super(TaskAdaptiveAttention, self).__init__() 23 | self.task_descriptor = nn.Sequential( 24 | nn.AdaptiveAvgPool2d(1), 25 | nn.Conv2d(channel, channel // 4, kernel_size=1), 26 | nn.ReLU(inplace=True), 27 | nn.Conv2d(channel // 4, channel, kernel_size=1), 28 | nn.Sigmoid() 29 | ) 30 | 31 | def forward(self, x): 32 | """Extract task descriptor from input data.""" 33 | return self.task_descriptor(x) 34 | 35 | 36 | class SEAttention(nn.Module): 37 | 38 | def __init__(self, channel=512, reduction=16): 39 | super(SEAttention, self).__init__() 40 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 41 | self.fc = nn.Sequential( 42 | nn.Linear(channel, channel // reduction, bias=False), 43 | nn.ReLU(inplace=True), 44 | nn.Linear(channel // reduction, channel, bias=False), 45 | nn.Sigmoid() 46 | ) 47 | self.task_adaptive_attention = TaskAdaptiveAttention(channel) 48 | 49 | def init_weights(self): 50 | for m in self.modules(): 51 | if isinstance(m, nn.Conv2d): 52 | init.kaiming_normal_(m.weight, mode='fan_out') 53 | if m.bias is not None: 54 | init.constant_(m.bias, 0) 55 | elif isinstance(m, nn.BatchNorm2d): 56 | init.constant_(m.weight, 1) 57 | init.constant_(m.bias, 0) 58 | elif isinstance(m, nn.Linear): 59 | init.normal_(m.weight, std=0.001) 60 | if m.bias is not None: 61 | init.constant_(m.bias, 0) 62 | 63 | def forward(self, x): 64 | b, c, _, _ = x.size() 65 | # Task descriptor influences SEAttention weights 66 | task_descriptor = self.task_adaptive_attention(x) 67 | pooled = self.avg_pool(x).view(b, c) 68 | se_weights = self.fc(pooled).view(b, c, 1, 1) 69 | adaptive_weights = task_descriptor.view(b, c, 1, 1) 70 | return x * se_weights.expand_as(x) * adaptive_weights.expand_as(x) 71 | 72 | if __name__ == '__main__': 73 | model = SEAttention() 74 | model.init_weights() 75 | input = torch.randn(1, 512, 7, 7) 76 | output = model(input) 77 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/pyramid_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by incorporating a pyramid pooling layer to generate multi-scale context features 3 | Implement this by adding a pyramid pooling module that extracts pooled features at different scales 4 | Apply a unified attention mechanism across these pooled features to recalibrate the feature map 5 | Modify the forward function to include pyramid pooling and attention application 6 | Evaluate the model's effectiveness by comparing detection performance on small and distributed targets, using visualization techniques and quantitative analysis on synthetic datasets 7 | 8 | """ 9 | 10 | # Modified code 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | 19 | class PyramidPooling(nn.Module): 20 | def __init__(self, in_channels, pool_sizes): 21 | super(PyramidPooling, self).__init__() 22 | self.stages = nn.ModuleList([self._make_stage(in_channels, size) for size in pool_sizes]) 23 | 24 | def _make_stage(self, in_channels, size): 25 | prior = nn.AdaptiveAvgPool2d(output_size=size) 26 | conv = nn.Conv2d(in_channels, in_channels, kernel_size=1, bias=False) 27 | return nn.Sequential(prior, conv) 28 | 29 | def forward(self, x): 30 | h, w = x.shape[2], x.shape[3] 31 | pyramids = [F.interpolate(stage(x), size=(h, w), mode='bilinear', align_corners=True) for stage in self.stages] 32 | return torch.cat(pyramids, dim=1) 33 | 34 | class SEAttention(nn.Module): 35 | 36 | def __init__(self, channel=512, reduction=16, pool_sizes=[1, 2, 3, 6]): 37 | super().__init__() 38 | self.pyramid_pooling = PyramidPooling(channel, pool_sizes) 39 | self.attention_conv = nn.Conv2d(channel * len(pool_sizes), channel, kernel_size=1, bias=False) 40 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 41 | self.fc = nn.Sequential( 42 | nn.Linear(channel, channel // reduction, bias=False), 43 | nn.ReLU(inplace=True), 44 | nn.Linear(channel // reduction, channel, bias=False), 45 | nn.Sigmoid() 46 | ) 47 | 48 | def init_weights(self): 49 | for m in self.modules(): 50 | if isinstance(m, nn.Conv2d): 51 | init.kaiming_normal_(m.weight, mode='fan_out') 52 | if m.bias is not None: 53 | init.constant_(m.bias, 0) 54 | elif isinstance(m, nn.BatchNorm2d): 55 | init.constant_(m.weight, 1) 56 | init.constant_(m.bias, 0) 57 | elif isinstance(m, nn.Linear): 58 | init.normal_(m.weight, std=0.001) 59 | if m.bias is not None: 60 | init.constant_(m.bias, 0) 61 | 62 | def forward(self, x): 63 | b, c, _, _ = x.size() 64 | x = self.pyramid_pooling(x) 65 | x = self.attention_conv(x) 66 | y = self.avg_pool(x).view(b, c) 67 | y = self.fc(y).view(b, c, 1, 1) 68 | return x * y.expand_as(x) 69 | 70 | if __name__ == '__main__': 71 | model = SEAttention() 72 | model.init_weights() 73 | input = torch.randn(1, 512, 7, 7) 74 | output = model(input) 75 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-dynamic_complexity_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enhance the `CoordAtt` module by integrating a dynamic complexity adjustment mechanism 3 | Use a simple heuristic based on feature map variance to determine complexity 4 | Route the input through either a lightweight or complex processing path: a basic path for low variance features and an enhanced path for high variance features 5 | Modify the forward method to include this decision mechanism and dynamically adjust processing 6 | Evaluate the module's adaptability on a small benchmark dataset, assessing improvements in feature discrimination and computational efficiency over the original and other variants 7 | 8 | """ 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | class h_sigmoid(nn.Module): 15 | def __init__(self, inplace=True): 16 | super(h_sigmoid, self).__init__() 17 | self.relu = nn.ReLU6(inplace=inplace) 18 | 19 | def forward(self, x): 20 | return self.relu(x + 3) / 6 21 | 22 | class h_swish(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_swish, self).__init__() 25 | self.sigmoid = h_sigmoid(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return x * self.sigmoid(x) 29 | 30 | class CoordAtt(nn.Module): 31 | def __init__(self, inp, reduction=32): 32 | super(CoordAtt, self).__init__() 33 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 34 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 35 | 36 | mip = max(8, inp // reduction) 37 | 38 | # Lightweight path 39 | self.light_conv = nn.Sequential( 40 | nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0), 41 | nn.BatchNorm2d(mip), 42 | h_swish() 43 | ) 44 | 45 | # Complex path 46 | self.complex_conv = nn.Sequential( 47 | nn.Conv2d(inp, mip, kernel_size=3, stride=1, padding=1), 48 | nn.BatchNorm2d(mip), 49 | h_swish(), 50 | nn.Conv2d(mip, mip, kernel_size=3, stride=1, padding=1), 51 | nn.BatchNorm2d(mip), 52 | h_swish() 53 | ) 54 | 55 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 56 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 57 | 58 | def forward(self, x): 59 | identity = x 60 | 61 | # Calculate feature map variance 62 | variance = x.var(dim=(2, 3), keepdim=True).mean() 63 | 64 | # Choose path based on variance 65 | if variance < 0.5: # Threshold can be tuned 66 | y = self.light_conv(x) 67 | else: 68 | y = self.complex_conv(x) 69 | 70 | n, c, h, w = y.size() 71 | x_h = self.pool_h(y) 72 | x_w = self.pool_w(y).permute(0, 1, 3, 2) 73 | 74 | y = torch.cat([x_h, x_w], dim=2) 75 | y = self.light_conv(y) 76 | 77 | x_h, x_w = torch.split(y, [h, w], dim=2) 78 | x_w = x_w.permute(0, 1, 3, 2) 79 | 80 | a_h = self.conv_h(x_h).sigmoid() 81 | a_w = self.conv_w(x_w).sigmoid() 82 | 83 | out = identity * a_w * a_h 84 | 85 | return out 86 | 87 | if __name__ == '__main__': 88 | x = torch.randn(2, 64, 32, 32) 89 | att = CoordAtt(inp=64, reduction=32) 90 | out = att(x) 91 | print("输入尺寸:", x.shape) 92 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/adaptive_complexity_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Incorporate a complexity assessment module within the SEAttention framework 3 | Implement a function that calculates a complexity score using simple features like pixel intensity variance or entropy from input data 4 | Modify the forward function of SEAttention to adjust the attention weights using this complexity score 5 | Evaluate the model's performance across small target detection tasks with varying image complexities, using metrics such as precision, recall, and F1-score 6 | Compare the results against the baseline SEAttention model to demonstrate improvements in robustness and adaptability 7 | 8 | """ 9 | 10 | # Modified code 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | import torchvision.transforms as transforms 19 | 20 | 21 | class SEAttention(nn.Module): 22 | 23 | def __init__(self, channel=512, reduction=16): 24 | super().__init__() 25 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 26 | self.fc = nn.Sequential( 27 | nn.Linear(channel, channel // reduction, bias=False), 28 | nn.ReLU(inplace=True), 29 | nn.Linear(channel // reduction, channel, bias=False), 30 | nn.Sigmoid() 31 | ) 32 | 33 | def complexity_score(self, x): 34 | # Convert to grayscale for simplicity 35 | gray_transform = transforms.Grayscale() 36 | x_gray = gray_transform(x) 37 | 38 | # Compute pixel intensity variance as complexity score 39 | variance = torch.var(x_gray, dim=(2, 3), keepdim=True) 40 | 41 | # Normalize the variance to be between 0 and 1 42 | max_variance = torch.max(variance) 43 | min_variance = torch.min(variance) 44 | complexity_score = (variance - min_variance) / (max_variance - min_variance + 1e-5) 45 | 46 | return complexity_score 47 | 48 | def init_weights(self): 49 | for m in self.modules(): 50 | if isinstance(m, nn.Conv2d): 51 | init.kaiming_normal_(m.weight, mode='fan_out') 52 | if m.bias is not None: 53 | init.constant_(m.bias, 0) 54 | elif isinstance(m, nn.BatchNorm2d): 55 | init.constant_(m.weight, 1) 56 | init.constant_(m.bias, 0) 57 | elif isinstance(m, nn.Linear): 58 | init.normal_(m.weight, std=0.001) 59 | if m.bias is not None: 60 | init.constant_(m.bias, 0) 61 | 62 | def forward(self, x): 63 | b, c, _, _ = x.size() 64 | # Calculate complexity score 65 | complexity = self.complexity_score(x) 66 | 67 | # Original SEAttention operations 68 | y = self.avg_pool(x).view(b, c) 69 | y = self.fc(y).view(b, c, 1, 1) 70 | 71 | # Adjust attention weights using complexity score 72 | adjusted_y = y * complexity 73 | 74 | return x * adjusted_y.expand_as(x) 75 | 76 | if __name__ == '__main__': 77 | model = SEAttention() 78 | model.init_weights() 79 | input = torch.randn(1, 512, 7, 7) 80 | output = model(input) 81 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/adaptive_gating_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by adding a learnable gating mechanism that dynamically adjusts attention weights based on input complexity 3 | Implement this by introducing a gating layer that takes input feature statistics (e 4 | g 5 | , variance, mean) to modulate the balance between the original feature and the recalibrated attention feature 6 | Modify the forward function to integrate this gating mechanism after the channel attention 7 | Evaluate the model's performance by comparing it with the baseline SEAttention and other modifications, using synthetic datasets for small target detection and analyzing adaptive behavior through feature map visualizations 8 | 9 | """ 10 | 11 | # Modified code 12 | import numpy as np 13 | import torch 14 | from torch import flatten, nn 15 | from torch.nn import init 16 | from torch.nn.modules.activation import ReLU 17 | from torch.nn.modules.batchnorm import BatchNorm2d 18 | from torch.nn import functional as F 19 | 20 | 21 | class SEAttention(nn.Module): 22 | 23 | def __init__(self, channel=512, reduction=16): 24 | super().__init__() 25 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 26 | self.fc = nn.Sequential( 27 | nn.Linear(channel, channel // reduction, bias=False), 28 | nn.ReLU(inplace=True), 29 | nn.Linear(channel // reduction, channel, bias=False), 30 | nn.Sigmoid() 31 | ) 32 | # Gating mechanism 33 | self.gate_fc = nn.Sequential( 34 | nn.Linear(2, channel // reduction, bias=False), # Assuming input feature statistics 35 | nn.ReLU(inplace=True), 36 | nn.Linear(channel // reduction, 1, bias=False), 37 | nn.Sigmoid() 38 | ) 39 | 40 | def init_weights(self): 41 | for m in self.modules(): 42 | if isinstance(m, nn.Conv2d): 43 | init.kaiming_normal_(m.weight, mode='fan_out') 44 | if m.bias is not None: 45 | init.constant_(m.bias, 0) 46 | elif isinstance(m, nn.BatchNorm2d): 47 | init.constant_(m.weight, 1) 48 | init.constant_(m.bias, 0) 49 | elif isinstance(m, nn.Linear): 50 | init.normal_(m.weight, std=0.001) 51 | if m.bias is not None: 52 | init.constant_(m.bias, 0) 53 | 54 | def forward(self, x): 55 | b, c, _, _ = x.size() 56 | y = self.avg_pool(x).view(b, c) 57 | attention_weights = self.fc(y).view(b, c, 1, 1) 58 | 59 | # Compute input feature statistics 60 | mean = x.mean(dim=[2, 3], keepdim=False).view(b, c, 1, 1) 61 | variance = x.var(dim=[2, 3], keepdim=False).view(b, c, 1, 1) 62 | feature_stats = torch.cat((mean, variance), dim=1).view(b, 2, 1, 1) 63 | 64 | # Gating mechanism 65 | gate_value = self.gate_fc(feature_stats.view(b, 2)).view(b, 1, 1, 1) 66 | 67 | # Modulate attention feature with gating mechanism 68 | modulated_feature = gate_value * x + (1 - gate_value) * (x * attention_weights.expand_as(x)) 69 | 70 | return modulated_feature 71 | 72 | if __name__ == '__main__': 73 | model = SEAttention() 74 | model.init_weights() 75 | input = torch.randn(1, 512, 7, 7) 76 | output = model(input) 77 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-dual_domain_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the CoordAtt module to incorporate both spatial and frequency domain attention mechanisms 3 | Perform a Fast Fourier Transform (FFT) on the input feature maps to capture frequency domain information 4 | Compute attention weights separately for spatial and frequency domains, then merge them using a simple weighted sum or concatenation followed by a linear transformation 5 | Adjust the forward method to include these steps, and evaluate the module's performance on a small benchmark dataset, comparing improvements in feature representation, accuracy, and computational efficiency against the original CoordAtt and other variants 6 | 7 | """ 8 | 9 | # Modified code 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | class h_sigmoid(nn.Module): 15 | def __init__(self, inplace=True): 16 | super(h_sigmoid, self).__init__() 17 | self.relu = nn.ReLU6(inplace=inplace) 18 | 19 | def forward(self, x): 20 | return self.relu(x + 3) / 6 21 | 22 | class h_swish(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_swish, self).__init__() 25 | self.sigmoid = h_sigmoid(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return x * self.sigmoid(x) 29 | 30 | class CoordAtt(nn.Module): 31 | def __init__(self, inp, reduction=32, freq_weight=0.5): 32 | super(CoordAtt, self).__init__() 33 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 34 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 35 | 36 | mip = max(8, inp // reduction) 37 | 38 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 39 | self.bn1 = nn.BatchNorm2d(mip) 40 | self.act = h_swish() 41 | 42 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 43 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 44 | 45 | # Frequency domain attention 46 | self.freq_weight = freq_weight 47 | self.conv_freq = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 48 | self.bn_freq = nn.BatchNorm2d(mip) 49 | 50 | def forward(self, x): 51 | identity = x 52 | 53 | # Spatial attention 54 | n, c, h, w = x.size() 55 | x_h = self.pool_h(x) 56 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 57 | 58 | y_spatial = torch.cat([x_h, x_w], dim=2) 59 | y_spatial = self.conv1(y_spatial) 60 | y_spatial = self.bn1(y_spatial) 61 | y_spatial = self.act(y_spatial) 62 | 63 | x_h, x_w = torch.split(y_spatial, [h, w], dim=2) 64 | x_w = x_w.permute(0, 1, 3, 2) 65 | 66 | a_h = self.conv_h(x_h).sigmoid() 67 | a_w = self.conv_w(x_w).sigmoid() 68 | 69 | # Frequency domain attention 70 | x_freq = torch.fft.fft2(x) 71 | x_freq = torch.abs(x_freq) # Use magnitude spectrum 72 | x_freq = self.conv_freq(x_freq) 73 | x_freq = self.bn_freq(x_freq) 74 | x_freq = self.act(x_freq) 75 | 76 | # Weighted sum of spatial and frequency domain attention 77 | out = (1 - self.freq_weight) * identity * a_w * a_h + self.freq_weight * x_freq 78 | 79 | return out 80 | 81 | if __name__ == '__main__': 82 | x = torch.randn(2, 64, 32, 32) 83 | att = CoordAtt(inp=64, reduction=32) 84 | out = att(x) 85 | print("输入尺寸:", x.shape) 86 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-se_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate a Squeeze-and-Excitation (SE) block within the CoordAtt module 3 | Before applying the coordinate attention, add an SE block that squeezes the spatial dimensions and excites channels based on global average pooling 4 | Modify the forward method to include this SE block before the existing coordinate attention operations 5 | Evaluate the effectiveness by testing on a small benchmark dataset, comparing feature representation and attention quality to the original CoordAtt, and analyzing the computational overhead and parameter count 6 | 7 | """ 8 | 9 | # 创新不足 10 | 11 | # Modified code 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | 17 | class h_sigmoid(nn.Module): 18 | def __init__(self, inplace=True): 19 | super(h_sigmoid, self).__init__() 20 | self.relu = nn.ReLU6(inplace=inplace) 21 | 22 | def forward(self, x): 23 | return self.relu(x + 3) / 6 24 | 25 | 26 | class h_swish(nn.Module): 27 | def __init__(self, inplace=True): 28 | super(h_swish, self).__init__() 29 | self.sigmoid = h_sigmoid(inplace=inplace) 30 | 31 | def forward(self, x): 32 | return x * self.sigmoid(x) 33 | 34 | 35 | class SEBlock(nn.Module): 36 | def __init__(self, inp, reduction=16): 37 | super(SEBlock, self).__init__() 38 | self.global_avgpool = nn.AdaptiveAvgPool2d(1) 39 | self.fc1 = nn.Conv2d(inp, inp // reduction, kernel_size=1, padding=0) 40 | self.relu = nn.ReLU(inplace=True) 41 | self.fc2 = nn.Conv2d(inp // reduction, inp, kernel_size=1, padding=0) 42 | self.sigmoid = nn.Sigmoid() 43 | 44 | def forward(self, x): 45 | se = self.global_avgpool(x) 46 | se = self.fc1(se) 47 | se = self.relu(se) 48 | se = self.fc2(se) 49 | se = self.sigmoid(se) 50 | return x * se 51 | 52 | 53 | class CoordAtt(nn.Module): 54 | def __init__(self, inp, reduction=32): 55 | super(CoordAtt, self).__init__() 56 | self.se_block = SEBlock(inp, reduction=reduction) 57 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 58 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 59 | 60 | mip = max(8, inp // reduction) 61 | 62 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 63 | self.bn1 = nn.BatchNorm2d(mip) 64 | self.act = h_swish() 65 | 66 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 67 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 68 | 69 | def forward(self, x): 70 | # Apply SE block 71 | x = self.se_block(x) 72 | identity = x 73 | 74 | n, c, h, w = x.size() 75 | x_h = self.pool_h(x) 76 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 77 | 78 | y = torch.cat([x_h, x_w], dim=2) 79 | y = self.conv1(y) 80 | y = self.bn1(y) 81 | y = self.act(y) 82 | 83 | x_h, x_w = torch.split(y, [h, w], dim=2) 84 | x_w = x_w.permute(0, 1, 3, 2) 85 | 86 | a_h = self.conv_h(x_h).sigmoid() 87 | a_w = self.conv_w(x_w).sigmoid() 88 | 89 | out = identity * a_w * a_h 90 | 91 | return out 92 | 93 | if __name__ == '__main__': 94 | x = torch.randn(2, 64, 32, 32) 95 | att = CoordAtt(inp=64, reduction=32) 96 | out = att(x) 97 | print("输入尺寸:", x.shape) 98 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-shared_params_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the CoordAtt module to introduce a shared parameter block accessible to multiple network components, such as convolutional layers and activation functions 3 | Implement this shared parameter block as a separate module with learnable parameters 4 | Integrate it into the CoordAtt module by adjusting the conv1, conv_h, and conv_w layers to utilize these shared parameters 5 | Evaluate the parameter count reduction and computational efficiency by testing on a small benchmark dataset, comparing performance and representation quality with the original CoordAtt and other variants 6 | 7 | """ 8 | 9 | # Modified code 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class h_sigmoid(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_sigmoid, self).__init__() 18 | self.relu = nn.ReLU6(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return self.relu(x + 3) / 6 22 | 23 | class h_swish(nn.Module): 24 | def __init__(self, inplace=True): 25 | super(h_swish, self).__init__() 26 | self.sigmoid = h_sigmoid(inplace=inplace) 27 | 28 | def forward(self, x): 29 | return x * self.sigmoid(x) 30 | 31 | # Shared parameter block to be used in multiple layers 32 | class SharedParameterBlock(nn.Module): 33 | def __init__(self, out_channels): 34 | super(SharedParameterBlock, self).__init__() 35 | self.weight = nn.Parameter(torch.randn(out_channels, 1, 1, 1)) 36 | self.bias = nn.Parameter(torch.zeros(out_channels)) 37 | 38 | def forward(self, x): 39 | return x * self.weight + self.bias 40 | 41 | class CoordAtt(nn.Module): 42 | def __init__(self, inp, reduction=32): 43 | super(CoordAtt, self).__init__() 44 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 45 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 46 | 47 | mip = max(8, inp // reduction) 48 | 49 | # Integrate the shared parameter block into conv1, conv_h, conv_w 50 | self.shared_params = SharedParameterBlock(mip) 51 | 52 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 53 | self.bn1 = nn.BatchNorm2d(mip) 54 | self.act = h_swish() 55 | 56 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 57 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 58 | 59 | def forward(self, x): 60 | identity = x 61 | 62 | n, c, h, w = x.size() 63 | x_h = self.pool_h(x) 64 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 65 | 66 | y = torch.cat([x_h, x_w], dim=2) 67 | y = self.conv1(y) 68 | y = self.bn1(y) 69 | y = self.act(y) 70 | 71 | # Apply shared parameters 72 | y = self.shared_params(y) 73 | 74 | x_h, x_w = torch.split(y, [h, w], dim=2) 75 | x_w = x_w.permute(0, 1, 3, 2) 76 | 77 | # Apply shared parameters to the conv_h and conv_w layers 78 | a_h = self.shared_params(self.conv_h(x_h)).sigmoid() 79 | a_w = self.shared_params(self.conv_w(x_w)).sigmoid() 80 | 81 | out = identity * a_w * a_h 82 | 83 | return out 84 | 85 | if __name__ == '__main__': 86 | x = torch.randn(2, 64, 32, 32) 87 | att = CoordAtt(inp=64, reduction=32) 88 | out = att(x) 89 | print("Input size:", x.shape) 90 | print("Output size:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-probabilistic_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the CoordAtt module to introduce a probabilistic attention mechanism 3 | Implement a stochastic gating mechanism where attention weights are sampled from a Gaussian distribution with learnable parameters (mean and variance) 4 | Adjust the forward method to compute these probabilistic attention weights and integrate them into the feature modulation process 5 | Evaluate the impact on feature representation and robustness by testing on a small benchmark dataset, comparing the performance to the original CoordAtt and other variants, and analyzing the uncertainty estimates 6 | 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | class h_sigmoid(nn.Module): 14 | def __init__(self, inplace=True): 15 | super(h_sigmoid, self).__init__() 16 | self.relu = nn.ReLU6(inplace=inplace) 17 | 18 | def forward(self, x): 19 | return self.relu(x + 3) / 6 20 | 21 | class h_swish(nn.Module): 22 | def __init__(self, inplace=True): 23 | super(h_swish, self).__init__() 24 | self.sigmoid = h_sigmoid(inplace=inplace) 25 | 26 | def forward(self, x): 27 | return x * self.sigmoid(x) 28 | 29 | class ProbabilisticAttention(nn.Module): 30 | def __init__(self, channels): 31 | super(ProbabilisticAttention, self).__init__() 32 | self.mean = nn.Parameter(torch.zeros(1, channels, 1, 1)) 33 | self.log_var = nn.Parameter(torch.zeros(1, channels, 1, 1)) 34 | 35 | def forward(self, x): 36 | std = torch.exp(0.5 * self.log_var) 37 | epsilon = torch.randn_like(std) 38 | attention_weights = self.mean + std * epsilon 39 | return torch.sigmoid(attention_weights) 40 | 41 | class CoordAtt(nn.Module): 42 | def __init__(self, inp, reduction=32): 43 | super(CoordAtt, self).__init__() 44 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 45 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 46 | 47 | mip = max(8, inp // reduction) 48 | 49 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 50 | self.bn1 = nn.BatchNorm2d(mip) 51 | self.act = h_swish() 52 | 53 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 54 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 55 | 56 | # Introduce probabilistic attention 57 | self.prob_att_h = ProbabilisticAttention(inp) 58 | self.prob_att_w = ProbabilisticAttention(inp) 59 | 60 | def forward(self, x): 61 | identity = x 62 | 63 | n, c, h, w = x.size() 64 | x_h = self.pool_h(x) 65 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 66 | 67 | y = torch.cat([x_h, x_w], dim=2) 68 | y = self.conv1(y) 69 | y = self.bn1(y) 70 | y = self.act(y) 71 | 72 | x_h, x_w = torch.split(y, [h, w], dim=2) 73 | x_w = x_w.permute(0, 1, 3, 2) 74 | 75 | a_h = self.conv_h(x_h) 76 | a_w = self.conv_w(x_w) 77 | 78 | # Apply probabilistic attention 79 | pa_h = self.prob_att_h(a_h) 80 | pa_w = self.prob_att_w(a_w) 81 | 82 | out = identity * pa_w * pa_h 83 | 84 | return out 85 | 86 | if __name__ == '__main__': 87 | x = torch.randn(2, 64, 32, 32) 88 | att = CoordAtt(inp=64, reduction=32) 89 | out = att(x) 90 | print("输入尺寸:", x.shape) 91 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/denoising_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate a lightweight denoising autoencoder within the SEAttention framework 3 | Implement an encoder-decoder structure focused on feature compression and noise reduction 4 | Modify the forward function to pass input through the autoencoder before applying channel attention 5 | Optimize the autoencoder's parameters using a transfer learning approach, ensuring it is tailored for small target detection 6 | Evaluate performance by comparing detection accuracy and attention map clarity on small targets with and without the denoising mechanism, using synthetic datasets 7 | 8 | """ 9 | 10 | # Modified code 11 | 12 | import numpy as np 13 | import torch 14 | from torch import flatten, nn 15 | from torch.nn import init 16 | from torch.nn.modules.activation import ReLU 17 | from torch.nn.modules.batchnorm import BatchNorm2d 18 | from torch.nn import functional as F 19 | 20 | class DenoisingAutoencoder(nn.Module): 21 | def __init__(self, channel=512, latent_dim=128): 22 | super().__init__() 23 | # Encoder 24 | self.encoder = nn.Sequential( 25 | nn.Conv2d(channel, channel // 2, kernel_size=3, padding=1), 26 | nn.ReLU(inplace=True), 27 | nn.Conv2d(channel // 2, latent_dim, kernel_size=3, padding=1), 28 | nn.ReLU(inplace=True) 29 | ) 30 | # Decoder 31 | self.decoder = nn.Sequential( 32 | nn.Conv2d(latent_dim, channel // 2, kernel_size=3, padding=1), 33 | nn.ReLU(inplace=True), 34 | nn.Conv2d(channel // 2, channel, kernel_size=3, padding=1), 35 | nn.Sigmoid() 36 | ) 37 | 38 | def forward(self, x): 39 | encoded = self.encoder(x) 40 | decoded = self.decoder(encoded) 41 | return decoded 42 | 43 | class SEAttention(nn.Module): 44 | def __init__(self, channel=512, reduction=16): 45 | super().__init__() 46 | self.denoising_autoencoder = DenoisingAutoencoder(channel) 47 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 48 | self.fc = nn.Sequential( 49 | nn.Linear(channel, channel // reduction, bias=False), 50 | nn.ReLU(inplace=True), 51 | nn.Linear(channel // reduction, channel, bias=False), 52 | nn.Sigmoid() 53 | ) 54 | 55 | def init_weights(self): 56 | for m in self.modules(): 57 | if isinstance(m, nn.Conv2d): 58 | init.kaiming_normal_(m.weight, mode='fan_out') 59 | if m.bias is not None: 60 | init.constant_(m.bias, 0) 61 | elif isinstance(m, nn.BatchNorm2d): 62 | init.constant_(m.weight, 1) 63 | init.constant_(m.bias, 0) 64 | elif isinstance(m, nn.Linear): 65 | init.normal_(m.weight, std=0.001) 66 | if m.bias is not None: 67 | init.constant_(m.bias, 0) 68 | 69 | def forward(self, x): 70 | # Pass input through the denoising autoencoder 71 | denoised = self.denoising_autoencoder(x) 72 | 73 | # SEAttention mechanism 74 | b, c, _, _ = denoised.size() 75 | y = self.avg_pool(denoised).view(b, c) 76 | y = self.fc(y).view(b, c, 1, 1) 77 | return denoised * y.expand_as(denoised) 78 | 79 | if __name__ == '__main__': 80 | model = SEAttention() 81 | model.init_weights() 82 | input = torch.randn(1, 512, 7, 7) 83 | output = model(input) 84 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/modulated_post_sigmoid_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | After pooling the height and width features, apply a *single* 1x1 convolution to each of them 4 | This 1x1 conv will project the pooled features to a lower dimension and also transform them for fusion 5 | Apply the sigmoid activation to the transformed features 6 | Introduce a learnable parameter for each of the sigmoid-activated features 7 | Multiply the sigmoid-activated feature with the learnable parameter 8 | Concatenate the modulated height and width attention maps along the channel dimension 9 | Apply a ReLU activation to the concatenated feature map 10 | Then, apply a final 1x1 convolution to the ReLU activated feature map to create the combined attention map 11 | Use this combined attention map to modulate the input feature map 12 | Modify the `__init__` to include the initial 1x1 convolutions, the final 1x1 convolution for fusion, and the learnable parameters 13 | Modify the `forward` to implement the initial 1x1 convolutions, sigmoid activation, modulation with learnable parameters, concatenation, ReLU activation, the final 1x1 convolution for fusion and modulate the input feature map 14 | The output can be compared to the baseline using the same test input and observing the changes in output 15 | 16 | """ 17 | 18 | # Modified code 19 | import torch 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | 23 | 24 | class h_sigmoid(nn.Module): 25 | def __init__(self, inplace=True): 26 | super(h_sigmoid, self).__init__() 27 | self.relu = nn.ReLU6(inplace=inplace) 28 | 29 | def forward(self, x): 30 | return self.relu(x + 3) / 6 31 | 32 | 33 | class h_swish(nn.Module): 34 | def __init__(self, inplace=True): 35 | super(h_swish, self).__init__() 36 | self.sigmoid = h_sigmoid(inplace=inplace) 37 | 38 | def forward(self, x): 39 | return x * self.sigmoid(x) 40 | 41 | 42 | class CoordAtt(nn.Module): 43 | def __init__(self, inp, reduction=32): 44 | super(CoordAtt, self).__init__() 45 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 46 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 47 | 48 | mip = max(8, inp // reduction) 49 | 50 | self.conv1_h = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 51 | self.conv1_w = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 52 | 53 | self.param_h = nn.Parameter(torch.randn(1, mip, 1, 1) * 0.02) 54 | self.param_w = nn.Parameter(torch.randn(1, mip, 1, 1) * 0.02) 55 | 56 | self.conv_fusion = nn.Conv2d(mip*2, inp, kernel_size=1, stride=1, padding=0) 57 | 58 | self.relu = nn.ReLU() 59 | 60 | 61 | def forward(self, x): 62 | identity = x 63 | 64 | n, c, h, w = x.size() 65 | x_h = self.pool_h(x) 66 | x_w = self.pool_w(x) 67 | 68 | x_w = x_w.permute(0, 1, 3, 2) 69 | 70 | x_h = self.conv1_h(x_h) 71 | x_w = self.conv1_w(x_w) 72 | 73 | a_h = torch.sigmoid(x_h) * self.param_h 74 | a_w = torch.sigmoid(x_w) * self.param_w 75 | 76 | y = torch.cat([a_h, a_w], dim=1) 77 | 78 | y = self.relu(y) 79 | y = self.conv_fusion(y) 80 | 81 | 82 | out = identity * y 83 | 84 | return out 85 | 86 | if __name__ == '__main__': 87 | x = torch.randn(2, 64, 32, 32) 88 | att = CoordAtt(inp=64, reduction=32) 89 | out = att(x) 90 | print("输入尺寸:", x.shape) 91 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxa-edge_aware_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate a lightweight edge detection layer, such as a Sobel filter or small trainable convolutional layer, within the CoordAtt module 3 | Modify the forward method to compute edge maps and integrate them by modulating the attention weights 4 | Evaluate the impact on feature representation by testing on a small benchmark dataset, comparing performance to the original CoordAtt, and monitoring computational efficiency 5 | 6 | """ 7 | 8 | # xxa 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torchvision.transforms as transforms 14 | 15 | class h_sigmoid(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_sigmoid, self).__init__() 18 | self.relu = nn.ReLU6(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return self.relu(x + 3) / 6 22 | 23 | 24 | class h_swish(nn.Module): 25 | def __init__(self, inplace=True): 26 | super(h_swish, self).__init__() 27 | self.sigmoid = h_sigmoid(inplace=inplace) 28 | 29 | def forward(self, x): 30 | return x * self.sigmoid(x) 31 | 32 | 33 | class CoordAtt(nn.Module): 34 | def __init__(self, inp, reduction=32): 35 | super(CoordAtt, self).__init__() 36 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 37 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 38 | 39 | mip = max(8, inp // reduction) 40 | 41 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 42 | self.bn1 = nn.BatchNorm2d(mip) 43 | self.act = h_swish() 44 | 45 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 46 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 47 | 48 | # Edge detection using Sobel filters 49 | self.sobel_x = nn.Conv2d(inp, 1, kernel_size=3, stride=1, padding=1, bias=False) 50 | self.sobel_y = nn.Conv2d(inp, 1, kernel_size=3, stride=1, padding=1, bias=False) 51 | sobel_kernel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=torch.float32).expand(1, inp, 3, 3) 52 | sobel_kernel_y = torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=torch.float32).expand(1, inp, 3, 3) 53 | self.sobel_x.weight = nn.Parameter(sobel_kernel_x, requires_grad=False) 54 | self.sobel_y.weight = nn.Parameter(sobel_kernel_y, requires_grad=False) 55 | 56 | def forward(self, x): 57 | identity = x 58 | 59 | # Compute edge maps 60 | edge_x = self.sobel_x(x) 61 | edge_y = self.sobel_y(x) 62 | edge_map = torch.sqrt(edge_x ** 2 + edge_y ** 2) 63 | edge_map = edge_map.sigmoid() # Normalize edge map 64 | 65 | n, c, h, w = x.size() 66 | x_h = self.pool_h(x) 67 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 68 | 69 | y = torch.cat([x_h, x_w], dim=2) 70 | y = self.conv1(y) 71 | y = self.bn1(y) 72 | y = self.act(y) 73 | 74 | x_h, x_w = torch.split(y, [h, w], dim=2) 75 | x_w = x_w.permute(0, 1, 3, 2) 76 | 77 | a_h = self.conv_h(x_h).sigmoid() 78 | a_w = self.conv_w(x_w).sigmoid() 79 | 80 | # Integrate edge map by modulating attention 81 | out = identity * a_w * a_h * edge_map 82 | 83 | return out 84 | 85 | if __name__ == '__main__': 86 | x = torch.randn(2, 64, 32, 32) 87 | att = CoordAtt(inp=64, reduction=32) 88 | out = att(x) 89 | print("输入尺寸:", x.shape) 90 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/probiou/experiment.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | 5 | def _get_covariance_matrix(boxes): 6 | """ 7 | Generating covariance matrix from obbs. 8 | 9 | Args: 10 | boxes (torch.Tensor): A tensor of shape (N, 5) representing rotated bounding boxes, with xywhr format. 11 | 12 | Returns: 13 | (torch.Tensor): Covariance matrices corresponding to original rotated bounding boxes. 14 | """ 15 | # Gaussian bounding boxes, ignore the center points (the first two columns) because they are not needed here. 16 | gbbs = torch.cat((boxes[:, 2:4].pow(2) / 12, boxes[:, 4:]), dim=-1) 17 | a, b, c = gbbs.split(1, dim=-1) 18 | cos = c.cos() 19 | sin = c.sin() 20 | cos2 = cos.pow(2) 21 | sin2 = sin.pow(2) 22 | return a * cos2 + b * sin2, a * sin2 + b * cos2, (a - b) * cos * sin 23 | 24 | 25 | def probiou(obb1, obb2, CIoU=False, eps=1e-7): 26 | """ 27 | Calculate probabilistic IoU between oriented bounding boxes. 28 | 29 | Implements the algorithm from https://arxiv.org/pdf/2106.06072v1.pdf. 30 | 31 | Args: 32 | obb1 (torch.Tensor): Ground truth OBBs, shape (N, 5), format xywhr. 33 | obb2 (torch.Tensor): Predicted OBBs, shape (N, 5), format xywhr. 34 | CIoU (bool, optional): If True, calculate CIoU. Defaults to False. 35 | eps (float, optional): Small value to avoid division by zero. Defaults to 1e-7. 36 | 37 | Returns: 38 | (torch.Tensor): OBB similarities, shape (N,). 39 | 40 | Note: 41 | OBB format: [center_x, center_y, width, height, rotation_angle]. 42 | If CIoU is True, returns CIoU instead of IoU. 43 | """ 44 | x1, y1 = obb1[..., :2].split(1, dim=-1) 45 | x2, y2 = obb2[..., :2].split(1, dim=-1) 46 | a1, b1, c1 = _get_covariance_matrix(obb1) 47 | a2, b2, c2 = _get_covariance_matrix(obb2) 48 | 49 | t1 = ( 50 | ((a1 + a2) * (y1 - y2).pow(2) + (b1 + b2) * (x1 - x2).pow(2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps) 51 | ) * 0.25 52 | t2 = (((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)) * 0.5 53 | t3 = ( 54 | ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2)) 55 | / (4 * ((a1 * b1 - c1.pow(2)).clamp_(0) * (a2 * b2 - c2.pow(2)).clamp_(0)).sqrt() + eps) 56 | + eps 57 | ).log() * 0.5 58 | bd = (t1 + t2 + t3).clamp(eps, 100.0) 59 | hd = (1.0 - (-bd).exp() + eps).sqrt() 60 | iou = 1 - hd 61 | 62 | if CIoU: # only include the wh aspect ratio part 63 | w1, h1 = obb1[..., 2:4].split(1, dim=-1) 64 | w2, h2 = obb2[..., 2:4].split(1, dim=-1) 65 | v = (4 / math.pi**2) * ((w2 / h2).atan() - (w1 / h1).atan()).pow(2) 66 | with torch.no_grad(): 67 | alpha = v / (v - iou + (1 + eps)) 68 | return iou - v * alpha # CIoU 69 | return iou 70 | 71 | if __name__ == "__main__": 72 | # 定义两个 OBBs 73 | obb1 = torch.tensor([[0.0, 0.0, 2.0, 4.0, 0.0]]) # [x, y, w, h, r=0°] 74 | obb2 = torch.tensor([[5.0, 5.0, 6.0, 2.0, math.radians(45)]]) # [x, y, w, h, r=45°] 75 | print("OBB1:", obb1) 76 | print("OBB2:", obb2) 77 | 78 | # 调用 _get_covariance_matrix 79 | a1, b1, c1 = _get_covariance_matrix(obb1) 80 | a2, b2, c2 = _get_covariance_matrix(obb2) 81 | print("OBB1的协方差分量:", a1, b1, c1) 82 | print("OBB2的协方差分量:", a2, b2, c2) 83 | 84 | # 计算概率IoU 85 | iou = probiou(obb1, obb2, CIoU=False) 86 | print("ProbIoU:", iou) 87 | 88 | # 计算CIoU 89 | ciou = probiou(obb1, obb2, CIoU=True) 90 | print("CIoU:", ciou) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/aaa-freq_domain_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate a frequency domain analysis step within the CoordAtt module 3 | Implement a function to perform Fast Fourier Transform (FFT) on the input feature maps, focusing on extracting significant frequency components 4 | Use these components to modulate the attention weights in the CoordAtt module 5 | Modify the forward method to incorporate this frequency domain information before applying the spatial attention mechanism 6 | Evaluate the impact on feature representation using metrics such as accuracy, feature representation quality, and computational efficiency by testing on a small benchmark dataset 7 | Compare the performance to the original CoordAtt and other variants 8 | 9 | """ 10 | ### aaa 11 | # Modified code 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | import torch.fft 16 | 17 | class h_sigmoid(nn.Module): 18 | def __init__(self, inplace=True): 19 | super(h_sigmoid, self).__init__() 20 | self.relu = nn.ReLU6(inplace=inplace) 21 | 22 | def forward(self, x): 23 | return self.relu(x + 3) / 6 24 | 25 | class h_swish(nn.Module): 26 | def __init__(self, inplace=True): 27 | super(h_swish, self).__init__() 28 | self.sigmoid = h_sigmoid(inplace=inplace) 29 | 30 | def forward(self, x): 31 | return x * self.sigmoid(x) 32 | 33 | class CoordAtt(nn.Module): 34 | def __init__(self, inp, reduction=32): 35 | super(CoordAtt, self).__init__() 36 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 37 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 38 | 39 | mip = max(8, inp // reduction) 40 | 41 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 42 | self.bn1 = nn.BatchNorm2d(mip) 43 | self.act = h_swish() 44 | 45 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 46 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 47 | 48 | def forward(self, x): 49 | identity = x 50 | 51 | # Compute FFT on a downsampled version of input feature maps 52 | pool_x = F.adaptive_avg_pool2d(x, (x.size(2) // 2, x.size(3) // 2)) 53 | fft_x = torch.fft.fft2(pool_x) 54 | fft_x = torch.fft.fftshift(fft_x) 55 | 56 | # Extract significant frequency components 57 | freq_magnitude = torch.abs(fft_x) 58 | 59 | # Normalize frequency components to modulate attention 60 | freq_magnitude = (freq_magnitude - freq_magnitude.min()) / (freq_magnitude.max() - freq_magnitude.min()) 61 | 62 | n, c, h, w = x.size() 63 | x_h = self.pool_h(x) 64 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 65 | 66 | y = torch.cat([x_h, x_w], dim=2) 67 | y = self.conv1(y) 68 | y = self.bn1(y) 69 | y = self.act(y) 70 | 71 | x_h, x_w = torch.split(y, [h, w], dim=2) 72 | x_w = x_w.permute(0, 1, 3, 2) 73 | 74 | a_h = self.conv_h(x_h).sigmoid() 75 | a_w = self.conv_w(x_w).sigmoid() 76 | 77 | # Modulate attention weights with frequency magnitude 78 | a_h = a_h * F.interpolate(freq_magnitude[:, :, :h, :], size=(h, 1)) 79 | a_w = a_w * F.interpolate(freq_magnitude[:, :, :, :w], size=(1, w)) 80 | 81 | out = identity * a_w * a_h 82 | 83 | return out 84 | 85 | if __name__ == '__main__': 86 | x = torch.randn(2, 64, 32, 32) 87 | att = CoordAtt(inp=64, reduction=32) 88 | out = att(x) 89 | print("输入尺寸:", x.shape) 90 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xaa-adaptive_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implement a simple heuristic-based mechanism within the CoordAtt module to dynamically adjust attention parameters based on input feature map statistics such as variance or entropy 3 | Modify the forward method to incorporate this mechanism, allowing it to adaptively configure the attention strategy 4 | Evaluate the adaptability and performance on a small benchmark dataset, comparing it with the original CoordAtt and other versions, focusing on accuracy, feature representation quality, and computational efficiency 5 | The heuristic could be a rule-based system or a lightweight decision tree 6 | 7 | 在CoordAtt模块中实现一个基于输入特征图统计(如方差或熵)的简单启发式机制,以动态调整注意力参数。 8 | 修改forward方法以包含此机制,使其能够自适应地配置注意力策略。 9 | 通过在一个小型基准数据集上评估其适应性和性能, 10 | 比较与原始CoordAtt和其他版本的差异,重点关注准确性、特征表示质量和计算效率。启发式方法可以是基于规则的系统或轻量级决策树。 11 | """ 12 | 13 | # xaa 可以试试 xxx实测不行 学不到任何能力 14 | 15 | # Modified code 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | import numpy as np 21 | 22 | class h_sigmoid(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_sigmoid, self).__init__() 25 | self.relu = nn.ReLU6(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return self.relu(x + 3) / 6 29 | 30 | class h_swish(nn.Module): 31 | def __init__(self, inplace=True): 32 | super(h_swish, self).__init__() 33 | self.sigmoid = h_sigmoid(inplace=inplace) 34 | 35 | def forward(self, x): 36 | return x * self.sigmoid(x) 37 | 38 | class ADACoordAtt(nn.Module): 39 | def __init__(self, inp, reduction=32): 40 | super(ADACoordAtt, self).__init__() 41 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 42 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 43 | 44 | mip = max(8, inp // reduction) 45 | 46 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 47 | self.bn1 = nn.BatchNorm2d(mip) 48 | self.act = h_swish() 49 | 50 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 51 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 52 | 53 | def compute_variance(self, x): 54 | # Calculate the variance of the feature map 55 | return torch.var(x, dim=(2, 3), keepdim=True) 56 | 57 | def forward(self, x): 58 | identity = x 59 | 60 | n, c, h, w = x.size() 61 | 62 | # Compute variance and use it to adjust the attention 63 | variance = self.compute_variance(x) 64 | 65 | # Heuristic rule: if variance is high, reduce the impact of attention 66 | # Scale factor ranges from 0.5 to 1.0 based on variance 67 | scale_factor = torch.clamp(1.0 - 0.5 * (variance / variance.max()), min=0.5, max=1.0) 68 | 69 | x_h = self.pool_h(x) 70 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 71 | 72 | y = torch.cat([x_h, x_w], dim=2) 73 | y = self.conv1(y) 74 | y = self.bn1(y) 75 | y = self.act(y) 76 | 77 | x_h, x_w = torch.split(y, [h, w], dim=2) 78 | x_w = x_w.permute(0, 1, 3, 2) 79 | 80 | a_h = self.conv_h(x_h).sigmoid() 81 | a_w = self.conv_w(x_w).sigmoid() 82 | 83 | # Apply the scale factor to the attention maps 84 | a_h = a_h * scale_factor 85 | a_w = a_w * scale_factor 86 | 87 | out = identity * a_w * a_h 88 | 89 | return out 90 | 91 | if __name__ == '__main__': 92 | x = torch.randn(2, 64, 32, 32) 93 | att = ADACoordAtt(inp=64, reduction=32) 94 | out = att(x) 95 | print("输入尺寸:", x.shape) 96 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-content_adaptive_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` class to incorporate a content-adaptive attention mechanism 3 | Implement a self-attention-like operation where attention weights are computed based on the cosine similarity between features 4 | Integrate this mechanism before the existing coordinate attention operations, allowing attention weights to be modulated based on input content 5 | Evaluate the benefits of this approach by assessing feature representation quality and comparing performance on a small benchmark dataset against the original CoordAtt 6 | Monitor computational efficiency and parameter count to ensure the approach remains lightweight 7 | 8 | """ 9 | 10 | # 创新不足 11 | # Modified code 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | 17 | 18 | class h_sigmoid(nn.Module): 19 | def __init__(self, inplace=True): 20 | super(h_sigmoid, self).__init__() 21 | self.relu = nn.ReLU6(inplace=inplace) 22 | 23 | def forward(self, x): 24 | return self.relu(x + 3) / 6 25 | 26 | 27 | class h_swish(nn.Module): 28 | def __init__(self, inplace=True): 29 | super(h_swish, self).__init__() 30 | self.sigmoid = h_sigmoid(inplace=inplace) 31 | 32 | def forward(self, x): 33 | return x * self.sigmoid(x) 34 | 35 | 36 | class CoordAtt(nn.Module): 37 | def __init__(self, inp, reduction=32): 38 | super(CoordAtt, self).__init__() 39 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 40 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 41 | 42 | mip = max(8, inp // reduction) 43 | 44 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 45 | self.bn1 = nn.BatchNorm2d(mip) 46 | self.act = h_swish() 47 | 48 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 49 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 50 | 51 | # Self-attention-like mechanism using cosine similarity 52 | self.query_conv = nn.Conv2d(inp, inp // reduction, kernel_size=1) 53 | self.key_conv = nn.Conv2d(inp, inp // reduction, kernel_size=1) 54 | self.value_conv = nn.Conv2d(inp, inp, kernel_size=1) 55 | self.softmax = nn.Softmax(dim=-1) 56 | 57 | def forward(self, x): 58 | identity = x 59 | 60 | # Compute cosine similarity based attention 61 | n, c, h, w = x.size() 62 | query = self.query_conv(x) 63 | key = self.key_conv(x) 64 | value = self.value_conv(x) 65 | 66 | query = query.view(n, -1, h * w) 67 | key = key.view(n, -1, h * w) 68 | value = value.view(n, -1, h * w) 69 | 70 | attention = torch.bmm(query.permute(0, 2, 1), key) 71 | attention = self.softmax(attention / (c ** 0.5)) 72 | out_attention = torch.bmm(value, attention).view(n, c, h, w) 73 | 74 | # Existing coordinate attention operations 75 | x_h = self.pool_h(out_attention) 76 | x_w = self.pool_w(out_attention).permute(0, 1, 3, 2) 77 | 78 | y = torch.cat([x_h, x_w], dim=2) 79 | y = self.conv1(y) 80 | y = self.bn1(y) 81 | y = self.act(y) 82 | 83 | x_h, x_w = torch.split(y, [h, w], dim=2) 84 | x_w = x_w.permute(0, 1, 3, 2) 85 | 86 | a_h = self.conv_h(x_h).sigmoid() 87 | a_w = self.conv_w(x_w).sigmoid() 88 | 89 | out = identity * a_w * a_h 90 | 91 | return out 92 | 93 | if __name__ == '__main__': 94 | x = torch.randn(2, 64, 32, 32) 95 | att = CoordAtt(inp=64, reduction=32) 96 | out = att(x) 97 | print("输入尺寸:", x.shape) 98 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/temporal_attention_fusion.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate a temporal attention mechanism using LSTM or GRU layers into the SEAttention framework 3 | This involves processing sequences of input frames to generate a temporal attention map 4 | Combine the temporal map with the SEAttention channel output through element-wise multiplication 5 | Evaluate improvements in detection metrics such as precision, recall, and F1-score on small target detection tasks, comparing against the baseline SEAttention model 6 | 7 | """ 8 | 9 | # Modified code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | class SEAttention(nn.Module): 19 | 20 | def __init__(self, channel=512,reduction=16): 21 | super().__init__() 22 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 23 | self.fc = nn.Sequential( 24 | nn.Linear(channel, channel // reduction, bias=False), 25 | nn.ReLU(inplace=True), 26 | nn.Linear(channel // reduction, channel, bias=False), 27 | nn.Sigmoid() 28 | ) 29 | 30 | def init_weights(self): 31 | for m in self.modules(): 32 | if isinstance(m, nn.Conv2d): 33 | init.kaiming_normal_(m.weight, mode='fan_out') 34 | if m.bias is not None: 35 | init.constant_(m.bias, 0) 36 | elif isinstance(m, nn.BatchNorm2d): 37 | init.constant_(m.weight, 1) 38 | init.constant_(m.bias, 0) 39 | elif isinstance(m, nn.Linear): 40 | init.normal_(m.weight, std=0.001) 41 | if m.bias is not None: 42 | init.constant_(m.bias, 0) 43 | 44 | def forward(self, x): 45 | b, c, _, _ = x.size() 46 | y = self.avg_pool(x).view(b, c) 47 | y = self.fc(y).view(b, c, 1, 1) 48 | return x * y.expand_as(x) 49 | 50 | class TemporalAttention(nn.Module): 51 | 52 | def __init__(self, channel=512, hidden_size=256, num_layers=1): 53 | super().__init__() 54 | self.gru = nn.GRU(input_size=channel, hidden_size=hidden_size, 55 | num_layers=num_layers, batch_first=True) 56 | self.fc = nn.Linear(hidden_size, channel) 57 | self.sigmoid = nn.Sigmoid() 58 | 59 | def forward(self, x): 60 | b, t, c, h, w = x.size() # assuming input shape is (batch, time, channel, height, width) 61 | x = x.view(b, t, c * h * w) # flatten spatial dimensions 62 | _, h_n = self.gru(x) # h_n is the last hidden state 63 | y = self.fc(h_n[-1]) # take the last layer's hidden state 64 | y = self.sigmoid(y).view(b, c, 1, 1) 65 | return y 66 | 67 | class SEAttentionWithTemporal(nn.Module): 68 | 69 | def __init__(self, channel=512, reduction=16, hidden_size=256, num_layers=1): 70 | super().__init__() 71 | self.se_attention = SEAttention(channel, reduction) 72 | self.temporal_attention = TemporalAttention(channel, hidden_size, num_layers) 73 | 74 | def forward(self, x): 75 | se_output = self.se_attention(x[:, -1]) # apply SEAttention on the last frame 76 | temporal_map = self.temporal_attention(x) 77 | return se_output * temporal_map.expand_as(se_output) 78 | 79 | if __name__ == '__main__': 80 | model = SEAttentionWithTemporal() 81 | model.se_attention.init_weights() 82 | input = torch.randn(1, 5, 512, 7, 7) # example with 5-frame sequence 83 | output = model(input) 84 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/simplified_graph_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by incorporating a simplified Graph Neural Network (GNN) layer 3 | Treat feature maps as graphs with nodes representing spatial locations and edges encoding basic spatial relationships or proximity 4 | Implement a lightweight graph convolution technique to process these graphs, focusing on essential spatial dependencies 5 | Integrate this GNN layer after the channel attention stage 6 | Modify the forward function to include basic graph construction and processing 7 | Evaluate the model's performance on synthetic datasets by comparing detection accuracy and attention focus against baseline SEAttention, with emphasis on capturing spatial dependencies efficiently 8 | 9 | """ 10 | 11 | # Modified code 12 | import numpy as np 13 | import torch 14 | from torch import flatten, nn 15 | from torch.nn import init 16 | from torch.nn.modules.activation import ReLU 17 | from torch.nn.modules.batchnorm import BatchNorm2d 18 | from torch.nn import functional as F 19 | from torch_geometric.nn import GCNConv # Importing graph convolutional layer 20 | 21 | class SEAttentionGNN(nn.Module): 22 | 23 | def __init__(self, channel=512, reduction=16): 24 | super().__init__() 25 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 26 | self.fc = nn.Sequential( 27 | nn.Linear(channel, channel // reduction, bias=False), 28 | nn.ReLU(inplace=True), 29 | nn.Linear(channel // reduction, channel, bias=False), 30 | nn.Sigmoid() 31 | ) 32 | # Define a GCN layer for processing the graph 33 | self.gcn = GCNConv(channel, channel) 34 | 35 | def init_weights(self): 36 | for m in self.modules(): 37 | if isinstance(m, nn.Conv2d): 38 | init.kaiming_normal_(m.weight, mode='fan_out') 39 | if m.bias is not None: 40 | init.constant_(m.bias, 0) 41 | elif isinstance(m, nn.BatchNorm2d): 42 | init.constant_(m.weight, 1) 43 | init.constant_(m.bias, 0) 44 | elif isinstance(m, nn.Linear): 45 | init.normal_(m.weight, std=0.001) 46 | if m.bias is not None: 47 | init.constant_(m.bias, 0) 48 | 49 | def forward(self, x): 50 | b, c, h, w = x.size() 51 | 52 | # Channel attention 53 | y = self.avg_pool(x).view(b, c) 54 | y = self.fc(y).view(b, c, 1, 1) 55 | x = x * y.expand_as(x) 56 | 57 | # Convert feature maps to graph 58 | x_flat = x.view(b, c, -1).permute(0, 2, 1) # Reshape to (b, h*w, c) 59 | 60 | # Create adjacency matrix using spatial proximity (considering 4-connectivity) 61 | edge_index = [] 62 | for i in range(h): 63 | for j in range(w): 64 | index = i * w + j 65 | if i + 1 < h: # Down 66 | edge_index.append([index, (i + 1) * w + j]) 67 | if j + 1 < w: # Right 68 | edge_index.append([index, i * w + (j + 1)]) 69 | 70 | edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() 71 | 72 | # Apply GCN layer 73 | x_graph = [] 74 | for i in range(b): 75 | x_graph.append(self.gcn(x_flat[i], edge_index)) 76 | 77 | x_graph = torch.stack(x_graph).permute(0, 2, 1).view(b, c, h, w) 78 | 79 | return x_graph 80 | 81 | if __name__ == '__main__': 82 | model = SEAttentionGNN() 83 | model.init_weights() 84 | input = torch.randn(1, 512, 7, 7) 85 | output = model(input) 86 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/sigmoid_weighted_interaction_group_conv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | After pooling height and width features, perform an element-wise multiplication of the pooled height and width features 4 | Concatenate the pooled height and width features 5 | Apply a learnable parameter followed by a sigmoid activation to the element-wise multiplied feature 6 | Perform a weighted sum of the sigmoid-activated element-wise multiplied feature and the concatenated features 7 | Apply a group convolution instead of 1x1 convolution in the `conv1` layer 8 | Use a small number of groups (e 9 | g 10 | 4 or 8) 11 | Modify the `__init__` function to include the group convolution layer and a learnable parameter 12 | Modify the `forward` function to implement the element-wise multiplication, concatenation, sigmoid activation of the learnable parameter, weighted sum, and the group convolution before the shared `conv1` layer 13 | The rest of the forward pass remains the same 14 | Compare the output with the baseline using the same test input and observe changes 15 | This involves element-wise multiplication, concatenation, learnable parameter with sigmoid, weighted sum, group conv, and modifying the forward pass 16 | 17 | """ 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | 23 | 24 | class h_sigmoid(nn.Module): 25 | def __init__(self, inplace=True): 26 | super(h_sigmoid, self).__init__() 27 | self.relu = nn.ReLU6(inplace=inplace) 28 | 29 | def forward(self, x): 30 | return self.relu(x + 3) / 6 31 | 32 | 33 | class h_swish(nn.Module): 34 | def __init__(self, inplace=True): 35 | super(h_swish, self).__init__() 36 | self.sigmoid = h_sigmoid(inplace=inplace) 37 | 38 | def forward(self, x): 39 | return x * self.sigmoid(x) 40 | 41 | 42 | class CoordAtt(nn.Module): 43 | def __init__(self, inp, reduction=32, groups=4): 44 | super(CoordAtt, self).__init__() 45 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 46 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 47 | 48 | mip = max(8, inp // reduction) 49 | 50 | self.conv1 = nn.Conv2d(mip, mip, kernel_size=1, stride=1, padding=0, groups=groups, bias=False) 51 | self.bn1 = nn.BatchNorm2d(mip) 52 | self.act = h_swish() 53 | 54 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 55 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 56 | 57 | self.weight = nn.Parameter(torch.zeros(1, mip, 1, 1)) 58 | self.sigmoid = nn.Sigmoid() 59 | 60 | 61 | def forward(self, x): 62 | identity = x 63 | 64 | n, c, h, w = x.size() 65 | x_h = self.pool_h(x) 66 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 67 | 68 | # Element-wise multiplication 69 | x_hw = x_h * x_w 70 | 71 | # Concatenation 72 | y = torch.cat([x_h, x_w], dim=2) 73 | 74 | # Sigmoid-weighted interaction 75 | weight = self.sigmoid(self.weight) 76 | x_hw = x_hw * weight 77 | y = y + x_hw 78 | 79 | y = self.conv1(y) 80 | y = self.bn1(y) 81 | y = self.act(y) 82 | 83 | x_h, x_w = torch.split(y, [h, w], dim=2) 84 | x_w = x_w.permute(0, 1, 3, 2) 85 | 86 | a_h = self.conv_h(x_h).sigmoid() 87 | a_w = self.conv_w(x_w).sigmoid() 88 | 89 | out = identity * a_w * a_h 90 | 91 | return out 92 | 93 | if __name__ == '__main__': 94 | x = torch.randn(2, 64, 32, 32) 95 | att = CoordAtt(inp=64, reduction=32) 96 | out = att(x) 97 | print("输入尺寸:", x.shape) 98 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/geometric_transformation_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by adding a lightweight geometric transformation layer that applies controlled transformations (e.g, small rotations, translations) to the input feature map 3 | Integrate a transformation-aware attention mechanism that recalibrates feature maps based on invariant patterns across these transformations 4 | Modify the forward function to include these geometric transformations and subsequent attention recalibration 5 | Evaluate the model's effectiveness by comparing detection accuracy and visual focus of attention maps on synthetic datasets, particularly observing improvements in small target detection 6 | 7 | """ 8 | 9 | # Modified code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | import torchvision.transforms as T 18 | 19 | class GeometricTransformLayer(nn.Module): 20 | def __init__(self): 21 | super().__init__() 22 | self.transforms = T.Compose([ 23 | T.RandomAffine(degrees=5, translate=(0.05, 0.05)) 24 | ]) 25 | 26 | def forward(self, x): 27 | # Apply geometric transformation 28 | return self.transforms(x) 29 | 30 | class TransformationAwareAttention(nn.Module): 31 | def __init__(self, channel): 32 | super().__init__() 33 | self.channel = channel 34 | self.weight = nn.Parameter(torch.ones(channel, 1, 1)) 35 | 36 | def forward(self, x, transformed_x): 37 | # Recalibrate feature maps based on invariant patterns 38 | attention_map = torch.sigmoid(self.weight) 39 | return x * attention_map + transformed_x * (1 - attention_map) 40 | 41 | class SEAttention(nn.Module): 42 | def __init__(self, channel=512, reduction=16): 43 | super().__init__() 44 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 45 | self.fc = nn.Sequential( 46 | nn.Linear(channel, channel // reduction, bias=False), 47 | nn.ReLU(inplace=True), 48 | nn.Linear(channel // reduction, channel, bias=False), 49 | nn.Sigmoid() 50 | ) 51 | self.geo_transform = GeometricTransformLayer() 52 | self.trans_attention = TransformationAwareAttention(channel) 53 | 54 | def init_weights(self): 55 | for m in self.modules(): 56 | if isinstance(m, nn.Conv2d): 57 | init.kaiming_normal_(m.weight, mode='fan_out') 58 | if m.bias is not None: 59 | init.constant_(m.bias, 0) 60 | elif isinstance(m, nn.BatchNorm2d): 61 | init.constant_(m.weight, 1) 62 | init.constant_(m.bias, 0) 63 | elif isinstance(m, nn.Linear): 64 | init.normal_(m.weight, std=0.001) 65 | if m.bias is not None: 66 | init.constant_(m.bias, 0) 67 | 68 | def forward(self, x): 69 | b, c, _, _ = x.size() 70 | # Apply geometric transformation 71 | transformed_x = self.geo_transform(x) 72 | # SE attention 73 | y = self.avg_pool(x).view(b, c) 74 | y = self.fc(y).view(b, c, 1, 1) 75 | se_attention = x * y.expand_as(x) 76 | # Transformation-aware attention 77 | attention_output = self.trans_attention(se_attention, transformed_x) 78 | return attention_output 79 | 80 | if __name__ == '__main__': 81 | model = SEAttention() 82 | model.init_weights() 83 | input = torch.randn(1, 512, 7, 7) 84 | output = model(input) 85 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-multi_scale_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add parallel convolutional branches with kernel sizes 1x1, 3x3, and 5x5 to `CoordAtt` to capture multi-scale features 3 | Each branch should have its own convolutional layer followed by batch normalization and activation 4 | Concatenate the outputs of these branches before combining with the original coordinate attention features 5 | Modify the `forward` method to integrate these multi-scale features before applying attention weights 6 | Evaluate the enhancement in feature representation by testing on a small benchmark dataset and comparing the modified module's performance to the original, while also monitoring computational overhead 7 | 8 | """ 9 | 10 | # Modified code 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | class h_sigmoid(nn.Module): 17 | def __init__(self, inplace=True): 18 | super(h_sigmoid, self).__init__() 19 | self.relu = nn.ReLU6(inplace=inplace) 20 | 21 | def forward(self, x): 22 | return self.relu(x + 3) / 6 23 | 24 | 25 | class h_swish(nn.Module): 26 | def __init__(self, inplace=True): 27 | super(h_swish, self).__init__() 28 | self.sigmoid = h_sigmoid(inplace=inplace) 29 | 30 | def forward(self, x): 31 | return x * self.sigmoid(x) 32 | 33 | 34 | class CoordAtt(nn.Module): 35 | def __init__(self, inp, reduction=32): 36 | super(CoordAtt, self).__init__() 37 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 38 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 39 | 40 | mip = max(8, inp // reduction) 41 | 42 | # Original coordinate attention components 43 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 44 | self.bn1 = nn.BatchNorm2d(mip) 45 | self.act = h_swish() 46 | 47 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 48 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 49 | 50 | # Multi-scale feature branches 51 | self.conv_1x1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 52 | self.bn_1x1 = nn.BatchNorm2d(mip) 53 | 54 | self.conv_3x3 = nn.Conv2d(inp, mip, kernel_size=3, stride=1, padding=1) 55 | self.bn_3x3 = nn.BatchNorm2d(mip) 56 | 57 | self.conv_5x5 = nn.Conv2d(inp, mip, kernel_size=5, stride=1, padding=2) 58 | self.bn_5x5 = nn.BatchNorm2d(mip) 59 | 60 | def forward(self, x): 61 | identity = x 62 | 63 | n, c, h, w = x.size() 64 | x_h = self.pool_h(x) 65 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 66 | 67 | # Coordinate attention path 68 | y = torch.cat([x_h, x_w], dim=2) 69 | y = self.conv1(y) 70 | y = self.bn1(y) 71 | y = self.act(y) 72 | 73 | x_h, x_w = torch.split(y, [h, w], dim=2) 74 | x_w = x_w.permute(0, 1, 3, 2) 75 | 76 | a_h = self.conv_h(x_h).sigmoid() 77 | a_w = self.conv_w(x_w).sigmoid() 78 | 79 | # Multi-scale feature branches 80 | y_1x1 = self.bn_1x1(self.conv_1x1(x)) 81 | y_3x3 = self.bn_3x3(self.conv_3x3(x)) 82 | y_5x5 = self.bn_5x5(self.conv_5x5(x)) 83 | 84 | multi_scale_features = torch.cat([y_1x1, y_3x3, y_5x5], dim=1) 85 | multi_scale_features = self.act(multi_scale_features) 86 | 87 | # Combine multi-scale features with coordinate attention 88 | out = identity * a_w * a_h + multi_scale_features 89 | 90 | return out 91 | 92 | if __name__ == '__main__': 93 | x = torch.randn(2, 64, 32, 32) 94 | att = CoordAtt(inp=64, reduction=32) 95 | out = att(x) 96 | print("输入尺寸:", x.shape) 97 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/dynamic_attention_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implement a dynamic attention mechanism that selects between spatial and channel attentions based on input characteristics 3 | Develop a decision layer that analyzes input features and outputs a preference score for each attention type 4 | Modify the SEAttention class to incorporate spatial attention 5 | Use the decision layer to dynamically apply spatial or channel attention 6 | Evaluate the model's performance on small target detection tasks by analyzing precision, recall, and F1-score, comparing against the baseline SEAttention model and other enhanced models 7 | 8 | """ 9 | 10 | # Modified code 11 | import numpy as np 12 | import torch 13 | from torch import flatten, nn 14 | from torch.nn import init 15 | from torch.nn.modules.activation import ReLU 16 | from torch.nn.modules.batchnorm import BatchNorm2d 17 | from torch.nn import functional as F 18 | 19 | class SEAttention(nn.Module): 20 | 21 | def __init__(self, channel=512, reduction=16): 22 | super().__init__() 23 | self.channel = channel 24 | self.reduction = reduction 25 | 26 | # Channel Attention Components 27 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 28 | self.fc = nn.Sequential( 29 | nn.Linear(channel, channel // reduction, bias=False), 30 | nn.ReLU(inplace=True), 31 | nn.Linear(channel // reduction, channel, bias=False), 32 | nn.Sigmoid() 33 | ) 34 | 35 | # Spatial Attention Components 36 | self.conv1 = nn.Conv2d(channel, channel // reduction, kernel_size=1) 37 | self.conv2 = nn.Conv2d(channel // reduction, 1, kernel_size=1) 38 | self.sigmoid = nn.Sigmoid() 39 | 40 | # Decision Layer 41 | self.decision_layer = nn.Sequential( 42 | nn.Linear(channel, 2), 43 | nn.Softmax(dim=1) 44 | ) 45 | 46 | def init_weights(self): 47 | for m in self.modules(): 48 | if isinstance(m, nn.Conv2d): 49 | init.kaiming_normal_(m.weight, mode='fan_out') 50 | if m.bias is not None: 51 | init.constant_(m.bias, 0) 52 | elif isinstance(m, nn.BatchNorm2d): 53 | init.constant_(m.weight, 1) 54 | init.constant_(m.bias, 0) 55 | elif isinstance(m, nn.Linear): 56 | init.normal_(m.weight, std=0.001) 57 | if m.bias is not None: 58 | init.constant_(m.bias, 0) 59 | 60 | def channel_attention(self, x, b, c): 61 | y = self.avg_pool(x).view(b, c) 62 | y = self.fc(y).view(b, c, 1, 1) 63 | return x * y.expand_as(x) 64 | 65 | def spatial_attention(self, x, b, c, h, w): 66 | y = self.conv1(x) 67 | y = self.conv2(y) 68 | y = self.sigmoid(y) 69 | return x * y.expand_as(x) 70 | 71 | def forward(self, x): 72 | b, c, h, w = x.size() 73 | 74 | # Decision layer based on input features 75 | avg_features = self.avg_pool(x).view(b, c) 76 | decision = self.decision_layer(avg_features) 77 | 78 | # Split decision into channel and spatial attention weights 79 | channel_weight, spatial_weight = decision[:, 0], decision[:, 1] 80 | 81 | # Apply attention based on decision weights 82 | channel_attended = self.channel_attention(x, b, c) * channel_weight.view(b, 1, 1, 1) 83 | spatial_attended = self.spatial_attention(x, b, c, h, w) * spatial_weight.view(b, 1, 1, 1) 84 | 85 | return channel_attended + spatial_attended 86 | 87 | if __name__ == '__main__': 88 | model = SEAttention() 89 | model.init_weights() 90 | input = torch.randn(1, 512, 7, 7) 91 | output = model(input) 92 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/pre_pool_spatial_adaptive_channel_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | Introduce a spatial adaptive channel attention module before the pooling operations 4 | This module will consist of global average pooling and global max pooling 5 | A 1x1 convolution will be used to generate a spatial weight map for the max pooling output 6 | The weighted max pooling output is then added element-wise to the average pooling output 7 | This result is then passed through a 1x1 convolution, a ReLU activation, and a sigmoid activation 8 | The output of the channel attention will be used to modulate the input feature map before the height and width pooling 9 | Modify the `__init__` function to include the channel attention module and the 1x1 convolution for weight map generation 10 | Modify the `forward` function to implement the channel attention, the weighted sum of the pooling outputs using the spatial weight map, modulation of the input feature map, and then the rest of the operations 11 | Compare the output with the baseline using the same test input and observe the changes 12 | This involves adding global average pooling, global max pooling, a 1x1 conv for spatial weight map, a 1x1 conv, ReLU and sigmoid, and modifying the forward pass to apply the attention before pooling 13 | 14 | """ 15 | 16 | import torch 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | 20 | 21 | class h_sigmoid(nn.Module): 22 | def __init__(self, inplace=True): 23 | super(h_sigmoid, self).__init__() 24 | self.relu = nn.ReLU6(inplace=inplace) 25 | 26 | def forward(self, x): 27 | return self.relu(x + 3) / 6 28 | 29 | 30 | class h_swish(nn.Module): 31 | def __init__(self, inplace=True): 32 | super(h_swish, self).__init__() 33 | self.sigmoid = h_sigmoid(inplace=inplace) 34 | 35 | def forward(self, x): 36 | return x * self.sigmoid(x) 37 | 38 | 39 | class CoordAtt(nn.Module): 40 | def __init__(self, inp, reduction=32): 41 | super(CoordAtt, self).__init__() 42 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 43 | self.max_pool = nn.AdaptiveMaxPool2d(1) 44 | 45 | mip = max(8, inp // reduction) 46 | self.conv_reduce = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 47 | 48 | self.spatial_weight = nn.Conv2d(mip, 1, kernel_size=1, stride=1, padding=0) 49 | 50 | self.bn1 = nn.BatchNorm2d(mip) 51 | self.act = h_swish() 52 | 53 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 54 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 55 | 56 | def forward(self, x): 57 | identity = x 58 | n, c, h, w = x.size() 59 | 60 | x_reduced = self.conv_reduce(x) 61 | avg_out = self.avg_pool(x_reduced) 62 | max_out = self.max_pool(x_reduced) 63 | 64 | spatial_weight = self.spatial_weight(max_out).sigmoid() 65 | 66 | channel_att = avg_out + max_out * spatial_weight 67 | 68 | channel_att = self.bn1(channel_att) 69 | channel_att = self.act(channel_att) 70 | 71 | x = x * channel_att 72 | 73 | x_h = nn.AdaptiveAvgPool2d((None, 1))(x) 74 | x_w = nn.AdaptiveAvgPool2d((1, None))(x).permute(0, 1, 3, 2) 75 | 76 | 77 | y = torch.cat([x_h, x_w], dim=2) 78 | 79 | y = self.bn1(self.conv_reduce(y)) 80 | y = self.act(y) 81 | 82 | 83 | x_h, x_w = torch.split(y, [h, w], dim=2) 84 | x_w = x_w.permute(0, 1, 3, 2) 85 | 86 | a_h = self.conv_h(x_h).sigmoid() 87 | a_w = self.conv_w(x_w).sigmoid() 88 | 89 | out = identity * a_w * a_h 90 | 91 | return out 92 | 93 | if __name__ == '__main__': 94 | x = torch.randn(2, 64, 32, 32) 95 | att = CoordAtt(inp=64, reduction=32) 96 | out = att(x) 97 | print("输入尺寸:", x.shape) 98 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxx-sparse_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate an L1 regularization term into the training process of the CoordAtt module to induce sparsity in the feature maps 3 | Modify the loss function to include this L1 penalty, encouraging sparsity in the output of the initial convolutional layers 4 | Evaluate the impact on feature representation quality and computational efficiency by testing on a small benchmark dataset 5 | Compare the results in terms of accuracy, feature discrimination, and computational overhead with the original CoordAtt and other variants 6 | 7 | """ 8 | 9 | # Modified code 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | class h_sigmoid(nn.Module): 15 | def __init__(self, inplace=True): 16 | super(h_sigmoid, self).__init__() 17 | self.relu = nn.ReLU6(inplace=inplace) 18 | 19 | def forward(self, x): 20 | return self.relu(x + 3) / 6 21 | 22 | class h_swish(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_swish, self).__init__() 25 | self.sigmoid = h_sigmoid(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return x * self.sigmoid(x) 29 | 30 | class CoordAtt(nn.Module): 31 | def __init__(self, inp, reduction=32): 32 | super(CoordAtt, self).__init__() 33 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 34 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 35 | 36 | mip = max(8, inp // reduction) 37 | 38 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 39 | self.bn1 = nn.BatchNorm2d(mip) 40 | self.act = h_swish() 41 | 42 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 43 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 44 | 45 | def forward(self, x): 46 | identity = x 47 | 48 | n, c, h, w = x.size() 49 | x_h = self.pool_h(x) 50 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 51 | 52 | y = torch.cat([x_h, x_w], dim=2) 53 | y = self.conv1(y) 54 | y = self.bn1(y) 55 | y = self.act(y) 56 | 57 | x_h, x_w = torch.split(y, [h, w], dim=2) 58 | x_w = x_w.permute(0, 1, 3, 2) 59 | 60 | a_h = self.conv_h(x_h).sigmoid() 61 | a_w = self.conv_w(x_w).sigmoid() 62 | 63 | out = identity * a_w * a_h 64 | 65 | return out 66 | 67 | def l1_regularization(model, lambda_l1): 68 | l1_norm = sum(p.abs().sum() for p in model.parameters()) 69 | return lambda_l1 * l1_norm 70 | 71 | # Example training loop 72 | def train(model, dataloader, criterion, optimizer, lambda_l1): 73 | model.train() 74 | total_loss = 0.0 75 | for data, target in dataloader: 76 | optimizer.zero_grad() 77 | output = model(data) 78 | loss = criterion(output, target) 79 | 80 | # Add L1 regularization penalty 81 | l1_penalty = l1_regularization(model, lambda_l1) 82 | loss += l1_penalty 83 | 84 | loss.backward() 85 | optimizer.step() 86 | 87 | total_loss += loss.item() 88 | return total_loss / len(dataloader) 89 | 90 | if __name__ == '__main__': 91 | x = torch.randn(2, 64, 32, 32) 92 | att = CoordAtt(inp=64, reduction=32) 93 | out = att(x) 94 | print("输入尺寸:", x.shape) 95 | print("输出尺寸:", out.shape) 96 | 97 | # Example of lambda_l1 for L1 regularization 98 | lambda_l1 = 0.01 99 | 100 | # Mock dataloader, criterion, and optimizer for testing 101 | dataloader = [ (x, torch.randn(2, 64, 32, 32)) for _ in range(10) ] 102 | criterion = nn.MSELoss() 103 | optimizer = torch.optim.SGD(att.parameters(), lr=0.01) 104 | 105 | # Run a single training epoch 106 | avg_loss = train(att, dataloader, criterion, optimizer, lambda_l1) 107 | print("Average training loss with L1 regularization:", avg_loss) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/color_channel_fusion_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend SEAttention by implementing a color channel fusion mechanism 3 | Add a preprocessing step that applies a shared attention mechanism to the R, G, and B channels, followed by a weighted fusion of these channels to create a comprehensive feature map 4 | Integrate this fused feature map into the existing SEAttention architecture 5 | Evaluate the model's effectiveness by comparing detection performance on synthetic datasets designed with varying color contrasts and subtle variations, using quantitative metrics such as precision and recall, and qualitative analysis of attention map focus 6 | 7 | """ 8 | 9 | # Modified code 10 | import numpy as np 11 | import torch 12 | from torch import flatten, nn 13 | from torch.nn import init 14 | from torch.nn.modules.activation import ReLU 15 | from torch.nn.modules.batchnorm import BatchNorm2d 16 | from torch.nn import functional as F 17 | 18 | class ChannelAttention(nn.Module): 19 | def __init__(self, channel=512, reduction=16): 20 | super().__init__() 21 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 22 | self.fc = nn.Sequential( 23 | nn.Linear(channel, channel // reduction, bias=False), 24 | nn.ReLU(inplace=True), 25 | nn.Linear(channel // reduction, channel, bias=False), 26 | nn.Sigmoid() 27 | ) 28 | 29 | def forward(self, x): 30 | b, c, _, _ = x.size() 31 | y = self.avg_pool(x).view(b, c) 32 | y = self.fc(y).view(b, c, 1, 1) 33 | return x * y.expand_as(x) 34 | 35 | class ColorChannelFusion(nn.Module): 36 | def __init__(self, channel=512, reduction=16): 37 | super().__init__() 38 | self.red_attention = ChannelAttention(channel=channel, reduction=reduction) 39 | self.green_attention = ChannelAttention(channel=channel, reduction=reduction) 40 | self.blue_attention = ChannelAttention(channel=channel, reduction=reduction) 41 | self.weighted_fusion = nn.Conv2d(channel * 3, channel, kernel_size=1, stride=1, padding=0, bias=False) 42 | 43 | def forward(self, x): 44 | # Assuming input x shape is (batch_size, 3, height, width) 45 | red, green, blue = torch.split(x, 1, dim=1) 46 | red = self.red_attention(red) 47 | green = self.green_attention(green) 48 | blue = self.blue_attention(blue) 49 | 50 | # Concatenate along channel dimension 51 | fused = torch.cat([red, green, blue], dim=1) 52 | 53 | # Apply weighted fusion 54 | fused_feature_map = self.weighted_fusion(fused) 55 | return fused_feature_map 56 | 57 | class EnhancedSEAttention(nn.Module): 58 | def __init__(self, channel=512, reduction=16): 59 | super().__init__() 60 | self.color_fusion = ColorChannelFusion(channel=channel, reduction=reduction) 61 | self.se_attention = ChannelAttention(channel=channel, reduction=reduction) 62 | 63 | def init_weights(self): 64 | for m in self.modules(): 65 | if isinstance(m, nn.Conv2d): 66 | init.kaiming_normal_(m.weight, mode='fan_out') 67 | if m.bias is not None: 68 | init.constant_(m.bias, 0) 69 | elif isinstance(m, nn.BatchNorm2d): 70 | init.constant_(m.weight, 1) 71 | init.constant_(m.bias, 0) 72 | elif isinstance(m, nn.Linear): 73 | init.normal_(m.weight, std=0.001) 74 | if m.bias is not None: 75 | init.constant_(m.bias, 0) 76 | 77 | def forward(self, x): 78 | fused_features = self.color_fusion(x) 79 | attention_output = self.se_attention(fused_features) 80 | return attention_output 81 | 82 | if __name__ == '__main__': 83 | model = EnhancedSEAttention() 84 | model.init_weights() 85 | input = torch.randn(1, 3, 7, 7) # Updated to expect 3 channels (R, G, B) 86 | output = model(input) 87 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/pre_pool_conv_spatial_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | Apply a 1x1 convolution to the input feature map before applying max and average pooling 4 | Concatenate the results of the max and average pooling along the channel dimension 5 | Apply a lightweight spatial attention module to the concatenated map which will have a 3x3 depthwise convolution, followed by a 1x1 convolution and finally a sigmoid activation 6 | This spatial attention module modulates the combined attention map before it is split into a_h and a_w 7 | Modify the `__init__` to include the 1x1 convolution before the pooling layers, and the lightweight spatial attention module 8 | Modify the `forward` to implement the new pooling and modulation scheme 9 | The output can be compared to the baseline using the same test input and observing the changes in output 10 | This involves modifying `__init__` to incorporate pre-pooling conv, depthwise conv, 1x1 conv and sigmoid, and `forward` to implement the pooling and spatial attention 11 | 12 | """ 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | 18 | 19 | class h_sigmoid(nn.Module): 20 | def __init__(self, inplace=True): 21 | super(h_sigmoid, self).__init__() 22 | self.relu = nn.ReLU6(inplace=inplace) 23 | 24 | def forward(self, x): 25 | return self.relu(x + 3) / 6 26 | 27 | 28 | class h_swish(nn.Module): 29 | def __init__(self, inplace=True): 30 | super(h_swish, self).__init__() 31 | self.sigmoid = h_sigmoid(inplace=inplace) 32 | 33 | def forward(self, x): 34 | return x * self.sigmoid(x) 35 | 36 | 37 | class CoordAtt(nn.Module): 38 | def __init__(self, inp, reduction=32): 39 | super(CoordAtt, self).__init__() 40 | self.pre_conv = nn.Conv2d(inp, inp, kernel_size=1, stride=1, padding=0) # 1x1 conv before pooling 41 | self.pool_h = nn.AdaptiveMaxPool2d((None, 1)) 42 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 43 | 44 | mip = max(8, inp // reduction) 45 | 46 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 47 | self.bn1 = nn.BatchNorm2d(mip) 48 | self.act = h_swish() 49 | 50 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 51 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 52 | 53 | # Spatial Attention Module 54 | self.spatial_conv1 = nn.Conv2d(2, 1, kernel_size=3, stride=1, padding=1, groups=1) # Depthwise conv 55 | self.spatial_sigmoid = nn.Sigmoid() 56 | 57 | 58 | def forward(self, x): 59 | identity = x 60 | 61 | n, c, h, w = x.size() 62 | x = self.pre_conv(x) # Apply 1x1 convolution 63 | 64 | x_h = self.pool_h(x) 65 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 66 | 67 | #Spatial Attention 68 | attention_map = torch.cat([x_h, x_w.permute(0,1,3,2)], dim=1) 69 | attention_map = self.spatial_conv1(attention_map) 70 | attention_map = self.spatial_sigmoid(attention_map) 71 | attention_map = F.interpolate(attention_map, size=(h, w), mode='bilinear', align_corners=False) 72 | 73 | y = torch.cat([x_h, x_w], dim=2) 74 | 75 | y = self.conv1(y) 76 | y = self.bn1(y) 77 | y = self.act(y) 78 | 79 | x_h, x_w = torch.split(y, [h, w], dim=2) 80 | x_w = x_w.permute(0, 1, 3, 2) 81 | 82 | a_h = self.conv_h(x_h) 83 | a_w = self.conv_w(x_w) 84 | 85 | a_h = F.interpolate(a_h, size=(h, w), mode='bilinear', align_corners=False).sigmoid() 86 | a_w = F.interpolate(a_w, size=(h, w), mode='bilinear', align_corners=False).sigmoid() 87 | 88 | 89 | out = identity * a_w * a_h * attention_map 90 | 91 | return out 92 | 93 | if __name__ == '__main__': 94 | x = torch.randn(2, 64, 32, 32) 95 | att = CoordAtt(inp=64, reduction=32) 96 | out = att(x) 97 | print("输入尺寸:", x.shape) 98 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/uncertainty_guided_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate an uncertainty estimation module within the SEAttention framework 3 | Develop functions to compute uncertainty scores for different regions in the input feature map, using methods such as Monte Carlo Dropout or entropy-based measures 4 | Modify the SEAttention class to incorporate these uncertainty scores into the attention mechanism, adjusting attention weights based on uncertainty 5 | Evaluate the model's performance on small target detection tasks using metrics such as precision, recall, and F1-score, while also analyzing the uncertainty estimation's impact on detection accuracy 6 | Compare results with the baseline SEAttention model and other enhanced models 7 | 8 | """ 9 | 10 | # Modified code 11 | 12 | import numpy as np 13 | import torch 14 | from torch import flatten, nn 15 | from torch.nn import init 16 | from torch.nn.modules.activation import ReLU 17 | from torch.nn.modules.batchnorm import BatchNorm2d 18 | from torch.nn import functional as F 19 | from torch.distributions import Categorical 20 | 21 | class MonteCarloDropout(nn.Module): 22 | def __init__(self, p=0.5): 23 | super(MonteCarloDropout, self).__init__() 24 | self.p = p 25 | 26 | def forward(self, x): 27 | return F.dropout(x, p=self.p, training=True) 28 | 29 | class UncertaintyEstimator(nn.Module): 30 | def __init__(self, channel, num_samples=10): 31 | super(UncertaintyEstimator, self).__init__() 32 | self.num_samples = num_samples 33 | self.dropout = MonteCarloDropout(p=0.5) 34 | self.conv = nn.Conv2d(channel, channel, kernel_size=1) 35 | 36 | def forward(self, x): 37 | # Use Monte Carlo sampling to estimate uncertainty 38 | predictions = torch.stack([self.conv(self.dropout(x)) for _ in range(self.num_samples)], dim=0) 39 | mean_prediction = torch.mean(predictions, dim=0) 40 | uncertainty = torch.var(predictions, dim=0).mean(dim=(2, 3), keepdim=True) # Calculate uncertainty as variance 41 | return mean_prediction, uncertainty 42 | 43 | class SEAttention(nn.Module): 44 | 45 | def __init__(self, channel=512, reduction=16): 46 | super().__init__() 47 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 48 | self.fc = nn.Sequential( 49 | nn.Linear(channel, channel // reduction, bias=False), 50 | nn.ReLU(inplace=True), 51 | nn.Linear(channel // reduction, channel, bias=False), 52 | nn.Sigmoid() 53 | ) 54 | self.uncertainty_estimator = UncertaintyEstimator(channel) 55 | 56 | def init_weights(self): 57 | for m in self.modules(): 58 | if isinstance(m, nn.Conv2d): 59 | init.kaiming_normal_(m.weight, mode='fan_out') 60 | if m.bias is not None: 61 | init.constant_(m.bias, 0) 62 | elif isinstance(m, nn.BatchNorm2d): 63 | init.constant_(m.weight, 1) 64 | init.constant_(m.bias, 0) 65 | elif isinstance(m, nn.Linear): 66 | init.normal_(m.weight, std=0.001) 67 | if m.bias is not None: 68 | init.constant_(m.bias, 0) 69 | 70 | def forward(self, x): 71 | b, c, _, _ = x.size() 72 | y = self.avg_pool(x).view(b, c) 73 | y = self.fc(y).view(b, c, 1, 1) 74 | 75 | # Estimate uncertainty 76 | mean_prediction, uncertainty = self.uncertainty_estimator(x) 77 | 78 | # Integrate uncertainty into attention weights 79 | # Here, uncertainty is used to scale the attention weights, 80 | # with higher uncertainty leading to lower attention weights. 81 | attention = x * y.expand_as(x) 82 | adjusted_attention = attention * (1 - uncertainty) 83 | 84 | return adjusted_attention 85 | 86 | if __name__ == '__main__': 87 | model = SEAttention() 88 | model.init_weights() 89 | input = torch.randn(1, 512, 7, 7) 90 | output = model(input) 91 | print(output.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxa-nonlocal_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Introduce a lightweight non-local attention mechanism within the CoordAtt module 3 | Implement a simplified version of the non-local block that captures global context effectively 4 | Modify the forward method to first compute these non-local attention features and integrate them with the existing coordinate attention features before applying the final attention weights 5 | Evaluate performance on a small benchmark dataset, focusing on improvements in feature representation and capturing long-range dependencies 6 | Compare against the original CoordAtt and other variants to assess computational efficiency and accuracy improvements 7 | 8 | """ 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | 15 | class h_sigmoid(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_sigmoid, self).__init__() 18 | self.relu = nn.ReLU6(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return self.relu(x + 3) / 6 22 | 23 | 24 | class h_swish(nn.Module): 25 | def __init__(self, inplace=True): 26 | super(h_swish, self).__init__() 27 | self.sigmoid = h_sigmoid(inplace=inplace) 28 | 29 | def forward(self, x): 30 | return x * self.sigmoid(x) 31 | 32 | 33 | class SimplifiedNonLocalBlock(nn.Module): 34 | def __init__(self, in_channels): 35 | super(SimplifiedNonLocalBlock, self).__init__() 36 | self.theta = nn.Conv2d(in_channels, in_channels // 2, kernel_size=1) 37 | self.phi = nn.Conv2d(in_channels, in_channels // 2, kernel_size=1) 38 | self.g = nn.Conv2d(in_channels, in_channels // 2, kernel_size=1) 39 | self.out_conv = nn.Conv2d(in_channels // 2, in_channels, kernel_size=1) 40 | 41 | def forward(self, x): 42 | n, c, h, w = x.size() 43 | 44 | theta = self.theta(x).view(n, c // 2, -1) 45 | phi = self.phi(x).view(n, c // 2, -1) 46 | g = self.g(x).view(n, c // 2, -1) 47 | 48 | attention = torch.bmm(theta.permute(0, 2, 1), phi) 49 | attention = F.softmax(attention, dim=-1) 50 | 51 | out = torch.bmm(g, attention.permute(0, 2, 1)) 52 | out = out.view(n, c // 2, h, w) 53 | out = self.out_conv(out) 54 | 55 | return x + out 56 | 57 | 58 | class CoordAtt(nn.Module): 59 | def __init__(self, inp, reduction=32): 60 | super(CoordAtt, self).__init__() 61 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 62 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 63 | 64 | mip = max(8, inp // reduction) 65 | 66 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 67 | self.bn1 = nn.BatchNorm2d(mip) 68 | self.act = h_swish() 69 | 70 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 71 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 72 | 73 | # Add non-local block 74 | self.non_local_block = SimplifiedNonLocalBlock(inp) 75 | 76 | def forward(self, x): 77 | identity = x 78 | 79 | # Compute non-local features 80 | non_local_features = self.non_local_block(x) 81 | 82 | n, c, h, w = x.size() 83 | x_h = self.pool_h(non_local_features) 84 | x_w = self.pool_w(non_local_features).permute(0, 1, 3, 2) 85 | 86 | y = torch.cat([x_h, x_w], dim=2) 87 | y = self.conv1(y) 88 | y = self.bn1(y) 89 | y = self.act(y) 90 | 91 | x_h, x_w = torch.split(y, [h, w], dim=2) 92 | x_w = x_w.permute(0, 1, 3, 2) 93 | 94 | a_h = self.conv_h(x_h).sigmoid() 95 | a_w = self.conv_w(x_w).sigmoid() 96 | 97 | out = identity * a_w * a_h 98 | 99 | return out 100 | 101 | if __name__ == '__main__': 102 | x = torch.randn(2, 64, 32, 32) 103 | att = CoordAtt(inp=64, reduction=32) 104 | out = att(x) 105 | print("输入尺寸:", x.shape) 106 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/refined_modulated_cross_spatial_interaction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | After pooling height and width features, add two 1x1 projection convolutions 4 | One for height and one for width 5 | Introduce a learnable parameter for each of the projected features, and multiply them by the corresponding projected features 6 | Modulate the pooled width feature using the modulated projected height feature via element-wise multiplication, and vice-versa 7 | Concatenate the modulated features along the channel dimension 8 | Feed the concatenated features to the shared `conv1` 9 | In the `__init__` function, add two 1x1 convolution layers for projection, and two learnable parameters 10 | In the `forward` function, implement the projection, learnable parameter multiplication, element-wise modulation, and concatenation before shared `conv1` 11 | The rest of the forward pass remains the same 12 | Compare output with the baseline using same test input, observe changes 13 | This involves adding two 1x1 convs, two learnable parameters, and modifying the forward pass 14 | 15 | """ 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | 21 | 22 | class h_sigmoid(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_sigmoid, self).__init__() 25 | self.relu = nn.ReLU6(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return self.relu(x + 3) / 6 29 | 30 | 31 | class h_swish(nn.Module): 32 | def __init__(self, inplace=True): 33 | super(h_swish, self).__init__() 34 | self.sigmoid = h_sigmoid(inplace=inplace) 35 | 36 | def forward(self, x): 37 | return x * self.sigmoid(x) 38 | 39 | 40 | class CoordAtt(nn.Module): 41 | def __init__(self, inp, reduction=32): 42 | super(CoordAtt, self).__init__() 43 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 44 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 45 | 46 | mip = max(8, inp // reduction) 47 | 48 | # Projection Convolutions 49 | self.proj_h = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 50 | self.proj_w = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 51 | 52 | # Learnable Parameters 53 | self.alpha_h = nn.Parameter(torch.ones(1)) 54 | self.alpha_w = nn.Parameter(torch.ones(1)) 55 | 56 | self.conv1 = nn.Conv2d(mip * 2, mip, kernel_size=1, stride=1, padding=0) # Modified to accept concatenated feature 57 | self.bn1 = nn.BatchNorm2d(mip) 58 | self.act = h_swish() 59 | 60 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 61 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 62 | 63 | def forward(self, x): 64 | identity = x 65 | 66 | n, c, h, w = x.size() 67 | x_h = self.pool_h(x) 68 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 69 | 70 | # Projection 71 | proj_h = self.proj_h(x_h) 72 | proj_w = self.proj_w(x_w) 73 | 74 | # Learnable Parameter Modulation 75 | mod_h = proj_h * self.alpha_h 76 | mod_w = proj_w * self.alpha_w 77 | 78 | 79 | # Modulate 80 | mod_x_h = x_h * mod_w 81 | mod_x_w = x_w * mod_h 82 | 83 | 84 | # Concatenate along height dimension 85 | y = torch.cat([mod_x_h, mod_x_w], dim=2) 86 | 87 | # Shared conv1 88 | y = self.conv1(y) 89 | y = self.bn1(y) 90 | y = self.act(y) 91 | 92 | # Split along the height dimension 93 | x_h, x_w = torch.split(y, [h, w], dim=2) 94 | x_w = x_w.permute(0, 1, 3, 2) 95 | 96 | a_h = self.conv_h(x_h).sigmoid() 97 | a_w = self.conv_w(x_w).sigmoid() 98 | 99 | out = identity * a_w * a_h 100 | 101 | return out 102 | 103 | if __name__ == '__main__': 104 | x = torch.randn(2, 64, 32, 32) 105 | att = CoordAtt(inp=64, reduction=32) 106 | out = att(x) 107 | print("输入尺寸:", x.shape) 108 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | .venv_jax 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | .idea/ 164 | .aider* 165 | *.DS_Store 166 | 167 | # Misc folders 168 | data/ 169 | *ckpt.pt 170 | *.zip 171 | ICLR2022-OpenReviewData/ 172 | templates/*/run_0/ 173 | templates/*/*.png 174 | -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/early_fusion_addition_projected.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | After pooling the height and width features, perform an element-wise addition of the pooled height and width features 4 | Apply a 1x1 convolution to the summed feature map, followed by a non-linearity (e 5 | g 6 | , ReLU) 7 | Then, apply *two separate* 1x1 convolutions to the fused feature map to generate projected height and width features, respectively 8 | Feed these projected height and width features to the respective `conv_h` and `conv_w` layers 9 | Modify the `__init__` to add the 1x1 convolution and activation for fusion, and *two additional 1x1 convolutions* for projections 10 | Modify the `forward` to implement the element-wise addition, 1x1 convolution, activation, two projection convolutions, and feeding to the subsequent convolution layers 11 | The rest of the forward pass remains unchanged 12 | Compare the output with the baseline using the same test input to observe changes 13 | This involves adding the 1x1 conv, non-linearity, two projection 1x1 convs, and modifying the `forward` pass to implement the fusion using addition and projected representations 14 | 15 | """ 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | 21 | 22 | class h_sigmoid(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_sigmoid, self).__init__() 25 | self.relu = nn.ReLU6(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return self.relu(x + 3) / 6 29 | 30 | 31 | class h_swish(nn.Module): 32 | def __init__(self, inplace=True): 33 | super(h_swish, self).__init__() 34 | self.sigmoid = h_sigmoid(inplace=inplace) 35 | 36 | def forward(self, x): 37 | return x * self.sigmoid(x) 38 | 39 | 40 | class CoordAtt(nn.Module): 41 | def __init__(self, inp, reduction=32): 42 | super(CoordAtt, self).__init__() 43 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 44 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 45 | 46 | mip = max(8, inp // reduction) 47 | 48 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 49 | self.bn1 = nn.BatchNorm2d(mip) 50 | self.act = h_swish() 51 | 52 | # Fusion convolution and activation 53 | self.fusion_conv = nn.Conv2d(mip, mip, kernel_size=1, stride=1, padding=0) 54 | self.fusion_act = nn.ReLU() # Changed to ReLU 55 | 56 | # Projection convolutions 57 | self.proj_h = nn.Conv2d(mip, mip, kernel_size=1, stride=1, padding=0) 58 | self.proj_w = nn.Conv2d(mip, mip, kernel_size=1, stride=1, padding=0) 59 | 60 | 61 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 62 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 63 | 64 | def forward(self, x): 65 | identity = x 66 | 67 | n, c, h, w = x.size() 68 | x_h = self.pool_h(x) 69 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 70 | 71 | y = torch.cat([x_h, x_w], dim=2) 72 | y = self.conv1(y) 73 | y = self.bn1(y) 74 | y = self.act(y) 75 | 76 | x_h, x_w = torch.split(y, [h, w], dim=2) 77 | x_w = x_w.permute(0, 1, 3, 2) 78 | 79 | # Element-wise addition of pooled features 80 | fused_feature = x_h + x_w 81 | 82 | # 1x1 convolution and activation 83 | fused_feature = self.fusion_conv(fused_feature) 84 | fused_feature = self.fusion_act(fused_feature) 85 | 86 | # Projection convolutions 87 | proj_h_feature = self.proj_h(fused_feature) 88 | proj_w_feature = self.proj_w(fused_feature) 89 | 90 | a_h = self.conv_h(proj_h_feature).sigmoid() 91 | a_w = self.conv_w(proj_w_feature).sigmoid() 92 | 93 | out = identity * a_w * a_h 94 | 95 | return out 96 | 97 | if __name__ == '__main__': 98 | x = torch.randn(2, 64, 32, 32) 99 | att = CoordAtt(inp=64, reduction=32) 100 | out = att(x) 101 | print("输入尺寸:", x.shape) 102 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/bottleneck_attention_modulation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | After pooling height and width features, concatenate them along the channel dimension 4 | Apply a single 1x1 convolution to the concatenated feature map 5 | Then, apply a bottleneck layer consisting of a 1x1 convolution, a non-linearity (e 6 | g 7 | , ReLU), and another 1x1 convolution, followed by a sigmoid activation to produce an attention map 8 | Use this attention map to modulate the *original* pooled height and width features *separately* before concatenating them 9 | Finally, feed the modulated concatenated map to the shared `conv1` 10 | In the `__init__` function, add three 1x1 convolution layers, one for the initial concatenated feature transformation and two for the bottleneck attention map generation, and a non-linearity 11 | In the `forward` function, implement the concatenation, the initial transformation, the bottleneck attention map generation, the separate modulation of the pooled height and width features, and finally the concatenation before feeding to the shared `conv1` 12 | The rest of the forward pass remains the same 13 | Compare output with the baseline using same test input, observe changes 14 | 15 | """ 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | 21 | 22 | class h_sigmoid(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_sigmoid, self).__init__() 25 | self.relu = nn.ReLU6(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return self.relu(x + 3) / 6 29 | 30 | 31 | class h_swish(nn.Module): 32 | def __init__(self, inplace=True): 33 | super(h_swish, self).__init__() 34 | self.sigmoid = h_sigmoid(inplace=inplace) 35 | 36 | def forward(self, x): 37 | return x * self.sigmoid(x) 38 | 39 | 40 | class CoordAtt(nn.Module): 41 | def __init__(self, inp, reduction=32): 42 | super(CoordAtt, self).__init__() 43 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 44 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 45 | 46 | mip = max(8, inp // reduction) 47 | 48 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 49 | self.bn1 = nn.BatchNorm2d(mip) 50 | self.act = h_swish() 51 | 52 | # Bottleneck attention layers 53 | self.conv_concat = nn.Conv2d(inp * 2, mip, kernel_size=1, stride=1, padding=0) # For initial transformation 54 | self.bottleneck_conv1 = nn.Conv2d(mip, mip // 2, kernel_size=1, stride=1, padding=0) 55 | self.relu = nn.ReLU() 56 | self.bottleneck_conv2 = nn.Conv2d(mip // 2, mip, kernel_size=1, stride=1, padding=0) 57 | 58 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 59 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 60 | 61 | def forward(self, x): 62 | identity = x 63 | 64 | n, c, h, w = x.size() 65 | x_h = self.pool_h(x) 66 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 67 | 68 | # Concatenate pooled features 69 | y = torch.cat([x_h, x_w], dim=2) 70 | 71 | # Initial transformation 72 | y_concat = self.conv_concat(y) 73 | 74 | # Bottleneck attention map 75 | attn = self.bottleneck_conv1(y_concat) 76 | attn = self.relu(attn) 77 | attn = self.bottleneck_conv2(attn).sigmoid() 78 | 79 | # Apply bottleneck attention 80 | y_attn = y_concat * attn 81 | 82 | # Shared conv layer 83 | y = self.conv1(y_attn) 84 | y = self.bn1(y) 85 | y = self.act(y) 86 | 87 | 88 | x_h, x_w = torch.split(y, [h, w], dim=2) 89 | x_w = x_w.permute(0, 1, 3, 2) 90 | 91 | a_h = self.conv_h(x_h).sigmoid() 92 | a_w = self.conv_w(x_w).sigmoid() 93 | 94 | out = identity * a_w * a_h 95 | 96 | return out 97 | 98 | if __name__ == '__main__': 99 | x = torch.randn(2, 64, 32, 32) 100 | att = CoordAtt(inp=64, reduction=32) 101 | out = att(x) 102 | print("输入尺寸:", x.shape) 103 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/coordattention/code/xxa-deformable_pooling_coordatt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Replace the `nn 3 | AdaptiveAvgPool2d` operations in the CoordAtt module with a custom deformable pooling layer 4 | Implement learnable offsets for pooling regions that adjust based on input features, allowing the pooling operation to capture more complex spatial hierarchies 5 | Ensure the deformable pooling layer is lightweight to maintain computational efficiency 6 | Evaluate the modified CoordAtt's performance on a small benchmark dataset, comparing improvements in feature representation quality and accuracy against the original implementation 7 | 8 | """ 9 | 10 | # Modified code 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class h_sigmoid(nn.Module): 16 | def __init__(self, inplace=True): 17 | super(h_sigmoid, self).__init__() 18 | self.relu = nn.ReLU6(inplace=inplace) 19 | 20 | def forward(self, x): 21 | return self.relu(x + 3) / 6 22 | 23 | 24 | class h_swish(nn.Module): 25 | def __init__(self, inplace=True): 26 | super(h_swish, self).__init__() 27 | self.sigmoid = h_sigmoid(inplace=inplace) 28 | 29 | def forward(self, x): 30 | return x * self.sigmoid(x) 31 | 32 | 33 | class DeformablePooling(nn.Module): 34 | def __init__(self, channels, kernel_size=1): 35 | super(DeformablePooling, self).__init__() 36 | self.offset_conv = nn.Conv2d(channels, 2, kernel_size=3, padding=1) 37 | self.kernel_size = kernel_size 38 | 39 | def forward(self, x): 40 | n, c, h, w = x.size() 41 | # Compute offsets 42 | offsets = self.offset_conv(x) 43 | # Create a normalized grid 44 | grid = self.create_grid(h, w, device=x.device) 45 | # Add offsets to the grid 46 | grid = grid + offsets.permute(0, 2, 3, 1) 47 | # Clamp grid values to ensure they are within valid range 48 | grid = torch.clamp(grid, -1, 1) 49 | # Sample using the modified grid 50 | sampled = F.grid_sample(x, grid, mode='bilinear', padding_mode='zeros', align_corners=True) 51 | return sampled 52 | 53 | def create_grid(self, height, width, device): 54 | # Create a grid for sampling 55 | theta = torch.tensor([[[1, 0, 0], [0, 1, 0]]], dtype=torch.float, device=device) 56 | grid = F.affine_grid(theta, (1, 1, height, width), align_corners=True) 57 | return grid.repeat(1, 1, 1, 1) 58 | 59 | 60 | class CoordAtt(nn.Module): 61 | def __init__(self, inp, reduction=32): 62 | super(CoordAtt, self).__init__() 63 | # Initialize deformable pooling layers for height and width 64 | self.pool_h = DeformablePooling(inp) 65 | self.pool_w = DeformablePooling(inp) 66 | 67 | mip = max(8, inp // reduction) 68 | 69 | # Convolutional layers for processing pooled features 70 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 71 | self.bn1 = nn.BatchNorm2d(mip) 72 | self.act = h_swish() 73 | 74 | self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 75 | self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) 76 | 77 | def forward(self, x): 78 | identity = x 79 | 80 | n, c, h, w = x.size() 81 | # Apply deformable pooling to both height and width 82 | x_h = self.pool_h(x) 83 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 84 | 85 | # Concatenate pooled features and process through convolutions 86 | y = torch.cat([x_h, x_w], dim=2) 87 | y = self.conv1(y) 88 | y = self.bn1(y) 89 | y = self.act(y) 90 | 91 | x_h, x_w = torch.split(y, [h, w], dim=2) 92 | x_w = x_w.permute(0, 1, 3, 2) 93 | 94 | # Generate attention weights 95 | a_h = self.conv_h(x_h).sigmoid() 96 | a_w = self.conv_w(x_w).sigmoid() 97 | 98 | # Apply attention to the identity 99 | out = identity * a_w * a_h 100 | 101 | return out 102 | 103 | if __name__ == '__main__': 104 | x = torch.randn(2, 64, 32, 32) 105 | att = CoordAtt(inp=64, reduction=32) 106 | out = att(x) 107 | print("输入尺寸:", x.shape) 108 | print("输出尺寸:", out.shape) 109 | 110 | # I am done -------------------------------------------------------------------------------- /generation_idea_template/coordattention-gemini/code/sigmoid_dynamic_weighted_fusion.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modify the `CoordAtt` module 3 | After pooling height and width features, apply a *single 1x1 convolution* to each of the pooled feature maps *separately* 4 | This 1x1 conv will output *two channels* 5 | The first channel will represent the transformed feature, and the second channel will represent the *dynamic weight* 6 | Apply a *sigmoid activation* to the dynamic weight channel 7 | Then, perform a weighted addition of the transformed height and width features using their respective sigmoid-activated dynamic weights 8 | Feed the result into the shared `conv1` 9 | In the `__init__` function, add *two 1x1 convolution layers*, one for each of height and width, each of which output *two channels* 10 | In the `forward` function, implement the separate convolutions, the separation of the two output channels into transformed feature and dynamic weight, the application of the sigmoid activation to the dynamic weight channel, the weighted addition using these sigmoid-activated dynamic weights, before passing the result to the shared `conv1` 11 | The rest of the forward pass remains the same 12 | Compare output with the baseline using the same test input and observe the changes 13 | This involves modifying `__init__` to include the 1x1 conv layers with two output channels, and `forward` to implement the channel separation, sigmoid activation and dynamic fusion 14 | 15 | """ 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | 21 | 22 | class h_sigmoid(nn.Module): 23 | def __init__(self, inplace=True): 24 | super(h_sigmoid, self).__init__() 25 | self.relu = nn.ReLU6(inplace=inplace) 26 | 27 | def forward(self, x): 28 | return self.relu(x + 3) / 6 29 | 30 | 31 | class h_swish(nn.Module): 32 | def __init__(self, inplace=True): 33 | super(h_swish, self).__init__() 34 | self.sigmoid = h_sigmoid(inplace=inplace) 35 | 36 | def forward(self, x): 37 | return x * self.sigmoid(x) 38 | 39 | 40 | class CoordAtt(nn.Module): 41 | def __init__(self, inp, reduction=32): 42 | super(CoordAtt, self).__init__() 43 | self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) 44 | self.pool_w = nn.AdaptiveAvgPool2d((1, None)) 45 | 46 | mip = max(8, inp // reduction) 47 | 48 | self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) 49 | self.bn1 = nn.BatchNorm2d(mip) 50 | self.act = h_swish() 51 | 52 | # Add two 1x1 conv layers, one for h and one for w, each outputting 2 channels 53 | self.conv_h_sep = nn.Conv2d(inp, 2, kernel_size=1, stride=1, padding=0) 54 | self.conv_w_sep = nn.Conv2d(inp, 2, kernel_size=1, stride=1, padding=0) 55 | self.conv_h_expand = nn.Conv2d(1, inp, kernel_size=1, stride=1, padding=0) 56 | self.conv_w_expand = nn.Conv2d(1, inp, kernel_size=1, stride=1, padding=0) 57 | 58 | 59 | def forward(self, x): 60 | identity = x 61 | 62 | n, c, h, w = x.size() 63 | x_h = self.pool_h(x) 64 | x_w = self.pool_w(x).permute(0, 1, 3, 2) 65 | 66 | # Apply separate 1x1 convs to pooled features 67 | x_h_sep = self.conv_h_sep(x_h) 68 | x_w_sep = self.conv_w_sep(x_w) 69 | 70 | # Separate transformed feature and dynamic weight 71 | x_h_trans, x_h_weight = torch.split(x_h_sep, [1, 1], dim=1) 72 | x_w_trans, x_w_weight = torch.split(x_w_sep, [1, 1], dim=1) 73 | 74 | 75 | # Apply sigmoid to dynamic weights 76 | a_h = x_h_weight.sigmoid() 77 | a_w = x_w_weight.sigmoid() 78 | 79 | # Expand channels before weighted addition 80 | x_h_trans = self.conv_h_expand(x_h_trans) 81 | x_w_trans = self.conv_w_expand(x_w_trans) 82 | 83 | # Weighted addition of transformed features 84 | y = x_h_trans * a_h + x_w_trans.permute(0, 1, 3, 2) * a_w 85 | 86 | 87 | y = self.conv1(y) 88 | y = self.bn1(y) 89 | y = self.act(y) 90 | 91 | 92 | out = identity * y 93 | 94 | return out 95 | 96 | if __name__ == '__main__': 97 | x = torch.randn(2, 64, 32, 32) 98 | att = CoordAtt(inp=64, reduction=32) 99 | out = att(x) 100 | print("输入尺寸:", x.shape) 101 | print("输出尺寸:", out.shape) -------------------------------------------------------------------------------- /generation_idea_template/small_object_attention/code/contrastive_learning_integration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integrate a contrastive learning module into the SEAttention framework 3 | Develop functions to generate contrastive pairs from input data, either through augmentations or synthetic data creation 4 | Ensure these pairs highlight small target presence or absence 5 | Modify the training loop to include a contrastive loss alongside the standard detection loss 6 | Evaluate the performance improvements using metrics such as precision, recall, and F1-score on small target detection tasks, comparing results with the baseline SEAttention model and other enhanced models 7 | Emphasize robustness in varied detection scenarios 8 | 9 | """ 10 | 11 | # Modified code 12 | import numpy as np 13 | import torch 14 | from torch import flatten, nn 15 | from torch.nn import init 16 | from torch.nn.modules.activation import ReLU 17 | from torch.nn.modules.batchnorm import BatchNorm2d 18 | from torch.nn import functional as F 19 | from torchvision import transforms 20 | 21 | class SEAttention(nn.Module): 22 | 23 | def __init__(self, channel=512, reduction=16): 24 | super().__init__() 25 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 26 | self.fc = nn.Sequential( 27 | nn.Linear(channel, channel // reduction, bias=False), 28 | nn.ReLU(inplace=True), 29 | nn.Linear(channel // reduction, channel, bias=False), 30 | nn.Sigmoid() 31 | ) 32 | 33 | def init_weights(self): 34 | for m in self.modules(): 35 | if isinstance(m, nn.Conv2d): 36 | init.kaiming_normal_(m.weight, mode='fan_out') 37 | if m.bias is not None: 38 | init.constant_(m.bias, 0) 39 | elif isinstance(m, nn.BatchNorm2d): 40 | init.constant_(m.weight, 1) 41 | init.constant_(m.bias, 0) 42 | elif isinstance(m, nn.Linear): 43 | init.normal_(m.weight, std=0.001) 44 | if m.bias is not None: 45 | init.constant_(m.bias, 0) 46 | 47 | def forward(self, x): 48 | b, c, _, _ = x.size() 49 | y = self.avg_pool(x).view(b, c) 50 | y = self.fc(y).view(b, c, 1, 1) 51 | return x * y.expand_as(x) 52 | 53 | class ContrastiveLearningModule(nn.Module): 54 | 55 | def __init__(self, feature_dim): 56 | super().__init__() 57 | self.projector = nn.Sequential( 58 | nn.Linear(feature_dim, feature_dim, bias=False), 59 | nn.ReLU(inplace=True), 60 | nn.Linear(feature_dim, feature_dim, bias=False) 61 | ) 62 | 63 | def forward(self, x1, x2): 64 | z1 = self.projector(x1) 65 | z2 = self.projector(x2) 66 | return z1, z2 67 | 68 | def contrastive_loss(z1, z2, temperature=0.5, device='cpu'): 69 | z1 = F.normalize(z1, dim=1) 70 | z2 = F.normalize(z2, dim=1) 71 | batch_size = z1.size(0) 72 | labels = torch.arange(batch_size).to(device) 73 | similarity_matrix = torch.matmul(z1, z2.T) / temperature 74 | loss = F.cross_entropy(similarity_matrix, labels) 75 | return loss 76 | 77 | def generate_contrastive_pairs(input_data): 78 | # Applying random augmentations to generate pairs 79 | transform = transforms.Compose([ 80 | transforms.RandomResizedCrop(7), 81 | transforms.RandomHorizontalFlip(), 82 | transforms.RandomVerticalFlip() 83 | ]) 84 | augmented_data_1 = transform(input_data) 85 | augmented_data_2 = transform(input_data) 86 | return augmented_data_1, augmented_data_2 87 | 88 | if __name__ == '__main__': 89 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 90 | model = SEAttention().to(device) 91 | model.init_weights() 92 | contrastive_model = ContrastiveLearningModule(feature_dim=512).to(device) 93 | 94 | input_data = torch.randn(10, 512, 7, 7).to(device) # Example for a batch size of 10 95 | output = model(input_data) 96 | 97 | augmented_data_1, augmented_data_2 = generate_contrastive_pairs(input_data) 98 | z1, z2 = contrastive_model(output.view(output.size(0), -1), output.view(output.size(0), -1)) 99 | cl_loss = contrastive_loss(z1, z2, device=device) 100 | 101 | print(f'Output shape: {output.shape}, Contrastive Loss: {cl_loss.item()}') --------------------------------------------------------------------------------