├── LICENSE
├── README.md
├── competition
    ├── 2020DCIC-创新大赛大数据赛道
    │   ├── README.md
    │   └── Task1-EDA.ipynb
    ├── 2021阿里云供应链大赛——需求预测及单级库存优化
    │   └── baseline.ipynb
    ├── 2023全球智能汽车AI挑战赛——赛道二：智能驾驶汽车虚拟仿真视频数据理解赛道
    │   ├── README.md
    │   └── clip_demo (2).ipynb
    ├── 2024“大运河杯”数据开发应用创新大赛——城市治理_baseline.ipynb
    ├── 2024数字中国创新大赛DCIC
    │   └── 海上风电出力预测赛道
    │   │   ├── README.md
    │   │   └── dcic_baseline.ipynb
    ├── AIWIN2021
    │   ├── AIWIN-保险文本知识问答-baseline.ipynb
    │   ├── AIWIN-保险文本问答-submit.ipynb
    │   ├── AIWIN_互联网舆情企业风险-submit.ipynb
    │   ├── AIWIN_互联网舆情识别-NER.ipynb
    │   ├── AIWIN_互联网舆情识别-baseline.ipynb
    │   ├── README.md
    │   └── 致Great_互联网舆情企业风险事件的识别和预警.ipynb
    ├── AIWIN2023
    │   ├── README.md
    │   ├── 中文网页自动导航_关键词_baseline.ipynb
    │   └── 研报类型识别baseline.ipynb
    ├── ATEC2022
    │   ├── README.md
    │   └── task1-EDA-Model.ipynb
    ├── DC竞赛-AI助疫·口罩佩戴检测大赛
    │   ├── 1_train.py
    │   ├── 2_predict.py
    │   └── README.md
    ├── DIGIX2021
    │   └── README.md
    ├── DataFountain-CCFBDI-2021
    │   ├── README.md
    │   └── 个贷违约预测-860.ipynb
    ├── DataFountain-三角形图计算算法设计及性能优化
    │   └── README.md
    ├── DataFountain-乘用车细分市场销量预测
    │   └── README.md
    ├── DataFountain-云计算时代的大数据查询分析优化
    │   └── README.md
    ├── DataFountain-互联网新闻情感分析
    │   ├── README.md
    │   └── bert_baseline.ipynb
    ├── DataFountain-互联网金融新实体发现
    │   ├── README.md
    │   ├── bert-chinese-ner.zip
    │   └── bert_baseline.ipynb
    ├── DataFountain-企业网络资产及安全事件分析与可视化
    │   └── README.md
    ├── DataFountain-企业非法集资风险预测
    │   ├── 843 (1).ipynb
    │   └── README.md
    ├── DataFountain-基于OCR的身份证要素提取
    │   └── README.md
    ├── DataFountain-多人种人脸识别
    │   └── README.md
    ├── DataFountain-技术需求与技术成果项目之间关联度计算模型
    │   ├── README.md
    │   └── bert_baseline.py
    ├── DataFountain-离散制造过程中典型工件的质量符合率预测
    │   └── README.md
    ├── DataFountain-视频版权检测算法
    │   ├── README.md
    │   └── ccf_video_baseline.ipynb
    ├── DataFountain-金融信息负面及主体判定
    │   └── README.md
    ├── Kesci-中国华录杯人群密度检测
    │   ├── README.md
    │   └── test.py
    ├── Tianchi-2020数字中国创新大赛—算法赛：智慧海洋建设
    │   └── README.md
    ├── Tianchi-安泰杯跨境电商智能算法大赛
    │   └── README.md
    ├── Tianchi-心电人机智能大赛心电异常事件预测
    │   └── README.md
    ├── Tianchi-第三届阿里云安全算法挑战赛
    │   ├── EDA.ipynb
    │   ├── GBM_old.ipynb
    │   ├── LGB_LinuX_0819.py
    │   ├── README.md
    │   ├── api.csv
    │   ├── finetune.ipynb
    │   └── gbm.py
    ├── TinyMind人民币面值&冠字号编码识别挑战赛
    │   ├── .DS_Store
    │   ├── README.md
    │   ├── task1
    │   │   ├── 1_train.ipynb
    │   │   ├── README.md
    │   │   └── predict_rmb.py
    │   └── task2
    │   │   ├── .DS_Store
    │   │   ├── 1_train_faster_rcnn.py
    │   │   ├── 2_predict_faster_rcnn.py
    │   │   ├── 3_savejson.py
    │   │   ├── VOC2007.zip
    │   │   ├── crnn-pytorch
    │   │       ├── README.md
    │   │       ├── __init__.py
    │   │       ├── dataset
    │   │       │   ├── __init__.py
    │   │       │   ├── collate_fn.py
    │   │       │   ├── data_transform.py
    │   │       │   ├── test_data.py
    │   │       │   └── text_data.py
    │   │       ├── fold_tta.pkl
    │   │       ├── lr_policy.py
    │   │       ├── models
    │   │       │   ├── __init__.py
    │   │       │   ├── crnn.py
    │   │       │   └── model_loader.py
    │   │       ├── pb_rcnn_label.csv
    │   │       ├── submit.py
    │   │       ├── test.py
    │   │       ├── test2.py
    │   │       ├── test2_tta.py
    │   │       └── train.py
    │   │   ├── data
    │   │       └── data.json
    │   │   └── multi-digit-pytorch
    │   │       ├── .ipynb_checkpoints
    │   │           └── 未命名-checkpoint.ipynb
    │   │       ├── 1_train.py
    │   │       ├── 2_predict.py
    │   │       ├── example.log
    │   │       └── 未命名.ipynb
    ├── WSDM2022
    │   └── README.md
    ├── biendata-智源&计算所-互联网虚假新闻检测挑战赛
    │   ├── README.md
    │   └── task1_bert.ipynb
    ├── kaggle-allstate-claims-severity
    │   ├── README.md
    │   ├── XGB_encoding(LB1106.33084).py
    │   └── nn_bagging_1111.84364.py
    ├── kaggle-atecup-deepfake
    │   ├── README.md
    │   └── ffdi-resnet-baseline.ipynb
    ├── kaggle-quickdraw-doodle-recognition
    │   ├── 1_save2df.py
    │   ├── 2_train.py
    │   ├── EDA.ipynb
    │   ├── EDA_predict.ipynb
    │   ├── PlotLoss.ipynb
    │   ├── README.md
    │   └── Transform_Example.ipynb
    ├── kaggle-spaceship-titanic.ipynb
    ├── kaggle-two-sigma-connect-rental-listing-inquiries
    │   ├── README.md
    │   └── lgb.py
    ├── yanxishe-IMDB评论剧透检测
    │   ├── README.md
    │   └── src
    │   │   └── ml.ipynb
    ├── yanxishe-人脸年龄识别
    │   ├── 1_train.py
    │   ├── 1_train_pretrain.py
    │   ├── 2_predit.py
    │   ├── README.md
    │   └── 人脸年龄识别练习赛冠军源码_1575964312087.zip
    ├── yanxishe-喵脸关键点检测
    │   ├── 1_train.py
    │   ├── 2_predict.py
    │   ├── README.md
    │   └── train_box.csv
    ├── yanxishe-白葡萄酒品质预测
    │   ├── README.md
    │   ├── lgb_baseline.py
    │   └── winequality_dataset.zip
    ├── yanxishe-美食识别挑战（1）：豆腐VS土豆
    │   ├── 1_train.py
    │   ├── 2_predict.py
    │   └── README.md
    ├── yanxishe-肌肉活动电信号推测手势
    │   ├── README.md
    │   └── lgb_baseline.ipynb
    ├── yanxishe-肺炎X光病灶识别
    │   ├── 1_train.py
    │   ├── 2_predict.py
    │   └── README.md
    ├── yanxishe-胸腔X光肺炎检测
    │   ├── 1_train.py
    │   ├── 2_predict.py
    │   └── README.md
    ├── 全球AI攻防挑战赛
    │   ├── README.md
    │   ├── 全球AI攻防挑战赛—赛道一：大模型生图安全疫苗注入_baseline.ipynb
    │   └── 全球AI攻防挑战赛—赛道二：金融场景凭证篡改检测_baseline.ipynb
    ├── 点石-Retention Rate of Baidu Hao Kan APP Users
    │   ├── 1_splitdf.py
    │   ├── 2_baseline_1128.py
    │   ├── 2_baseline_1202.py
    │   ├── 2_baseline_1203_Train0.75989_Test0.75627.py
    │   ├── 2_baseline_1203_Train0.76103_Test0.75740.py
    │   ├── 2_baseline_1203_Train0.77218_Test0.76203.py
    │   ├── README.md
    │   └── featselect.py
    ├── 科大讯飞AI开发者大赛-事件抽取挑战
    │   └── README.md
    ├── 科大讯飞AI开发者大赛-婴儿啼哭声识别挑战赛
    │   ├── README.md
    │   └── cry_baseline.ipynb
    ├── 科大讯飞AI开发者大赛-温室温度预测挑战赛
    │   ├── README.md
    │   └── baseline.py
    ├── 科大讯飞AI开发者大赛-脑PET图像分析和疾病预测挑战赛算法挑战大赛
    │   ├── 1_train.py
    │   ├── 2_predict.py
    │   └── README.md
    ├── 科大讯飞AI开发者大赛2021
    │   ├── 上海海事大学_蛋白质结构预测赛.ipynb
    │   ├── 中国农业大学_农作物生长情况识别挑战赛.ipynb
    │   ├── 中国农业大学_引导拍照挑战赛.ipynb
    │   ├── 中国科学技术大学_新冠肺炎声音诊断挑战赛.ipynb
    │   ├── 中文成语填空挑战赛
    │   │   ├── README.md
    │   │   ├── baseline.py
    │   │   ├── gen_train_test.py
    │   │   └── run.sh
    │   ├── 中文问题相似度挑战赛
    │   │   ├── README.md
    │   │   └── bert-nsp-xunfei.ipynb
    │   ├── 人脸关键点检测挑战赛
    │   │   ├── README.md
    │   │   ├── face-keypoint2.ipynb
    │   │   ├── face-keypoint_kfold.ipynb
    │   │   └── face-keypoint_kfold_stronger.ipynb
    │   ├── 人脸情绪识别挑战赛
    │   │   ├── README.md
    │   │   ├── keras_cnn_baseline.ipynb
    │   │   ├── pytorch_cnn_baseline-66.ipynb
    │   │   └── pytorch_cnn_baseline.ipynb
    │   ├── 北京林业大学_环境空气质量评价挑战赛.ipynb
    │   ├── 学术论文分类挑战赛
    │   │   ├── README.md
    │   │   └── tfidf_baseline.ipynb
    │   ├── 安徽大学-脑部PETMR图像疾病预测挑战赛.ipynb
    │   ├── 广告图片素材分类_baseline.ipynb
    │   ├── 广告点击率预估挑战赛
    │   │   ├── README.md
    │   │   └── 广告点击率预估挑战赛Baseline.ipynb
    │   ├── 智能硬件语音控制的时频图分类挑战赛.ipynb
    │   ├── 清华大学_智能硬件语音控制的时频图分类挑战赛.ipynb
    │   ├── 电商图像检索挑战赛
    │   │   ├── README.md
    │   │   ├── cnn_arcface.ipynb
    │   │   ├── cnn_arcface_kfold.ipynb
    │   │   └── cnn_baseline.ipynb
    │   ├── 科大讯飞商店销量预测
    │   │   ├── README.md
    │   │   └── lgb.py
    │   ├── 科大讯飞股份有限公司_基于用户画像的商品推荐挑战赛.ipynb
    │   ├── 科大讯飞股份有限公司_猪只盘点挑战赛.ipynb
    │   ├── 移动设备用户年龄和性别预测
    │   │   ├── README.md
    │   │   └── 移动设备用户年龄和性别预测Baseline.ipynb
    │   ├── 蛋白质结构预测挑战赛.md
    │   └── 车辆贷款违约预测挑战赛
    │   │   ├── Baseline.py
    │   │   └── README.md
    ├── 科大讯飞AI开发者大赛2022
    │   ├── LED生产封装瑕疵识别挑战赛_baseline.ipynb
    │   ├── README.md
    │   ├── 中文对话文本匹配挑战赛_baseline.ipynb
    │   ├── 人员聚集识别挑战赛-baseline.ipynb
    │   ├── 作物引导拍照挑战赛_baseline.ipynb
    │   ├── 创意视角下的数字广告CTR预估-数据读取.ipynb
    │   ├── 创意视角下的数字广告CTR预估-模型搭建.ipynb
    │   ├── 商品销量智能预测挑战赛_baseline.ipynb
    │   ├── 国产平台动作识别挑战赛.ipynb
    │   ├── 基于论文摘要的文本分类与查询性问答-bert.ipynb
    │   ├── 基于论文摘要的文本分类与查询性问答-tfidf.ipynb
    │   ├── 房屋租赁价格预测挑战赛-baseline.ipynb
    │   ├── 智能家居使用场景识别挑战赛_baseline.ipynb
    │   ├── 智能硬件语音控制的时频图分类挑战赛2.0-baseline.ipynb
    │   ├── 机动车车牌识别挑战赛-baseline.ipynb
    │   ├── 汽车领域多语种迁移学习挑战赛-baseline-0.61.py
    │   ├── 电信客户流失预测_baseline.ipynb
    │   ├── 疫情微博情绪识别挑战赛-baseline.ipynb
    │   ├── 神经影像分析与疾病预测挑战赛_baseline.ipynb
    │   ├── 糖尿病遗传风险检测挑战赛-baseline.ipynb
    │   └── 非标准化疾病诉求的简单分诊挑战赛_baseline.ipynb
    ├── 科大讯飞AI开发者大赛2023
    │   ├── 5G移动用户使用预测挑战赛_baseline.ipynb
    │   ├── AI量化模型预测挑战赛_baseline.ipynb
    │   ├── ChatGPT生成文本检测器_baseline.ipynb
    │   ├── README.md
    │   ├── Stable Diffusion鉴别器挑战赛_baseline.ipynb
    │   ├── 中文语义病句识别纠正_baseline.md
    │   ├── 交通场景运输车辆外廓视觉测量挑战赛_baseline.ipynb
    │   ├── 人岗匹配挑战赛2023_baseline.md
    │   ├── 企业经营健康评估挑战赛_baseline.ipynb
    │   ├── 健康成人脑龄预测挑战赛_baseline.ipynb
    │   ├── 农作物朝向检测挑战赛_baseline.ipynb
    │   ├── 农机作业轨迹测算挑战赛_baseline.ipynb
    │   ├── 农民身份识别挑战赛_baseline.ipynb
    │   ├── 叶片病害识别挑战赛_baseline.ipynb
    │   ├── 图片文本块检测_baseline.ipynb
    │   ├── 基于可见光图像的柑橘花果梢语义分割挑战赛_baseline.ipynb
    │   ├── 基于图像识别算法的无人船障碍物检测挑战赛_baseline.ipynb
    │   ├── 基于用户画像的商品推荐挑战赛2.0_baseline.ipynb
    │   ├── 基于论文摘要的文本分类与关键词抽取挑战赛_baseline.ipynb
    │   ├── 基于近红外光谱的煤质参数预测挑战赛_baseline.ipynb
    │   ├── 多标签图像检索挑战赛_baseline.ipynb
    │   ├── 大视角差图像特征提取及匹配挑战赛.ipynb
    │   ├── 学术文档篇章级结构恢复挑战赛_baseline.ipynb
    │   ├── 学术文档要素分类挑战赛_baseline.ipynb
    │   ├── 工业场景下的服装生产力预测挑战赛_baseline.ipynb
    │   ├── 快速现场细胞学评价中的恶性细胞识别挑战赛_baseline.ipynb
    │   ├── 旋转机械故障诊断挑战赛_baseline.ipynb
    │   ├── 机器翻译质量评估挑战赛2023_baseline.ipynb
    │   ├── 标书实体抽取挑战赛_baseline.ipynb
    │   ├── 校招简历信息完整性检测挑战赛_baseline.ipynb
    │   ├── 校招简历应聘岗位与项目技能匹配检测挑战赛_baseline.ipynb
    │   ├── 水泵状态监测与故障诊断挑战赛_baseline.ipynb
    │   ├── 汽车保险索赔预测挑战赛_baseline.ipynb
    │   ├── 汽车领域文本规则泛化性增强挑战赛_baseline.ipynb
    │   ├── 用户新增预测挑战赛_baseline.ipynb
    │   ├── 社交账号网络分类挑战赛_baseline.ipynb
    │   ├── 移动广告营销场景下的人群召回算法挑战赛_baseline.ipynb
    │   ├── 空气质量指数预测挑战赛_baseline.ipynb
    │   ├── 糖尿病风险预测挑战赛_baseline.ipynb
    │   ├── 能源消耗预测挑战赛_baseline.ipynb
    │   ├── 脑PET图像分析和疾病预测挑战赛_baseline.ipynb
    │   ├── 自动驾驶疲劳检测挑战赛_baseline.ipynb
    │   ├── 苹果病害图像识别挑战赛_baseline.ipynb
    │   ├── 跨境电商效果广告ROI预测挑战赛_baseline.py
    │   ├── 通信系统调制格式识别与分类挑战赛_baseline.ipynb
    │   ├── 遥感图像倾斜舰船小目标检测挑战赛_baseline.ipynb
    │   ├── 酒店住宿价格预测挑战赛_baseline.ipynb
    │   ├── 锂离子电池生产参数调控及生产温度预测挑战赛_baseline.ipynb
    │   ├── 高分辨率遥感影像建筑物变化检测挑战赛_baseline.ipynb
    │   └── 鸟类品种识别挑战赛_baseline.ipynb
    ├── 科大讯飞AI开发者大赛2024
    │   ├── README.md
    │   ├── 交通标识识别挑战赛_baseline.ipynb
    │   ├── 人岗匹配挑战赛赛季3_baseline.ipynb
    │   ├── 低资源文本翻译挑战赛_baseline.ipynb
    │   ├── 农业行人重识别挑战赛_baseline.ipynb
    │   ├── 分子性质AI预测挑战赛_baseline.ipynb
    │   ├── 基于无人机图像的农民劳作行为识别挑战赛_baseline.ipynb
    │   ├── 基于术语词典干预的机器翻译挑战赛_baseline.ipynb
    │   ├── 基于热力学定律的电池材料生产参数动态调控挑战赛_baseline.ipynb
    │   ├── 基于超声数据的多病种疾病预测挑战赛_baseline.ipynb
    │   ├── 大模型RAG智能问答挑战赛_baseline.ipynb
    │   ├── 大模型图像风格迁移挑战赛_label.ipynb
    │   ├── 大模型图文匹配识别挑战赛_baseline.ipynb
    │   ├── 大模型图表知识问答挑战赛_baseline.ipynb
    │   ├── 大模型能力评测中文成语释义与解析_baseline.ipynb
    │   ├── 心理健康辅助诊断挑战赛_0.92308.ipynb
    │   ├── 心理健康辅助诊断挑战赛_baseline.ipynb
    │   ├── 机器翻译质量评估挑战赛_baseline.ipynb
    │   ├── 濒危大型动物种类识别挑战赛_baseline.ipynb
    │   ├── 玉米雄穗识别挑战赛_baseline.ipynb
    │   ├── 电力需求预测挑战赛_baseline.ipynb
    │   ├── 短视频精准推荐挑战赛_baseline.ipynb
    │   ├── 网络安全入侵检测挑战赛_baseline.ipynb
    │   ├── 轻度认知障碍疾病预测挑战赛_baseline.ipynb
    │   ├── 问答意图聚类挑战赛_baseline.ipynb
    │   └── 高分辨率遥感识别检索挑战赛_baseline.ipynb
    ├── 第三届“马栏山杯”国际音视频算法大赛
    │   ├── README.md
    │   ├── mgtv-用户下一个观看视频预测-390.ipynb
    │   └── mgtv-用户下一个观看视频预测-BERT.ipynb
    ├── 第四届工业大数据创新竞赛：算法赛道
    │   ├── README.md
    │   └── 注塑成型赛道baseline.ipynb
    ├── 腾讯-2018腾讯广告算法大赛
    │   └── README.md
    ├── 腾讯-2019腾讯广告算法大赛
    │   └── README.md
    └── 阿里灵杰问天引擎电商搜索算法赛
    │   ├── README.md
    │   ├── sentence-bert.ipynb
    │   └── 无监督baseline.ipynb
├── docs
    ├── .nojekyll
    ├── 2021-科大讯飞AI开发者大赛
    │   └── README.md
    ├── README.md
    ├── _sidebar.md
    └── index.html
└── tutorial
    ├── bert
        ├── README.md
        ├── bert-cls-example.ipynb
        ├── bert-mlm-example.ipynb
        ├── bert-ner-example.ipynb
        ├── bert-nsp-example.ipynb
        └── bert-qa-example.ipynb
    ├── jax
        └── README.md
    ├── paddlepaddle
        └── README.md
    ├── rank-ensemble.ipynb
    ├── sklearn
        └── README.md
    └── tree
        └── README.md


/competition/2020DCIC-创新大赛大数据赛道/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/competition/2023全球智能汽车AI挑战赛——赛道二：智能驾驶汽车虚拟仿真视频数据理解赛道/README.md:
--------------------------------------------------------------------------------
 1 | ## 赛题：智能驾驶汽车虚拟仿真视频数据理解赛道
 2 | 
 3 | 输入：元宇宙仿真平台生成的前视摄像头虚拟视频数据（8-10秒左右）；
 4 | 
 5 | 输出：对视频中的信息进行综合理解，以指定的json文件格式，按照数据说明中的关键词（key）填充描述型的文本信息（value，中文/英文均可以）；
 6 | 
 7 | 
 8 | https://tianchi.aliyun.com/competition/entrance/532155/information
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/competition/2024数字中国创新大赛DCIC/海上风电出力预测赛道/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/competition/AIWIN2021/README.md:
--------------------------------------------------------------------------------
 1 | # AIWIN 秋季竞赛
 2 | 
 3 | 
 4 | ## 赛题1- 手写体 OCR 识别竞赛
 5 | 
 6 | 手写体 OCR 识别竞赛由交通银行命题，设立两个任务，其中任务一由第四范式提供开放数据集，特别针对金额和日期做识别，任务二要求在指定训练环境完成不可下载训练集的训练，增加了银行机构的文本内容。任务一适合新手，并配套学习营和特别的学习奖励。
 7 | 
 8 | 比赛地址：http://ailab.aiwin.org.cn/competitions/65
 9 | 
10 | baseline地址：https://aistudio.baidu.com/aistudio/projectdetail/2612313
11 | 
12 | ## 赛题2- 心电图智能诊断竞赛
13 | 
14 | 心电图智能诊断竞赛由数创医疗和复旦大学附属中山医院共同命题，设立两个任务，其中任务一诊断心电图的正常异常与否，任务二对10+种不同症状予以判断综合分类。任务一同步设有学习营和配套的学习奖励，欢迎新手参与。
15 | 
16 | 比赛地址：http://ailab.aiwin.org.cn/competitions/64
17 | 
18 | baseline地址：https://aistudio.baidu.com/aistudio/projectdetail/2653802
19 | 
20 | # AIWIN 春季竞赛
21 | 
22 | ## 赛题1—互联网舆情企业风险事件的识别和预警
23 | 
24 | 参赛选手从给定的互联网信息中提取、识别出企业主体名称，以及标记风险标签（内容包含新闻标题、正文、及对应标签等）。
25 | 
26 | 比赛地址：http://ailab.aiwin.org.cn/competitions/48
27 | 
28 | 
29 | ## 赛题2-保险文本视觉认知问答竞赛
30 | 
31 | 利用OCR技术自动识别影像资料后，再通过AI智能判断所识别文字的内在逻辑，回答关于图片的自然语言问题。问题的答案是可以从图片中提取的任何文本/标记。
32 | 
33 | 比赛地址：http://ailab.aiwin.org.cn/competitions/49
34 | 
35 | 训练集+测试集OCR识别结果：http://datawhale-cdn.coggle.club/aiwin2021/ocr/ocr_result.zip
36 | 
37 | 
38 | ## 赛题3-文化传媒数字资产的自动编目
39 | 
40 | 基于计算机视觉、NLP和语音识别等多模态技术，以新闻视频为类型，通过AI算法自动将完整新闻节目进行时序解构、添加语义标签、并进行内容归类。
41 | 
42 | 比赛地址：http://ailab.aiwin.org.cn/competitions/51
43 | 
44 | ## 赛题4-机器学习在债券定价中的应用
45 | 
46 | 利用宏观数据、行情数据或者其它特色数据构建特征，进行机器学习建模，对中债10年期国债、中债10年期国开债、中债10年期AAA级地方政府债、中债10年期AAA级城投债以及中债10年期AAA级企业债到期收益率进行预测，预测给定的未来时间段（2021.5.6-2021.6.4期间，包含两端日期，共 23 个交易日）的系列十年期债券利率价格，并分析所用特征重要程度，给出相关逻辑解释。
47 | 
48 | 比赛地址：http://ailab.aiwin.org.cn/competitions/52
49 | 


--------------------------------------------------------------------------------
/competition/AIWIN2023/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/competition/ATEC2022/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/competition/DC竞赛-AI助疫·口罩佩戴检测大赛/2_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
 15 | 
 16 | from efficientnet_pytorch import EfficientNet
 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 
 18 | 
 19 | import torch
 20 | torch.manual_seed(0)
 21 | torch.backends.cudnn.deterministic = False
 22 | torch.backends.cudnn.benchmark = True
 23 | 
 24 | import torchvision.models as models
 25 | import torchvision.transforms as transforms
 26 | import torchvision.datasets as datasets
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.autograd import Variable
 31 | from torch.utils.data.dataset import Dataset
 32 | 
 33 | class QRDataset(Dataset):
 34 |     def __init__(self, img_path, transform=None):
 35 |         self.img_path = img_path
 36 |         if transform is not None:
 37 |             self.transform = transform
 38 |         else:
 39 |             self.transform = None
 40 |     
 41 |     def __getitem__(self, index):
 42 |         start_time = time.time()
 43 |         img = Image.open(self.img_path[index]).convert('RGB')
 44 |         
 45 |         if self.transform is not None:
 46 |             img = self.transform(img)
 47 |         return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index])))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 | 
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |                 
 56 | #         model = models.resnet18(True)
 57 | #         model.avgpool = nn.AdaptiveAvgPool2d(1)
 58 | #         model.fc = nn.Linear(512, 2)
 59 | #         self.resnet = model
 60 |         
 61 |         model = EfficientNet.from_pretrained('efficientnet-b4') 
 62 |         model._fc = nn.Linear(1792, 2)
 63 |         self.resnet = model
 64 |         
 65 |     def forward(self, img):        
 66 |         out = self.resnet(img)
 67 |         return out
 68 | 
 69 | def predict(test_loader, model, tta=10):
 70 |     # switch to evaluate mode
 71 |     model.eval()
 72 |     
 73 |     test_pred_tta = None
 74 |     for _ in range(tta):
 75 |         test_pred = []
 76 |         with torch.no_grad():
 77 |             end = time.time()
 78 |             for i, (input, target) in enumerate(test_loader):
 79 |                 input = input.cuda()
 80 |                 target = target.cuda()
 81 | 
 82 |                 # compute output
 83 |                 output = model(input)
 84 |                 output = output.data.cpu().numpy()
 85 | 
 86 |                 test_pred.append(output)
 87 |         test_pred = np.vstack(test_pred)
 88 |     
 89 |         if test_pred_tta is None:
 90 |             test_pred_tta = test_pred
 91 |         else:
 92 |             test_pred_tta += test_pred
 93 |     
 94 |     return test_pred_tta
 95 | 
 96 | test_jpg = ['game_gauzeMask_data/toPredict/{0}.jpg'.format(x) for x in range(0, 3802)]
 97 | test_jpg = np.array(test_jpg)
 98 | 
 99 | test_pred = None
100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt',
101 |                   'resnet18_fold3.pt'][:]:
102 |     
103 |     test_loader = torch.utils.data.DataLoader(
104 |         QRDataset(test_jpg,
105 |                 transforms.Compose([
106 |                             transforms.Resize((256, 256)),
107 |                             transforms.RandomHorizontalFlip(),
108 |                             transforms.RandomVerticalFlip(),
109 |                             transforms.ToTensor(),
110 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
111 |             ])
112 |         ), batch_size=50, shuffle=False, num_workers=10, pin_memory=True
113 |     )
114 |         
115 |     print(model_path)
116 |     model = VisitNet().cuda()
117 |     model.load_state_dict(torch.load(model_path))
118 |     # model = nn.DataParallel(model).cuda()
119 |     if test_pred is None:
120 |         test_pred = predict(test_loader, model, 1)
121 |     else:
122 |         test_pred += predict(test_loader, model, 1)
123 |         
124 |     print(test_pred.shape)
125 |     
126 | test_csv = pd.DataFrame()
127 | test_csv['ID'] = list(range(0, 3082))
128 | test_csv['Label'] = np.argmax(test_pred, 1)
129 | test_csv['Label'] = test_csv['Label'].map({1:'pos', 0:'neg'})
130 | test_csv.to_csv('tmp.csv', index=None)


--------------------------------------------------------------------------------
/competition/DC竞赛-AI助疫·口罩佩戴检测大赛/README.md:
--------------------------------------------------------------------------------
 1 | 比赛链接：https://www.dcjingsai.com/common/cmpt/AI%E5%8A%A9%E7%96%AB%C2%B7%E5%8F%A3%E7%BD%A9%E4%BD%A9%E6%88%B4%E6%A3%80%E6%B5%8B%E5%A4%A7%E8%B5%9B_%E7%AB%9E%E8%B5%9B%E4%BF%A1%E6%81%AF.html
 2 | 
 3 | 比赛任务：戴口罩、不带口罩，二分类；
 4 | 
 5 | 思路：直接CNN训练
 6 | 
 7 | ```
 8 | python 1_train.py
 9 | python 2_predict.py
10 | ```
11 | 


--------------------------------------------------------------------------------
/competition/DIGIX2021/README.md:
--------------------------------------------------------------------------------
  1 | ## 2021 DIGIX全球校园AI算法精英大赛
  2 | 
  3 | 报名链接（限在校生）：https://developer.huawei.com/consumer/cn/activity/digixActivity/digixdetail/201621215957378831?ha_source=gb_sf&ha_sourceId=89000073
  4 | 
  5 | - 赛题一：通过使用日志数据预测用户的留存周期；
  6 | - 赛题二：通过给出的文章数据判别文章质量；
  7 | - 赛题三：使用用户数据及历史行为数据建模完成视频推荐；
  8 | - 赛题四：为基于多语言多模态的搜索排序任务；
  9 | - 赛题五：为识别菜单图片中的文本信息。
 10 | 
 11 | 报名时间：2021 年 8 月 31 日前均可报名
 12 | 
 13 | 组队规则：每支队伍不超过 3 人，队长一人
 14 | 
 15 | ---
 16 | 
 17 | ### 赛题1：基于多目标多视图的用户留存周期预测
 18 | - 简介
 19 | 
 20 | 活跃留存周期预测通常使用单一视图做预测。在音乐领域，结合用户关注的音乐话题信息进行表征学习、结合歌曲信息进行音频、歌词、歌曲名、评论文本的多模态表征、结合歌手、用户、歌曲构建知识图谱，基于用户在APP侧信息构造行为链路向量化，辅助多日留存的多目标优化。在业界探索下一代机器学习模型与多模态向量化中有非常深远的价值。
 21 | 
 22 | - 赛题说明
 23 | 
 24 | 本题目基于脱敏和采样后的数据信息，保证数据安全。利用连续30天的用户行为日志，用户信息，歌曲信息，歌手信息，歌曲音频信息，预测未来30天内用户的留存情况，按未来一日、两日、三日、七日、十四日、三十日分段。
 25 | 
 26 | - 评价指标：Area Under Curve (AUC)
 27 | - baseline地址：https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game1
 28 | 
 29 | - 提交文件：
 30 | 提交的结果需要包含三个文件：
 31 |     - submission.csv：模型预测的结果文件，格式和给出的标注文件一致
 32 |     - DIGIX Implementation Instruction.docx：一份 word 文档描述所使用的模型以及所使用的环境
 33 |     - Source Code.zip：所使用的源码打包文件
 34 | 
 35 | ---
 36 | 
 37 | ### 赛题2：基于多模型迁移预训练文章质量判别
 38 | - 简介
 39 | 
 40 | 文章质量判别是信息流领域的核心问题，提升文章质量判别的准确率是提升信息流质量和精准推送的核心技术点。在本次大赛中，主办方提供匿名化的文章质量数据，参赛选手基于给定的数据构建文章质量判别模型。希望通过本次大赛挖掘nlp算法领域的人才，推动 nlp算法的发展。
 41 | 
 42 | - 赛题说明
 43 | 
 44 | 本题目将为选手提供文章数据，参赛选手基于给定的数据构建文章质量判别模型。所提供的数据经过脱敏处理，保证数据安全。
 45 | 
 46 | 基础数据集包含两部分：训练集和测试集。其中训练集给定了该样本的文章质量的相关标签；测试集用于计算参赛选手模型的评分指标，参赛选手需要计算出测试集中每个样本文章质量判断及优质文章的类型。
 47 | 
 48 | - 评价指标：F1 Score
 49 | - baseline地址：https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game2
 50 | - 赛题讲解视频：https://www.bilibili.com/video/BV1Rf4y157eo
 51 | 
 52 | 
 53 | - 提交文件：
 54 | 提交的结果需要包含三个文件：
 55 |     - submission.csv：模型预测的结果文件，格式和给出的标注文件一致
 56 |     - DIGIX Implementation Instruction.docx：一份 word 文档描述所使用的模型以及所使用的环境
 57 |     - Source Code.zip：所使用的源码打包文件
 58 | 
 59 | ---
 60 | 
 61 | ### 赛题3：基于多目标优化的视频推荐
 62 | 
 63 | - 简介
 64 | 
 65 | 推荐系统大多都是基于隐式反馈来做推荐，比如用户的点击、观看时长、评论、分享等，且不同隐式反馈表达了用户不同的喜好程度。如果仅仅以单目标对推荐结果进行衡量，会存在衡量不全面的问题。如视频场景，假设某个用户打开一个视频看了开头觉得不喜欢立马关掉，如果以点击为目标则体现的是用户感兴趣，但实际情况是用户对这个视频不感兴趣。从这个例子可以看出，在视频推荐中如果仅仅以点击为目标，可能忽视了用户更深层次的隐式反馈。因此，视频推荐除了关注用户点击，还需关注用户观看时长、分享等目标，期望通过多目标能更深入地挖掘用户兴趣，做更精准的推荐。
 66 | 
 67 | - 赛题说明
 68 | 
 69 | 本赛题提供14天数据用于训练，1天数据用于测试，数据包括用户特征，视频内容特征，以及用户历史行为数据，选手基于给出的数据，提供推荐策略，目标是预测每位用户观看视频时长所在区间，且预测是否对视频进行分享。所提供的数据经过脱敏处理，保证数据安全。
 70 | 
 71 | - 评价指标：AUC加权和
 72 | - baseline地址：https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game3
 73 | - 赛题讲解视频：https://www.bilibili.com/video/BV1gg411M7Bx
 74 | 
 75 | - 提交文件：
 76 | 提交的结果需要包含三个文件：
 77 |     - submission.csv：模型预测的结果文件，格式和给出的标注文件一致
 78 |     - DIGIX Implementation Instruction.docx：一份 word 文档描述所使用的模型以及所使用的环境
 79 |     - Source Code.zip：所使用的源码打包文件
 80 | 
 81 | ---
 82 | 
 83 | ### 赛题4：基于多模态多语言的搜索排序
 84 | 
 85 | - 简介
 86 | 
 87 | 搜索，是用户获取信息，找答案最方便快捷的方式。一次用户搜索会经历 Query 解析、召回、排序多个环节，排序作为最后整个过程一环，对用户的体验有最直接的影响。在多语言、多模态的场景下如何充分利用信息、更好的优化用户体验，是业界普遍在探索的难题，也是机器学习算法的明珠。
 88 | 
 89 | - 赛题说明
 90 | 
 91 | 本题目将为选手提供的搜索数据、公开爬取经过清理后的网页属性库，参赛选手基于给定的数据构建召回、排序模型。所提供的数据经过脱敏处理，保证数据安全。
 92 | 
 93 | 基础数据集包含两部分：训练集和测试集。其中训练集为若干个Query下的网页排序；测试集选手需提交对提供Query的网页排序，用于计算与真实排序的HIT@K。
 94 | 
 95 | 本题目标是：在给定搜索关键字（query）和候选网页（doc）集合下，通过对 query 和网页的 title\url 等进行相关性预测，给出这个 query 对应的网页排序结果。本题包含英语和土语两个语言的数据。在训练集中，我们分别提供了这两种语言的 query和与之对应的网页排序结果（约 100 个）。选手可以基于这个训练集数据进行模型训练。预测集，我们提供了待测试的 query，以及用于召回排序的候选网页集合。选手在这个集合上进行召回排序，并将结果提交到系统。
 96 | 
 97 | - 评价指标：HIT@K
 98 | - baseline地址：https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game4
 99 | 
100 | 
101 | - 提交文件：
102 | 提交的结果需要包含三个文件：
103 |     - submission.csv：模型预测的结果文件，格式和给出的标注文件一致
104 |     - DIGIX Implementation Instruction.docx：一份 word 文档描述所使用的模型以及所使用的环境
105 |     - Source Code.zip：所使用的源码打包文件
106 | 
107 | ---
108 | 
109 | ### 赛题5：小样本菜单识别
110 | 
111 | - 简介
112 | 
113 | 图像文本识别在日常生活中有广泛的应用。在不同应用场景下，图像文本识别存在不同挑战。在菜单文字识别任务中，利用少量样本学习模型，同时解决多语言、艺术字等复杂场景下的问题，是提高识别准确率，提升用户体验的关键。希望通过本次比赛，挖掘计算机视觉方向人才，推动该领域发展。
114 | 
115 | - 赛题说明
116 | 
117 | 本赛题为选手提供菜单图片数据及其标注。训练集包括通用字符菜单图片及少量包含特殊字符菜单图片，选手使用训练数据进行模型训练。测试集分为A/B两个测试集。测试集仅提供菜单图片，选手使用模型预测菜单图片中文本主体的位置和内容。
118 | 
119 | 检测并识别菜单中的文字，给出文字内容和坐标。我们提供了包括菜单图片及其对应的标注文件作为训练数据，标注以 JSON 文件形式给出。选手训练模型后在给定的测试集上预测测试菜单图片中的文字内容及坐标。本赛题使用 F1-score 作为最终效果的评价指标，综合考虑了字符级准确率 precision 和字符级召回率 recall。
120 | 
121 | - 赛题数据
122 | 
123 | 数据包括菜单图片及其对应的标注文件，标注以 JSON 文件形式给出，格式如下：
124 | ```
125 | {"imagename.jpg":[{"label":"context","points":[[x1,y1],[x2,y2],…]},…],…}
126 | ```
127 | 
128 | 其中，“imagename.jpg”为图片名，“label”表示文本内容，“points”表示文本区域边缘坐标序列（文本可能是多边形）。标注文件中，“###”表示模糊字符或者其他语言字符，无需处理。
129 | 
130 | - 评价指标：F1 Score
131 | - baseline地址：https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game5
132 | - 赛题讲解视频：https://www.bilibili.com/video/BV14f4y1579M
133 | 
134 | - 提交文件：
135 | 提交的结果需要包含三个文件：
136 |     - label_special.json：模型预测的结果文件，格式和给出的标注文件一致
137 |     - DIGIX Implementation Instruction.docx：一份 word 文档描述所使用的模型以及所使用的环境
138 |     - Source Code.zip：所使用的源码打包文件
139 | 


--------------------------------------------------------------------------------
/competition/DataFountain-CCFBDI-2021/README.md:
--------------------------------------------------------------------------------
 1 | ## 个贷违约预测
 2 | 
 3 | - 赛题类型：结构化数据挖掘、金融风控
 4 | 
 5 | https://www.datafountain.cn/competitions/530
 6 | 
 7 | 本赛题要求利用已有的与目标客群稍有差异的另一批信贷数据，辅助目标业务风控模型的创建，两者数据集之间存在大量相同的字段和极少的共同用户。此处希望大家可以利用迁移学习捕捉不同业务中用户基本信息与违约行为之间的关联，帮助实现对新业务的用户违约预测。
 8 | 
 9 | - baseline1：[阿水0.86单表思路](https://github.com/datawhalechina/competition-baseline/blob/master/competition/DataFountain-CCFBDI-2021/%E4%B8%AA%E8%B4%B7%E8%BF%9D%E7%BA%A6%E9%A2%84%E6%B5%8B-860.ipynb)
10 | - baseline2：[恒哥0.87多表思路](https://github.com/LogicJake/competition_baselines/tree/master/competitions/2021ccf_loan)
11 | 
12 | ## 剧本角色情感识别
13 | 
14 | - 赛题类型：NLP、情感分类
15 | 
16 | https://www.datafountain.cn/competitions/518
17 | 
18 | 本赛题提供一部分电影剧本作为训练集，训练集数据已由人工进行标注，参赛队伍需要对剧本场景中每句对白和动作描述中涉及到的每个角色的情感从多个维度进行分析和识别。该任务的主要难点和挑战包括：1）剧本的行文风格和通常的新闻类语料差别较大，更加口语化；2）剧本中角色情感不仅仅取决于当前的文本，对前文语义可能有深度依赖。
19 | 
20 | - basline1：[恒哥 Bert 0.682](https://github.com/LogicJake/competition_baselines/tree/master/competitions/2021ccf_aqy)
21 | - basline2：[强哥 Bert多任务 0.67](https://github.com/China-ChallengeHub/ChallengeHub-Baselines/blob/main/aiqiyi-baseline.ipynb)
22 | 
23 | ![](https://coggle.club/assets/img/coggle_qrcode.jpg)
24 | 
25 | 
26 | ## 用户上网异常行为分析
27 | 
28 | - 赛题类型：结构化数据挖掘
29 | 
30 | https://www.datafountain.cn/competitions/520
31 | 
32 | 利用机器学习、深度学习，UEBA等人工智能方法，基于无标签的用户日常上网日志数据，构建用户上网行为基线和上网行为评价模型，依据上网行为与基线的距离确定偏离程度。
33 | - 通过用户日常上网数据构建行为基线；
34 | - 采用无监督学习模型，基于用户上网行为特征，构建上网行为评价模型，评价上网行为与基线的偏离程度。
35 | 
36 | - baseline：[CquptDJ](https://blog.csdn.net/qq_44694861/article/details/120423658)
37 | 
38 | 
39 | ## 产品评论观点提取
40 | 
41 | - 赛题类型：NLP、NER
42 | 
43 | https://www.datafountain.cn/competitions/529
44 | 
45 | 观点提取旨在从非结构化的评论文本中提取标准化、结构化的信息，如产品名、评论维度、评论观点等。此处希望大家能够通过自然语言处理的语义情感分析技术判断出一段银行产品评论文本的情感倾向，并能进一步通过语义分析和实体识别，标识出评论所讨论的产品名，评价指标和评价关键词。
46 | 
47 | - baseline：[恒哥](https://github.com/LogicJake/competition_baselines/tree/master/competitions/2021ccf_ner)
48 | 
49 | ## 基于飞桨实现花样滑冰选手骨骼点动作识别
50 | 
51 | - 赛题类型：计算机视觉、姿态估计
52 | 
53 | https://www.datafountain.cn/competitions/519/
54 | 
55 | 基于现实场景的应用需求以及图深度学习模型的发展，本次比赛旨在通过征集各队伍建立的高精度、细粒度、意义明确的动作识别模型，探索基于骨骼点的时空细粒度人体动作识别新方法。本次比赛将基于评价指标Accuracy对各队伍提交结果的评测成绩进行排名，Accuracy得分越高，则认为该模型的动作识别效果越好。
56 | 
57 | - baseline：[飞浆](https://aistudio.baidu.com/aistudio/projectdetail/2417717)
58 | 
59 | ## 千言-问题匹配鲁棒性评测
60 | 
61 | - 赛题类型：NLP、文本匹配
62 | 
63 | https://www.datafountain.cn/competitions/516/
64 | 
65 | 问题匹配（Question Matching）任务旨在判断两个自然问句之间的语义是否等价，是自然语言处理领域的一个重要研究方向。问题匹配同时也具有很高的商业价值，在信息检索、智能客服等领域发挥着重要作用。
66 | 
67 | - baseline：[飞浆](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_matching/question_matching)
68 | 


--------------------------------------------------------------------------------
/competition/DataFountain-三角形图计算算法设计及性能优化/README.md:
--------------------------------------------------------------------------------
1 | https://www.datafountain.cn/competitions/349
2 | 
3 | 第二名开源：https://github.com/wang-zhq/TC_CUDA
4 | 


--------------------------------------------------------------------------------
/competition/DataFountain-乘用车细分市场销量预测/README.md:
--------------------------------------------------------------------------------
1 | https://www.datafountain.cn/competitions/352
2 | 
3 | 冠军思路：https://zhuanlan.zhihu.com/p/98926322
4 | 冠军代码：https://github.com/cxq80803716/2019-CCF-BDCI-Car_sales
5 | 


--------------------------------------------------------------------------------
/competition/DataFountain-云计算时代的大数据查询分析优化/README.md:
--------------------------------------------------------------------------------
1 | https://www.datafountain.cn/competitions/347/ranking?sch=weekly
2 | 
3 | 线上第二名：https://github.com/WenbinHou/CCF-BDCI-2019-Database
4 | 


--------------------------------------------------------------------------------
/competition/DataFountain-互联网新闻情感分析/README.md:
--------------------------------------------------------------------------------
 1 | baseline思路：
 2 | 
 3 | 直接bert梭哈，如果显存不够可以改maxlen和batachsize
 4 | 
 5 | 也可以不同maxlen和batchsize的bert结果进行平均，会有收益。
 6 | 
 7 | https://www.datafountain.cn/competitions/350
 8 | 
 9 | 第一名：https://github.com/cxy229/BDCI2019-SENTIMENT-CLASSIFICATION
10 | 


--------------------------------------------------------------------------------
/competition/DataFountain-互联网金融新实体发现/README.md:
--------------------------------------------------------------------------------
 1 | baseline思路：
 2 | 
 3 | 参考https://github.com/ProHiryu/bert-chinese-ner
 4 | 
 5 | 训练和预测可以修改如下参数：
 6 | 
 7 | ```
 8 | flags.DEFINE_bool(
 9 |     "do_train", True,
10 |     "Whether to run training."
11 | )
12 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
13 | 
14 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
15 | 
16 | flags.DEFINE_bool("do_predict", False,"Whether to run the model in inference mode on the test set.")
17 | ```
18 | 
19 | 具体用法https://github.com/ProHiryu/bert-chinese-ner
20 | 
21 | ## Top选手分享
22 | 
23 | 第四名：https://github.com/rebornZH/2019-CCF-BDCI-NLP
24 | 
25 | 第五名：https://github.com/light8lee/2019-BDCI-FinancialEntityDiscovery
26 | 


--------------------------------------------------------------------------------
/competition/DataFountain-互联网金融新实体发现/bert-chinese-ner.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/DataFountain-互联网金融新实体发现/bert-chinese-ner.zip


--------------------------------------------------------------------------------
/competition/DataFountain-企业网络资产及安全事件分析与可视化/README.md:
--------------------------------------------------------------------------------
1 | https://www.datafountain.cn/competitions/358
2 | 
3 | 第二名开源：https://github.com/Mrzhangxiaohua/2019CCF_Visualization
4 | 


--------------------------------------------------------------------------------
/competition/DataFountain-企业非法集资风险预测/README.md:
--------------------------------------------------------------------------------
1 | 比赛链接：https://www.datafountain.cn/competitions/469/
2 | 
3 | 比赛直播：https://www.bilibili.com/video/BV1mf4y1q7az?p=2
4 | 
5 | 线上843分数
6 | 


--------------------------------------------------------------------------------
/competition/DataFountain-基于OCR的身份证要素提取/README.md:
--------------------------------------------------------------------------------
1 | https://www.datafountain.cn/competitions/346
2 | 
3 | 第一名代码：https://github.com/Mingtzge/2019-CCF-BDCI-OCR-MCZJ-OCR-IdentificationIDElement
4 | 
5 | 2019CCF  BDCI大赛 最佳创新探索奖获得者、OCR身份证要素提取单赛题冠军 天晨破晓团队
6 | 


--------------------------------------------------------------------------------
/competition/DataFountain-多人种人脸识别/README.md:
--------------------------------------------------------------------------------
1 | https://www.datafountain.cn/competitions/348
2 | 
3 | 第一名：天才儿童代码，[方案分享](https://mp.weixin.qq.com/s?__biz=MzIwNDA5NDYzNA==&mid=2247483935&idx=1&sn=c82806f1c4fdd3c3c6e31a31a6faf75c&chksm=96c42fdaa1b3a6cc95e05cc401b97bb7588b86cf221664e3949de6473240debd6fcae5da3a93&token=1694266337)，[代码分享](https://github.com/themostnewone/2019ccf)
4 | 


--------------------------------------------------------------------------------
/competition/DataFountain-技术需求与技术成果项目之间关联度计算模型/README.md:
--------------------------------------------------------------------------------
 1 | baseline思路：pair 分类问题的bert baseline
 2 | 
 3 | 代码存在一定问题，在encode阶段如果ach过长会导致req取不到，所以应该分开切片或者其他切片方式。
 4 | 
 5 | https://www.datafountain.cn/competitions/359
 6 | 
 7 | 第一名：https://github.com/Makaixin/Correlation-between-requirements-and-achievements
 8 | 
 9 | 第二名：https://github.com/rebornZH/2019-CCF-BDCI-NLP
10 | 


--------------------------------------------------------------------------------
/competition/DataFountain-离散制造过程中典型工件的质量符合率预测/README.md:
--------------------------------------------------------------------------------
1 | https://www.datafountain.cn/competitions/351/
2 | 
3 | 第一名：https://github.com/CcIsHandsome/-TOP1-
4 | 


--------------------------------------------------------------------------------
/competition/DataFountain-视频版权检测算法/README.md:
--------------------------------------------------------------------------------
 1 | https://www.datafountain.cn/competitions/354/
 2 | 
 3 | 
 4 | 
 5 | 1. 并行提取视频关键帧；
 6 | 2. 通过resnet18提取关键帧特征；
 7 | 3. 通过通过CNN特征计算得到query与refer对应关系；
 8 | 4. 视频侵权时间段还需要进一步分析；
 9 | 5. 截止到现在全网唯一成功提交的思路；
10 | 


--------------------------------------------------------------------------------
/competition/DataFountain-金融信息负面及主体判定/README.md:
--------------------------------------------------------------------------------
 1 | https://www.datafountain.cn/competitions/353
 2 | 
 3 | 冠军：https://github.com/xiong666/ccf_financial_negative
 4 | 
 5 | 线上第一名：https://github.com/A-Rain/BDCI2019-Negative_Finance_Info_Judge
 6 | 
 7 | 第二名：https://github.com/rebornZH/2019-CCF-BDCI-NLP
 8 | 
 9 | 第三名：https://github.com/Chevalier1024/CCF-BDCI-ABSA
10 | 


--------------------------------------------------------------------------------
/competition/Kesci-中国华录杯人群密度检测/README.md:
--------------------------------------------------------------------------------
 1 | 比赛链接：https://www.kesci.com/home/competition/5df1d33d23ea6d002b264ada/content
 2 | 
 3 | 人群密度检测：在一张图片当中统计图片当中行人的数量。特别说明，当画面中行人数量大于 100 时，均按 100 计算。
 4 | 
 5 | ![](https://github.com/weizheliu/Context-Aware-Crowd-Counting/raw/master/images/prediction.png)
 6 | 
 7 | 比赛数据集链接：链接: https://pan.baidu.com/s/1wtmQUlsr_fcUKGTW1K-4oA 提取码: c2ab
 8 | 
 9 | baseline思路，使用Crowd Counting进行预测，使用*Context-Aware Crowd Counting*的预训练权重：
10 | 
11 | 1. `git clone https://github.com/weizheliu/Context-Aware-Crowd-Counting`
12 | 2. 下载pretrained model（part_B_pre.pth.tar），在我们分享的数据集中已经包含
13 | 3. `python test.py`即可，线上分数341左右
14 | 


--------------------------------------------------------------------------------
/competition/Kesci-中国华录杯人群密度检测/test.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import PIL.Image as Image
  3 | import numpy as np
  4 | import os
  5 | import glob
  6 | import scipy
  7 | from image import *
  8 | from model import CANNet
  9 | import torch
 10 | from torch.autograd import Variable
 11 | 
 12 | from sklearn.metrics import mean_squared_error,mean_absolute_error
 13 | 
 14 | from torchvision import transforms
 15 | 
 16 | 
 17 | transform=transforms.Compose([
 18 |                        transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406],
 19 |                                      std=[0.229, 0.224, 0.225]),
 20 |                    ])
 21 | 
 22 | # the folder contains all the test images
 23 | img_folder='../A/'
 24 | img_paths=[]
 25 | 
 26 | for img_path in glob.glob(os.path.join(img_folder, '*')):
 27 |     img_paths.append(img_path)
 28 | 
 29 | model = CANNet()
 30 | 
 31 | model = model.cuda()
 32 | 
 33 | checkpoint = torch.load('part_B_pre.pth.tar')
 34 | 
 35 | model.load_state_dict(checkpoint['state_dict'])
 36 | 
 37 | model.eval()
 38 | 
 39 | pred= []
 40 | gt = []
 41 | 
 42 | # for i in xrange(len(img_paths)):
 43 | #     img = transform(Image.open(img_paths[i]).convert('RGB')).cuda()
 44 | #     img = img.unsqueeze(0)
 45 | #     h,w = img.shape[2:4]
 46 | #     h_d = h/2
 47 | #     w_d = w/2
 48 | #     img_1 = Variable(img[:,:,:h_d,:w_d].cuda())
 49 | #     img_2 = Variable(img[:,:,:h_d,w_d:].cuda())
 50 | #     img_3 = Variable(img[:,:,h_d:,:w_d].cuda())
 51 | #     img_4 = Variable(img[:,:,h_d:,w_d:].cuda())
 52 | #     density_1 = model(img_1).data.cpu().numpy()
 53 | #     density_2 = model(img_2).data.cpu().numpy()
 54 | #     density_3 = model(img_3).data.cpu().numpy()
 55 | #     density_4 = model(img_4).data.cpu().numpy()
 56 | 
 57 | #     pure_name = os.path.splitext(os.path.basename(img_paths[i]))[0]
 58 | #     # gt_file = h5py.File(img_paths[i].replace('.jpg','.h5').replace('images','ground_truth'),'r')
 59 | #     # groundtruth = np.asarray(gt_file['density'])
 60 | #     pred_sum = density_1.sum()+density_2.sum()+density_3.sum()+density_4.sum()
 61 | #     pred.append(pred_sum)
 62 | #     # gt.append(np.sum(groundtruth))
 63 | #     print(img_paths[i], pred_sum)
 64 | 
 65 | for i in xrange(len(img_paths)):
 66 |     img = Image.open(img_paths[i])
 67 |     print('')
 68 |     print(img.size)
 69 |     if img.size[0] > 1200:
 70 |         img = img.resize((1024, int(img.size[1]*1024.0/img.size[0])))
 71 | #     elif img.size[1] < 350:
 72 | #         img = img.resize((1024, int(img.size[1]*1024.0/img.size[0])))
 73 |     print(img.size)
 74 |     
 75 |     img2 = transform(img.transpose(Image.FLIP_LEFT_RIGHT).convert('RGB')).cuda()
 76 |     img = transform(img.convert('RGB')).cuda()
 77 |     img2 = img2.unsqueeze(0)
 78 |     img = img.unsqueeze(0)
 79 |     h,w = img.shape[2:4]
 80 |     h_d = h/2
 81 |     w_d = w/2
 82 |     
 83 |     density_1 = model(img.cuda()).data.cpu().numpy()
 84 |     density_2 = model(img2.cuda()).data.cpu().numpy()
 85 |     
 86 | #     # img = img.unsqueeze(0)
 87 | #     h,w = img.shape[2:4]
 88 | #     h_d = h/2
 89 | #     w_d = w/2
 90 | #     img_1 = Variable(img[:,:,:h_d,:w_d].cuda())
 91 | #     img_2 = Variable(img[:,:,:h_d,w_d:].cuda())
 92 | #     img_3 = Variable(img[:,:,h_d:,:w_d].cuda())
 93 | #     img_4 = Variable(img[:,:,h_d:,w_d:].cuda())
 94 | #     density_3 = model(img_1).data.cpu().numpy()
 95 | #     density_4 = model(img_2).data.cpu().numpy()
 96 | #     density_5 = model(img_3).data.cpu().numpy()
 97 | #     density_6 = model(img_4).data.cpu().numpy()
 98 |     
 99 |     pure_name = os.path.splitext(os.path.basename(img_paths[i]))[0]
100 |     # gt_file = h5py.File(img_paths[i].replace('.jpg','.h5').replace('images','ground_truth'),'r')
101 |     # groundtruth = np.asarray(gt_file['density'])
102 |     pred_sum = density_1.sum() + density_2.sum()
103 |     pred.append(pred_sum/2)
104 |     # gt.append(np.sum(groundtruth))
105 |     print(img_paths[i], pred_sum)
106 | 
107 | import pandas as pd
108 | df = pd.DataFrame()
109 | df['file'] = [os.path.basename(x) for x in img_paths]
110 | df['man_count'] = pred
111 | df['man_count'] = df['man_count'].round()
112 | df['man_count'] = df['man_count'].astype(int)
113 | df.loc[df['man_count'] > 100, 'man_count'] = 100
114 | df.loc[df['man_count'] < 0, 'man_count'] = 0
115 | df.to_csv('../tmp2.csv', index=None)


--------------------------------------------------------------------------------
/competition/Tianchi-2020数字中国创新大赛—算法赛：智慧海洋建设/README.md:
--------------------------------------------------------------------------------
1 | 比赛链接：https://tianchi.aliyun.com/competition/entrance/231768/introduction
2 | 
3 | baseline地址：https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.3.163c5cfdJTbd9E&postId=87376
4 | 
5 | 


--------------------------------------------------------------------------------
/competition/Tianchi-安泰杯跨境电商智能算法大赛/README.md:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/231718/information
2 | 
3 | - 比赛数据：https://pan.baidu.com/s/1rHXSI44LjIi_dwDN5dZkFg 提取码: 2h6q
4 | - 冠军：[法国南部代码](https://github.com/RainFung/Tianchi-AntaiCup-International-E-commerce-Artificial-Intelligence-Challenge)
5 | 


--------------------------------------------------------------------------------
/competition/Tianchi-心电人机智能大赛心电异常事件预测/README.md:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/231754/introduction
2 | 
3 | 冠军分享：https://github.com/RandomWalk-xzq/Hefei_ECG_TOP1
4 | 


--------------------------------------------------------------------------------
/competition/Tianchi-第三届阿里云安全算法挑战赛/README.md:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/231668/information
2 | 


--------------------------------------------------------------------------------
/competition/Tianchi-第三届阿里云安全算法挑战赛/api.csv:
--------------------------------------------------------------------------------
  1 | GetSystemTimeAsFileTime
  2 | NtAllocateVirtualMemory
  3 | NtFreeVirtualMemory
  4 | SetUnhandledExceptionFilter
  5 | LdrLoadDll
  6 | LdrGetProcedureAddress
  7 | LdrUnloadDll
  8 | NtCreateMutant
  9 | NtCreateSection
 10 | NtMapViewOfSection
 11 | CoInitializeEx
 12 | RegOpenKeyExW
 13 | CoUninitialize
 14 | NtUnmapViewOfSection
 15 | NtClose
 16 | LdrGetDllHandle
 17 | NtTerminateProcess
 18 | NtOpenKey
 19 | NtQueryValueKey
 20 | __exception__
 21 | SetErrorMode
 22 | RegQueryValueExW
 23 | RegCloseKey
 24 | NtCreateFile
 25 | NtWriteFile
 26 | CreateProcessInternalW
 27 | NtProtectVirtualMemory
 28 | RegOpenKeyExA
 29 | NtQueryAttributesFile
 30 | LoadStringA
 31 | GetSystemMetrics
 32 | RegQueryValueExA
 33 | FindResourceExW
 34 | LoadResource
 35 | GetSystemWindowsDirectoryW
 36 | FindResourceA
 37 | SizeofResource
 38 | GetFileVersionInfoSizeW
 39 | GetFileVersionInfoW
 40 | DrawTextExA
 41 | WSAStartup
 42 | socket
 43 | setsockopt
 44 | closesocket
 45 | bind
 46 | NtSetInformationFile
 47 | NtDeviceIoControlFile
 48 | CreateThread
 49 | NtOpenFile
 50 | GetSystemDirectoryW
 51 | NtOpenMutant
 52 | NtOpenSection
 53 | RegEnumKeyExW
 54 | LoadStringW
 55 | GetCursorPos
 56 | EnumWindows
 57 | GetKeyState
 58 | NtQuerySystemInformation
 59 | FindFirstFileExW
 60 | NtOpenDirectoryObject
 61 | GetVolumePathNameW
 62 | CreateDirectoryW
 63 | GetFileAttributesW
 64 | DeleteFileW
 65 | CopyFileA
 66 | CreateToolhelp32Snapshot
 67 | Thread32First
 68 | Thread32Next
 69 | NtDuplicateObject
 70 | GetSystemInfo
 71 | NtOpenKeyEx
 72 | GetTempPathW
 73 | SetFilePointer
 74 | NtReadFile
 75 | GetFileType
 76 | GetTimeZoneInformation
 77 | SetWindowsHookExA
 78 | NtEnumerateKey
 79 | NtQueryInformationFile
 80 | listen
 81 | connect
 82 | gethostbyname
 83 | NtOpenProcess
 84 | WriteProcessMemory
 85 | RtlAddVectoredExceptionHandler
 86 | ReadProcessMemory
 87 | FindWindowA
 88 | SHGetFolderPathW
 89 | CreateActCtxW
 90 | FindResourceW
 91 | SetWindowsHookExW
 92 | GetForegroundWindow
 93 | RegQueryInfoKeyW
 94 | RegEnumValueW
 95 | GetFileSizeEx
 96 | DrawTextExW
 97 | Process32FirstW
 98 | Process32NextW
 99 | NtReadVirtualMemory
100 | OutputDebugStringA
101 | SearchPathW
102 | OleInitialize
103 | CryptAcquireContextW
104 | GetFileSize
105 | SetEndOfFile
106 | GlobalMemoryStatus
107 | CoGetClassObject
108 | CoCreateInstance
109 | NtQueryKey
110 | NtSetValueKey
111 | NtDelayExecution
112 | RegEnumKeyW
113 | NtQueryDirectoryFile
114 | GetFileInformationByHandleEx
115 | NtEnumerateValueKey
116 | GetUserNameExW
117 | GetComputerNameW
118 | GetUserNameW
119 | DeviceIoControl
120 | FindWindowW
121 | RegCreateKeyExW
122 | SendNotifyMessageW
123 | RegSetValueExW
124 | GetFileAttributesExW
125 | GetFileInformationByHandle
126 | SetFileTime
127 | LookupAccountSidW
128 | IsDebuggerPresent
129 | NtResumeThread
130 | GlobalMemoryStatusEx
131 | GetShortPathNameW
132 | NtCreateKey
133 | CoInitializeSecurity
134 | UuidCreate
135 | NtCreateThreadEx
136 | RtlAddVectoredContinueHandler
137 | LookupPrivilegeValueW
138 | NtOpenThread
139 | Module32FirstW
140 | Module32NextW
141 | GetKeyboardState
142 | WriteConsoleA
143 | GetVolumeNameForVolumeMountPointW
144 | NtQueryFullAttributesFile
145 | SetFilePointerEx
146 | GetVolumePathNamesForVolumeNameW
147 | system
148 | WriteConsoleW
149 | RemoveDirectoryA
150 | GetNativeSystemInfo
151 | GetSystemDirectoryA
152 | CopyFileW
153 | GetAdaptersInfo
154 | RegEnumValueA
155 | RegDeleteValueW
156 | RegCreateKeyExA
157 | GetUserNameA
158 | SetFileAttributesW
159 | RegEnumKeyExA
160 | OpenSCManagerA
161 | OpenServiceA
162 | RegSetValueExA
163 | RegDeleteValueA
164 | InternetCrackUrlA
165 | InternetSetOptionA
166 | InternetGetConnectedState
167 | InternetOpenW
168 | InternetSetStatusCallback
169 | InternetConnectW
170 | HttpOpenRequestW
171 | InternetQueryOptionA
172 | HttpSendRequestW
173 | HttpQueryInfoA
174 | InternetCloseHandle
175 | getaddrinfo
176 | GetAdaptersAddresses
177 | getsockname
178 | select
179 | CryptProtectMemory
180 | CryptUnprotectMemory
181 | GetComputerNameA
182 | GetFileVersionInfoSizeExW
183 | GetFileVersionInfoExW
184 | InternetCrackUrlW
185 | SHGetSpecialFolderLocation
186 | CryptHashData
187 | NetUserGetInfo
188 | shutdown
189 | CreateServiceA
190 | StartServiceA
191 | ShellExecuteExW
192 | SetStdHandle
193 | NtQueryMultipleValueKey
194 | CreateJobObjectW
195 | SetInformationJobObject
196 | GetSystemWindowsDirectoryA
197 | FindResourceExA
198 | RemoveDirectoryW
199 | GetDiskFreeSpaceExW
200 | MoveFileWithProgressW
201 | NetShareEnum
202 | RegDeleteKeyW
203 | GetDiskFreeSpaceW
204 | RegQueryInfoKeyA
205 | OpenSCManagerW
206 | OpenServiceW
207 | CryptAcquireContextA
208 | GetAddrInfoW
209 | NtTerminateThread
210 | CreateServiceW
211 | NtDeleteKey
212 | GetBestInterfaceEx
213 | timeGetTime
214 | InternetOpenA
215 | CryptEncrypt
216 | InternetConnectA
217 | HttpOpenRequestA
218 | HttpSendRequestA
219 | StartServiceW
220 | ControlService
221 | DeleteService
222 | CryptExportKey
223 | CryptCreateHash
224 | WSASocketW
225 | NtSuspendThread
226 | NtGetContextThread
227 | UnhookWindowsHookEx
228 | CertOpenStore
229 | CryptDecodeObjectEx
230 | CertControlStore
231 | NtDeleteValueKey
232 | GetAsyncKeyState
233 | EnumServicesStatusW
234 | DnsQuery_W
235 | FindWindowExW
236 | FindFirstFileExA
237 | RegDeleteKeyA
238 | FindWindowExA
239 | InternetOpenUrlA
240 | SendNotifyMessageA
241 | CoCreateInstanceEx
242 | IWbemServices_ExecQuery
243 | WSASocketA
244 | URLDownloadToFileW
245 | accept
246 | NtCreateDirectoryObject
247 | CertCreateCertificateContext
248 | AssignProcessToJobObject
249 | SetFileInformationByHandle
250 | NetGetJoinInformation
251 | InternetReadFile
252 | RtlRemoveVectoredExceptionHandler
253 | CryptGenKey
254 | MessageBoxTimeoutA
255 | NetUserGetLocalGroups
256 | DeleteUrlCacheEntryW
257 | send
258 | recv
259 | ioctlsocket
260 | WSARecv
261 | WSASend
262 | sendto
263 | CopyFileExW
264 | RegisterHotKey
265 | MessageBoxTimeoutW
266 | CreateRemoteThread
267 | GetUserNameExA
268 | EnumServicesStatusA
269 | NtQueueApcThread
270 | RtlCreateUserThread
271 | InternetOpenUrlW
272 | CryptProtectData
273 | WSAConnect
274 | CryptDecrypt
275 | CreateDirectoryExW
276 | IWbemServices_ExecMethod
277 | recvfrom
278 | ObtainUserAgentString
279 | DnsQuery_A
280 | ReadCabinetState
281 | NtSetContextThread
282 | WSARecvFrom
283 | WSASendTo
284 | NtLoadKey
285 | NtLoadDriver
286 | DeleteUrlCacheEntryA
287 | GetInterfaceInfo
288 | NtWriteVirtualMemory
289 | RtlCompressBuffer
290 | NtShutdownSystem
291 | TaskDialog
292 | NtDeleteFile
293 | InternetGetConnectedStateExW
294 | CryptUnprotectData
295 | InternetGetConnectedStateExA
296 | NtSaveKeyEx
297 | NtSaveKey
298 | CertOpenSystemStoreA
299 | PRF
300 | ExitWindowsEx
301 | WSAAccept
302 | CreateRemoteThreadEx
303 | CertOpenSystemStoreW
304 | NtUnloadDriver
305 | NtCreateThread
306 | NtLoadKeyEx
307 | InternetWriteFile
308 | RtlDecompressBuffer
309 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/.DS_Store


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | # TinyMind人民币面值&冠字号编码识别挑战赛
 2 | 
 3 | https://www.tinymind.cn/competitions/47
 4 | 
 5 | 任务1面值分类100分代码，和任务2编码识别第五名代码。
 6 | 
 7 | - 任务1：直接是一个分类问题；
 8 | - 任务2：可以抽象成一个字符识别问题；
 9 |   - 先用检测模型（Fast-RCNN）进行检测；
10 |   - 再使用识别模型CRNN或者muti-CNN进行识别
11 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task1/README.md:
--------------------------------------------------------------------------------
1 | 1. 修改`predict_rmb.py`文件中对应的路径;
2 | 2. `python predict_rmb.py`
3 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task1/predict_rmb.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | from PIL import Image
 12 | 
 13 | from sklearn.preprocessing import LabelEncoder
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold
 15 | 
 16 | import torch
 17 | torch.manual_seed(0)
 18 | torch.backends.cudnn.deterministic = False
 19 | torch.backends.cudnn.benchmark = True
 20 | 
 21 | import torchvision.models as models
 22 | import torchvision.transforms as transforms
 23 | import torchvision.datasets as datasets
 24 | import torch.nn as nn
 25 | import torch.nn.functional as F
 26 | import torch.optim as optim
 27 | from torch.autograd import Variable
 28 | from torch.utils.data.dataset import Dataset
 29 | 
 30 | class QRDataset(Dataset):
 31 |     def __init__(self, img_path, img_label, transform=None):
 32 |         self.img_path = img_path
 33 |         self.img_label=img_label
 34 |         
 35 |         if transform is not None:
 36 |             self.transform = transform
 37 |         else:
 38 |             self.transform = None
 39 |     
 40 |     def __getitem__(self, index):
 41 |         start_time = time.time()
 42 |         img = Image.open(self.img_path[index])
 43 |         
 44 |         if self.transform is not None:
 45 |             img = self.transform(img)
 46 |                 
 47 |         return img, torch.from_numpy(np.array([self.img_label[index]]))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 |         
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |         model = models.resnet18(False)
 56 |         model.avgpool = nn.AdaptiveAvgPool2d(1)
 57 |         model.fc = nn.Linear(512, 256)
 58 |         self.resnet = model
 59 |         
 60 |     def forward(self, img):
 61 |         out = self.resnet(img)
 62 |         return F.log_softmax(out, dim=1)
 63 |     
 64 | def predict(test_loader, model, tta=10):
 65 |     # switch to evaluate mode
 66 |     model.eval()
 67 |     
 68 |     test_pred_tta = None
 69 |     for _ in range(tta):
 70 |         test_pred = []
 71 |         with torch.no_grad():
 72 |             end = time.time()
 73 |             for i, (input, target) in enumerate(test_loader):
 74 |                 input = input.cuda()
 75 |                 target = target.cuda()
 76 | 
 77 |                 # compute output
 78 |                 output = model(input)
 79 |                 output = output.data.cpu().numpy()
 80 | 
 81 |                 test_pred.append(output)
 82 |         test_pred = np.vstack(test_pred)
 83 |     
 84 |         if test_pred_tta is None:
 85 |             test_pred_tta = test_pred
 86 |         else:
 87 |             test_pred_tta += test_pred
 88 |     
 89 |     return test_pred_tta
 90 | 
 91 | 
 92 | def main():
 93 |     
 94 |     # 修改输入的路径
 95 |     df_train = pd.read_csv('../../input/train_face_value_label.csv', dtype={' label': object, 'name': object})
 96 |     lbl = LabelEncoder()
 97 |     df_train['y'] = lbl.fit_transform(df_train[' label'].values)
 98 |     
 99 |     # 修改输入的路径
100 |     test_path = glob.glob('../../input/public_test_data/*.jpg')
101 |     test_path = np.array(test_path)
102 |     
103 |     test_loader = torch.utils.data.DataLoader(
104 |         QRDataset(test_path, np.zeros(len(test_path)),
105 |                 transforms.Compose([
106 |                             # transforms.Resize((124, 124)),
107 |                             transforms.Resize(280),
108 |                             transforms.RandomCrop((256, 256)),
109 |                             transforms.RandomHorizontalFlip(),
110 |                             transforms.RandomVerticalFlip(),
111 |                             transforms.ToTensor(),
112 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
113 |             ])
114 |         ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True
115 |     )
116 | 
117 |     model = VisitNet()
118 |     model = model.cuda()
119 |     model.load_state_dict(torch.load('./resnet18_fold0_11_Acc@1100.00(100.00).pt'))
120 |     
121 |     test_pred = predict(test_loader, model, 10)
122 |     test_pred = np.vstack(test_pred)
123 |     test_pred = np.argmax(test_pred, 1)
124 |     
125 |     test_pred = lbl.inverse_transform(test_pred)
126 |     test_csv = pd.DataFrame()
127 |     test_csv['name'] = [x.split('/')[-1] for x in test_path]
128 |     test_csv['label'] = test_pred
129 |     test_csv.sort_values(by='name', inplace=True)
130 |     test_csv.to_csv('tmp_newmodel_resnet18_tta10.csv', index=None, sep=',')
131 | 
132 | if __name__== "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/.DS_Store


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/2_predict_faster_rcnn.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import os, glob, shutil, codecs
 3 | 
 4 | import mxnet as mx
 5 | from matplotlib import pyplot as plt
 6 | import gluoncv
 7 | from gluoncv import model_zoo, data, utils
 8 | 
 9 | net = model_zoo.get_model('faster_rcnn_resnet50_v1b_voc', ctx=mx.gpu(0), pretrained=False)
10 | net.load_parameters('./faster_rcnn_resnet50_v1b_voc_0002_0.0519.params')
11 | net.classes = ['zipcode']
12 | net.collect_params().reset_ctx(ctx = mx.gpu(0))
13 | 
14 | # MXNET_CUDNN_AUTOTUNE_DEFAULT=0 python 2_predict_faster_rcnn.py
15 | 
16 | with codecs.open('./data/train_data_box.csv', 'w') as up:
17 |     for path in glob.glob('../input/train_data/*.jpg'):
18 |         orig_img_cv2 = cv2.imread(path)
19 |         x, orig_img = data.transforms.presets.rcnn.load_test(path)
20 |         x = x.as_in_context(mx.gpu(0))
21 |         box_ids, scores, bboxes = net(x)
22 |         bboxes = bboxes.asnumpy()[0][0].astype(int)
23 |         
24 |         y1, x1, y2, x2 = bboxes
25 |         x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
26 |         x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
27 |         
28 |         y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
29 |         y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
30 |         
31 |         x1, x2 = int(x1), int(x2)
32 |         y1, y2 = int(y1), int(y2)
33 |         
34 |         # x1-=10; x2+=10
35 |         # y1-=10; y2+=10
36 |         
37 |         # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :])
38 |         cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)])
39 |         up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2))
40 | 
41 | with codecs.open('./data/public_test_data_box.csv', 'w') as up:
42 |     for path in glob.glob('../input/public_test_data/*.jpg'):
43 |         orig_img_cv2 = cv2.imread(path)
44 |         x, orig_img = data.transforms.presets.rcnn.load_test(path)
45 |         x = x.as_in_context(mx.gpu(0))
46 |         box_ids, scores, bboxes = net(x)
47 |         bboxes = bboxes.asnumpy()[0][0].astype(int)
48 |         
49 |         y1, x1, y2, x2 = bboxes
50 |         x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
51 |         x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
52 |         
53 |         y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
54 |         y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
55 |         
56 |         x1, x2 = int(x1), int(x2)
57 |         y1, y2 = int(y1), int(y2)
58 |         
59 |         #x1-=10; x2+=10
60 |         # y1-=10; y2+=10
61 |         
62 |         # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :])
63 |         cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)])
64 |         up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2))
65 |         
66 | with codecs.open('./data/private_test_data_box.csv', 'w') as up:
67 |     for path in glob.glob('../input/private_test_data/*.jpg'):
68 |         orig_img_cv2 = cv2.imread(path)
69 |         x, orig_img = data.transforms.presets.rcnn.load_test(path)
70 |         x = x.as_in_context(mx.gpu(0))
71 |         box_ids, scores, bboxes = net(x)
72 |         bboxes = bboxes.asnumpy()[0][0].astype(int)
73 |         
74 |         y1, x1, y2, x2 = bboxes
75 |         x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
76 |         x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
77 |         
78 |         y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
79 |         y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
80 |         
81 |         x1, x2 = int(x1), int(x2)
82 |         y1, y2 = int(y1), int(y2)
83 |         
84 |         #x1-=10; x2+=10
85 |         # y1-=10; y2+=10
86 |         
87 |         # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :])
88 |         cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)])
89 |         up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2))


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/3_savejson.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import os, glob, shutil, codecs, json
 3 | from tqdm import tqdm, tqdm_notebook
 4 | # %pylab inline
 5 | 
 6 | 
 7 | 
 8 | desc = {}
 9 | desc['abc'] = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
10 | 
11 | desc['train'] = []
12 | desc['test'] = []
13 | desc['pb'] = []
14 | 
15 | import pandas as pd
16 | df_train_label = pd.read_csv('../input/train_id_label.csv')
17 | df_submit = pd.read_csv('./crnn-pytorch/pb_rcnn_label.csv')
18 | df_submit['label'] = df_submit['label'].apply(lambda x: ' '+x)
19 | df_submit.columns = ['name', ' label']
20 | 
21 | df_train_label = pd.concat([df_train_label, df_submit], axis=0, ignore_index=True)
22 | print(df_train_label.shape)
23 | 
24 | train_guanzi = df_train_label[' label'].apply(lambda x: x[-4:]).unique()
25 | 
26 | 
27 | def checkImageIsValid(imagePath):
28 |     img = cv2.imread(imagePath)
29 |     if img is None:
30 |         return False
31 |     
32 |     with open(imagePath, 'rb') as f:
33 |         imageBin = f.read()
34 |     
35 |     if imageBin is None:
36 |         return False
37 |     
38 |     try:
39 |         imageBuf = np.fromstring(imageBin, dtype=np.uint8)
40 |         img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
41 |         imgH, imgW = img.shape[0], img.shape[1]
42 |         if imgH * imgW == 0:
43 |             return False
44 |         return True
45 |     except:
46 |         return False
47 | 
48 | bad_img_path = []
49 | for x in df_train_label['name'].values:
50 |     if not checkImageIsValid('./data/data/'+x):
51 |         bad_img_path.append(x)
52 | 
53 | 
54 | import numpy as np
55 | from sklearn.model_selection import KFold, StratifiedKFold
56 | X = np.zeros((df_train_label['name'].shape[0], 2))
57 | kf = KFold(n_splits=24)
58 | kf.get_n_splits(X)
59 | 
60 | print(kf)
61 | fold_idx=0
62 | for train_index, test_index in kf.split(X, df_train_label[' label'].apply(lambda x:x[1:2])):
63 |     print("TRAIN:", train_index, "TEST:", test_index)
64 |     
65 |     desc['fold'+str(fold_idx)+'_train'] = []
66 |     desc['fold'+str(fold_idx)+'_test'] = []
67 |     
68 |     for row in df_train_label.iloc[train_index].iterrows():
69 | #         desc['fold'+str(fold_idx)+'_train'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
70 | #         continue
71 | 
72 |         if row[1]['name'] in bad_img_path:
73 |             continue
74 |             
75 |         if checkImageIsValid('./data/data/'+row[1]['name']):
76 |             desc['fold'+str(fold_idx)+'_train'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
77 |         else:
78 |             print('./data/data/'+row[1]['name'])
79 |             
80 |     for row in df_train_label.iloc[test_index].iterrows():
81 | #         desc['fold'+str(fold_idx)+'_test'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
82 | #         continue
83 | 
84 |         if row[1]['name'] in bad_img_path:
85 |             continue
86 | 
87 |         if checkImageIsValid('./data/data/'+row[1]['name']):
88 |             desc['fold'+str(fold_idx)+'_test'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
89 |         else:
90 |             print('./data/data/'+row[1]['name'])
91 |             
92 |     fold_idx+=1
93 | 
94 | for row in glob.glob('../input/private_test_data/*'):
95 |     desc['pb'].append({'text':'QJ69411105', 'name':row.split('/')[-1]})
96 |     
97 | with open('./data/desc.json', 'w') as outfile:
98 |     json.dump(desc, outfile)


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/VOC2007.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/VOC2007.zip


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/README.md:
--------------------------------------------------------------------------------
 1 | Convolutional Recurrent Neural Network
 2 | ======================================
 3 | 
 4 | This software implements OCR system using CNN + RNN + CTCLoss, inspired by CRNN network.
 5 | 
 6 | Usage
 7 | -----
 8 | 
 9 | `
10 | python ./train.py --help
11 | `
12 | 
13 | Demo
14 | ----
15 | 
16 | 1. Train simple OCR using TestDataset data generator.
17 | Training for ~60-100 epochs.
18 | ```
19 | python train.py --test-init True --test-epoch 10 --output-dir <path_to_folder_with_snapshots>
20 | ```
21 | 
22 | 2. Run test for trained model with visualization mode.
23 | ```
24 | python test.py --snapshot <path_to_folder_with_snapshots>/crnn_resnet18_10_best --visualize True
25 | ```
26 | 
27 | Train on custom dataset
28 | -----------------------
29 | 
30 | 1. Create dataset
31 | 
32 | - Structure of dataset:
33 | ```
34 | <root_dataset_dir>
35 | ---- data
36 | -------- <img_filename_0>
37 | ...
38 | -------- <img_filename_1>
39 | ---- desc.json
40 | ```
41 | 
42 | - Structure of desc.json:
43 | ```
44 | {
45 | "abc": <symbols_in_aphabet>,
46 | "train": [
47 | {
48 | "text": <text_on_image>
49 | "name": <img_filename>
50 | },
51 | ...
52 | {
53 | "text": <text_on_image>
54 | "name": <img_filename>
55 | }
56 | ],
57 | "test": [
58 | {
59 | "text": <text_on_image>
60 | "name": <img_filename>
61 | },
62 | ...
63 | {
64 | "text": <text_on_image>
65 | "name": <img_filename>
66 | }
67 | ]
68 | }
69 | ```
70 | 
71 | 2. Train simple OCR using custom dataset.
72 | ```
73 | python train.pt --test-init True --test-epoch 10 --output-dir <path_to_folder_with_snapshots> --data-path <path_to_custom_dataset>
74 | ```
75 | 
76 | 3. Run test for trained model with visualization mode.
77 | ```
78 | python test.py --snapshot <path_to_folder_with_snapshots>/crnn_resnet18_10_best --visualize True --data-path <path_to_custom_dataset>
79 | ```
80 | 
81 | 
82 | Dependence
83 | ----------
84 | * pytorch 0.3.0 +
85 | * [warp-ctc](https://github.com/SeanNaren/warp-ctc)
86 | 
87 | Articles
88 | --------
89 | 
90 | * [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717)
91 | * [Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks](https://dl.acm.org/citation.cfm?id=1143891)
92 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/__init__.py


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/__init__.py


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/collate_fn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def text_collate(batch):
 5 |     img = list()
 6 |     seq = list()
 7 |     seq_len = list()
 8 |     for sample in batch:
 9 |         img.append(torch.from_numpy(sample["img"].transpose((2, 0, 1))).float())
10 |         seq.extend(sample["seq"])
11 |         seq_len.append(sample["seq_len"])
12 |     img = torch.stack(img)
13 |     seq = torch.Tensor(seq).int()
14 |     seq_len = torch.Tensor(seq_len).int()
15 |     batch = {"img": img, "seq": seq, "seq_len": seq_len}
16 |     return batch
17 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/data_transform.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import cv2
  4 | import torch
  5 | import albumentations.augmentations.functional as albumentations
  6 | 
  7 | class ToTensor(object):
  8 |     def __call__(self, sample):
  9 |         sample["img"] = torch.from_numpy(sample["img"].transpose((2, 0, 1))).float()
 10 | #         sample["img"][0] = (sample["img"][0] - 0.485)/0.229
 11 | #         sample["img"][0] = (sample["img"][0] - 0.456)/0.224
 12 | #         sample["img"][0] = (sample["img"][0] - 0.406)/0.225
 13 |         
 14 |         sample["seq"] = torch.Tensor(sample["seq"]).int()
 15 |         return sample
 16 | 
 17 | 
 18 | class Resize(object):
 19 |     def __init__(self, size=(320, 32)):
 20 |         self.size = size
 21 | 
 22 |     def __call__(self, sample):
 23 |         if sample["img"] is None:
 24 |             return np.zeros((320, 32, 3))
 25 |             
 26 |         else:
 27 |             sample["img"] = cv2.resize(sample["img"], self.size)
 28 |             sample["img"] = sample["img"].astype(float)/255.0
 29 |             sample["img"][0] = (sample["img"][0] - 0.485)/0.229
 30 |             sample["img"][0] = (sample["img"][0] - 0.456)/0.224
 31 |             sample["img"][0] = (sample["img"][0] - 0.406)/0.225
 32 |             return sample
 33 | 
 34 | 
 35 | class Rotation(object):
 36 |     def __init__(self, angle=5, fill_value=0, p = 0.5):
 37 |         self.angle = angle
 38 |         self.fill_value = fill_value
 39 |         self.p = p
 40 | 
 41 |     def __call__(self, sample):
 42 |         if np.random.uniform(0.0, 1.0) < self.p:
 43 |             return sample
 44 |         h,w,_ = sample["img"].shape
 45 |         ang_rot = np.random.uniform(self.angle) - self.angle/2
 46 |         transform = cv2.getRotationMatrix2D((w/2, h/2), ang_rot, 1)
 47 |         sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value)
 48 |         return sample
 49 | 
 50 | 
 51 | class Translation(object):
 52 |     def __init__(self, fill_value=0, p = 0.5):
 53 |         self.fill_value = fill_value
 54 |         self.p = p
 55 | 
 56 |     def __call__(self, sample):
 57 |         if np.random.uniform(0.0, 1.0) < self.p:
 58 |             return sample
 59 |         h,w,_ = sample["img"].shape
 60 |         trans_range = [w / 20, h / 20]
 61 |         tr_x = trans_range[0]*np.random.uniform()-trans_range[0]/2
 62 |         tr_y = trans_range[1]*np.random.uniform()-trans_range[1]/2
 63 |         transform = np.float32([[1,0, tr_x], [0,1, tr_y]])
 64 |         sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value)
 65 |         return sample
 66 | 
 67 | 
 68 | class Scale(object):
 69 |     def __init__(self, scale=[0.5, 1.2], fill_value=0, p = 0.5):
 70 |         self.scale = scale
 71 |         self.fill_value = fill_value
 72 |         self.p = p
 73 | 
 74 |     def __call__(self, sample):
 75 |         if np.random.uniform(0.0, 1.0) < self.p:
 76 |             return sample
 77 |         h, w, _ = sample["img"].shape
 78 |         scale = np.random.uniform(self.scale[0], self.scale[1])
 79 |         transform = np.float32([[scale, 0, 0],[0, scale, 0]])
 80 |         sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value)
 81 |         return sample
 82 | 
 83 | # add lyz
 84 | class Snow(object):
 85 |     def __init__(self, p = 0.5):
 86 |         self.p = p
 87 | 
 88 |     def __call__(self, sample):
 89 |         if np.random.uniform(0.0, 1.0) < self.p or not sample["aug"]:
 90 |             return sample
 91 |         h, w, _ = sample["img"].shape
 92 |         sample["img"] = albumentations.add_snow(sample["img"], snow_point=0.5, brightness_coeff=2)
 93 |         return sample
 94 |     
 95 | class Contrast(object):
 96 |     def __init__(self, p = 0.5):
 97 |         self.p = p
 98 | 
 99 |     def __call__(self, sample):
100 |         if np.random.uniform(0.0, 1.0) < self.p:
101 |             return sample
102 |         h, w, _ = sample["img"].shape
103 |         sample["img"] = albumentations.brightness_contrast_adjust(sample["img"], beta=np.random.uniform(0.0, 1.0)+0.1)
104 |         # sample["img"] = cv2.GaussianBlur(sample["img"],(3,3),0)
105 |         return sample
106 |     
107 | class Grid_distortion(object):
108 |     def __init__(self, p = 0.5):
109 |         self.p = p
110 | 
111 |     def __call__(self, sample):
112 |         # print('grid', np.random.uniform(0.0, 1.0))
113 |         
114 |         if np.random.uniform(0.0, 1.0) < self.p:
115 |             return sample
116 |         h, w, _ = sample["img"].shape
117 |         
118 |         # grid_distortion
119 |         if np.random.uniform(0.0, 1.0) < self.p:
120 |             num_steps=15
121 |             distort_limit=[-0.05,0.05]
122 |             stepsx = [1 + random.uniform(distort_limit[0], distort_limit[1]) for i in
123 |                               range(num_steps + 1)]
124 |             stepsy = [1 + random.uniform(distort_limit[0], distort_limit[1]) for i in
125 |                               range(num_steps + 1)]
126 |             sample["img"]=albumentations.grid_distortion(sample["img"],5,stepsx, stepsy)
127 |         # elastic_transform
128 |         else:
129 |             sample["img"]=albumentations.elastic_transform(sample["img"], alpha=5, sigma=1, alpha_affine=random.uniform(0,2), 
130 |                                             interpolation=cv2.INTER_LINEAR, border_mode=cv2.BORDER_REFLECT_101,)
131 |         
132 |         if np.random.uniform(0.0, 1.0) < self.p-0.2:
133 |             sample["img"]=albumentations.jpeg_compression(sample["img"], random.randint(20, 100))
134 |         return sample


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/test_data.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | import string
 6 | import random
 7 | 
 8 | class TestDataset(Dataset):
 9 |     def __init__(self,
10 |                  epoch_len = 10000,
11 |                  seq_len = 8,
12 |                  transform=None,
13 |                  abc=string.digits):
14 |         super().__init__()
15 |         self.abc = abc
16 |         self.epoch_len = epoch_len
17 |         self.seq_len = seq_len
18 |         self.transform = transform
19 | 
20 |     def __len__(self):
21 |         return self.epoch_len
22 | 
23 |     def get_abc(self):
24 |         return self.abc
25 | 
26 |     def set_mode(self, mode='train'):
27 |         return
28 | 
29 |     def generate_string(self):
30 |         return ''.join(random.choice(self.abc) for _ in range(self.seq_len))
31 | 
32 |     def get_sample(self):
33 |         h, w = 64, int(self.seq_len * 64 * 2.5)
34 |         pw = int(w / self.seq_len)
35 |         seq = []
36 |         img = np.zeros((h, w), dtype=np.uint8)
37 |         text = self.generate_string()
38 |         for i in range(len(text)):
39 |             c = text[i]
40 |             seq.append(self.abc.find(c) + 1)
41 |             hs, ws = 32, 32
42 |             symb = np.zeros((hs, ws), dtype=np.uint8)
43 |             font = cv2.FONT_HERSHEY_SIMPLEX
44 |             cv2.putText(symb, str(c), (3, 30), font, 1.2, (255), 2, cv2.LINE_AA)
45 |             # Rotation
46 |             angle = 60
47 |             ang_rot = np.random.uniform(angle) - angle/2
48 |             transform = cv2.getRotationMatrix2D((ws/2, hs/2), ang_rot, 1)
49 |             symb = cv2.warpAffine(symb, transform, (ws, hs), borderValue = 0)
50 |             # Scale
51 |             scale = np.random.uniform(0.7, 1.0)
52 |             transform = np.float32([[scale, 0, 0],[0, scale, 0]])
53 |             symb = cv2.warpAffine(symb, transform, (ws, hs), borderValue = 0)
54 |             y = np.random.randint(hs, h)
55 |             x = np.random.randint(i * pw, (i + 1) * pw - ws)
56 |             img[y-hs:y, x:x+ws] = symb
57 |         nw = int(w * 32 / h)
58 |         img = cv2.resize(img, (nw, 32))
59 |         img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
60 |         return img, seq
61 | 
62 |     def __getitem__(self, idx):
63 |         img, seq = self.get_sample()
64 |         sample = {"img": img, "seq": seq, "seq_len": len(seq), "aug": True}
65 |         if self.transform:
66 |             sample = self.transform(sample)
67 |         return sample
68 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/text_data.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import json
 3 | import os
 4 | import cv2
 5 | 
 6 | class TextDataset(Dataset):
 7 |     def __init__(self, data_path, mode="train", transform=None):
 8 |         super(Dataset, self).__init__()
 9 |         self.data_path = data_path
10 |         self.mode = mode
11 |         self.config = json.load(open(os.path.join(data_path, "desc.json")))
12 |         self.transform = transform
13 | 
14 |     def abc_len(self):
15 |         return len(self.config["abc"])
16 | 
17 |     def get_abc(self):
18 |         return self.config["abc"]
19 | 
20 |     def set_mode(self, mode):
21 |         self.mode = mode
22 | 
23 |     def __len__(self):
24 |         if self.mode == "test":
25 |             return len(self.config[self.mode])
26 |         return len(self.config[self.mode])
27 | 
28 |     def __getitem__(self, idx):
29 |         
30 |         name = self.config[self.mode][idx]["name"]
31 |         text = self.config[self.mode][idx]["text"]
32 | 
33 |         img = cv2.imread(os.path.join(self.data_path, "data", name))
34 |         # print(os.path.join(self.data_path, "data", name))
35 |         # img = cv2.imread(os.path.join(self.data_path, name))
36 |         seq = self.text_to_seq(text)
37 |         sample = {"img": img, "seq": seq, "seq_len": len(seq), "aug": self.mode == "train"}
38 |         if self.transform:
39 |             # print('trans')
40 |             sample = self.transform(sample)
41 |         return sample
42 | 
43 |     def text_to_seq(self, text):
44 |         seq = []
45 |         for c in text:
46 |             seq.append(self.config["abc"].find(c) + 1)
47 |         return seq


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/fold_tta.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/fold_tta.pkl


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/lr_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class StepLR(object):
 5 |     def __init__(self, optimizer, step_size=1000, max_iter=10000):
 6 |         self.optimizer = optimizer
 7 |         self.max_iter = max_iter
 8 |         self.step_size = step_size
 9 |         self.last_iter = -1
10 |         self.base_lrs = list(map(lambda group: group['lr'], optimizer.param_groups))
11 | 
12 |     def get_lr(self):
13 |         return self.optimizer.param_groups[0]['lr']
14 | 
15 |     def step(self, last_iter=None):
16 |         if last_iter is not None:
17 |             self.last_iter = last_iter
18 |         if self.last_iter + 1 == self.max_iter:
19 |             self.last_iter = -1
20 |         self.last_iter = (self.last_iter + 1) % self.max_iter
21 |         for ids, param_group in enumerate(self.optimizer.param_groups):
22 |             param_group['lr'] = self.base_lrs[ids] * 0.8 ** ( self.last_iter // self.step_size )
23 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/__init__.py


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/model_loader.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from .crnn import CRNN
 7 | 
 8 | def load_weights(target, source_state):
 9 |     new_dict = OrderedDict()
10 |     for k, v in target.state_dict().items():
11 |         if k in source_state and v.size() == source_state[k].size():
12 |             new_dict[k] = source_state[k]
13 |         else:
14 |             new_dict[k] = v
15 |     target.load_state_dict(new_dict)
16 | 
17 | def load_model(abc, seq_proj=[0, 0], backend='resnet18', snapshot=None, cuda=True):
18 |     net = CRNN(abc=abc, seq_proj=seq_proj, backend=backend)
19 |     net = nn.DataParallel(net)
20 |     if snapshot is not None:
21 |         load_weights(net, torch.load(snapshot))
22 |     if cuda:
23 |         net = net.cuda()
24 |     return net
25 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/submit.py:
--------------------------------------------------------------------------------
 1 | def check_label(s):
 2 |     if '*' in s:
 3 |         return True
 4 |     if len(s) != 10:
 5 |         return True
 6 |     
 7 |     if len(set(s[3:]) & set(string.ascii_uppercase)) > 0:
 8 |         return True
 9 |     
10 |     if s[0] in string.digits:
11 |         return True
12 |     
13 |     if s[0] in string.ascii_uppercase and s[1] in string.ascii_uppercase and s[2] in string.ascii_uppercase:
14 |         return True
15 |     
16 |     if s[0] in string.ascii_uppercase and s[1] in string.ascii_uppercase:
17 |         return True 
18 |     elif s[0] in string.ascii_uppercase and s[2] in string.ascii_uppercase and s[1] in string.digits:
19 |         return True
20 |     else:
21 |         return False
22 |     
23 | 
24 | import pandas as pd
25 | import string
26 | submit_df1 = pd.read_csv('./tmp_rcnn_tta10_pb.csv')
27 | submit_df2 = pd.read_csv('../multi-digit-pytorch/tmp_rcnn_tta10_cnn.csv')
28 | 
29 | submit_df1.loc[submit_df1['name'] == 'OFTUHPVE.jpg', 'label'] = submit_df2[submit_df2['name'] == 'OFTUHPVE.jpg']['label']
30 | submit_df1[~submit_df1['label'].apply(lambda x: check_label(x))]
31 | submit_df1.to_csv('tmp_rcnn_tta10_pb_submit.csv',index=None)


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import string
 4 | from tqdm import tqdm
 5 | import click
 6 | import numpy as np
 7 | import pandas as pd
 8 | import torch
 9 | from torch.autograd import Variable
10 | from torch.utils.data import DataLoader
11 | 
12 | from dataset.test_data import TestDataset
13 | from dataset.text_data import TextDataset
14 | from dataset.collate_fn import text_collate
15 | from dataset.data_transform import Resize, Rotation, Translation, Scale
16 | from models.model_loader import load_model
17 | from torchvision.transforms import Compose
18 | 
19 | import editdistance
20 | 
21 | def test(net, data, abc, cuda, visualize, batch_size=10):
22 |     data_loader = DataLoader(data, batch_size=10, num_workers=1, shuffle=False, collate_fn=text_collate)
23 |     
24 |     error_idx = []
25 |     idx= 0
26 |     count = 0.0
27 |     tp = 0.0
28 |     avg_ed = 0.0
29 |     iterator = tqdm(data_loader)
30 |     for sample in iterator:
31 |         imgs = Variable(sample["img"])
32 |         if cuda:
33 |             imgs = imgs.cuda()
34 |         out = net(imgs, decode=True)
35 |         gt = (sample["seq"].numpy() - 1).tolist()
36 |         lens = sample["seq_len"].numpy().tolist()
37 |         pos = 0
38 |         key = ''
39 |         for i in range(len(out)):
40 |             gts = ''.join(abc[c] for c in gt[pos:pos+lens[i]])
41 |             pos += lens[i]
42 |             
43 |             if gts != out[i]:
44 |                 # print(out[i], gts, imgs.shape)
45 |                 error_idx.append(int(count))
46 |             if gts == out[i]:
47 |                 tp += 1.0
48 |             else:
49 |                 avg_ed += editdistance.eval(out[i], gts)
50 |             count += 1.0
51 |         if not visualize:
52 |             iterator.set_description("acc: {0:.4f}; avg_ed: {1:.4f}".format(tp / count, avg_ed / count))
53 |         idx+=1
54 |         
55 |     acc = tp / count
56 |     avg_ed = avg_ed / count
57 |     return acc, avg_ed, error_idx
58 | 
59 | @click.command()
60 | @click.option('--data-path', type=str, default=None, help='Path to dataset')
61 | @click.option('--abc', type=str, default=string.digits+string.ascii_uppercase, help='Alphabet')
62 | @click.option('--seq-proj', type=str, default="10x20", help='Projection of sequence')
63 | @click.option('--backend', type=str, default="resnet18", help='Backend network')
64 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights')
65 | @click.option('--input-size', type=str, default="320x32", help='Input size')
66 | @click.option('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3')
67 | @click.option('--visualize', type=bool, default=False, help='Visualize output')
68 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize):
69 |     os.environ["CUDA_VISIBLE_DEVICES"] = gpu
70 |     cuda = True if gpu is not '' else False
71 | 
72 |     input_size = [int(x) for x in input_size.split('x')]
73 |     transform = Compose([
74 |         Rotation(),
75 |         Resize(size=(input_size[0], input_size[1]))
76 |     ])
77 |     if data_path is not None:
78 |         data = TextDataset(data_path=data_path, mode="test", transform=transform)
79 |     else:
80 |         data = TestDataset(transform=transform, abc=abc)
81 |     seq_proj = [int(x) for x in seq_proj.split('x')]
82 |     net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval()
83 |     acc, avg_ed = test(net, data, data.get_abc(), cuda, visualize)
84 |     
85 |     df_submit = pd.DataFrame()
86 |     
87 |     print("Accuracy: {}".format(acc))
88 |     print("Edit distance: {}".format(avg_ed))
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/test2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2, glob
  3 | import string
  4 | from tqdm import tqdm
  5 | import click
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import torch
 10 | from torch.autograd import Variable
 11 | from torch.utils.data import DataLoader
 12 | 
 13 | from dataset.test_data import TestDataset
 14 | from dataset.text_data import TextDataset
 15 | from dataset.collate_fn import text_collate
 16 | from dataset.data_transform import Resize, Rotation, Translation, Scale, Contrast, Snow, Grid_distortion
 17 | from models.model_loader import load_model
 18 | from torchvision.transforms import Compose
 19 | 
 20 | import editdistance
 21 | 
 22 | def pred_to_string(pred):
 23 |     seq = []
 24 |     for i in range(pred.shape[0]):
 25 |         label = np.argmax(pred[i])
 26 |         seq.append(label - 1)
 27 |     out = []
 28 |     for i in range(len(seq)):
 29 |         if len(out) == 0:
 30 |             if seq[i] != -1:
 31 |                 out.append(seq[i])
 32 |         else:
 33 |             if seq[i] != -1 and seq[i] != seq[i - 1]:
 34 |                 out.append(seq[i])
 35 |     out = ''.join('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'[i] for i in out)
 36 |     return out
 37 | 
 38 | def decode(pred):
 39 |     seq = []
 40 |     for i in range(pred.shape[0]):
 41 |         seq.append(pred_to_string(pred[i]))
 42 |     return seq
 43 | 
 44 | def test(net, data, abc, cuda, visualize, batch_size=256):
 45 |     data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=text_collate)
 46 | 
 47 |     count = 0.0
 48 |     tp = 0.0
 49 |     avg_ed = 0.0
 50 |     pred_pb = []
 51 |     iterator = tqdm(data_loader)
 52 |     for sample in iterator:
 53 |         imgs = Variable(sample["img"])
 54 |         if cuda:
 55 |             imgs = imgs.cuda()
 56 |         out = net(imgs, decode=True)
 57 |         gt = (sample["seq"].numpy() - 1).tolist()
 58 |         lens = sample["seq_len"].numpy().tolist()
 59 |         pos = 0
 60 |         key = ''
 61 |         for i in range(len(out)):
 62 |             gts = ''.join(abc[c] for c in gt[pos:pos+lens[i]])
 63 |             pos += lens[i]
 64 |             pred_pb.append(out[i])
 65 |             
 66 |             if gts == out[i]:
 67 |                 tp += 1.0
 68 |             else:
 69 |                 avg_ed += editdistance.eval(out[i], gts)
 70 |             count += 1.0
 71 |         if not visualize:
 72 |             iterator.set_description("acc: {0:.4f}; avg_ed: {0:.4f}".format(tp / count, avg_ed / count))
 73 | 
 74 |     acc = tp / count
 75 |     avg_ed = avg_ed / count
 76 |     return acc, avg_ed, pred_pb
 77 | 
 78 | 
 79 | def test_tta(net, data, abc, cuda, visualize, batch_size=256):
 80 |     pred_pb_tta = None
 81 |     
 82 |     for _ in range(10):
 83 |         data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=text_collate)
 84 |         iterator = tqdm(data_loader)
 85 |         
 86 |         pred_pb = []
 87 |         for sample in iterator:
 88 |             imgs = Variable(sample["img"])
 89 |             if cuda:
 90 |                 imgs = imgs.cuda()
 91 |             out = net(imgs, decode=False)
 92 |             out = out.permute(1, 0, 2).cpu().data.numpy()
 93 |             
 94 |             pred_pb.append(out)
 95 |         
 96 |         if pred_pb_tta is None:
 97 |             pred_pb_tta = np.concatenate(pred_pb)
 98 |         else:
 99 |             pred_pb_tta += np.concatenate(pred_pb)
100 |     return 0, 0, decode(pred_pb_tta)
101 | 
102 | @click.command()
103 | @click.option('--data-path', type=str, default=None, help='Path to dataset')
104 | @click.option('--abc', type=str, default=string.digits+string.ascii_uppercase, help='Alphabet')
105 | @click.option('--seq-proj', type=str, default="10x20", help='Projection of sequence')
106 | @click.option('--backend', type=str, default="resnet34", help='Backend network')
107 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights')
108 | @click.option('--input-size', type=str, default="320x32", help='Input size')
109 | @click.option('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3')
110 | @click.option('--visualize', type=bool, default=False, help='Visualize output')
111 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize):
112 |     os.environ["CUDA_VISIBLE_DEVICES"] = gpu
113 |     cuda = True if gpu is not '' else False
114 | 
115 |     input_size = [int(x) for x in input_size.split('x')]
116 |     transform = Compose([
117 |         Rotation(),
118 |         Translation(),
119 |         # Scale(),
120 |         Contrast(),
121 |         Grid_distortion(),
122 |         Resize(size=(input_size[0], input_size[1]))
123 |     ])
124 |     if data_path is not None:
125 |         data = TextDataset(data_path=data_path, mode="pb", transform=transform)
126 |     else:
127 |         data = TestDataset(transform=transform, abc=abc)
128 |     seq_proj = [int(x) for x in seq_proj.split('x')]
129 |     net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval()
130 |     acc, avg_ed, pred_pb = test_tta(net, data, data.get_abc(), cuda, visualize)
131 |     
132 |     df_submit = pd.DataFrame()
133 |     df_submit['name'] = [x.split('/')[-1] for x in glob.glob('../../input/public_test_data/*')]
134 |     df_submit['label'] = pred_pb
135 |     
136 |     df_submit.to_csv('tmp_rcnn_tta10.csv', index=None)
137 |     print("Accuracy: {}".format(acc))
138 |     print("Edit distance: {}".format(avg_ed))
139 | 
140 | if __name__ == '__main__':
141 |     main()
142 | 


--------------------------------------------------------------------------------
/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/data/data.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/data/data.json


--------------------------------------------------------------------------------
/competition/WSDM2022/README.md:
--------------------------------------------------------------------------------
 1 | **WSDM会议（CCF B类会议）** 是涉及搜索和数据挖掘的网络启发研究的主要会议之一。WSDM Cup将于10月15日开始，一直持续2022年到1月下旬。
 2 | 
 3 | > 比赛赛题解析录屏（11月28日）：https://www.bilibili.com/video/BV1Ng411K7Jm/
 4 | 
 5 | ## User Retention Score Prediction
 6 | 
 7 | http://challenge.ai.iqiyi.com/detail?raceId=61600f6cef1b65639cd5eaa6
 8 | 
 9 | 举办方：iQIYI
10 | 
11 | 赛题类型：用户留存预测、CTR类型
12 | 
13 | ### 赛题背景
14 | 
15 | 爱奇艺手机端APP，通过深度学习等最新的AI技术，提升用户个性化的产品体验，更好地让用户享受定制化的娱乐服务。我们用“N日留存分”这一关键指标来衡量用户的满意程度。
16 | 
17 | 例如，如果一个用户10月1日的“7日留存分”等于3，代表这个用户接下来的7天里（10月2日~8日），有3天会访问爱奇艺APP。预测用户的留存分是个充满挑战的难题：不同用户本身的偏好、活跃度差异很大，另外用户可支配的娱乐时间、热门内容的流行趋势等其他因素，也有很强的周期性特征。
18 | 
19 | ### 赛题任务
20 | 
21 | 本次大赛基于爱奇艺APP脱敏和采样后的数据信息，预测用户的7日留存分。参赛队伍需要设计相应的算法进行数据分析和预测。
22 | 
23 | ### 评价指标
24 | 本次比赛是一个数值预测类问题。评价函数使用：$100*(1-\frac{1}{n}\sum^n_1|\frac{F_t-A_t}{7}|)$
25 | 
26 | $n$是测试集用户数量，$F$是参赛者对用户的7日留存分预测值，$A$是真实的7日留存分真实值。
27 | 
28 | ### 赛题开源
29 | 
30 | - [第一名思路](https://zhuanlan.zhihu.com/p/462736790), [代码](https://github.com/hansu1017/WSDM2022-Retention-Score-Prediction)
31 | - [第三名代码](https://github.com/Chenfei-Kang/2022_WSDM_iQiYi_Retention_Score_Prediction)
32 | 
33 | ### 其他开源
34 | 
35 | - [`举办方`开源了84.5分数的代码](http://challenge.ai.iqiyi.com/detail?raceId=61600f6cef1b65639cd5eaa6)，基于Keras，需要32G内存 + 4G GPU
36 | - [`阿水`基于举办方改写了模型代码](https://aistudio.baidu.com/aistudio/projectdetail/2715522)，线上85.5，基于PaddlePaddle，需要32G内存 + 4G GPU
37 | - [`第一次打比赛`只使用了两个特征](https://github.com/LogicJake/competition_baselines/tree/master/competitions/wsdm_iqiyi_torch)，基于Pytorch，需要8G内存 + 4G GPU
38 | 
39 | ## Temporal Link Prediction
40 | 
41 | https://www.dgl.ai/WSDM2022-Challenge/
42 | 
43 | 举办方：Intel / Amazon
44 | 
45 | 比赛类型：图算法
46 | 
47 | ### 赛题背景
48 | 
49 | Temporal Link Prediction是时间图上的经典任务之一。与询问部分观察图上两个节点之间是否存在边的链接预测相反，时间链接预测询问在给定时间跨度内两个节点之间是否存在边。
50 | 
51 | 它比传统的链接预测更有用，因为可以围绕模型构建多个应用程序，例如预测电子商务中客户的需求，或预测社交网络中将发生什么事件等。
52 | 
53 | ### 赛题任务
54 | 
55 | 在这个挑战中，我们希望有一个模型可以同时处理两种数据：
56 | 
57 | - 数据集 A：以实体为节点，以不同类型的事件为边的动态事件图。
58 | - 数据集 B：用户-项目图，以用户和项目为节点，以不同类型的交互为边。
59 | 
60 | 该任务将预测在给定时间戳之前两个给定节点之间是否存在给定类型的边。
61 | 
62 | 
63 | ### 评价指标
64 | 
65 | 使用 ROC 下的面积 (AUC) 作为两个数据集的评估指标，并使用两个$AUC$的调和平均值作为提交的分数。
66 | 
67 | 具体来说设$AUC_A$和$AUC_B$分别为数据集A和数据集B的$AUC$。
68 | 
69 | ## Cross- Market Recommendation
70 | 
71 | https://xmrec.github.io/wsdmcup/
72 | 
73 | 举办方：University of Amsterdam / University of Massachusetts Amherst / Amazon
74 | 
75 | 比赛类型：推荐系统
76 | 
77 | ### 赛题背景
78 | 
79 | 电子商务公司通常跨市场运营；例如亚马逊已将业务和销售扩展到全球18 个市场（即国家/地区）。跨市场推荐涉及通过利用类似的高资源市场的数据向目标市场的用户推荐相关产品的问题，例如利用美国市场的数据改进目标市场的推荐。
80 | 
81 | 然而关键的挑战是数据，例如用户与产品的交互数据（点击、购买、评论），传达了个别市场的某些偏见。因此在源市场上训练的算法在不同的目标市场不一定有效。
82 | 
83 | ### 赛题目标
84 | 
85 | 在本次WSDM杯挑战赛中，我们提供不同市场的用户购买和评分数据，目标是通过利用来自类似辅助市场的数据来改进这些目标市场中的个人推荐系统。
86 | 
87 | ### 评估指标
88 | 
89 | 使用NDCG@10进行评估，项目的分数为每个用户排序，前10个项目被考虑进行评估。
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/competition/biendata-智源&计算所-互联网虚假新闻检测挑战赛/README.md:
--------------------------------------------------------------------------------
1 | https://www.biendata.com/competition/falsenews/
2 | 
3 | - task1：直接使用bert，单折在91.0精度
4 |   
5 |   冠军分享：https://mp.weixin.qq.com/s/jS_QUezLyBzfOBeiHN_gkQ
6 |   
7 |   冠军代码：https://www.biendata.com/models/category/3529/L_notebook/
8 | 


--------------------------------------------------------------------------------
/competition/kaggle-allstate-claims-severity/README.md:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/c/allstate-claims-severity/
2 | 


--------------------------------------------------------------------------------
/competition/kaggle-allstate-claims-severity/nn_bagging_1111.84364.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ''' 
  3 | Author: Danijel Kivaranovic 
  4 | Title: Neural network (Keras) with sparse data
  5 | '''
  6 | 
  7 | ## import libraries
  8 | import numpy as np
  9 | np.random.seed(123)
 10 | 
 11 | import pandas as pd
 12 | import subprocess
 13 | from scipy.sparse import csr_matrix, hstack
 14 | from sklearn.metrics import mean_absolute_error
 15 | from sklearn.preprocessing import StandardScaler
 16 | from sklearn.model_selection import KFold
 17 | from keras.models import Sequential
 18 | from keras.layers import Dense, Dropout, Activation
 19 | from keras.layers.normalization import BatchNormalization
 20 | from keras.layers.advanced_activations import PReLU
 21 | 
 22 | ## Batch generators ##################################################################################################################################
 23 | 
 24 | def batch_generator(X, y, batch_size, shuffle):
 25 |     #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
 26 |     number_of_batches = np.ceil(X.shape[0]/batch_size)
 27 |     counter = 0
 28 |     sample_index = np.arange(X.shape[0])
 29 |     if shuffle:
 30 |         np.random.shuffle(sample_index)
 31 |     while True:
 32 |         batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
 33 |         X_batch = X[batch_index,:].toarray()
 34 |         y_batch = y[batch_index]
 35 |         counter += 1
 36 |         yield X_batch, y_batch
 37 |         if (counter == number_of_batches):
 38 |             if shuffle:
 39 |                 np.random.shuffle(sample_index)
 40 |             counter = 0
 41 | 
 42 | def batch_generatorp(X, batch_size, shuffle):
 43 |     number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
 44 |     counter = 0
 45 |     sample_index = np.arange(X.shape[0])
 46 |     while True:
 47 |         batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
 48 |         X_batch = X[batch_index, :].toarray()
 49 |         counter += 1
 50 |         yield X_batch
 51 |         if (counter == number_of_batches):
 52 |             counter = 0
 53 | 
 54 | ########################################################################################################################################################
 55 | 
 56 | ## read data
 57 | train = pd.read_csv('../input/train.csv')
 58 | test = pd.read_csv('../input/test.csv')
 59 | 
 60 | index = list(train.index)
 61 | print (index[0:10])
 62 | np.random.shuffle(index)
 63 | print (index[0:10])
 64 | train = train.iloc[index]
 65 | 'train = train.iloc[np.random.permutation(len(train))]'
 66 | 
 67 | ## set test loss to NaN
 68 | test['loss'] = np.nan
 69 | 
 70 | ## response and IDs
 71 | y = np.log(train['loss'].values+200)
 72 | id_train = train['id'].values
 73 | id_test = test['id'].values
 74 | 
 75 | ## stack train test
 76 | ntrain = train.shape[0]
 77 | tr_te = pd.concat((train, test), axis = 0)
 78 | 
 79 | ## Preprocessing and transforming to sparse data
 80 | sparse_data = []
 81 | 
 82 | f_cat = [f for f in tr_te.columns if 'cat' in f]
 83 | for f in f_cat:
 84 |     dummy = pd.get_dummies(tr_te[f].astype('category'))
 85 |     tmp = csr_matrix(dummy)
 86 |     sparse_data.append(tmp)
 87 | 
 88 | f_num = [f for f in tr_te.columns if 'cont' in f]
 89 | scaler = StandardScaler()
 90 | tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
 91 | sparse_data.append(tmp)
 92 | 
 93 | del(tr_te, train, test)
 94 | 
 95 | ## sparse train and test data
 96 | xtr_te = hstack(sparse_data, format = 'csr')
 97 | xtrain = xtr_te[:ntrain, :]
 98 | xtest = xtr_te[ntrain:, :]
 99 | 
100 | print('Dim train', xtrain.shape)
101 | print('Dim test', xtest.shape)
102 | 
103 | del(xtr_te, sparse_data, tmp)
104 | 
105 | ## neural net
106 | def nn_model():
107 |     model = Sequential()
108 |     
109 |     model.add(Dense(400, input_dim = xtrain.shape[1], init = 'he_normal'))
110 |     model.add(PReLU())
111 |     model.add(BatchNormalization())
112 |     model.add(Dropout(0.4))
113 |         
114 |     model.add(Dense(200, init = 'he_normal'))
115 |     model.add(PReLU())
116 |     model.add(BatchNormalization())    
117 |     model.add(Dropout(0.2))
118 |     
119 |     model.add(Dense(50, init = 'he_normal'))
120 |     model.add(PReLU())
121 |     model.add(BatchNormalization())    
122 |     model.add(Dropout(0.2))
123 |     
124 |     model.add(Dense(1, init = 'he_normal'))
125 |     model.compile(loss = 'mae', optimizer = 'adadelta')
126 |     return(model)
127 | 
128 | ## cv-folds
129 | nfolds = 10
130 | folds = KFold(n_splits = nfolds, shuffle = True, random_state = 111)
131 | 
132 | ## train models
133 | i = 0
134 | nbags = 10
135 | nepochs = 55
136 | pred_oob = np.zeros(xtrain.shape[0])
137 | pred_test = np.zeros(xtest.shape[0])
138 | 
139 | for (inTr, inTe) in folds.split(xtrain):
140 |     xtr = xtrain[inTr]
141 |     ytr = y[inTr]
142 |     xte = xtrain[inTe]
143 |     yte = y[inTe]
144 |     pred = np.zeros(xte.shape[0])
145 |     for j in range(nbags):
146 |         model = nn_model()
147 |         fit = model.fit_generator(generator = batch_generator(xtr, ytr, 128, True),
148 |                                   nb_epoch = nepochs,
149 |                                   samples_per_epoch = xtr.shape[0],
150 |                                   verbose = 1)
151 |         pred += np.exp(model.predict_generator(generator = batch_generatorp(xte, 800, False), val_samples = xte.shape[0])[:,0])-200
152 |         pred_test += np.exp(model.predict_generator(generator = batch_generatorp(xtest, 800, False), val_samples = xtest.shape[0])[:,0])-200
153 |     pred /= nbags
154 |     pred_oob[inTe] = pred
155 |     score = mean_absolute_error(np.exp(yte)-200, pred)
156 |     i += 1
157 |     print('Fold ', i, '- MAE:', score)
158 | 
159 | print('Total - MAE:', mean_absolute_error(np.exp(y)-200, pred_oob))
160 | 
161 | ## train predictions
162 | df = pd.DataFrame({'id': id_train, 'loss': pred_oob})
163 | df.to_csv('preds_oob.csv', index = False)
164 | 
165 | ## test predictions
166 | pred_test /= (nfolds*nbags)
167 | df = pd.DataFrame({'id': id_test, 'loss': pred_test})
168 | df.to_csv('submission_keras_shift_perm.csv', index = False)


--------------------------------------------------------------------------------
/competition/kaggle-atecup-deepfake/README.md:
--------------------------------------------------------------------------------
 1 | 本赛事由蚂蚁集团主办，在全球知名的数据科学竞赛平台Kaggle进行。赛事针对“AI换脸”的欺诈风险进行攻防实战演练，设立了100万元人民币的奖金池，鼓励推动AI向善的技术人才。
 2 | 
 3 | 近年来，“AI换脸”诈骗事件频发，面对全球范围的技术挑战，大赛设立了百万奖金池，分设图片赛道和音视频赛道，在此诚邀全球的学者、工程师、教育者、学生及独立开发者积极参与。
 4 | 
 5 | 
 6 | - 赛道一：图像赛道，确定给定的人脸图像是否是深度伪造图像，并输出其为深度伪造图像的概率。
 7 | - 赛道二：音视频赛道，确定包含人脸的视频（带音频）是否是Deepfake视频，并输出其深度伪造音视频的概率。
 8 | 
 9 | 赛事地址：https://www.atecup.cn/deepfake
10 | 


--------------------------------------------------------------------------------
/competition/kaggle-quickdraw-doodle-recognition/1_save2df.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os, sys, codecs, glob
 5 | import numpy as np
 6 | import pandas as pd
 7 | import cv2
 8 | 
 9 | from sklearn.preprocessing import LabelEncoder
10 | from sklearn.cross_validation import train_test_split
11 | 
12 | # 读取单个csv文件
13 | def read_df(path, nrows):
14 |     print('Reading...', path)
15 |     if nrows.isdigit():
16 |         return pd.read_csv(path, nrows=int(nrows), parse_dates=['timestamp'])
17 |     else:
18 |         return pd.read_csv(path, parse_dates=['timestamp'])
19 | 
20 | # 读取多个csv文件
21 | def contcat_df(paths, nrows):
22 |     dfs = []
23 |     for path in paths:
24 |         dfs.append(read_df(path, nrows))
25 |     return pd.concat(dfs, axis=0, ignore_index=True)
26 | 
27 | def main():
28 |     if not os.path.exists('./data'):
29 |         os.mkdir('./data')
30 |     
31 |     CLASSES_CSV = glob.glob('../input/train_simplified/*.csv')
32 |     CLASSES = [x.split('/')[-1][:-4] for x in CLASSES_CSV]
33 | 
34 |     print('Reading data...')
35 |     df = contcat_df(CLASSES_CSV, number)
36 |     df = df.reindex(np.random.permutation(df.index))
37 |     
38 |     lbl = LabelEncoder().fit(df['word'])
39 |     df['word'] = lbl.transform(df['word'])
40 |     
41 |     if df.shape[0] * 0.05 < 120000:
42 |         df_train, df_val = train_test_split(df, test_size=0.05)
43 |     else:
44 |         df_train, df_val = df.iloc[:-500000], df.iloc[-500000:]
45 |     
46 |     print('Train:', df_train.shape[0], 'Val', df_val.shape[0])
47 |     print('Save data...')
48 |     df_train.to_pickle(os.path.join('./data/', 'train_' + str(number) + '.pkl'))
49 |     df_val.to_pickle(os.path.join('./data/', 'val_' + str(number) + '.pkl'))
50 | 
51 | # python 1_save2df.py 50000
52 | # python 1_save2df.py all
53 | if __name__ == "__main__":
54 |     number = str(sys.argv[1])
55 |     main()


--------------------------------------------------------------------------------
/competition/kaggle-quickdraw-doodle-recognition/README.md:
--------------------------------------------------------------------------------
1 | https://quickdraw.withgoogle.com/
2 | 
3 | https://www.kaggle.com/c/quickdraw-doodle-recognition/
4 | 


--------------------------------------------------------------------------------
/competition/kaggle-two-sigma-connect-rental-listing-inquiries/README.md:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/
2 | 


--------------------------------------------------------------------------------
/competition/yanxishe-IMDB评论剧透检测/README.md:
--------------------------------------------------------------------------------
 1 | # IMDB评论剧透检测
 2 | ## 竞赛链接
 3 | https://god.yanxishe.com/20
 4 | ## score
 5 | 74.729
 6 | ## 操作说明
 7 | 数据放在data目录下
 8 | 执行ml.ipynb
 9 | ## 优化方向
10 | baseline中只利用了review_text信息
11 | #### 文本方向
12 | review_summary，以及IMDB_movie_details.json信息进行挖掘
13 | #### 时序方向
14 | review_date进行挖掘
15 | #### 其他方向
16 | movie_id，user_id，rating进行挖掘
17 | 


--------------------------------------------------------------------------------
/competition/yanxishe-人脸年龄识别/2_predit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
 15 | 
 16 | from efficientnet_pytorch import EfficientNet
 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 
 18 | 
 19 | import torch
 20 | torch.manual_seed(0)
 21 | torch.backends.cudnn.deterministic = False
 22 | torch.backends.cudnn.benchmark = True
 23 | 
 24 | import torchvision.models as models
 25 | import torchvision.transforms as transforms
 26 | import torchvision.datasets as datasets
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.autograd import Variable
 31 | from torch.utils.data.dataset import Dataset
 32 | 
 33 | class QRDataset(Dataset):
 34 |     def __init__(self, img_path, transform=None):
 35 |         self.img_path = img_path
 36 |         if transform is not None:
 37 |             self.transform = transform
 38 |         else:
 39 |             self.transform = None
 40 |     
 41 |     def __getitem__(self, index):
 42 |         start_time = time.time()
 43 |         img = Image.open(self.img_path[index]).convert('RGB')
 44 |         
 45 |         if self.transform is not None:
 46 |             img = self.transform(img)
 47 |         return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index])))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 | 
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |                 
 56 | #         model = models.resnet18(True)
 57 | #         model.avgpool = nn.AdaptiveAvgPool2d(1)
 58 | #         model.fc = nn.Linear(512, 2)
 59 | #         self.resnet = model
 60 |         
 61 |         model = EfficientNet.from_pretrained('efficientnet-b0') 
 62 |         model._fc = nn.Linear(1280, 100)
 63 |         self.resnet = model
 64 |         
 65 |     def forward(self, img):        
 66 |         out = self.resnet(img)
 67 |         return out
 68 | 
 69 | def predict(test_loader, model, tta=10):
 70 |     # switch to evaluate mode
 71 |     model.eval()
 72 |     
 73 |     test_pred_tta = None
 74 |     for _ in range(tta):
 75 |         test_pred = []
 76 |         with torch.no_grad():
 77 |             end = time.time()
 78 |             for i, (input, target) in enumerate(test_loader):
 79 |                 input = input.cuda()
 80 |                 target = target.cuda()
 81 | 
 82 |                 # compute output
 83 |                 output = model(input)
 84 |                 output = output.data.cpu().numpy()
 85 | 
 86 |                 test_pred.append(output)
 87 |         test_pred = np.vstack(test_pred)
 88 |     
 89 |         if test_pred_tta is None:
 90 |             test_pred_tta = test_pred
 91 |         else:
 92 |             test_pred_tta += test_pred
 93 |     
 94 |     return test_pred_tta
 95 | 
 96 | test_jpg = ['../face_age_dataset/test/{0}.png'.format(x) for x in range(1, 1805)]
 97 | test_jpg = np.array(test_jpg)
 98 | 
 99 | test_pred = None
100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt', 'resnet18_fold3.pt']:
101 |     
102 |     test_loader = torch.utils.data.DataLoader(
103 |         QRDataset(test_jpg,
104 |                 transforms.Compose([
105 |                             transforms.Resize((224, 224)),
106 |                             transforms.RandomHorizontalFlip(),
107 |                             transforms.RandomVerticalFlip(),
108 |                             transforms.ToTensor(),
109 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
110 |             ])
111 |         ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True
112 |     )
113 |         
114 |     
115 |     model = VisitNet().cuda()
116 |     model = nn.DataParallel(model).cuda()
117 |     model.load_state_dict(torch.load(model_path))
118 |     # model = nn.DataParallel(model).cuda()
119 |     if test_pred is None:
120 |         test_pred = predict(test_loader, model, 5)
121 |     else:
122 |         test_pred += predict(test_loader, model, 5)
123 |     
124 | test_csv = pd.DataFrame()
125 | test_csv[0] = list(range(1, 1805))
126 | test_csv[1] = np.argmax(test_pred, 1)
127 | test_csv[1] = test_csv[1].apply(lambda x: str(x).zfill(3))
128 | test_csv.to_csv('tmp.csv', index=None, header=None)


--------------------------------------------------------------------------------
/competition/yanxishe-人脸年龄识别/README.md:
--------------------------------------------------------------------------------
 1 | 比赛链接：https://god.yanxishe.com/10
 2 | 
 3 | 使用IMDB-WIKI数据集进行pretrain，再到比赛数据集finetune；
 4 | https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/
 5 | 
 6 | 单模型4折，就可以达到25准确率，复现Top5成绩；
 7 | 
 8 | ```
 9 | python3 1_train.py
10 | python3 2_predict.py
11 | ```
12 | 
13 | 人脸年龄识别练习赛冠军源码_1575964312087.zip为比赛前三名的代码；
14 | 


--------------------------------------------------------------------------------
/competition/yanxishe-人脸年龄识别/人脸年龄识别练习赛冠军源码_1575964312087.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/yanxishe-人脸年龄识别/人脸年龄识别练习赛冠军源码_1575964312087.zip


--------------------------------------------------------------------------------
/competition/yanxishe-喵脸关键点检测/2_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
 15 | 
 16 | from efficientnet_pytorch import EfficientNet
 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 
 18 | 
 19 | import torch
 20 | torch.manual_seed(0)
 21 | torch.backends.cudnn.deterministic = False
 22 | torch.backends.cudnn.benchmark = True
 23 | 
 24 | import torchvision.models as models
 25 | import torchvision.transforms as transforms
 26 | import torchvision.datasets as datasets
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.autograd import Variable
 31 | from torch.utils.data.dataset import Dataset
 32 | 
 33 | class QRDataset(Dataset):
 34 |     def __init__(self, img_path, transform=None):
 35 |         self.img_path = img_path
 36 |         if transform is not None:
 37 |             self.transform = transform
 38 |         else:
 39 |             self.transform = None
 40 |     
 41 |     def __getitem__(self, index):
 42 |         start_time = time.time()
 43 |         img = Image.open(self.img_path[index]).convert('RGB')
 44 |         
 45 |         if self.transform is not None:
 46 |             img = self.transform(img)
 47 |         return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index])))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 | 
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |                 
 56 | #         model = models.resnet18(True)
 57 | #         model.avgpool = nn.AdaptiveAvgPool2d(1)
 58 | #         model.fc = nn.Linear(512, 2)
 59 | #         self.resnet = model
 60 |         
 61 |         model = EfficientNet.from_pretrained('efficientnet-b0') 
 62 |         model._fc = nn.Linear(1280, 18)
 63 |         self.resnet = model
 64 |         
 65 |     def forward(self, img):        
 66 |         out = self.resnet(img)
 67 |         return out
 68 | 
 69 | def predict(test_loader, model, tta=10):
 70 |     # switch to evaluate mode
 71 |     model.eval()
 72 |     
 73 |     test_pred_tta = None
 74 |     for _ in range(tta):
 75 |         test_pred = []
 76 |         with torch.no_grad():
 77 |             end = time.time()
 78 |             for i, (input, target) in tqdm(enumerate(test_loader), total=len(test_loader)):
 79 |                 input = input.cuda()
 80 |                 target = target.cuda()
 81 | 
 82 |                 # compute output
 83 |                 output = model(input)
 84 |                 output = output.data.cpu().numpy()
 85 | 
 86 |                 test_pred.append(output)
 87 |         test_pred = np.vstack(test_pred)
 88 |     
 89 |         if test_pred_tta is None:
 90 |             test_pred_tta = test_pred
 91 |         else:
 92 |             test_pred_tta += test_pred
 93 |     
 94 |     return test_pred_tta
 95 | 
 96 | test_jpg = ['../test/{0}.jpg'.format(x) for x in range(0, 9526)]
 97 | test_jpg = np.array(test_jpg)
 98 | 
 99 | test_pred = None
100 | for model_path in ['./resnet18_fold4.pt']:
101 |     
102 |     test_loader = torch.utils.data.DataLoader(
103 |         QRDataset(test_jpg,
104 |                 transforms.Compose([
105 |                             transforms.Resize((512, 512)),
106 |                             # transforms.RandomHorizontalFlip(),
107 |                             # transforms.RandomVerticalFlip(),
108 |                             transforms.ToTensor(),
109 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
110 |             ])
111 |         ), batch_size=20, shuffle=False, num_workers=10, pin_memory=True
112 |     )
113 |         
114 |     
115 |     model = VisitNet().cuda()
116 |     model.load_state_dict(torch.load(model_path))
117 |     # model = nn.DataParallel(model).cuda()
118 |     if test_pred is None:
119 |         test_pred = predict(test_loader, model, 1)
120 |     else:
121 |         test_pred += predict(test_loader, model, 1)
122 |     
123 | # test_csv = pd.DataFrame()
124 | # test_csv[0] = list(range(0, 1047))
125 | # test_csv[1] = np.argmax(test_pred, 1)
126 | # test_csv.to_csv('tmp.csv', index=None, header=None)
127 | 
128 | test_pred = pd.DataFrame(test_pred)
129 | test_pred.columns = ['left_eye_x', 'left_eye_y', 'right_eye_x', 'right_eye_y',
130 |        'mouth_x', 'mouth_y', 'left_ear1_x', 'left_ear1_y', 'left_ear2_x',
131 |        'left_ear2_y', 'left_ear3_x', 'left_ear3_y', 'right_ear1_x',
132 |        'right_ear1_y', 'right_ear2_x', 'right_ear2_y', 'right_ear3_x',
133 |        'right_ear3_y']
134 | test_pred = test_pred.reset_index()
135 | 
136 | img_size = []
137 | for idx in (range(9526)):
138 |     img_size.append(cv2.imread('../test/{0}.jpg'.format(idx)).shape[:2])
139 | 
140 | img_size = np.vstack(img_size)
141 | test_pred['height'] = img_size[:, 0]
142 | test_pred['width'] = img_size[:, 1]
143 | 
144 | for col in test_pred.columns:
145 |     if '_x' in col:
146 |         test_pred[col]*=test_pred['width']
147 |     elif '_y' in col:
148 |         test_pred[col]*=test_pred['height']
149 | 
150 | test_pred.astype(int).iloc[:, :-2].to_csv('tmp.csv', index=None, header=None)


--------------------------------------------------------------------------------
/competition/yanxishe-喵脸关键点检测/README.md:
--------------------------------------------------------------------------------
1 | https://god.yanxishe.com/19
2 | 
3 | 利用CNN进行回归预测
4 | 
5 | ```
6 | python3 1_train.py
7 | python3 2_predict.py
8 | ```
9 | 


--------------------------------------------------------------------------------
/competition/yanxishe-白葡萄酒品质预测/README.md:
--------------------------------------------------------------------------------
1 | https://god.yanxishe.com/15
2 | 
3 | lgb线上得分96.2667
4 | 
5 | 


--------------------------------------------------------------------------------
/competition/yanxishe-白葡萄酒品质预测/lgb_baseline.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import lightgbm as lgb
 4 | from sklearn.preprocessing import LabelEncoder
 5 | from itertools import combinations
 6 | 
 7 | train_df = pd.read_csv('../input/train.csv', header=None, sep=';')
 8 | test_df = pd.read_csv('../input/test.csv', header=None, sep=';')
 9 | 
10 | train_df = train_df[train_df[11] != 'quality']
11 | lbl = LabelEncoder().fit(train_df[11])
12 | train_df[11] = lbl.transform(train_df[11])
13 | 
14 | for a, b in combinations([0,1,2,3,4,7,8,9,10], 2):
15 |     train_df[str(a) + '_' + str(b)] = train_df[a].astype(float) + train_df[b].astype(float)
16 |     train_df[str(a) + '/' + str(b)] = train_df[a].astype(float) / train_df[b].astype(float)
17 |     train_df[str(a) + '*' + str(b)] = train_df[a].astype(float) * train_df[b].astype(float)
18 |     train_df[str(a) + '/log' + str(b)] = train_df[a].astype(float) / np.log1p(train_df[b].astype(float))
19 |     
20 |     test_df[str(a) + '_' + str(b)] = test_df[a].astype(float) + test_df[b].astype(float)
21 |     test_df[str(a) + '/' + str(b)] = test_df[a].astype(float) / test_df[b].astype(float)
22 |     test_df[str(a) + '*' + str(b)] = test_df[a].astype(float) * test_df[b].astype(float)
23 |     test_df[str(a) + '/log' + str(b)] = test_df[a].astype(float) / np.log1p(test_df[b].astype(float))
24 |     
25 | from sklearn.model_selection import StratifiedKFold
26 | from sklearn.metrics import roc_auc_score
27 | 
28 | n_fold = 10
29 | skf = StratifiedKFold(n_splits = n_fold, shuffle = True)
30 | eval_fun = roc_auc_score
31 | 
32 | def run_oof(clf, X_train, y_train, X_test, kf):
33 |     print(clf)
34 |     preds_train = np.zeros((len(X_train), 7), dtype = np.float)
35 |     preds_test = np.zeros((len(X_test), 7), dtype = np.float)
36 |     train_loss = []; test_loss = []
37 | 
38 |     i = 1
39 |     for train_index, test_index in kf.split(X_train, y_train):
40 |         x_tr = X_train[train_index]; x_te = X_train[test_index]
41 |         y_tr = y_train[train_index]; y_te = y_train[test_index]
42 |         clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], early_stopping_rounds = 500, verbose = False)
43 |         
44 |         # train_loss.append(eval_fun(y_tr, clf.predict_proba(x_tr)[:]))
45 |         # test_loss.append(eval_fun(y_te, clf.predict_proba(x_te)[:]))
46 | 
47 |         preds_train[test_index] = clf.predict_proba(x_te)[:]
48 |         preds_test += clf.predict_proba(X_test)[:]
49 | 
50 |         # print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss)))
51 |         print('-' * 50)
52 |         i += 1
53 |     print('Train: ', train_loss)
54 |     print('Val: ', test_loss)
55 |     print('-' * 50)
56 |     # print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss)))
57 |     preds_test /= n_fold
58 |     return preds_train, preds_test
59 | 
60 | params = {
61 |     'learning_rate': 0.01,
62 |     'min_child_samples': 5,
63 |     'max_depth': 5,
64 |     'lambda_l1': 2,
65 |     'boosting': 'gbdt',
66 |     'objective': 'multiclass',
67 |     'n_estimators': 3000,
68 |     'metric': 'multi_error',
69 |     'num_class': 7,
70 |     'feature_fraction': .75,
71 |     'bagging_fraction': .85,
72 |     'seed': 99,
73 |     'num_threads': 20,
74 |     'verbose': -1
75 | }
76 | 
77 | train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 
78 |                                 train_df.drop(11, axis=1).values, 
79 |                                 train_df[11].values, 
80 |                                 test_df.values, 
81 |                                 skf)
82 | 
83 | submit = pd.DataFrame()
84 | submit[0] = range(len(test_df))
85 | submit[1] = lbl.inverse_transform(np.argmax(test_pred, 1))
86 | submit.to_csv('lgb.csv', index=None, header=None)
87 | 


--------------------------------------------------------------------------------
/competition/yanxishe-白葡萄酒品质预测/winequality_dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/yanxishe-白葡萄酒品质预测/winequality_dataset.zip


--------------------------------------------------------------------------------
/competition/yanxishe-美食识别挑战（1）：豆腐VS土豆/2_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
 15 | 
 16 | from efficientnet_pytorch import EfficientNet
 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 
 18 | 
 19 | import torch
 20 | torch.manual_seed(0)
 21 | torch.backends.cudnn.deterministic = False
 22 | torch.backends.cudnn.benchmark = True
 23 | 
 24 | import torchvision.models as models
 25 | import torchvision.transforms as transforms
 26 | import torchvision.datasets as datasets
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.autograd import Variable
 31 | from torch.utils.data.dataset import Dataset
 32 | 
 33 | class QRDataset(Dataset):
 34 |     def __init__(self, img_path, transform=None):
 35 |         self.img_path = img_path
 36 |         if transform is not None:
 37 |             self.transform = transform
 38 |         else:
 39 |             self.transform = None
 40 |     
 41 |     def __getitem__(self, index):
 42 |         start_time = time.time()
 43 |         img = Image.open(self.img_path[index]).convert('RGB')
 44 |         
 45 |         if self.transform is not None:
 46 |             img = self.transform(img)
 47 |         return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index])))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 | 
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |                 
 56 |         model = models.resnet18(True)
 57 |         model.avgpool = nn.AdaptiveAvgPool2d(1)
 58 |         model.fc = nn.Linear(512, 2)
 59 |         self.resnet = model
 60 |         
 61 | #         model = EfficientNet.from_pretrained('efficientnet-b4') 
 62 | #         model._fc = nn.Linear(1792, 2)
 63 | #         self.resnet = model
 64 |         
 65 |     def forward(self, img):        
 66 |         out = self.resnet(img)
 67 |         return out
 68 | 
 69 | def predict(test_loader, model, tta=10):
 70 |     # switch to evaluate mode
 71 |     model.eval()
 72 |     
 73 |     test_pred_tta = None
 74 |     for _ in range(tta):
 75 |         test_pred = []
 76 |         with torch.no_grad():
 77 |             end = time.time()
 78 |             for i, (input, target) in enumerate(test_loader):
 79 |                 input = input.cuda()
 80 |                 target = target.cuda()
 81 | 
 82 |                 # compute output
 83 |                 output = model(input)
 84 |                 output = output.data.cpu().numpy()
 85 | 
 86 |                 test_pred.append(output)
 87 |         test_pred = np.vstack(test_pred)
 88 |     
 89 |         if test_pred_tta is None:
 90 |             test_pred_tta = test_pred
 91 |         else:
 92 |             test_pred_tta += test_pred
 93 |     
 94 |     return test_pred_tta
 95 | 
 96 | test_jpg = ['./豆腐和土豆/test/{0}.jpg'.format(x) for x in range(0, 1047)]
 97 | test_jpg = np.array(test_jpg)
 98 | 
 99 | test_pred = None
100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt',
101 |                   'resnet18_fold3.pt', 'resnet18_fold4.pt', 'resnet18_fold5.pt',
102 |                   'resnet18_fold6.pt', 'resnet18_fold7.pt', 'resnet18_fold8.pt',
103 |                   'resnet18_fold9.pt']:
104 |     
105 |     test_loader = torch.utils.data.DataLoader(
106 |         QRDataset(test_jpg,
107 |                 transforms.Compose([
108 |                             transforms.Resize((512, 512)),
109 |                             transforms.RandomHorizontalFlip(),
110 |                             transforms.RandomVerticalFlip(),
111 |                             transforms.ToTensor(),
112 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
113 |             ])
114 |         ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True
115 |     )
116 |         
117 |     
118 |     model = VisitNet().cuda()
119 |     model.load_state_dict(torch.load(model_path))
120 |     # model = nn.DataParallel(model).cuda()
121 |     if test_pred is None:
122 |         test_pred = predict(test_loader, model, 2)
123 |     else:
124 |         test_pred += predict(test_loader, model, 2)
125 |     
126 | test_csv = pd.DataFrame()
127 | test_csv[0] = list(range(0, 1047))
128 | test_csv[1] = np.argmax(test_pred, 1)
129 | test_csv.to_csv('tmp.csv', index=None, header=None)


--------------------------------------------------------------------------------
/competition/yanxishe-美食识别挑战（1）：豆腐VS土豆/README.md:
--------------------------------------------------------------------------------
 1 | https://god.yanxishe.com/16
 2 | 
 3 | resnet18 10fold tta,
 4 | 
 5 | 修改代码里面文件路径后：
 6 | 
 7 | ```
 8 | python 1_train.py
 9 | python 2_predict.py
10 | ```
11 | 


--------------------------------------------------------------------------------
/competition/yanxishe-肌肉活动电信号推测手势/README.md:
--------------------------------------------------------------------------------
1 | https://god.yanxishe.com/14
2 | 
3 | LBG baseline,线上95.2
4 | 


--------------------------------------------------------------------------------
/competition/yanxishe-肺炎X光病灶识别/2_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
 15 | 
 16 | from efficientnet_pytorch import EfficientNet
 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 
 18 | 
 19 | import torch
 20 | torch.manual_seed(0)
 21 | torch.backends.cudnn.deterministic = False
 22 | torch.backends.cudnn.benchmark = True
 23 | 
 24 | import torchvision.models as models
 25 | import torchvision.transforms as transforms
 26 | import torchvision.datasets as datasets
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.autograd import Variable
 31 | from torch.utils.data.dataset import Dataset
 32 | 
 33 | class QRDataset(Dataset):
 34 |     def __init__(self, img_path, transform=None):
 35 |         self.img_path = img_path
 36 |         if transform is not None:
 37 |             self.transform = transform
 38 |         else:
 39 |             self.transform = None
 40 |     
 41 |     def __getitem__(self, index):
 42 |         start_time = time.time()
 43 |         img = Image.open(self.img_path[index]).convert('RGB')
 44 |         
 45 |         if self.transform is not None:
 46 |             img = self.transform(img)
 47 |         return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index])))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 | 
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |                 
 56 |         model = models.resnet18(True)
 57 |         model.avgpool = nn.AdaptiveAvgPool2d(1)
 58 |         model.fc = nn.Linear(512, 5)
 59 |         self.resnet = model
 60 |         
 61 | #         model = EfficientNet.from_pretrained('efficientnet-b4') 
 62 | #         model._fc = nn.Linear(1792, 2)
 63 | #         self.resnet = model
 64 |         
 65 |     def forward(self, img):        
 66 |         out = self.resnet(img)
 67 |         return out
 68 | 
 69 | def predict(test_loader, model, tta=10):
 70 |     # switch to evaluate mode
 71 |     model.eval()
 72 |     
 73 |     test_pred_tta = None
 74 |     for _ in range(tta):
 75 |         test_pred = []
 76 |         with torch.no_grad():
 77 |             end = time.time()
 78 |             for i, (input, target) in enumerate(test_loader):
 79 |                 input = input.cuda()
 80 |                 target = target.cuda()
 81 | 
 82 |                 # compute output
 83 |                 output = model(input)
 84 |                 output = output.data.cpu().numpy()
 85 | 
 86 |                 test_pred.append(output)
 87 |         test_pred = np.vstack(test_pred)
 88 |     
 89 |         if test_pred_tta is None:
 90 |             test_pred_tta = test_pred
 91 |         else:
 92 |             test_pred_tta += test_pred
 93 |     
 94 |     return test_pred_tta
 95 | 
 96 | test_jpg = ['./test/{0}.jpg'.format(x) for x in range(0, 6671)]
 97 | test_jpg = np.array(test_jpg)
 98 | 
 99 | test_pred = None
100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold0.pt']:
101 |     
102 |     test_loader = torch.utils.data.DataLoader(
103 |         QRDataset(test_jpg,
104 |                 transforms.Compose([
105 |                             transforms.Resize((512, 512)),
106 |                             transforms.RandomHorizontalFlip(),
107 |                             transforms.RandomVerticalFlip(),
108 |                             transforms.ToTensor(),
109 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
110 |             ])
111 |         ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True
112 |     )
113 |         
114 |     
115 |     model = VisitNet().cuda()
116 |     model.load_state_dict(torch.load(model_path))
117 |     # model = nn.DataParallel(model).cuda()
118 |     if test_pred is None:
119 |         test_pred = predict(test_loader, model, 2)
120 |     else:
121 |         test_pred += predict(test_loader, model, 2)
122 |     
123 | test_csv = pd.DataFrame()
124 | test_csv[0] = list(range(0, 6671))
125 | test_csv[1] = np.argmax(test_pred, 1)
126 | test_csv.to_csv('tmp.csv', index=None, header=None)


--------------------------------------------------------------------------------
/competition/yanxishe-肺炎X光病灶识别/README.md:
--------------------------------------------------------------------------------
1 | https://god.yanxishe.com/18
2 | 
3 | 直接用分类的思路线上成绩77.5146，暂时没有用到位置信息；
4 | ```
5 | python3 1_train.py
6 | python3 2_predict.py
7 | ```
8 | 


--------------------------------------------------------------------------------
/competition/yanxishe-胸腔X光肺炎检测/2_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
 15 | 
 16 | from efficientnet_pytorch import EfficientNet
 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 
 18 | 
 19 | import torch
 20 | torch.manual_seed(0)
 21 | torch.backends.cudnn.deterministic = False
 22 | torch.backends.cudnn.benchmark = True
 23 | 
 24 | import torchvision.models as models
 25 | import torchvision.transforms as transforms
 26 | import torchvision.datasets as datasets
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.autograd import Variable
 31 | from torch.utils.data.dataset import Dataset
 32 | 
 33 | class QRDataset(Dataset):
 34 |     def __init__(self, img_path, transform=None):
 35 |         self.img_path = img_path
 36 |         if transform is not None:
 37 |             self.transform = transform
 38 |         else:
 39 |             self.transform = None
 40 |     
 41 |     def __getitem__(self, index):
 42 |         start_time = time.time()
 43 |         img = Image.open(self.img_path[index]).convert('RGB')
 44 |         
 45 |         if self.transform is not None:
 46 |             img = self.transform(img)
 47 |         return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index])))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 | 
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |                 
 56 |         model = models.resnet18(True)
 57 |         model.avgpool = nn.AdaptiveAvgPool2d(1)
 58 |         model.fc = nn.Linear(512, 2)
 59 |         self.resnet = model
 60 |         
 61 | #         model = EfficientNet.from_pretrained('efficientnet-b4') 
 62 | #         model._fc = nn.Linear(1792, 2)
 63 | #         self.resnet = model
 64 |         
 65 |     def forward(self, img):        
 66 |         out = self.resnet(img)
 67 |         return out
 68 | 
 69 | def predict(test_loader, model, tta=10):
 70 |     # switch to evaluate mode
 71 |     model.eval()
 72 |     
 73 |     test_pred_tta = None
 74 |     for _ in range(tta):
 75 |         test_pred = []
 76 |         with torch.no_grad():
 77 |             end = time.time()
 78 |             for i, (input, target) in enumerate(test_loader):
 79 |                 input = input.cuda()
 80 |                 target = target.cuda()
 81 | 
 82 |                 # compute output
 83 |                 output = model(input)
 84 |                 output = output.data.cpu().numpy()
 85 | 
 86 |                 test_pred.append(output)
 87 |         test_pred = np.vstack(test_pred)
 88 |     
 89 |         if test_pred_tta is None:
 90 |             test_pred_tta = test_pred
 91 |         else:
 92 |             test_pred_tta += test_pred
 93 |     
 94 |     return test_pred_tta
 95 | 
 96 | test_jpg = ['../input/xray_dataset/test/{0}.jpeg'.format(x) for x in range(1, 1758)]
 97 | test_jpg = np.array(test_jpg)
 98 | 
 99 | test_pred = None
100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt',
101 |                   'resnet18_fold3.pt']:
102 |     
103 |     test_loader = torch.utils.data.DataLoader(
104 |         QRDataset(test_jpg,
105 |                 transforms.Compose([
106 |                             transforms.Resize((512, 512)),
107 |                             transforms.RandomHorizontalFlip(),
108 |                             transforms.RandomVerticalFlip(),
109 |                             transforms.ToTensor(),
110 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
111 |             ])
112 |         ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True
113 |     )
114 |         
115 |     
116 |     model = VisitNet().cuda()
117 |     model.load_state_dict(torch.load(model_path))
118 |     # model = nn.DataParallel(model).cuda()
119 |     if test_pred is None:
120 |         test_pred = predict(test_loader, model, 5)
121 |     else:
122 |         test_pred += predict(test_loader, model, 5)
123 |     
124 | test_csv = pd.DataFrame()
125 | test_csv[0] = list(range(1, 1758))
126 | test_csv[1] = np.argmax(test_pred, 1)
127 | test_csv.to_csv('tmp.csv', index=None, header=None)
128 | 


--------------------------------------------------------------------------------
/competition/yanxishe-胸腔X光肺炎检测/README.md:
--------------------------------------------------------------------------------
 1 | https://god.yanxishe.com/13
 2 | 
 3 | pytorch resnet18 TTA
 4 | 线上99
 5 | 
 6 | ```
 7 | python3 1_train.py
 8 | python3 2_predict.py
 9 | ```
10 | 
11 | 


--------------------------------------------------------------------------------
/competition/全球AI攻防挑战赛/README.md:
--------------------------------------------------------------------------------
1 | 在全球人工智能发展和治理广受关注的大趋势下，由中国图象图形学学会、蚂蚁集团、云安全联盟CSA大中华区主办，广泛联合学界、机构共同组织发起全球AI攻防挑战赛。本次比赛包含攻防两大赛道，分别聚焦大模型自身安全和大模型生成内容的防伪检测，涉及信用成长、凭证审核、商家入驻、智能助理等多个业务场景，覆盖机器学习、图像处理与计算机视觉、数据处理等多个算法领域，旨在聚合行业及学界力量共同守护AI及大模型的安全，共同推动AI安全可信技术的发展。
2 | 
3 | - 赛题 1：https://tianchi.aliyun.com/s/24acb952f488f1f713a5294cf585bea3
4 | - 赛题 2：https://tianchi.aliyun.com/s/14a815673dc09ef786edf5794bf3bce2
5 | 


--------------------------------------------------------------------------------
/competition/点石-Retention Rate of Baidu Hao Kan APP Users/1_splitdf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import os, sys, time, codecs, glob
 5 | from tqdm import tqdm, tqdm_notebook
 6 | 
 7 | def read_input(debug=True):
 8 |     if debug:
 9 |         nrows = 100000
10 |     else:
11 |         nrows = None
12 | 
13 |     train = pd.read_csv('../input/train', sep='\t', nrows=nrows,
14 |                 names=['user_id', 'user_male', 'user_age', 'user_edu', 'user_district', 'label',  'user_install',
15 |                         'video_id', 'video_class', 'video_tag', 'video_creator', 'video_uptime', 'video_duration',
16 |                         'behavior_show', 'behavior_click', 'behavior_recommend', 'behavior_playback', 'behavior_timestamp',
17 |                         'behavior_comment', 'behavior_like', 'behavior_forard'],
18 |                 dtype={'user_id':object, 'video_tag':object})
19 |     test = pd.read_csv('../input/test', sep='\t', nrows=nrows,
20 |                 names=['user_id', 'user_male', 'user_age', 'user_edu', 'user_district',  'user_install',
21 |                         'video_id', 'video_class', 'video_tag', 'video_creator', 'video_uptime', 'video_duration',
22 |                         'behavior_show', 'behavior_click', 'behavior_recommend', 'behavior_playback', 'behavior_timestamp',
23 |                         'behavior_comment', 'behavior_like', 'behavior_forard'])
24 | 
25 | #     train['video_uptime'] = train['video_uptime'].apply(lambda x: timestamp_datetime(x))
26 | #     train['behavior_timestamp'] = train['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000))
27 | #     train['video_tag'] = train['video_tag'].apply(lambda x: x.split('$'))
28 | #     train.sort_values(by=['user_id', 'behavior_timestamp'], inplace=True)
29 | 
30 | 
31 | #     test['video_uptime'] = test['video_uptime'].apply(lambda x: timestamp_datetime(x))
32 | #     test['behavior_timestamp'] = test['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000))
33 | #     test['video_tag'] = test['video_tag'].apply(lambda x: x.split('$'))
34 | #     test.sort_values(by=['user_id', 'behavior_timestamp'], inplace=True)
35 | 
36 |     return train, test
37 | 
38 | train, test = read_input(debug=False)
39 | 
40 | # idx = train['user_id'].value_counts()
41 | # idx = idx[train['user_id'].unique()]
42 | # idx = idx.reset_index()
43 | # for i, rows in tqdm(enumerate(idx.iterrows())):
44 | #     if i == 0:
45 | #         start = 0
46 | #     else:
47 | #         start = idx.iloc[:i]['user_id'].sum()
48 | #     span = idx.iloc[i]['user_id']
49 | 
50 | #     tmp_df = train.iloc[start :start+span]
51 | #     tmp_df.to_csv('./train/{0}.csv'.format(str(idx.iloc[i]['index'])), index=None)
52 | 
53 | idx = test['user_id'].value_counts()
54 | idx = idx[test['user_id'].unique()]
55 | idx = idx.reset_index()
56 | for i, rows in tqdm(enumerate(idx.iterrows())):
57 |     if i == 0:
58 |         start = 0
59 |     else:
60 |         start = idx.iloc[:i]['user_id'].sum()
61 |     span = idx.iloc[i]['user_id']
62 | 
63 |     tmp_df = test.iloc[start :start+span]
64 |     tmp_df.to_csv('./test/{0}.csv'.format(str(idx.iloc[i]['index'])), index=None)
65 | 


--------------------------------------------------------------------------------
/competition/点石-Retention Rate of Baidu Hao Kan APP Users/README.md:
--------------------------------------------------------------------------------
1 | https://dianshi.baidu.com/competition/24/rule
2 | 
3 | 比赛数据下载：链接: https://pan.baidu.com/s/1Nw64v5jPAoom3PUxRZqxNw 提取码: w54b
4 | 
5 | 第五名代码
6 | 


--------------------------------------------------------------------------------
/competition/点石-Retention Rate of Baidu Hao Kan APP Users/featselect.py:
--------------------------------------------------------------------------------
 1 | import os, sys, codecs
 2 | import lightgbm as lgb
 3 | 
 4 | def modelWarpper(clf, data_train, data_label, basescore):
 5 |     params = {
 6 |         'learning_rate': 0.01,
 7 |         'min_child_samples': 5,
 8 |         'max_depth': 4,
 9 |         'lambda_l1': 2,
10 |         'boosting': 'gbdt',
11 |         'objective': 'binary',
12 |         'n_estimators': 2000,
13 |         'metric': 'auc',
14 |         # 'num_class': 6,
15 |         'feature_fraction': .85,
16 |         'bagging_fraction': .85,
17 |         'seed': 99,
18 |         'num_threads': -1,
19 |         'verbose': -1
20 |     }
21 |     for col in data_train.columns:
22 |         cv_results1 = lgb.cv(
23 |                 params,
24 |                 lgb.Dataset(data_train.drop([col], axis=1).values, label=data_label.values),
25 |                 num_boost_round=2000,
26 |                 nfold=7, verbose_eval=False,
27 |                 early_stopping_rounds=200,
28 |         )
29 |         
30 |         if cv_results1['auc-mean'][-1] > basescore:
31 |             print('+', col, 'CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1])
32 |         else:
33 |             print('-', col, 'CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1])
34 |             
35 |     XX


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛-事件抽取挑战/README.md:
--------------------------------------------------------------------------------
 1 | 赛题官网：http://challenge.xfyun.cn/topic/info?type=hotspot
 2 | 
 3 | 赛题baseline：https://zhuanlan.zhihu.com/p/150190165
 4 | 
 5 | 思路：使用BERT完成事件抽取
 6 | 
 7 | 硬软件需要：
 8 | - 安装bert4keras，https://github.com/bojone/bert4keras
 9 | - 有GPU
10 | - 下载BERT预训练参数，https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
11 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛-婴儿啼哭声识别挑战赛/README.md:
--------------------------------------------------------------------------------
1 | 比赛链接：http://challenge.xfyun.cn/topic/info?type=baby-crying
2 | 
3 | baseline思路：MFCC特征+CNN模型
4 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛-温室温度预测挑战赛/README.md:
--------------------------------------------------------------------------------
1 | 比赛链接：http://challenge.xfyun.cn/topic/info?type=temperature
2 | 
3 | 线上得分0.14左右。
4 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛-脑PET图像分析和疾病预测挑战赛算法挑战大赛/2_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
 15 | 
 16 | # from efficientnet_pytorch import EfficientNet
 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 
 18 | 
 19 | import torch
 20 | torch.manual_seed(0)
 21 | torch.backends.cudnn.deterministic = False
 22 | torch.backends.cudnn.benchmark = True
 23 | 
 24 | import torchvision.models as models
 25 | import torchvision.transforms as transforms
 26 | import torchvision.datasets as datasets
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.autograd import Variable
 31 | from torch.utils.data.dataset import Dataset
 32 | 
 33 | class QRDataset(Dataset):
 34 |     def __init__(self, train_jpg, transform=None):
 35 |         self.train_jpg = train_jpg
 36 |         if transform is not None:
 37 |             self.transform = transform
 38 |         else:
 39 |             self.transform = None
 40 |     
 41 |     def __getitem__(self, index):
 42 |         start_time = time.time()
 43 |         img = Image.open(self.train_jpg[index]).convert('RGB')
 44 |         
 45 |         if self.transform is not None:
 46 |             img = self.transform(img)
 47 |         
 48 |         return img,torch.from_numpy(np.array(int('AD' in self.train_jpg[index])))
 49 |     
 50 |     def __len__(self):
 51 |         return len(self.train_jpg)
 52 | 
 53 | class VisitNet(nn.Module):
 54 |     def __init__(self):
 55 |         super(VisitNet, self).__init__()
 56 |                 
 57 |         model = models.resnet34(True)
 58 |         model.avgpool = nn.AdaptiveAvgPool2d(1)
 59 |         model.fc = nn.Linear(512, 2)
 60 |         self.resnet = model
 61 |         
 62 | #         model = EfficientNet.from_pretrained('efficientnet-b4') 
 63 | #         model._fc = nn.Linear(1792, 2)
 64 | #         self.resnet = model
 65 |         
 66 |     def forward(self, img):        
 67 |         out = self.resnet(img)
 68 |         return out
 69 | 
 70 | def predict(test_loader, model, tta=10):
 71 |     # switch to evaluate mode
 72 |     model.eval()
 73 |     
 74 |     test_pred_tta = None
 75 |     for _ in range(tta):
 76 |         test_pred = []
 77 |         with torch.no_grad():
 78 |             end = time.time()
 79 |             for i, (input, target) in enumerate(test_loader):
 80 |                 input = input.cuda()
 81 |                 target = target.cuda()
 82 | 
 83 |                 # compute output
 84 |                 output = model(input)
 85 |                 output = output.data.cpu().numpy()
 86 | 
 87 |                 test_pred.append(output)
 88 |         test_pred = np.vstack(test_pred)
 89 |     
 90 |         if test_pred_tta is None:
 91 |             test_pred_tta = test_pred
 92 |         else:
 93 |             test_pred_tta += test_pred
 94 |     
 95 |     return test_pred_tta
 96 | 
 97 | test_jpg = ['../初赛数据/test/AD&CN/{0}.png'.format(x) for x in range(1, 1001)]
 98 | test_jpg = np.array(test_jpg)
 99 | 
100 | test_pred = None
101 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt',
102 |                   'resnet18_fold3.pt', 'resnet18_fold4.pt', 'resnet18_fold5.pt',
103 |                   'resnet18_fold6.pt', 'resnet18_fold7.pt', 'resnet18_fold8.pt',
104 |                   'resnet18_fold9.pt'][:1]:
105 |     
106 |     test_loader = torch.utils.data.DataLoader(
107 |         QRDataset(test_jpg,
108 |                 transforms.Compose([
109 |                             transforms.Resize((512, 512)),
110 |                             # transforms.CenterCrop((450, 450)),
111 |                             transforms.RandomHorizontalFlip(),
112 |                             transforms.RandomVerticalFlip(),
113 |                             transforms.ToTensor(),
114 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
115 |             ])
116 |         ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True
117 |     )
118 |         
119 |     
120 |     model = VisitNet().cuda()
121 |     model.load_state_dict(torch.load(model_path))
122 |     # model = nn.DataParallel(model).cuda()
123 |     if test_pred is None:
124 |         test_pred = predict(test_loader, model, 5)
125 |     else:
126 |         test_pred += predict(test_loader, model, 5)
127 |     
128 | test_csv = pd.DataFrame()
129 | test_csv['uuid'] = list(range(1, 1001))
130 | test_csv['label'] = np.argmax(test_pred, 1)
131 | test_csv['label'] = test_csv['label'].map({1: 'AD', 0: 'CN'})
132 | test_csv.to_csv('tmp.csv', index=None)


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛-脑PET图像分析和疾病预测挑战赛算法挑战大赛/README.md:
--------------------------------------------------------------------------------
1 | 赛题链接：http://challenge.xfyun.cn/topic/info?type=PET
2 | 
3 | 赛题思路：CNN分类
4 | 
5 | ```
6 | python3 1_train.py
7 | python3 2_predict.py
8 | ```
9 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/中国科学技术大学_新冠肺炎声音诊断挑战赛.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 64,
  6 |    "id": "0c588760",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "import pandas as pd\n",
 12 |     "import librosa\n",
 13 |     "import glob\n",
 14 |     "import numpy as np\n",
 15 |     "import xgboost as xgb"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "e41b08d3",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# 处理训练集"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 70,
 29 |    "id": "850a8554",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "path = '初赛训练集\\\\cough'\n",
 34 |     "fea = []\n",
 35 |     "label = []\n",
 36 |     "for i in os.listdir(path):\n",
 37 |     "    for j in os.listdir(label_path+'\\\\'+i):\n",
 38 |     "        y, sr = librosa.load(path=label_path+'\\\\'+i+'\\\\'+j, sr=None, mono=False)\n",
 39 |     "        y = y[::3]\n",
 40 |     "        # 默认提取 20 帧\n",
 41 |     "        audio_mac = librosa.feature.mfcc(y=y, sr=16000)\n",
 42 |     "        y_shape = audio_mac.shape[1]\n",
 43 |     "        max_pad_size=11\n",
 44 |     "        if y_shape < max_pad_size:\n",
 45 |     "            pad_size = max_pad_size - y_shape\n",
 46 |     "            audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')\n",
 47 |     "        else:\n",
 48 |     "            audio_mac = audio_mac[:, :max_pad_size]\n",
 49 |     "        fea.append(audio_mac.flatten())\n",
 50 |     "        label.append(i)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 71,
 56 |    "id": "34a055db",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "df = pd.DataFrame(fea)\n",
 61 |     "df['label'] = label\n",
 62 |     "fea_names = [i for i in df.columns if i not in ['label']]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 72,
 68 |    "id": "b244421a",
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "[23:14:32] WARNING: ..\\src\\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n"
 76 |      ]
 77 |     },
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
 82 |        "              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
 83 |        "              importance_type='gain', interaction_constraints='',\n",
 84 |        "              learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n",
 85 |        "              min_child_weight=1, missing=nan, monotone_constraints='()',\n",
 86 |        "              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n",
 87 |        "              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
 88 |        "              tree_method='exact', validate_parameters=1, verbosity=None)"
 89 |       ]
 90 |      },
 91 |      "execution_count": 72,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "model = xgb.XGBClassifier()\n",
 98 |     "model.fit(df[fea_names],df['label'])"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "d15c937f",
104 |    "metadata": {},
105 |    "source": [
106 |     "# 处理测试集"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 55,
112 |    "id": "4540e914",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "path = '初赛测试集'\n",
117 |     "fea = []\n",
118 |     "files = []\n",
119 |     "for j in os.listdir(path):\n",
120 |     "    files.append(j)\n",
121 |     "    y, sr = librosa.load(path=path+'\\\\'+j, sr=None, mono=False)\n",
122 |     "    y = y[::3]\n",
123 |     "    # 默认提取 20 帧\n",
124 |     "    audio_mac = librosa.feature.mfcc(y=y, sr=16000)\n",
125 |     "    y_shape = audio_mac.shape[1]\n",
126 |     "    max_pad_size=11\n",
127 |     "    if y_shape < max_pad_size:\n",
128 |     "        pad_size = max_pad_size - y_shape\n",
129 |     "        audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')\n",
130 |     "    else:\n",
131 |     "        audio_mac = audio_mac[:, :max_pad_size]\n",
132 |     "    fea.append(audio_mac.flatten())"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 56,
138 |    "id": "423e324b",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "df2 = pd.DataFrame(fea)\n",
143 |     "df2['category_id'] = model.predict(df2[fea_names])\n",
144 |     "df2['category_id'].value_counts()"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 78,
150 |    "id": "64bc86d7",
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "df2['sample_id']=files\n",
155 |     "df2[['sample_id','category_id']].to_csv('sub.csv',index=False)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "127bfc2b",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "# 使用keras或其他方法\n",
166 |     "# df['label'] = df['label'].apply(lambda x:0 if x=='Negative' else 1)\n",
167 |     "\n",
168 |     "# from sklearn.model_selection import train_test_split\n",
169 |     "# x_train, x_val, y_train, y_val = train_test_split(df[fea_names], df['label'], test_size=0.2, random_state=42)\n",
170 |     "\n",
171 |     "# from keras.models import Sequential\n",
172 |     "# from keras.layers import Dense\n",
173 |     "# import keras\n",
174 |     "\n",
175 |     "# model = Sequential()\n",
176 |     "# model.add(Dense(64, activation='relu', input_shape=(220,)))\n",
177 |     "# model.add(Dense(64, activation='relu'))\n",
178 |     "# model.add(Dense(64, activation='relu'))\n",
179 |     "# model.add(Dense(1, activation='softmax'))\n",
180 |     "\n",
181 |     "# model.compile(loss=keras.losses.categorical_crossentropy,\n",
182 |     "#               optimizer=keras.optimizers.RMSprop(),\n",
183 |     "#               metrics=['accuracy'])\n",
184 |     "# model.fit(x_train, y_train, batch_size=30, epochs=20, verbose=1,validation_data=(x_val, y_val))"
185 |    ]
186 |   }
187 |  ],
188 |  "metadata": {
189 |   "kernelspec": {
190 |    "display_name": "Python 3",
191 |    "language": "python",
192 |    "name": "python3"
193 |   },
194 |   "language_info": {
195 |    "codemirror_mode": {
196 |     "name": "ipython",
197 |     "version": 3
198 |    },
199 |    "file_extension": ".py",
200 |    "mimetype": "text/x-python",
201 |    "name": "python",
202 |    "nbconvert_exporter": "python",
203 |    "pygments_lexer": "ipython3",
204 |    "version": "3.8.8"
205 |   }
206 |  },
207 |  "nbformat": 4,
208 |  "nbformat_minor": 5
209 | }
210 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/中文成语填空挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | ## 中文成语填空挑战赛
 2 | 
 3 | 中国文化博大精深源远流长，其中成语更是中国文化的精华。成语大多由四个字组成，一般都有典故或出处。有些成语从字面上不难理解，如“小题大做”、“后来居上”等。有些成语必须知道来源或典故才能懂得意思，如“朝三暮四”、“杯弓蛇影”等。
 4 | 
 5 | 成语学习是小学语文和初中重要的学习内容，如何在语句中选择合适的成语？本次赛题中希望选手构建模型能理解中文成语。
 6 | 
 7 | 比赛链接：http://challenge.xfyun.cn/topic/info?type=chinese-idioms&ch=dw-sq-1
 8 | 
 9 | | text      | 曾经在越南这个全球第四大网游市场占据80%的金山游戏CEO邹涛对记者表示：“海外市场的本土网游企业也在崛起，这一点在越南等东南亚市场表现尤其明显，越南本土游戏公司[MASK][MASK][MASK][MASK]，再加上更多的中国企业瞄准这一市场，竞争更加激烈 |
10 | | ------------- | ------------------------------------------------------------ |
11 | | candidate | 张王赵李, 海不波溢, 七男八婿, 异军突起                       |
12 | | label     | 异军突起                                                     |
13 | |               |                                                              |
14 | 
15 | 训练集5w条数据，测试集1w条数据，均为csv格式，列使用\t分割。测试集中label字段为空，需要选手预测。
16 | 
17 | 
18 | ## 赛事任务
19 | 
20 | 给定一个中文句子的情况下，需要选手在给定上下文的情况下从待选的成语中选择最为合适的成语。即给定句子的上下文，完成合适的成语填入对应位置。


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/中文成语填空挑战赛/gen_train_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | # author:quincy qiang
  4 | # email:yanqiangmiffy@gmail.com
  5 | # datetime:2021/8/16 11:20
  6 | # description:"do something"
  7 | 
  8 | import re
  9 | import pandas as pd
 10 | from tqdm import tqdm
 11 | 
 12 | train = pd.read_csv('data/train.csv', sep='\t')
 13 | test = pd.read_csv('data/test.csv', sep='\t')
 14 | 
 15 | print(train)
 16 | print(test)
 17 | 
 18 | 
 19 | def process_text(text):
 20 |     return re.sub(' +', ' ', text).strip()
 21 | 
 22 | 
 23 | def get_question(text):
 24 |     """
 25 |     根据[MASK][MASK][MASK][MASK]获取问题
 26 |     :param text:
 27 |     :return:
 28 |     """
 29 |     sentences = re.split('(。|！|\!|\.|？|\?)', text)  # 保留分割符
 30 |     for sent in sentences:
 31 |         if '[MASK][MASK][MASK][MASK]' in sent:
 32 |             return sent
 33 |     return text
 34 | 
 35 | 
 36 | cols = [
 37 |     "Unnamed: 0",
 38 |     "video-id",
 39 |     "fold-ind",  # q_id
 40 |     "startphrase",
 41 |     "sent1",  # content
 42 |     "sent2",  # question
 43 |     "gold-source",
 44 |     "ending0", "ending1", "ending2", "ending3",  # choice
 45 |     "label"]
 46 | 
 47 | # ======================================================
 48 | # 生成训练集
 49 | # ======================================================
 50 | res = []
 51 | 
 52 | for idx, row in tqdm(train.iterrows()):
 53 |     q_id = f'train_{idx}'
 54 |     content = row['text']
 55 |     content = process_text(content)
 56 |     question = get_question(content)
 57 |     modified_choices = eval(row['candidate'])
 58 |     label = modified_choices.index(row['label'])
 59 |     ## Hard-code for swag format!
 60 |     res.append(("",
 61 |                 "",
 62 |                 q_id,
 63 |                 "",
 64 |                 content,
 65 |                 question,
 66 |                 "",
 67 |                 modified_choices[0],
 68 |                 modified_choices[1],
 69 |                 modified_choices[2],
 70 |                 modified_choices[3],
 71 |                 label))
 72 | df = pd.DataFrame(res, columns=cols)
 73 | 
 74 | # ======================================================
 75 | # 生成测试集
 76 | # ======================================================
 77 | res = []
 78 | print("test.shape", test.shape)
 79 | for idx, row in tqdm(test.iterrows()):
 80 |     q_id = f'test_{idx}'
 81 |     content = row['text']
 82 |     content = process_text(content)
 83 |     question = get_question(content)
 84 |     modified_choices = eval(row['candidate'])
 85 |     ## Hard-code for swag format!
 86 |     res.append(("",
 87 |                 "",
 88 |                 q_id,
 89 |                 "",
 90 |                 content,
 91 |                 question,
 92 |                 "",
 93 |                 modified_choices[0],
 94 |                 modified_choices[1],
 95 |                 modified_choices[2],
 96 |                 modified_choices[3],
 97 |                 0))
 98 | df_test = pd.DataFrame(res, columns=cols)
 99 | 
100 | print(df_test.shape)
101 | 
102 | 
103 | DEBUG = False
104 | if DEBUG:
105 |     df.iloc[:50].to_csv('data/new_train.csv', index=False)
106 |     df.iloc[-50:].to_csv('data/new_valid.csv', index=False)
107 |     df_test.iloc[:50].to_csv('data/new_test.csv', index=False)
108 | else:
109 |     df.iloc[:45000].to_csv('data/new_train.csv', index=False)
110 |     df.iloc[5000:].to_csv('data/new_valid.csv', index=False)
111 |     df_test.to_csv('data/new_test.csv', index=False)
112 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/中文成语填空挑战赛/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -u baseline.py \
 4 |   --model_name_or_path 'hfl/chinese-xlnet-base' \
 5 |   --do_train \
 6 |   --do_eval \
 7 |   --do_predict \
 8 |   --logging_steps=100 \
 9 |   --max_seq_length 200 \
10 |   --train_file data/new_train.csv \
11 |   --validation_file data/new_valid.csv \
12 |   --test_file data/new_test.csv \
13 |   --learning_rate 3e-5 \
14 |   --num_train_epochs 2 \
15 |   --output_dir 'models/xlnet' \
16 |   --gradient_accumulation_steps 4 \
17 |   --per_device_eval_batch_size 16 \
18 |   --per_device_train_batch_size 16 \
19 |   --overwrite_output
20 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/中文问题相似度挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | ## 中文问题相似度挑战赛
 2 | 
 3 | ### 赛事背景
 4 | 问答系统中包括三个主要的部分：问题理解，信息检索和答案抽取。而问题理解是问答系统的第一部分也是非常关键的一部分。问题理解有非常广泛的应用，如重复评论识别、相似问题识别等。
 5 | 
 6 | 重复问题检测是一个常见的文本挖掘任务，在很多实际问答社区都有相应的应用。重复问题检测可以方便进行问题的答案聚合，以及问题答案推荐，自动QA等。由于中文词语的多样性和灵活性，本赛题需要选手构建一个重复问题识别算法。
 7 | 
 8 | ### 赛事任务
 9 | 本次赛题希望参赛选手对两个问题完成相似度打分。
10 | 
11 | 训练集：约5千条问题对和标签。若两个问题是相同的问题，标签为1；否则为0。
12 | 
13 | 测试集：约5千条问题对，需要选手预测标签。
14 | 
15 | http://challenge.xfyun.cn/topic/info?type=chinese-question-similarity&ch=dw-sq-1
16 | 
17 | ### baseline
18 | 
19 | - [BERT NSP方法](https://github.com/datawhalechina/competition-baseline/blob/master/competition/%E7%A7%91%E5%A4%A7%E8%AE%AF%E9%A3%9EAI%E5%BC%80%E5%8F%91%E8%80%85%E5%A4%A7%E8%B5%9B2021/%E4%B8%AD%E6%96%87%E9%97%AE%E9%A2%98%E7%9B%B8%E4%BC%BC%E5%BA%A6%E6%8C%91%E6%88%98%E8%B5%9B/bert-nsp.ipynb)
20 | - [word2vec + LightGBM](https://mp.weixin.qq.com/s/E3sfNaNg8JH-w_7Yv40MWw), 链接：https://pan.baidu.com/s/1WC3vQGlgBFvnlAXcj-0qrA 提取码：v7aj 
21 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/人脸关键点检测挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | http://challenge.xfyun.cn/topic/info?type=key-points-of-human-face&ch=dw-sq-1
 2 | 
 3 | ## 赛事背景
 4 | 人脸识别是基于人的面部特征信息进行身份识别的一种生物识别技术，金融和安防是目前人脸识别应用最广泛的两个领域。人脸关键点是人脸识别中的关键技术。人脸关键点检测需要识别出人脸的指定位置坐标，例如眉毛、眼睛、鼻子、嘴巴和脸部轮廓等位置坐标等。
 5 | 
 6 | ## 赛事任务
 7 | 
 8 | 给定人脸图像，找到4个人脸关键点，赛题任务可以视为一个关键点检测问题。
 9 | 
10 | - 训练集：5千张人脸图像，并且给定了具体的人脸关键点标注。
11 | - 测试集：约2千张人脸图像，需要选手识别出具体的关键点位置。
12 | 
13 | 
14 | ## 赛题数据
15 | 
16 | 赛题数据由训练集和测试集组成，train.csv为训练集标注数据，train.npy和test.npy为训练集图片和测试集图片，可以使用numpy.load进行读取。train.csv的信息为左眼坐标、右眼坐标、鼻子坐标和嘴巴坐标，总共8个点。
17 | 
18 | 本次竞赛的评价标准回归MAE进行评价，数值越小性能更优，最高分为0。评估代码参考：
19 | 
20 | ```
21 | from sklearn.metrics import mean_absolute_error
22 | y_true = [3, -0.5, 2, 7]
23 | y_pred = [2.5, 0.0, 2, 8]
24 | mean_absolute_error(y_true, y_pred)
25 | ```
26 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/人脸情绪识别挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | ## 人脸情绪识别挑战赛
 2 | 
 3 | 人脸表情是传播人类情感信息与协调人际关系的重要方式，表情识别是指从静态照片或视频序列中选择出表情状态，从而确定对人物的情绪与心理变化。在日常生活中人类习惯从面部表情中吸收非言语暗示，那么计算机可以完成类似任务吗？答案是肯定的，但是需要训练它学会识别情绪。
 4 | 
 5 | 赛题链接：http://challenge.xfyun.cn/topic/info?type=facial-emotion-recognition
 6 | 
 7 | ## 赛事任务
 8 | 
 9 | 给定人脸照片完成具体的情绪识别，选手需要根据训练集数据构建情绪识别任务，并对测试集图像进行预测，识别人脸的7种情绪。
10 | 
11 | ![](https://ai-contest-static.xfyun.cn/2021/120.jpg)
12 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/学术论文分类挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | ## 学术论文分类挑战赛
 2 | 
 3 | 随着人工智能技术不断发展，每周都有非常多的论文公开发布。现如今对论文进行分类逐渐成为非常现实的问题，这也是研究人员和研究机构每天都面临的问题。现在希望选手能构建一个论文分类模型。
 4 | 
 5 | 比赛链接：http://challenge.xfyun.cn/topic/info?type=academic-paper-classification
 6 | 
 7 | 
 8 | ## 赛事任务
 9 | 
10 | 本次赛题希望参赛选手利用论文信息：论文id、标题、摘要，划分论文具体类别。
11 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/广告点击率预估挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | http://challenge.xfyun.cn/topic/info?type=Ad-click-through&ch=dw-sq-1
 2 | 
 3 | ## 赛事背景
 4 | 广告点击率预估是在线广告交易的核心环节之一，如果说一家公司想知道 CTR（点击率），以确定将他们的钱花在数字广告上是否值得。点击率高表示对该特定广告系列更感兴趣，点击率低可能表明广告可能不那么相关。高点击率表明更多人点击了网站，这有利于在谷歌、必应等在线平台上以更少的钱获得更好的广告位置。
 5 | 
 6 | 近年来，各大有关广告点击率预估的比赛相拥而至，如腾讯广告算法大赛、科大讯飞营销算法大赛、阿里妈妈点击率预估大赛等。可以看出这是一个企业长期关注的问题，也是值得花时间探索的问题。
 7 | 
 8 | ## 赛事任务
 9 | 平台展示给用户特定的广告，用户存在点击与不点击两种行为。给定某平台实际广告业务中的用户行为数据，共包含13个用户相关的字段，其中isClick字段表明用户是否会点击广告。
10 | 
11 | 任务目标是通过训练集训练模型，来预测测试集中isClick字段的概率结果，即用户点击平台所推荐广告的概率，以此为依据，表示用户对特定广告感兴趣的程度。
12 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/电商图像检索挑战赛/README.md:
--------------------------------------------------------------------------------
1 | ## 赛事背景
2 | 在电商应用中每天商家都会上传数以百万的商品图像，商品图像可能是从不同角度拍摄的，也有可能是不同款式的商品图像。对于消费者而言，很难通过肉眼去找到相似的商品。如果有一种人工智能算法，能够找到相同商品的相同图像，则是非常有用的一项技术。
3 | 
4 | http://challenge.xfyun.cn/topic/info?type=e-commerce-image-retrieval
5 | 
6 | ## 赛事任务
7 | 给定一批电商商品（主要是服务商品）的图像，找到属于同一个商品的图像。任务可以视为一个图像检索问题，或者一个图像聚类问题，需要将同一个商品的图像聚类到一起。
8 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/科大讯飞商店销量预测/README.md:
--------------------------------------------------------------------------------
 1 | ## 线下商店销量预测挑战赛
 2 | 
 3 | http://challenge.xfyun.cn/topic/info?type=offline-store-sales-forecast&ch=dw-sq-1
 4 | 
 5 | ### 赛事背景
 6 | 企业运营效率的提高主要依托于两个要素：销售预测的精度和供应链的反应速度。销售预测精度高，即便供应链反应速度不快，也能够实现库存与资金的高周转；采购管理、补货管理、销售管理等的基础便是销售预测。
 7 | 
 8 | 销量预测是个非常经典的时序预测问题，通过一段时间内销售数据，预测未来商品的销量，对商品进行合理的分配和调度，解决供货上的不足或者堆积等问题。
 9 | 
10 | ### 赛事任务
11 | 给定商店销量历史相关数据和时间等信息，预测商店对应商品的周销量。
12 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/科大讯飞商店销量预测/lgb.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from sklearn.metrics import mean_squared_error
 5 | from sklearn.model_selection import KFold
 6 | 
 7 | import lightgbm as lgb
 8 | 
 9 | import warnings
10 | warnings.filterwarnings('ignore')
11 | 
12 | train = pd.read_csv('train.csv')
13 | test = pd.read_csv('test.csv')
14 | sample_submit = pd.read_csv('sample_submit.csv')
15 | 
16 | df = pd.concat([train, test], axis=0, ignore_index=True)
17 | 
18 | def lag_feature_adv(df, lags, col):
19 |     '''
20 |     历史N周平移特征
21 |     '''
22 |     tmp = df[['week','shop_id','item_id',col]]
23 |     for i in lags:
24 |         shifted = tmp.copy()
25 |         shifted.columns = ['week','shop_id','item_id', col+'_lag_'+str(i)+'_adv']
26 |         shifted['week'] += i
27 |         df = pd.merge(df, shifted, on=['week','shop_id','item_id'], how='left')
28 |         df[col+'_lag_'+str(i)+'_adv'] = df[col+'_lag_'+str(i)+'_adv']
29 |     return df
30 | 
31 | df = lag_feature_adv(df, [1, 2, 3], 'weekly_sales')
32 | 
33 | x_train = df[df.week < 33].drop(['weekly_sales'], axis=1)
34 | y_train = df[df.week < 33]['weekly_sales']
35 | x_test = df[df.week == 33].drop(['weekly_sales'], axis=1)
36 | 
37 | 
38 | def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
39 |     folds = 5
40 |     seed = 1024
41 |     kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
42 | 
43 |     train = np.zeros(train_x.shape[0])
44 |     test = np.zeros(test_x.shape[0])
45 |      
46 |     categorical_feature = ['shop_id','item_id','item_category_id']
47 |     cv_scores = []
48 | 
49 |     for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
50 |         print('************************************ {} ************************************'.format(str(i+1)))
51 |         trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
52 | 
53 |         train_matrix = clf.Dataset(trn_x, label=trn_y)
54 |         valid_matrix = clf.Dataset(val_x, label=val_y)
55 | 
56 |         params = {
57 |             'boosting_type': 'gbdt',
58 |             'objective': 'mse',
59 |             'metric': 'mse',
60 |             'min_child_weight': 5,
61 |             'num_leaves': 2 ** 7,
62 |             'lambda_l2': 10,
63 |             'feature_fraction': 0.9,
64 |             'bagging_fraction': 0.9,
65 |             'bagging_freq': 4,
66 |             'learning_rate': 0.05,
67 |             'seed': 1024,
68 |             'n_jobs':-1,
69 |             'silent': True,
70 |             'verbose': -1,
71 |         }
72 | 
73 |         model = clf.train(params, train_matrix, 5000, valid_sets=[train_matrix, valid_matrix], 
74 |                           categorical_feature = categorical_feature,
75 |                           verbose_eval=500,early_stopping_rounds=200)
76 |         val_pred = model.predict(val_x, num_iteration=model.best_iteration)
77 |         test_pred = model.predict(test_x, num_iteration=model.best_iteration)
78 | 
79 |         train[valid_index] = val_pred
80 |         test += test_pred / kf.n_splits
81 |         cv_scores.append(mean_squared_error(val_y, val_pred))
82 |         
83 |         print(cv_scores)
84 |        
85 |     print("%s_scotrainre_list:" % clf_name, cv_scores)
86 |     print("%s_score_mean:" % clf_name, np.mean(cv_scores))
87 |     print("%s_score_std:" % clf_name, np.std(cv_scores))
88 |     return train, test
89 | 
90 | lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test)
91 | 
92 | 
93 | sample_submit['weekly_sales'] = lgb_test
94 | sample_submit['weekly_sales'] = sample_submit['weekly_sales'].apply(lambda x:x if x>0 else 0).values
95 | sample_submit.to_csv('baseline_result.csv', index=False)
96 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/科大讯飞股份有限公司_基于用户画像的商品推荐挑战赛.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 28,
  6 |    "id": "9de58cd6",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "import lightgbm as lgb\n",
 13 |     "\n",
 14 |     "from sklearn.preprocessing import LabelEncoder\n",
 15 |     "from sklearn.metrics import f1_score\n",
 16 |     "from sklearn.model_selection import StratifiedKFold,KFold\n",
 17 |     "\n",
 18 |     "import multiprocessing\n",
 19 |     "\n",
 20 |     "from tqdm import tqdm\n",
 21 |     "import warnings\n",
 22 |     "warnings.filterwarnings(\"ignore\")"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "id": "36f84a54",
 29 |    "metadata": {
 30 |     "scrolled": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "df2 = pd.read_csv('测试集/apply_new.txt',header=None,names=['pid','gender','age','targid','time','province','city','model','make'])\n",
 35 |     "df = pd.read_csv('train.txt',header=None,names=['pid','label','gender','age','targid','time','province','city','model','make'])\n",
 36 |     "df.head(5)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "3a70dd46",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def get_fea(df):\n",
 47 |     "    df['targid_list']=df['targid'].apply(lambda x:x[1:-1].split(\",\"))\n",
 48 |     "    for i in range(30):\n",
 49 |     "        df['targid'+str(i)]=df['targid_list'].apply(lambda x:x[i] if len(x)>=i+1 else None)\n",
 50 |     "    return df"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 1,
 56 |    "id": "88e8225f",
 57 |    "metadata": {
 58 |     "scrolled": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "train = get_fea(df)\n",
 63 |     "test = get_fea(df2)\n",
 64 |     "for col in ['province','city']:\n",
 65 |     "    le = LabelEncoder()\n",
 66 |     "    test[col] = le.fit_transform(test[col])\n",
 67 |     "    train[col] = le.transform(train[col])"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 39,
 73 |    "id": "76a876a1",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "for i in range(30):\n",
 78 |     "    df['targid'+str(i)] = df['targid'+str(i)].astype('float64')\n",
 79 |     "    df2['targid'+str(i)] = df2['targid'+str(i)].astype('float64')"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 41,
 85 |    "id": "12289c30",
 86 |    "metadata": {
 87 |     "scrolled": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "fea = [f for f in train.columns if f not in ['pid','model','make','targid_list','targid','time','label','len']]# \n",
 92 |     "model = lgb.LGBMRegressor(max_depth=15,num_leaves=20,learning_rate=0.1,n_estimators=100,seed=2020)\n",
 93 |     "\n",
 94 |     "model.fit(train[fea],train['label'])\n",
 95 |     "pre = model.predict(test[fea])"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 46,
101 |    "id": "2e2baebd",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "test['pre'] = pre\n",
106 |     "test['pre'] = test['pre'].apply(lambda x:1 if x>0.5 else 0)\n",
107 |     "sub = test[['pid','pre']]\n",
108 |     "sub = sub.rename(columns=({'pid':'user_id','pre':'category_id'}))\n",
109 |     "sub.to_csv('sub.csv',index=False)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "id": "604ec138",
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": []
119 |   }
120 |  ],
121 |  "metadata": {
122 |   "kernelspec": {
123 |    "display_name": "Python 3",
124 |    "language": "python",
125 |    "name": "python3"
126 |   },
127 |   "language_info": {
128 |    "codemirror_mode": {
129 |     "name": "ipython",
130 |     "version": 3
131 |    },
132 |    "file_extension": ".py",
133 |    "mimetype": "text/x-python",
134 |    "name": "python",
135 |    "nbconvert_exporter": "python",
136 |    "pygments_lexer": "ipython3",
137 |    "version": "3.8.8"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 5
142 | }
143 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/科大讯飞股份有限公司_猪只盘点挑战赛.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "5516f498",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "from imageai.Detection import ObjectDetection\n",
 12 |     "\n",
 13 |     "# imageai说明请查看官网： https://github.com/OlafenwaMoses/ImageAI/"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 14,
 19 |    "id": "4ccc8ee3",
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "execution_path = os.getcwd()\n",
 24 |     "\n",
 25 |     "detector = ObjectDetection()\n",
 26 |     "detector.setModelTypeAsRetinaNet()\n",
 27 |     "detector.setModelPath( os.path.join(execution_path , \"resnet50_coco_best_v2.1.0.h5\")) # 需要提取从官网下载h5文件\n",
 28 |     "detector.loadModel()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 8,
 34 |    "id": "3ae4b045",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "num = []\n",
 39 |     "pigs = []\n",
 40 |     "pic = []"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 13,
 46 |    "id": "b1e853f8",
 47 |    "metadata": {
 48 |     "scrolled": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# 注意，这里的路径必须是全英文路径\n",
 53 |     "for file in os.listdir('E:\\\\pig\\\\test'):\n",
 54 |     "    detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , \"E:\\\\pig\\\\test\\\\\"+file), output_image_path=os.path.join(execution_path , \"E:\\\\pig\\\\test2\\\\\"+'2a'+file), minimum_percentage_probability=30)\n",
 55 |     "    num.append(len(detections))\n",
 56 |     "    pig = []\n",
 57 |     "    pic.append(file)\n",
 58 |     "    print(len(pic))\n",
 59 |     "    for eachObject in detections: \n",
 60 |     "        pig.append([eachObject[\"percentage_probability\"]]+eachObject[\"box_points\"])\n",
 61 |     "#         print(eachObject[\"name\"] , \" : \", eachObject[\"percentage_probability\"], \" : \", eachObject[\"box_points\"] )\n",
 62 |     "#         print(\"--------------------------------\")\n",
 63 |     "    pigs.append(pig)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 10,
 69 |    "id": "f16b5c7f",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# 写文件pig_count.txt\n",
 74 |     "with open('E:\\\\pig\\\\det_files\\\\pig_count.txt','w',encoding='utf-8') as f:\n",
 75 |     "    [f.write('{0} {1}\\n'.format(key, value)) for key,value in zip(pic,num)]"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 12,
 81 |    "id": "a40234e3",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# 写文件det_results\n",
 86 |     "for i in range(len(num)):\n",
 87 |     "    with open('E:\\\\pig\\\\det_files\\\\det_results\\\\'+pic[i].split('.')[0]+'.txt','w',encoding='utf-8') as f:\n",
 88 |     "        [f.write('pig {0} {1} {2} {3} {4}\\n'.format(min(0.01*value[0]+0.4,0.99),(value[3]+value[1])/2,(value[2]+value[4])/2,(value[3]-value[1]),(value[4]-value[2]))) for value in pigs[i]]\n",
 89 |     "        # ****需要注意，最终提交的格式是x_ccenter,y_center,w,h和训练集里面的box格式不同。这个当时提交踩了很多坑"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "id": "f68675b7",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": []
 99 |   }
100 |  ],
101 |  "metadata": {
102 |   "kernelspec": {
103 |    "display_name": "Python 3",
104 |    "language": "python",
105 |    "name": "python3"
106 |   },
107 |   "language_info": {
108 |    "codemirror_mode": {
109 |     "name": "ipython",
110 |     "version": 3
111 |    },
112 |    "file_extension": ".py",
113 |    "mimetype": "text/x-python",
114 |    "name": "python",
115 |    "nbconvert_exporter": "python",
116 |    "pygments_lexer": "ipython3",
117 |    "version": "3.8.8"
118 |   }
119 |  },
120 |  "nbformat": 4,
121 |  "nbformat_minor": 5
122 | }
123 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/移动设备用户年龄和性别预测/README.md:
--------------------------------------------------------------------------------
 1 | ## 赛事背景
 2 | 对于移动设备厂商而言，获取当前手机用户的人口属性信息是非常困难的。基于用户的手机及日常使用应用程序的偏好准确地预测其人口属性信息是提升个性化体验、构建精准用户画像的基础。
 3 | 
 4 | 需要说明的是，本赛事数据已获得个人用户的充分认可和同意，并已进行适当的匿名处理以保护隐私。由于保密，我们不会提供有关如何获得性别和年龄数据的详细信息。
 5 | 
 6 | 赛题链接：http://challenge.xfyun.cn/topic/info?type=mobile-devices&ch=dw-sq-1
 7 | 
 8 | ## 赛事任务
 9 | 
10 | 本次比赛有两个任务，分别对移动设备（device_id）进行性别和年龄的预测，这里包含二分类和回归两个问题，最终会将两个部分的分数结合起来进行排名。


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/蛋白质结构预测挑战赛.md:
--------------------------------------------------------------------------------
 1 | ## 赛事背景
 2 | 蛋白质是组成人体一切细胞、组织的重要成分，是参与人类生命活动必不可少的一部分。没有蛋白质就没有生命，足以见得蛋白质在生命中的重要性。基于此，越来越多的学者开始关注对生物信息学中蛋白质组学的研究，如蛋白质序列分析，蛋白质表达分析，蛋白质结构预测等。
 3 | 
 4 | 蛋白质结构预测是生物信息学的重要应用之一，蛋白质的结构对于理解蛋白质的功能十分重要。深入了解蛋白质的功能对疾病的基因检测和新型药物的开发有很直接的帮助，研究表明，具有相似结构的蛋白质其功能也相似。蛋白质的一级结构（氨基酸序列）可以由其基因编码序列获得，而蛋白质的结构由氨基酸序列唯一决定，这些结构信息包括二级结构，三级结构，四级结构。目前，实现对这些结构的准确预测仍然是一个亟待突破的关键问题。
 5 | 
 6 | 比赛链接：http://challenge.xfyun.cn/topic/info?type=protein&ch=dw-sq-1
 7 | 
 8 | ## 赛事任务
 9 | 蛋白质折叠识别常被用于解决蛋白质结构预测问题，本次大赛提供了蛋白质结构分类数据库SCOP中的ASTRAL SCOPe 2.07数据中蛋白质相似性小于40%的α，β，α＋β，α/β类中所属的折叠类型作为研究对象。参赛选手需基于提供的样本集构建模型，实现蛋白质的折叠分类。
10 | 
11 | 
12 | ## 赛题思路
13 | 
14 | https://github.com/HighingLIN/danbaizhi
15 | 
16 | 把蛋白质的结构视为一句话，每个句子的精度为一个字母，然后embedding编码，再用宽视野的Conv1D提取每个字的局部特征，用MaxPooling1D再去提取局部特征（为了防止过拟合，所以pool_size比较大）。
17 | 
18 | 模型的灵感来自于天池蛋白质比赛的top3开源：https://github.com/yjh126yjh/TianChi_Protein-Secondary-Structure-Prediction .


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2021/车辆贷款违约预测挑战赛/Baseline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | # 导入第三方包
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | import lightgbm as lgb
 10 | 
 11 | from sklearn.model_selection import KFold
 12 | from sklearn.metrics import f1_score, roc_auc_score
 13 | 
 14 | import warnings
 15 | warnings.filterwarnings('ignore')
 16 | 
 17 | 
 18 | 
 19 | # 读取数据集，具体下载方式可见操作手册
 20 | train = pd.read_csv('train.csv')
 21 | test = pd.read_csv('test.csv')
 22 | 
 23 | sample_submit = pd.read_csv('sample_submit.csv')
 24 | 
 25 | 
 26 | # 训练数据及测试数据准备
 27 | all_cols = [f for f in train.columns if f not in ['customer_id','loan_default']]
 28 | 
 29 | x_train = train[all_cols]
 30 | x_test = test[all_cols]
 31 | 
 32 | y_train = train['loan_default']
 33 | 
 34 | 
 35 | # 作为baseline部分仅使用经典的**LightGBM**作为训练模型，我们还能尝试**XGBoost、CatBoost和NN（神经网络）**
 36 | def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
 37 |     folds = 5
 38 |     seed = 2021
 39 |     kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
 40 | 
 41 |     train = np.zeros(train_x.shape[0])
 42 |     test = np.zeros(test_x.shape[0])
 43 | 
 44 |     cv_scores = []
 45 | 
 46 |     for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
 47 |         print('************************************ {} ************************************'.format(str(i+1)))
 48 |         trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
 49 | 
 50 |         train_matrix = clf.Dataset(trn_x, label=trn_y)
 51 |         valid_matrix = clf.Dataset(val_x, label=val_y)
 52 | 
 53 |         params = {
 54 |             'boosting_type': 'gbdt',
 55 |             'objective': 'binary',
 56 |             'metric': 'auc',
 57 |             'min_child_weight': 5,
 58 |             'num_leaves': 2 ** 7,
 59 |             'lambda_l2': 10,
 60 |             'feature_fraction': 0.9,
 61 |             'bagging_fraction': 0.9,
 62 |             'bagging_freq': 4,
 63 |             'learning_rate': 0.01,
 64 |             'seed': 2021,
 65 |             'nthread': 28,
 66 |             'n_jobs':-1,
 67 |             'silent': True,
 68 |             'verbose': -1,
 69 |         }
 70 | 
 71 |         model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=200)
 72 |         val_pred = model.predict(val_x, num_iteration=model.best_iteration)
 73 |         test_pred = model.predict(test_x, num_iteration=model.best_iteration)
 74 | 
 75 |         # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
 76 | 
 77 |         train[valid_index] = val_pred
 78 |         test += test_pred / kf.n_splits
 79 |         cv_scores.append(roc_auc_score(val_y, val_pred))
 80 |         
 81 |         print(cv_scores)
 82 |        
 83 |     print("%s_scotrainre_list:" % clf_name, cv_scores)
 84 |     print("%s_score_mean:" % clf_name, np.mean(cv_scores))
 85 |     print("%s_score_std:" % clf_name, np.std(cv_scores))
 86 |     return train, test
 87 | 
 88 | 
 89 | 
 90 | lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test)
 91 | 
 92 | 
 93 | # 预测结果
 94 | sample_submit['loan_default'] = lgb_test
 95 | sample_submit['loan_default'] = sample_submit['loan_default'].apply(lambda x:1 if x>0.25 else 0).values
 96 | sample_submit.to_csv('baseline_result.csv', index=False)
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2022/README.md:
--------------------------------------------------------------------------------
1 | 本届大赛按照算法、应用、编程赛、虚拟形象选拔、辩论赛、创意集市创意赛等等方向设置众多赛道；覆盖了智能语音、视觉、自然语言、图文识别等AI热门技术；涵盖了元宇宙、遗址文化、生物与环保、医疗健康、智能家居、电商销售等众多领域。大赛地址：
2 | 
3 | https://challenge.xfyun.cn/?ch=ds22-dw-sq04
4 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2022/汽车领域多语种迁移学习挑战赛-baseline-0.61.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd # 读取文件
 2 | import numpy as np # 数值计算
 3 | import nagisa # 日文分词
 4 | from sklearn.feature_extraction.text import TfidfVectorizer # 文本特征提取
 5 | from sklearn.linear_model import LogisticRegression # 逻辑回归
 6 | from sklearn.pipeline import make_pipeline # 组合流水线
 7 | 
 8 | # 读取数据
 9 | train_cn = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/中文_trian.xlsx')
10 | train_ja = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/日语_train.xlsx')
11 | train_en = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/英文_train.xlsx')
12 | 
13 | test_ja = pd.read_excel('testA.xlsx', sheet_name='日语_testA')
14 | test_en = pd.read_excel('testA.xlsx', sheet_name='英文_testA')
15 | 
16 | # 文本分词
17 | train_ja['words'] = train_ja['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
18 | train_en['words'] = train_en['原始文本'].apply(lambda x: x.lower())
19 | 
20 | test_ja['words'] = test_ja['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
21 | test_en['words'] = test_en['原始文本'].apply(lambda x: x.lower())
22 | 
23 | # 训练TFIDF和逻辑回归
24 | pipline = make_pipeline(
25 |     TfidfVectorizer(),
26 |     LogisticRegression()
27 | )
28 | pipline.fit(
29 |     train_ja['words'].tolist() + train_en['words'].tolist(),
30 |     train_ja['意图'].tolist() + train_en['意图'].tolist()
31 | )
32 | 
33 | # 模型预测
34 | test_ja['意图'] = pipline.predict(test_ja['words'])
35 | test_en['意图'] = pipline.predict(test_en['words'])
36 | test_en['槽值1'] = np.nan
37 | test_en['槽值2'] = np.nan
38 | 
39 | test_ja['槽值1'] = np.nan
40 | test_ja['槽值2'] = np.nan
41 | 
42 | # 写入提交文件
43 | writer = pd.ExcelWriter('submit.xlsx')
44 | test_en.drop(['words'], axis=1).to_excel(writer, sheet_name='英文_testA', index=None)
45 | test_ja.drop(['words'], axis=1).to_excel(writer, sheet_name='日语_testA', index=None)
46 | writer.save()
47 | writer.close()
48 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/README.md:
--------------------------------------------------------------------------------
1 | https://challenge.xfyun.cn/?ch=vWxQGFU
2 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/中文语义病句识别纠正_baseline.md:
--------------------------------------------------------------------------------
  1 | - 赛题名称：中文语义病句识别纠正
  2 | - 赛题类型：病句识别、错词纠正
  3 | - 赛题报名链接👇：
  4 | 
  5 | https://challenge.xfyun.cn/topic/info?type=identification-and-correction&ch=vWxQGFU
  6 | 
  7 | ## 赛题背景
  8 | 
  9 | 近年来随着自媒体热潮的掀起，人人都是信息的生产者，互联网上文本错误的内容暴增，如何避免这些文本错误，成为了人们迫切关注的问题。因此，各大有关文本校对的比赛相拥而至。
 10 | 
 11 | 然而，过往的文本错误主要针对拼写错误和语法错误，这些错误对于人类来说相对简单，往往是由外国语言学习者和中文母语写作者的疏忽而产生的。对于出版、教育等一些对深层次的中文语义错误识别有需求的行业，中文语义病句的识别将会有更大的帮助。
 12 | 
 13 | 语义病句经常出现在初高中的语文考试题目中，用来衡量学生对语文知识的掌握程度，这类语义病句对于学生来说是比较困难的，对于研究也有重大意义。
 14 | 
 15 | 
 16 | ## 赛事任务
 17 | 
 18 | 本赛事包含两个任务，分别是：中文语义病句识别和中文语义病句纠正。中文语义病句识别是一个二分类的问题，预测句子是否是语义病句。中文语义病句纠正任务需要针对病句给出纠正后的句子。语义错误和拼写错误、语法错误不同，语义错误更加关注句子语义层面的合法性，语义病句例子如下表所示。
 19 | 
 20 | | 病句                                     | 纠正后的句子                             |
 21 | | ---------------------------------------- | ---------------------------------------- |
 22 | | 英法联军**烧毁并洗劫**了北京圆明园。     | 英法联军**洗劫并烧毁**了北京圆明园。     |
 23 | | 山上的水宝贵，把它留给**晚上来**的人喝。 | 山上的水宝贵，把它留给**上来晚**的人喝。 |
 24 | | 国内彩电**市场**严重**滞销**。           | 国内彩电严重**滞销**。                   |
 25 | 
 26 | 
 27 | ## 赛题数据
 28 | 本次比赛使用的数据一部分来自网络上的中小学病句题库，一部分来自人工标注。每条数据包括句子id、句子标签（0：正确句子/1：病句）、原始句子、纠正后的句子。数据格式示例如下表所示：
 29 | 
 30 | | id   | 标签 | 原始句子                             | 纠正后的句子                         |
 31 | | ---- | ---- | ------------------------------------ | ------------------------------------ |
 32 | | 1    | 1    | 英法联军烧毁并洗劫了北京圆明园。     | 英法联军洗劫并烧毁了北京圆明园。     |
 33 | | 2    | 1    | 山上的水宝贵，把它留给晚上来的人喝。 | 山上的水宝贵，把它留给上来晚的人喝。 |
 34 | | 3    | 0    | 国内彩电严重滞销。                   | 国内彩电严重滞销。                   |
 35 | 
 36 | ## 评估指标
 37 | - 中文语义病句识别任务
 38 | 
 39 | 本模型依据提交的结果文件，采用针对语义病句的F1-score进行评价。
 40 | 
 41 | - 中文语义病句纠正任务
 42 | 
 43 | 本任务采用ChERRANT（Chinese ERRANT）中文GEC评估工具。ChERRANT的主要功能是通过对比预测编辑和标准编辑，计算预测结果的精确度、召回度、F值指标，从而评估语法纠错模型的性能。
 44 | 
 45 | ## 解题思路
 46 | 
 47 | 赛题本质是错词纠正任务，需要使用错词纠正的模型进行训练和预测。这里给出一种简单的思路，使用`t5`进行错词纠正。
 48 | 
 49 | ### 步骤1：配置pycorrector
 50 | 
 51 | `pycorrector`自带有`t5`预训练的错词纠正模型，首选需要配置如下库：
 52 | 
 53 | ```
 54 | torch
 55 | transformers
 56 | datasets
 57 | loguru
 58 | ```
 59 | 
 60 | 接下来下载`pycorrector`代码：
 61 | 
 62 | ```
 63 | https://github.com/shibing624/pycorrector
 64 | ```
 65 | 
 66 | ### 步骤2：定义数据集
 67 | 
 68 | 将比赛数据集转换为tsv格式，为【原始句子】 + 【\t】 + 【正确句子的格式】。每一行为一条训练样本。
 69 | 
 70 | ```
 71 | 你说的是对，跟那些失业的人比起来你也算是辛运的。	你说的是对，跟那些失业的人比起来你也算是幸运的。
 72 | ```
 73 | 
 74 | ### 步骤3：模型训练
 75 | 
 76 | 
 77 | 根据自己的GPU大小，修改batch size。
 78 | 
 79 | https://github.com/shibing624/pycorrector/blob/master/pycorrector/t5/train.py
 80 | 
 81 | 命令行运行进行训练：
 82 | 
 83 | ```
 84 | python train.py --do_train --train_path ../../../train.tsv
 85 | ```
 86 | 
 87 | ### 步骤4：模型预测与提交
 88 | 
 89 | 修改预测代码中的模型加载路径，传入待预测句子。最终将生成的结果写为指定json的格式，然后提交。
 90 | 
 91 | 
 92 | https://github.com/shibing624/pycorrector/blob/master/pycorrector/t5/t5_corrector.py
 93 | 
 94 | ```
 95 | python3.9 t5_corrector.py
 96 | ```
 97 | 
 98 | 这种思路的分数有31分左右，还有很大的提高空间。比如交叉训练、预训练，或者使用ChatGLM进行尝试。
 99 | 
100 | 
101 | 方案开源地址：
102 | 
103 | https://github.com/datawhalechina/competition-baseline/tree/master/competition/%E7%A7%91%E5%A4%A7%E8%AE%AF%E9%A3%9EAI%E5%BC%80%E5%8F%91%E8%80%85%E5%A4%A7%E8%B5%9B2023
104 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/人岗匹配挑战赛2023_baseline.md:
--------------------------------------------------------------------------------
 1 | 本地目录如下：
 2 | 
 3 | ```
 4 | person-post-matching-2023/
 5 |   run.py
 6 |   train.json    从比赛官网下载
 7 |   job_list.json   从比赛官网下载
 8 | ```
 9 | 
10 | 打包提交过程
11 | ```
12 | tar -cvzf person-post-matching-2023.tar.gz person-post-matching-2023/
13 | s3cmd put person-post-matching-2023.tar.gz s3://ai-competition/你的url/
14 | ```
15 | 
16 | run.py代码内容如下：
17 | 
18 | ```python
19 | import json
20 | import pandas as pd
21 | import numpy as np
22 | from sklearn.linear_model import LogisticRegression
23 | from sklearn.ensemble import RandomForestClassifier
24 | from sklearn.model_selection import cross_val_predict
25 | 
26 | train_data = pd.read_json('./train.json')
27 | train_data['解析结果'] = train_data['解析结果'].apply(lambda x : json.dumps(x).replace('"', ' ').replace('"', ' ').split())
28 | 
29 | test_data = pd.read_json('/work/data/personnel-matching-test-set/test.json')
30 | test_data['解析结果'] = test_data['解析结果'].apply(lambda x : json.dumps(x).replace('"', ' ').replace('"', ' ').split())
31 | 
32 | joblist = pd.read_json('./job_list.json')
33 | joblist['解析结果'] = joblist['岗位名称'] + ' ' + joblist['岗位介绍'] + ' ' + joblist['岗位要求']
34 | joblist['解析结果'] = joblist['解析结果'].apply(lambda x : x.split())
35 | 
36 | train_feat = []
37 | for row in train_data.iterrows():
38 |     label = row[1]['岗位ID']
39 |     query_text= row[1]['解析结果']
40 |     feat = [
41 |         label,
42 |         len(query_text), len(set(query_text)), len(query_text) - len(set(query_text)),
43 |     ]
44 |     for target_text in joblist['解析结果']:
45 |         feat += [
46 |             len(set(query_text) & set(target_text)),
47 |             len(set(query_text) & set(target_text)) / len(query_text),
48 |             len(set(query_text) & set(target_text)) / len(target_text),
49 |             
50 |             len(set(query_text) & set(target_text)) / len(set(target_text)),
51 |             len(set(query_text) & set(target_text)) / len(set(query_text))
52 | 
53 |         ]
54 |     train_feat.append(feat)
55 | train_feat = np.array(train_feat)
56 | m = RandomForestClassifier()
57 | m.fit(
58 |     train_feat[:, 1:],
59 |     train_feat[:, 0],
60 | )
61 | 
62 | test_feat = []
63 | for row in test_data.iterrows():
64 |     query_text= row[1]['解析结果']
65 |     feat = [
66 |         len(query_text), len(set(query_text)), len(query_text) - len(set(query_text)),
67 |     ]
68 |     for target_text in joblist['解析结果']:
69 |         feat += [
70 |             len(set(query_text) & set(target_text)),
71 |             len(set(query_text) & set(target_text)) / len(query_text),
72 |             len(set(query_text) & set(target_text)) / len(target_text),
73 |             
74 |             len(set(query_text) & set(target_text)) / len(set(target_text)),
75 |             len(set(query_text) & set(target_text)) / len(set(query_text))
76 | 
77 |         ]
78 |     test_feat.append(feat)
79 | test_feat = np.array(test_feat)
80 | pd.DataFrame({
81 |     '简历ID': range(len(test_data)),
82 |     '岗位ID': m.predict(test_feat).astype(int)
83 | }).to_csv('/work/output/result.csv', index=None)
84 | 
85 | ```
86 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/基于近红外光谱的煤质参数预测挑战赛_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "584f7c00-08a1-4776-a29a-a4e9095fcf48",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import pandas as pd"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 13,
 18 |    "id": "aad799ff-7a71-4c3c-883c-9c7d5b504c10",
 19 |    "metadata": {
 20 |     "tags": []
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "train_data = pd.read_csv('基于近红外光谱的煤质参数预测挑战赛公开数据/train_data.csv')\n",
 25 |     "train_label = pd.read_csv('基于近红外光谱的煤质参数预测挑战赛公开数据/train_label.csv')\n",
 26 |     "\n",
 27 |     "test_data = pd.read_csv('基于近红外光谱的煤质参数预测挑战赛公开数据/test_data.csv')\n",
 28 |     "submit = pd.read_csv('提交示例.csv')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 10,
 34 |    "id": "06b6d303-accd-4aad-84ee-8b3e225714b7",
 35 |    "metadata": {
 36 |     "tags": []
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "((100, 126), (500, 126))"
 43 |       ]
 44 |      },
 45 |      "execution_count": 10,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "test_data.shape, train_data.shape"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 19,
 57 |    "id": "f56185c0-f1cc-46e9-9795-4b394254ca24",
 58 |    "metadata": {
 59 |     "tags": []
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/html": [
 65 |        "<div>\n",
 66 |        "<style scoped>\n",
 67 |        "    .dataframe tbody tr th:only-of-type {\n",
 68 |        "        vertical-align: middle;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe tbody tr th {\n",
 72 |        "        vertical-align: top;\n",
 73 |        "    }\n",
 74 |        "\n",
 75 |        "    .dataframe thead th {\n",
 76 |        "        text-align: right;\n",
 77 |        "    }\n",
 78 |        "</style>\n",
 79 |        "<table border=\"1\" class=\"dataframe\">\n",
 80 |        "  <thead>\n",
 81 |        "    <tr style=\"text-align: right;\">\n",
 82 |        "      <th></th>\n",
 83 |        "      <th>样品编号</th>\n",
 84 |        "      <th>水分</th>\n",
 85 |        "      <th>灰分</th>\n",
 86 |        "    </tr>\n",
 87 |        "  </thead>\n",
 88 |        "  <tbody>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>0</th>\n",
 91 |        "      <td>0</td>\n",
 92 |        "      <td>12.2</td>\n",
 93 |        "      <td>6.04</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>1</th>\n",
 97 |        "      <td>1</td>\n",
 98 |        "      <td>11.7</td>\n",
 99 |        "      <td>6.03</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>2</th>\n",
103 |        "      <td>2</td>\n",
104 |        "      <td>9.0</td>\n",
105 |        "      <td>29.58</td>\n",
106 |        "    </tr>\n",
107 |        "    <tr>\n",
108 |        "      <th>3</th>\n",
109 |        "      <td>3</td>\n",
110 |        "      <td>7.6</td>\n",
111 |        "      <td>22.95</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>4</th>\n",
115 |        "      <td>4</td>\n",
116 |        "      <td>14.4</td>\n",
117 |        "      <td>19.87</td>\n",
118 |        "    </tr>\n",
119 |        "  </tbody>\n",
120 |        "</table>\n",
121 |        "</div>"
122 |       ],
123 |       "text/plain": [
124 |        "   样品编号    水分     灰分\n",
125 |        "0     0  12.2   6.04\n",
126 |        "1     1  11.7   6.03\n",
127 |        "2     2   9.0  29.58\n",
128 |        "3     3   7.6  22.95\n",
129 |        "4     4  14.4  19.87"
130 |       ]
131 |      },
132 |      "execution_count": 19,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "train_label.head()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 21,
144 |    "id": "62faf00b-b406-4e14-b788-bb227c1c7928",
145 |    "metadata": {
146 |     "tags": []
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "from sklearn.linear_model import LinearRegression\n",
151 |     "\n",
152 |     "m = LinearRegression()\n",
153 |     "m.fit(train_data.iloc[:, 1:], train_label['水分'])\n",
154 |     "submit['水分'] = m.predict(test_data.iloc[:, 1:])\n",
155 |     "\n",
156 |     "m = LinearRegression()\n",
157 |     "m.fit(train_data.iloc[:, 1:], train_label['灰分'])\n",
158 |     "submit['灰分'] = m.predict(test_data.iloc[:, 1:])\n",
159 |     "\n",
160 |     "submit.to_csv('lr.csv', index=None)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "id": "36ed68c8-e3f6-42ba-afb1-4ffc02885968",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "Python 3 (ipykernel)",
175 |    "language": "python",
176 |    "name": "python3.10"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "3.9.10"
189 |   },
190 |   "widgets": {
191 |    "application/vnd.jupyter.widget-state+json": {
192 |     "state": {},
193 |     "version_major": 2,
194 |     "version_minor": 0
195 |    }
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 5
200 | }
201 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/能源消耗预测挑战赛_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "id": "90ded076-1959-49b5-8021-2f954c96d92f",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import pandas as pd"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 17,
 18 |    "id": "0ecb9839-fe23-47a1-8a5f-9e5ee07bd8a9",
 19 |    "metadata": {
 20 |     "tags": []
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "train_data = pd.read_csv(\"能源消耗预测挑战赛公开数据/train.csv\")\n",
 25 |     "test_data = pd.read_csv(\"能源消耗预测挑战赛公开数据/test.csv\")"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 18,
 31 |    "id": "827e9054-cdb6-41f7-a4b3-132d22257400",
 32 |    "metadata": {
 33 |     "tags": []
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/html": [
 39 |        "<div>\n",
 40 |        "<style scoped>\n",
 41 |        "    .dataframe tbody tr th:only-of-type {\n",
 42 |        "        vertical-align: middle;\n",
 43 |        "    }\n",
 44 |        "\n",
 45 |        "    .dataframe tbody tr th {\n",
 46 |        "        vertical-align: top;\n",
 47 |        "    }\n",
 48 |        "\n",
 49 |        "    .dataframe thead th {\n",
 50 |        "        text-align: right;\n",
 51 |        "    }\n",
 52 |        "</style>\n",
 53 |        "<table border=\"1\" class=\"dataframe\">\n",
 54 |        "  <thead>\n",
 55 |        "    <tr style=\"text-align: right;\">\n",
 56 |        "      <th></th>\n",
 57 |        "      <th>date</th>\n",
 58 |        "      <th>id</th>\n",
 59 |        "    </tr>\n",
 60 |        "  </thead>\n",
 61 |        "  <tbody>\n",
 62 |        "    <tr>\n",
 63 |        "      <th>0</th>\n",
 64 |        "      <td>2019122700</td>\n",
 65 |        "      <td>0</td>\n",
 66 |        "    </tr>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>1</th>\n",
 69 |        "      <td>2019122700</td>\n",
 70 |        "      <td>1</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>2</th>\n",
 74 |        "      <td>2019122700</td>\n",
 75 |        "      <td>2</td>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>3</th>\n",
 79 |        "      <td>2019122700</td>\n",
 80 |        "      <td>3</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>4</th>\n",
 84 |        "      <td>2019122700</td>\n",
 85 |        "      <td>4</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>...</th>\n",
 89 |        "      <td>...</td>\n",
 90 |        "      <td>...</td>\n",
 91 |        "    </tr>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>59875</th>\n",
 94 |        "      <td>2019123123</td>\n",
 95 |        "      <td>494</td>\n",
 96 |        "    </tr>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>59876</th>\n",
 99 |        "      <td>2019123123</td>\n",
100 |        "      <td>495</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>59877</th>\n",
104 |        "      <td>2019123123</td>\n",
105 |        "      <td>496</td>\n",
106 |        "    </tr>\n",
107 |        "    <tr>\n",
108 |        "      <th>59878</th>\n",
109 |        "      <td>2019123123</td>\n",
110 |        "      <td>497</td>\n",
111 |        "    </tr>\n",
112 |        "    <tr>\n",
113 |        "      <th>59879</th>\n",
114 |        "      <td>2019123123</td>\n",
115 |        "      <td>498</td>\n",
116 |        "    </tr>\n",
117 |        "  </tbody>\n",
118 |        "</table>\n",
119 |        "<p>59880 rows × 2 columns</p>\n",
120 |        "</div>"
121 |       ],
122 |       "text/plain": [
123 |        "             date   id\n",
124 |        "0      2019122700    0\n",
125 |        "1      2019122700    1\n",
126 |        "2      2019122700    2\n",
127 |        "3      2019122700    3\n",
128 |        "4      2019122700    4\n",
129 |        "...           ...  ...\n",
130 |        "59875  2019123123  494\n",
131 |        "59876  2019123123  495\n",
132 |        "59877  2019123123  496\n",
133 |        "59878  2019123123  497\n",
134 |        "59879  2019123123  498\n",
135 |        "\n",
136 |        "[59880 rows x 2 columns]"
137 |       ]
138 |      },
139 |      "execution_count": 18,
140 |      "metadata": {},
141 |      "output_type": "execute_result"
142 |     }
143 |    ],
144 |    "source": [
145 |     "test_data"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 19,
151 |    "id": "7bf021cb-9954-4830-b3bc-c9627fc4011a",
152 |    "metadata": {
153 |     "tags": []
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "train_data['new_date'] = pd.to_datetime(train_data['date'], format='%Y%m%d%H')\n",
158 |     "test_data['new_date'] = pd.to_datetime(test_data['date'], format='%Y%m%d%H')"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 20,
164 |    "id": "682eda08-76c4-446d-8e35-1055a50ab8e8",
165 |    "metadata": {
166 |     "tags": []
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "train_data['new_date_hour'] = train_data['new_date'].dt.hour\n",
171 |     "test_data['new_date_hour'] = test_data['new_date'].dt.hour"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 23,
177 |    "id": "9e5a60b9-b677-44b0-8d63-87aea9213d0e",
178 |    "metadata": {
179 |     "tags": []
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "mean_target = train_data.groupby(['id', \"new_date_hour\"])['target'].mean().reset_index()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 24,
189 |    "id": "a2330daa-8885-4756-b905-a404d975f5e0",
190 |    "metadata": {
191 |     "tags": []
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "pd.merge(test_data, mean_target, \n",
196 |     "         on=['id', \"new_date_hour\"], how='left'\n",
197 |     ")[['date', 'id', 'target']].to_csv('submit.csv', index=None)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "id": "1de2bb6e-fc05-469c-ba33-b8185778e393",
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": []
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 3 (ipykernel)",
212 |    "language": "python",
213 |    "name": "python3.10"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.9.10"
226 |   },
227 |   "widgets": {
228 |    "application/vnd.jupyter.widget-state+json": {
229 |     "state": {},
230 |     "version_major": 2,
231 |     "version_minor": 0
232 |    }
233 |   }
234 |  },
235 |  "nbformat": 4,
236 |  "nbformat_minor": 5
237 | }
238 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/自动驾驶疲劳检测挑战赛_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 8,
  6 |    "id": "96596192-bd3b-4b9a-8b72-6060f63ba75b",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "from sklearn.svm import LinearSVC"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 11,
 20 |    "id": "d00f1077-eb0c-4dc4-b80d-231590d59655",
 21 |    "metadata": {
 22 |     "tags": []
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "train_data = np.load('自动驾驶疲劳检测挑战赛公开数据-更新/train.npy')\n",
 27 |     "test_data = np.load('自动驾驶疲劳检测挑战赛公开数据-更新/test.npy')\n",
 28 |     "train_label = pd.read_csv('自动驾驶疲劳检测挑战赛公开数据-更新/train_label.csv', header=None)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 15,
 34 |    "id": "7afcf7bd-fa8b-41b6-ace9-fbbe884b888b",
 35 |    "metadata": {
 36 |     "tags": []
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "((10000, 64, 64, 3), (15000, 64, 64, 3))"
 43 |       ]
 44 |      },
 45 |      "execution_count": 15,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "train_data.shape, test_data.shape"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "53e64dd9-7aee-4248-acf3-03e7106543ab",
 58 |    "metadata": {
 59 |     "tags": []
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "m = LinearSVC()\n",
 64 |     "m.fit(train_data.reshape(10000, -1), train_label)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 17,
 70 |    "id": "5a609443-9af4-4e7c-a899-bd06a492a4dd",
 71 |    "metadata": {
 72 |     "tags": []
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "pd.DataFrame(m.predict(test_data.reshape(15000, -1))).to_csv('submit.csv', index=None, header=None)"
 77 |    ]
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 3 (ipykernel)",
 83 |    "language": "python",
 84 |    "name": "python3.10"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.9.10"
 97 |   },
 98 |   "widgets": {
 99 |    "application/vnd.jupyter.widget-state+json": {
100 |     "state": {},
101 |     "version_major": 2,
102 |     "version_minor": 0
103 |    }
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 5
108 | }
109 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/跨境电商效果广告ROI预测挑战赛_baseline.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | 
 6 | train_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/train.csv')
 7 | test_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/testA.csv')
 8 | 
 9 | train_data['datetime'] = pd.to_datetime(train_data['datetime'])
10 | test_data['datetime'] = pd.to_datetime(test_data['datetime'])
11 | train_data['datetime_hour'] = train_data['datetime'].dt.hour
12 | test_data['datetime_hour'] = test_data['datetime'].dt.hour
13 | 
14 | train_data.drop('datetime', axis=1, inplace=True)
15 | test_data.drop('datetime', axis=1, inplace=True)
16 | 
17 | from sklearn.preprocessing import LabelEncoder
18 | 
19 | for col in ['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries']:
20 |     lbl = LabelEncoder()
21 |     lbl.fit(list(train_data[col]) + list(test_data[col]))
22 |     train_data[col] = lbl.transform(list(train_data[col]))
23 |     test_data[col] = lbl.transform(list(test_data[col]))
24 | 
25 | from lightgbm import LGBMRegressor
26 | model = LGBMRegressor()
27 | 
28 | train_data['product_id_roi_mean'] = train_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean())
29 | test_data['product_id_roi_mean'] = test_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean())
30 | 
31 | train_data['account_id_roi_mean'] = train_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean())
32 | test_data['account_id_roi_mean'] = test_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean())
33 | 
34 | train_data['countries_roi_mean'] = train_data['countries'].map(train_data.groupby(['countries'])['roi'].mean())
35 | test_data['countries_roi_mean'] = test_data['countries'].map(train_data.groupby(['countries'])['roi'].mean())
36 | 
37 | train_data['datetime_hour_roi_mean'] = train_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean())
38 | test_data['datetime_hour_roi_mean'] = test_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean())
39 | 
40 | model.fit(
41 |     train_data.iloc[:].drop('roi', axis=1),
42 |     train_data.iloc[:]['roi'], categorical_feature=['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries']
43 | )
44 | 
45 | df = pd.read_csv('提交示例.csv')
46 | df['roi'] = model.predict(test_data.iloc[:].drop('uuid', axis=1))
47 | df.to_csv('submit.csv', index=None)
48 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2023/通信系统调制格式识别与分类挑战赛_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 35,
  6 |    "id": "617a981e-a469-492b-87ef-4f527e714e19",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 36,
 17 |    "id": "3d880c58-2e16-4afb-bc5d-a7412eb24a62",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "y_train = np.load('通信调制格式识别与分类数据集公开数据/训练集/Y_train.npy')\n",
 22 |     "x_train = np.load('通信调制格式识别与分类数据集公开数据/训练集/X_train.npy')\n",
 23 |     "\n",
 24 |     "x_test = np.load('通信调制格式识别与分类数据集公开数据/测试集/X_test.npy')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 37,
 30 |    "id": "9e3fdb86-5051-49d8-aebc-79e71e2d8e7f",
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/plain": [
 36 |        "(176000, 2, 128)"
 37 |       ]
 38 |      },
 39 |      "execution_count": 37,
 40 |      "metadata": {},
 41 |      "output_type": "execute_result"
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "x_train.shape"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 38,
 51 |    "id": "2393a30d-c902-4507-90e4-e763867e9074",
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "1     16152\n",
 58 |        "7     16063\n",
 59 |        "2     16047\n",
 60 |        "4     16024\n",
 61 |        "6     16018\n",
 62 |        "8     16010\n",
 63 |        "9     15977\n",
 64 |        "0     15944\n",
 65 |        "5     15942\n",
 66 |        "3     15923\n",
 67 |        "10    15900\n",
 68 |        "dtype: int64"
 69 |       ]
 70 |      },
 71 |      "execution_count": 38,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "pd.DataFrame(y_train.argmax(1)).value_counts()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 43,
 83 |    "id": "5b7aa1bd-6f1a-4c9f-b53f-f29264bad295",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "x_train = x_train.reshape(-1, 256)\n",
 88 |     "x_test = x_test.reshape(-1, 256)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 46,
 94 |    "id": "803aea89-1443-44bb-9853-2cecd0990419",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "from sklearn.linear_model import SGDClassifier\n",
 99 |     "from sklearn.naive_bayes import GaussianNB\n",
100 |     "from sklearn.neighbors import KNeighborsClassifier\n",
101 |     "from sklearn.preprocessing import OneHotEncoder\n",
102 |     "\n",
103 |     "from sklearn.model_selection import cross_val_predict"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 47,
109 |    "id": "a835e17e-2659-4b94-81b4-9d3aeb15b647",
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "pred1 = cross_val_predict(GaussianNB(), x_train, y_train.argmax(1))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 48,
119 |    "id": "7be151f1-51a5-4eb3-801b-947ad2884719",
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "0.20875"
126 |       ]
127 |      },
128 |      "execution_count": 48,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "(y_train.argmax(1) == pred1).mean()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 49,
140 |    "id": "163513e6-3f89-4755-b787-1cd01ea587da",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "model = GaussianNB().fit(x_train, y_train.argmax(1))"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 50,
150 |    "id": "e9c0d45e-9fb7-4d7c-85a0-93e25258c467",
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "array([8, 7, 3, ..., 6, 2, 7])"
157 |       ]
158 |      },
159 |      "execution_count": 50,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "model.predict(x_test)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 51,
171 |    "id": "e2e614c7-6bfc-40a5-9f0c-9d3c53f9b90b",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "submit = np.zeros((len(x_test), 11))\n",
176 |     "submit[np.arange(len(x_test)), model.predict(x_test)] = 1"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 54,
182 |    "id": "9df998da-70ad-420c-b3ac-f338c527824a",
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "np.save('submit.npy', submit)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "835f8204-859d-4dac-ae13-8a88a874b139",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": []
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Python 3 (ipykernel)",
201 |    "language": "python",
202 |    "name": "python3.10"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.9.10"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 5
219 | }
220 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2024/README.md:
--------------------------------------------------------------------------------
1 | https://challenge.xfyun.cn/?ch=dw24_AtTCK9
2 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2024/低资源文本翻译挑战赛_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "7e53efda-8330-4f93-8b0a-a26d25c95ce1",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stderr",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
 14 |      ]
 15 |     },
 16 |     {
 17 |      "data": {
 18 |       "application/vnd.jupyter.widget-view+json": {
 19 |        "model_id": "c18328a4fa614a84b23facc7e233de76",
 20 |        "version_major": 2,
 21 |        "version_minor": 0
 22 |       },
 23 |       "text/plain": [
 24 |        "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
 25 |       ]
 26 |      },
 27 |      "metadata": {},
 28 |      "output_type": "display_data"
 29 |     },
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "<BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Sevgili Anne,\n",
 35 |       "\n",
 36 |       "Bugün sana, seni ne kadar çok sevdiğimi ifade etmek istiyorum. Senin için hissettiklerim kelimelerle ifade edilemeyecek kadar derin ve güçlü. Sen benim hayatımın ışığı, en büyük destekçim ve en sevdiğim insansın.\n",
 37 |       "\n",
 38 |       "Her gün seninle geçirdiğim her an için minnettarım. Senin sevgin, rehberliğin ve desteğin olmadan hayatım çok farklı olurdu. Bana verdiğin\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# pip install transformers==4.41.1\n",
 44 |     "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
 45 |     "\n",
 46 |     "# https://huggingface.co/CohereForAI/aya-23-8B/\n",
 47 |     "model_id = \"/home/lyz/hf-models/aya-23-8B/\" # 本地路径\n",
 48 |     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
 49 |     "model = AutoModelForCausalLM.from_pretrained(model_id)\n",
 50 |     "\n",
 51 |     "# Format message with the command-r-plus chat template\n",
 52 |     "messages = [{\"role\": \"user\", \"content\": \"Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz\"}]\n",
 53 |     "input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\")\n",
 54 |     "## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>\n",
 55 |     "\n",
 56 |     "gen_tokens = model.generate(\n",
 57 |     "    input_ids, \n",
 58 |     "    max_new_tokens=100, \n",
 59 |     "    do_sample=True, \n",
 60 |     "    temperature=0.3,\n",
 61 |     "    )\n",
 62 |     "\n",
 63 |     "gen_text = tokenizer.decode(gen_tokens[0])\n",
 64 |     "print(gen_text)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "21d274a9-b0da-425c-82bb-bc93c0a8c254",
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stderr",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "  0%|          | 3/1000 [00:30<2:58:12, 10.72s/it]"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "from tqdm import tqdm\n",
 83 |     "test_lines = open('testA.nl').readlines()\n",
 84 |     "\n",
 85 |     "result = []\n",
 86 |     "for line in tqdm(test_lines):\n",
 87 |     "    # Format message with the command-r-plus chat template\n",
 88 |     "    messages = [{\"role\": \"user\", \"content\": f\"将下面荷兰语翻译为中文：{line}\"}]\n",
 89 |     "    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\")\n",
 90 |     "    \n",
 91 |     "    gen_tokens = model.generate(\n",
 92 |     "        input_ids, \n",
 93 |     "        max_new_tokens=100, \n",
 94 |     "        do_sample=True, \n",
 95 |     "        temperature=0.3,\n",
 96 |     "    )\n",
 97 |     "    \n",
 98 |     "    gen_text = tokenizer.decode(gen_tokens[0])\n",
 99 |     "    result.append(\n",
100 |     "        gen_text.split('<|CHATBOT_TOKEN|>')[1].split('<|')[0]\n",
101 |     "    )\n",
102 |     "\n",
103 |     "    with open('submit.csv', 'a') as up:\n",
104 |     "        up.write(result[-1].replace('\\n', '') + '\\n')"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "id": "06b375b3-8a5f-411d-b1e1-049e6350c5c5",
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": []
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "py3.11",
119 |    "language": "python",
120 |    "name": "py3.11"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 3
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython3",
132 |    "version": "3.11.8"
133 |   }
134 |  },
135 |  "nbformat": 4,
136 |  "nbformat_minor": 5
137 | }
138 | 


--------------------------------------------------------------------------------
/competition/科大讯飞AI开发者大赛2024/大模型能力评测中文成语释义与解析_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "id": "714855ed-9040-44b1-a930-86edf0952277",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "data": {
 11 |       "application/vnd.jupyter.widget-view+json": {
 12 |        "model_id": "4428241dd396428683892fe51ed5d438",
 13 |        "version_major": 2,
 14 |        "version_minor": 0
 15 |       },
 16 |       "text/plain": [
 17 |        "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
 18 |       ]
 19 |      },
 20 |      "metadata": {},
 21 |      "output_type": "display_data"
 22 |     },
 23 |     {
 24 |      "name": "stderr",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
 33 |     "device = \"cuda\" # the device to load the model onto\n",
 34 |     "\n",
 35 |     "model = AutoModelForCausalLM.from_pretrained(\n",
 36 |     "    \"/home/lyz/hf-models/Qwen/Qwen1.5-4B-Chat/\",\n",
 37 |     "    torch_dtype=\"auto\",\n",
 38 |     "    device_map=\"auto\"\n",
 39 |     ")\n",
 40 |     "tokenizer = AutoTokenizer.from_pretrained(\"/home/lyz/hf-models/Qwen/Qwen1.5-4B-Chat/\")\n",
 41 |     "\n",
 42 |     "prompt = \"hello\"\n",
 43 |     "messages = [\n",
 44 |     "    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
 45 |     "    {\"role\": \"user\", \"content\": prompt}\n",
 46 |     "]\n",
 47 |     "text = tokenizer.apply_chat_template(\n",
 48 |     "    messages,\n",
 49 |     "    tokenize=False,\n",
 50 |     "    add_generation_prompt=True\n",
 51 |     ")\n",
 52 |     "model_inputs = tokenizer([text], return_tensors=\"pt\").to(device)\n",
 53 |     "\n",
 54 |     "generated_ids = model.generate(\n",
 55 |     "    model_inputs.input_ids,\n",
 56 |     "    max_new_tokens=512\n",
 57 |     ")\n",
 58 |     "generated_ids = [\n",
 59 |     "    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
 60 |     "]\n",
 61 |     "\n",
 62 |     "response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "id": "579d0f7f-a511-4d53-9b6c-a4cd1fcc2b87",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import pandas as pd\n",
 73 |     "test = pd.read_csv('./test_input.csv', header=None)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "id": "a06baea3-940e-4a14-885b-b8175075efdc",
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "for test_prompt in test[0].values:\n",
 84 |     "    prompt = f\"列举与下面句子最相关的五个成语。只需要输出五个成语，不需要有其他的输出，写在一行中：{test_prompt}\"\n",
 85 |     "\n",
 86 |     "    words = ['同舟共济'] * 5\n",
 87 |     "    for _ in range(10):\n",
 88 |     "        messages = [\n",
 89 |     "            {\"role\": \"user\", \"content\": prompt},\n",
 90 |     "        ]\n",
 91 |     "        text = tokenizer.apply_chat_template(\n",
 92 |     "            messages,\n",
 93 |     "            tokenize=False,\n",
 94 |     "            add_generation_prompt=True\n",
 95 |     "        )\n",
 96 |     "        model_inputs = tokenizer([text], return_tensors=\"pt\").to(device)\n",
 97 |     "        \n",
 98 |     "        generated_ids = model.generate(\n",
 99 |     "            model_inputs.input_ids,\n",
100 |     "            max_new_tokens=512\n",
101 |     "        )\n",
102 |     "        generated_ids = [\n",
103 |     "            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
104 |     "        ]\n",
105 |     "        \n",
106 |     "        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
107 |     "        response = response.replace('\\n', ' ').replace('、', ' ')\n",
108 |     "        words = [x for x in response.split() if len(x) == 4 and x.strip() != '']\n",
109 |     "        if len(words) == 5:\n",
110 |     "            break\n",
111 |     "\n",
112 |     "\n",
113 |     "    if len(' '.join(words).strip()) != 24:\n",
114 |     "        words = ['同舟共济'] * 5\n",
115 |     "\n",
116 |     "    with open('submit.csv', 'a+') as up:\n",
117 |     "        up.write(' '.join(words) + '\\n')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 1,
123 |    "id": "9e852a9a-6aae-4bfa-b18b-030e593b1e77",
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "24"
130 |       ]
131 |      },
132 |      "execution_count": 1,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "len('一模一样 如出一辙 千篇一律 大同小异 毫无二致')"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "id": "60d6f9b8-8f12-40bc-baca-27198dd4989d",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": []
148 |   }
149 |  ],
150 |  "metadata": {
151 |   "kernelspec": {
152 |    "display_name": "py3.11",
153 |    "language": "python",
154 |    "name": "py3.11"
155 |   },
156 |   "language_info": {
157 |    "codemirror_mode": {
158 |     "name": "ipython",
159 |     "version": 3
160 |    },
161 |    "file_extension": ".py",
162 |    "mimetype": "text/x-python",
163 |    "name": "python",
164 |    "nbconvert_exporter": "python",
165 |    "pygments_lexer": "ipython3",
166 |    "version": "3.11.8"
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 5
171 | }
172 | 


--------------------------------------------------------------------------------
/competition/第三届“马栏山杯”国际音视频算法大赛/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/competition/第四届工业大数据创新竞赛：算法赛道/README.md:
--------------------------------------------------------------------------------
1 | 比赛官网：http://www.industrial-bigdata.com/Challenge/title?competitionId=GKLEW707XP2O58KZNLO4UPYKCOIEQONH
2 | 
3 | 赛题1：水电站流量预测
4 | - https://zhuanlan.zhihu.com/p/264801614
5 | 
6 | 赛题2：注塑成型赛道
7 | - [鱼佬baseline](https://github.com/datawhalechina/competition-baseline/blob/master/competition/%E7%AC%AC%E5%9B%9B%E5%B1%8A%E5%B7%A5%E4%B8%9A%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%88%9B%E6%96%B0%E7%AB%9E%E8%B5%9B%EF%BC%9A%E7%AE%97%E6%B3%95%E8%B5%9B%E9%81%93/%E6%B3%A8%E5%A1%91%E6%88%90%E5%9E%8B%E8%B5%9B%E9%81%93baseline.ipynb)
8 | - https://mp.weixin.qq.com/s/oG-Nfi8z2CAB6rncaie9LA
9 | 


--------------------------------------------------------------------------------
/competition/腾讯-2018腾讯广告算法大赛/README.md:
--------------------------------------------------------------------------------
1 | # 2018腾讯广告算法大赛（Rank11）
2 | 
3 | - 比赛数据（初赛）：https://pan.baidu.com/s/1tzBTQqA0Q9qexFr32hFrzg 提取码: ujmf
4 | - 比赛数据（复赛）：https://pan.baidu.com/s/1Jmf22fSNCHy9why2GuhgYA 提取码：kz4w
5 | - 冠军：[儿须成名酒须醉代码](https://github.com/bettenW/2018-Tencent-social-advertising-algorithm-contest)
6 | 


--------------------------------------------------------------------------------
/competition/腾讯-2019腾讯广告算法大赛/README.md:
--------------------------------------------------------------------------------
1 | # 2019腾讯广告算法大赛完整代码（冠军）
2 | 
3 | - 比赛数据：https://pan.baidu.com/s/1O5aOkQ_gVOuT1jkC8NFb9g 提取码: biv9 
4 | - 冠军：[鱼遇雨欲语与余代码](https://github.com/bettenW/Tencent2019_Finals_Rank1st)
5 | - 分享文章：[2019腾讯广告算法大赛方案分享（冠军）](https://zhuanlan.zhihu.com/p/73062485)
6 | 


--------------------------------------------------------------------------------
/competition/阿里灵杰问天引擎电商搜索算法赛/README.md:
--------------------------------------------------------------------------------
 1 | ![](https://tianchi-public.oss-cn-hangzhou.aliyuncs.com/public/files/forum/164611999657774301646119988943.png)
 2 | 
 3 | 
 4 | ## [https://coggle.club/blog/tianchi-open-search](https://coggle.club/blog/tianchi-open-search)
 5 | 
 6 | ## 比赛介绍
 7 | 
 8 | 受疫情催化影响，近一年内全球电商及在线零售行业进入高速发展期。作为线上交易场景的重要购买入口，搜索行为背后是强烈的购买意愿，电商搜索质量的高低将直接决定最终的成交结果，因此在AI时代，如何通过构建智能搜索能力提升线上GMV转化成为了众多电商开发者的重要研究课题。**本次比赛由阿里云天池平台和问天引擎联合举办，诚邀社会各界开发者参与竞赛，共建AI未来。**
 9 | 
10 | 
11 | ## 赛题建模
12 | 
13 | 赛题是一个文本检索任务：给定一个搜索查询，我们首先使用一个检索系统来检索得结果。但检索系统可能会检索与搜索查询不相关的文档，整体的任务可以参考已有的`文本语义检索`。
14 | 
15 | ![](https://cdn.coggle.club/img/InformationRetrieval.png)
16 | 
17 | ### 赛题数据分析
18 | 
19 | - 文本长度分析
20 | 
21 | - 关键词分析
22 | 
23 | - hard example
24 | 
25 | ### 赛题难点分析
26 | 
27 | 赛题的query比较短，属于非对称语义搜索（Asymmetric Semantic Search）任务，有一个简短的查询，希望找到一个较长的段落来回答该查询。赛题的query与corpus的文本可能存在并无重合单词的情况。
28 | 
29 | ![](https://cdn.coggle.club/img/SemanticSearch.png)
30 | 
31 | ### 赛题解题思路
32 | 
33 | - 思路1：使用关键词匹配，识别出query和corpus中关键词，使用关键词进行编码为向量。
34 | - 思路2：使用sentence-bert结合比赛标注数据进行训练
35 | - 思路3：使用simcse无监督对比学习训练
36 | 
37 | ### 赛题相关资料
38 | 
39 | - [https://www.sbert.net/examples/training/data_augmentation/README.html](https://www.sbert.net/examples/training/data_augmentation/README.html)
40 | - [https://www.sbert.net/examples/applications/semantic-search/README.html](https://www.sbert.net/examples/applications/semantic-search/README.html)
41 | - [https://www.sbert.net/docs/pretrained-models/msmarco-v3.html](https://www.sbert.net/docs/pretrained-models/msmarco-v3.html)
42 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | 假如你是数据竞赛的初学者、爱好者，比赛的baseline不仅是比赛思路分享，同时也是一类数据问题的方法总结。本Repo想做的就是将收集并整理并分享各种比赛的baseline方案。
2 | 
3 | **在项目的文档中，我们将尽可能的将比赛的知识点进行整理，除了比赛代码，我们还想留下来具体的解题过程和知识点。** 如果你觉得我们的文档对你有帮助，建议收藏本链接，或关注我们的微信公众号。
4 | 
5 | ![](https://coggle.club/assets/img/coggle_qrcode.jpg)


--------------------------------------------------------------------------------
/docs/_sidebar.md:
--------------------------------------------------------------------------------
1 | - 目录
2 |   - [科大讯飞AI开发者大赛（2021年度）](2021-%E7%A7%91%E5%A4%A7%E8%AE%AF%E9%A3%9EAI%E5%BC%80%E5%8F%91%E8%80%85%E5%A4%A7%E8%B5%9B/README.md)
3 | 


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |   <meta charset="UTF-8">
 5 |   <title>数据竞赛开源项目</title>
 6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
 7 |   <meta name="description" content="Description">
 8 |   <meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
 9 |   <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/docsify/lib/themes/vue.css">
10 |   <!-- <link rel="stylesheet" href="//unpkg.com/docsify@4.7.0/lib/themes/vue.css"> -->
11 | </head>
12 | <body>
13 |   <div id="app"></div>
14 |   <script>
15 |     window.$docsify = {
16 |       name: '数据竞赛开源项目',
17 |       repo:'https://github.com/datawhalechina/competition-baseline/',
18 |       loadSidebar: true,
19 |       loadNavbar: '_sidebar.md',
20 |       subMaxLevel: 3,
21 |       pagination:{
22 |         previousText:'上一篇',
23 |         nextText:'下一篇',
24 |       },
25 |       search:{
26 |         placeholder:'搜索',
27 |         noData:'找不到结果',
28 |         paths:'auto',
29 |         depth:6
30 |       }
31 |     }
32 |   </script>
33 |   <!-- CDN files for docsify-katex -->
34 |   <script src="//cdn.jsdelivr.net/npm/docsify-katex@latest/dist/docsify-katex.js"></script>
35 |   <!-- or <script src="//cdn.jsdelivr.net/gh/upupming/docsify-katex/dist/docsify-katex.js"></script> -->
36 |   <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/katex@latest/dist/katex.min.css">
37 | 
38 |   <!-- Put them above docsify.min.js -->
39 |   <script src="//cdn.jsdelivr.net/npm/docsify/lib/docsify.min.js"></script>
40 |   <script src="//cdn.jsdelivr.net/npm/docsify-pagination/dist/docsify-pagination.min.js"></script>
41 |   <script src="//cdn.jsdelivr.net/npm/docsify-copy-code"></script>
42 |   <script src="//cdn.jsdelivr.net/npm/docsify/lib/plugins/search.min.js"></script>
43 |   <script src="//cdn.jsdelivr.net/npm/docsify/lib/plugins/zoom-image.min.js"></script>
44 |   <script src="//cdn.jsdelivr.net/npm/prismjs@1/components/prism-bash.min.js"></script>
45 |   <script src="//cdn.jsdelivr.net/npm/prismjs@1/components/prism-python.min.js"></script>
46 | 
47 |   <!-- gitalk for comments -->
48 |   <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/gitalk/dist/gitalk.css">
49 | <script src="//cdn.jsdelivr.net/npm/docsify/lib/plugins/gitalk.min.js"></script>
50 | <script src="//cdn.jsdelivr.net/npm/gitalk/dist/gitalk.min.js"></script>
51 | 
52 | 
53 | 
54 |   <!-- <script src="//unpkg.com/docsify@4.7.0/lib/docsify.min.js"></script> -->
55 | </body>
56 | </html>
57 | 


--------------------------------------------------------------------------------
/tutorial/bert/README.md:
--------------------------------------------------------------------------------
1 | 
2 | ## transformers教程
3 | 
4 | https://github.com/huggingface/transformers/tree/main/examples/pytorch
5 | 
6 | https://github.com/huggingface/transformers/tree/main/examples/tensorflow
7 | 


--------------------------------------------------------------------------------
/tutorial/jax/README.md:
--------------------------------------------------------------------------------
 1 | ## Jax
 2 | 
 3 | - [https://jax.readthedocs.io/en/latest/](https://jax.readthedocs.io/en/latest/)
 4 | - [🤗 Hugging Face Models](https://huggingface.co/models?library=jax)
 5 | 
 6 | ## Flax
 7 | 
 8 | - Doc: [https://flax.readthedocs.io/en/latest/index.html](https://flax.readthedocs.io/en/latest/index.html)
 9 | - Github: [https://github.com/google/flax](https://github.com/google/flax)
10 | 
11 | ## Demo
12 | 
13 | -   [Digit Recognizer using JAX/FLAX (Toy image dataset)](https://www.kaggle.com/nilaychauhan/digit-recognizer-using-jax-flax)
14 | -   [Dog Breed Classification using JAX/FLAX (Image Dataset)](https://www.kaggle.com/nilaychauhan/dog-breed-classification-using-jax-and-flax)
15 | -   [Jigsaw toxic comment classification using JAX/FLAX (Text Dataset)](https://www.kaggle.com/nilaychauhan/jigsaw-toxic-comment-classification-using-jax-flax)
16 | -   [Cornell BirdCall classification using JAX/FLAX (Audio dataset)](https://www.kaggle.com/nilaychauhan/cornell-birdcall-audio-recognition-using-jax-flax)
17 | 


--------------------------------------------------------------------------------
/tutorial/paddlepaddle/README.md:
--------------------------------------------------------------------------------
 1 | ## 用PaddlePaddle打比赛
 2 | 
 3 | ### 计算机视觉比赛
 4 | 
 5 | - 科大讯飞-人脸关键点检测挑战赛
 6 |   - [科大讯飞-人脸关键点检测挑战赛：基础思路](https://aistudio.baidu.com/aistudio/projectdetail/2772561)
 7 |   - [科大讯飞-人脸关键点检测挑战赛：进阶思路预训练模型](https://aistudio.baidu.com/aistudio/projectdetail/2792492)
 8 | - 科大讯飞-电商图像检索挑战赛
 9 |   - [科大讯飞-电商图像检索挑战赛：基础思路CNN相似度](https://aistudio.baidu.com/aistudio/projectdetail/2798206)
10 | - [AIWIN 手写体OCR识别竞赛](https://aistudio.baidu.com/aistudio/projectdetail/2612313)
11 | - [DataFountain-交通标志分类识别：CNN多分类](https://aistudio.baidu.com/aistudio/projectdetail/3171955)
12 | - [DataFountain-天气以及时间分类：CNN多标签分类](https://aistudio.baidu.com/aistudio/projectdetail/3169455)
13 | 
14 | ### 自然语言处理比赛
15 | 
16 | - [科大讯飞-学术论文分类挑战赛：ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/3162632)
17 | - [科大讯飞-中文问题相似度挑战赛：ERNIE-NSP](https://aistudio.baidu.com/aistudio/projectdetail/3168859)
18 | - [DataFountain-疫情期间网民情绪识别：ERNIE分类](https://aistudio.baidu.com/aistudio/projectdetail/3172384)
19 | - [科大讯飞-中文成语填空挑战赛：ERNIE MASK填空](https://aistudio.baidu.com/aistudio/projectdetail/3169048)
20 | 
21 | ### 推荐系统&CTR比赛
22 | 
23 | - [使用Paddle完成图书推荐](https://aistudio.baidu.com/aistudio/projectdetail/2556840)
24 | - [WSDM-爱奇艺：用户留存预测挑战赛](https://aistudio.baidu.com/aistudio/projectdetail/2715522)
25 | - [WSDM-亚马逊跨境电商推荐topline](https://aistudio.baidu.com/aistudio/projectdetail/3142643)
26 | 
27 | ### 结构化&时间序列
28 | 
29 | - [AIWIN 心电图智能诊断竞赛](https://aistudio.baidu.com/aistudio/projectdetail/2653802)
30 | 
31 | ## Paddle资料
32 | 
33 | 问：AI Studio有什么学习资料？
34 | - 项目环境介绍：https://ai.baidu.com/ai-doc/AISTUDIO/Dk3e2vxg9
35 | - Notebook环境：https://ai.baidu.com/ai-doc/AISTUDIO/sk3e2z8sb
36 | 


--------------------------------------------------------------------------------
/tutorial/rank-ensemble.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 66,
  6 |    "metadata": {
  7 |     "execution": {
  8 |      "iopub.execute_input": "2022-07-27T02:17:11.148005Z",
  9 |      "iopub.status.busy": "2022-07-27T02:17:11.147455Z",
 10 |      "iopub.status.idle": "2022-07-27T02:17:11.153326Z",
 11 |      "shell.execute_reply": "2022-07-27T02:17:11.152655Z",
 12 |      "shell.execute_reply.started": "2022-07-27T02:17:11.147953Z"
 13 |     },
 14 |     "tags": []
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from sklearn.metrics import roc_auc_score\n",
 19 |     "import numpy as np\n",
 20 |     "from scipy.stats import rankdata\n",
 21 |     "from sklearn.linear_model import LinearRegression"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 67,
 27 |    "metadata": {
 28 |     "execution": {
 29 |      "iopub.execute_input": "2022-07-27T02:17:11.327557Z",
 30 |      "iopub.status.busy": "2022-07-27T02:17:11.327015Z",
 31 |      "iopub.status.idle": "2022-07-27T02:17:11.335082Z",
 32 |      "shell.execute_reply": "2022-07-27T02:17:11.334607Z",
 33 |      "shell.execute_reply.started": "2022-07-27T02:17:11.327495Z"
 34 |     },
 35 |     "tags": []
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "t_true = np.random.randint(0, 2, size=1000)\n",
 40 |     "\n",
 41 |     "pred1 = t_true + np.random.randn(1000)\n",
 42 |     "pred1 = np.clip(pred1, 0, 1)\n",
 43 |     "\n",
 44 |     "pred2 = t_true + np.random.randn(1000) - 0.2\n",
 45 |     "pred2 = np.clip(pred2, 0, 1)\n",
 46 |     "\n",
 47 |     "pred3 = t_true + np.random.randn(1000) - 0.1\n",
 48 |     "pred3 = np.clip(pred3, 0, 1)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 68,
 54 |    "metadata": {
 55 |     "execution": {
 56 |      "iopub.execute_input": "2022-07-27T02:17:11.898040Z",
 57 |      "iopub.status.busy": "2022-07-27T02:17:11.897626Z",
 58 |      "iopub.status.idle": "2022-07-27T02:17:11.908813Z",
 59 |      "shell.execute_reply": "2022-07-27T02:17:11.908183Z",
 60 |      "shell.execute_reply.started": "2022-07-27T02:17:11.897992Z"
 61 |     },
 62 |     "tags": []
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "(0.7140375941955907, 0.7406884932307235, 0.7664949835720489)"
 69 |       ]
 70 |      },
 71 |      "execution_count": 68,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "roc_auc_score(t_true, pred1), roc_auc_score(t_true, pred2), roc_auc_score(t_true, pred3)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 69,
 83 |    "metadata": {
 84 |     "execution": {
 85 |      "iopub.execute_input": "2022-07-27T02:17:11.909933Z",
 86 |      "iopub.status.busy": "2022-07-27T02:17:11.909738Z",
 87 |      "iopub.status.idle": "2022-07-27T02:17:11.972134Z",
 88 |      "shell.execute_reply": "2022-07-27T02:17:11.971206Z",
 89 |      "shell.execute_reply.started": "2022-07-27T02:17:11.909909Z"
 90 |     },
 91 |     "tags": []
 92 |    },
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "0.8650706942160006"
 98 |       ]
 99 |      },
100 |      "execution_count": 69,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "roc_auc_score(t_true, (pred1 + pred2+ pred3) / 3)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 70,
112 |    "metadata": {
113 |     "execution": {
114 |      "iopub.execute_input": "2022-07-27T02:17:11.977691Z",
115 |      "iopub.status.busy": "2022-07-27T02:17:11.977440Z",
116 |      "iopub.status.idle": "2022-07-27T02:17:12.034941Z",
117 |      "shell.execute_reply": "2022-07-27T02:17:12.034030Z",
118 |      "shell.execute_reply.started": "2022-07-27T02:17:11.977673Z"
119 |     },
120 |     "tags": []
121 |    },
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "0.8575510547104799"
127 |       ]
128 |      },
129 |      "execution_count": 70,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     }
133 |    ],
134 |    "source": [
135 |     "roc_auc_score(t_true, (pred1 + pred2**0.2+ pred3**0.1))"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 71,
141 |    "metadata": {
142 |     "execution": {
143 |      "iopub.execute_input": "2022-07-27T02:17:12.896308Z",
144 |      "iopub.status.busy": "2022-07-27T02:17:12.895739Z",
145 |      "iopub.status.idle": "2022-07-27T02:17:12.905122Z",
146 |      "shell.execute_reply": "2022-07-27T02:17:12.904599Z",
147 |      "shell.execute_reply.started": "2022-07-27T02:17:12.896255Z"
148 |     },
149 |     "tags": []
150 |    },
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "0.8664093421215869"
156 |       ]
157 |      },
158 |      "execution_count": 71,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "pred = rankdata(pred1) + rankdata(pred2)+ rankdata(pred3)\n",
165 |     "pred /= 1000\n",
166 |     "pred /= 3\n",
167 |     "roc_auc_score(t_true, pred)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 65,
173 |    "metadata": {
174 |     "execution": {
175 |      "iopub.execute_input": "2022-07-27T02:14:06.895575Z",
176 |      "iopub.status.busy": "2022-07-27T02:14:06.895097Z",
177 |      "iopub.status.idle": "2022-07-27T02:14:06.901843Z",
178 |      "shell.execute_reply": "2022-07-27T02:14:06.901326Z",
179 |      "shell.execute_reply.started": "2022-07-27T02:14:06.895537Z"
180 |     }
181 |    },
182 |    "outputs": [
183 |     {
184 |      "data": {
185 |       "text/plain": [
186 |        "array([1., 3., 2.])"
187 |       ]
188 |      },
189 |      "execution_count": 65,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "rankdata([1,3,2])"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": []
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "Python 3 (ipykernel)",
209 |    "language": "python",
210 |    "name": "python3.10"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.9.10"
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 4
227 | }
228 | 


--------------------------------------------------------------------------------
/tutorial/sklearn/README.md:
--------------------------------------------------------------------------------
1 | https://scikit-learn.org/stable/
2 | 
3 | https://intel.github.io/scikit-learn-intelex/
4 | 


--------------------------------------------------------------------------------
/tutorial/tree/README.md:
--------------------------------------------------------------------------------
 1 | ## XGBoost
 2 | 
 3 | - XGBoost，https://xgboost.readthedocs.io/
 4 | - 参数介绍：https://xgboost.readthedocs.io/en/latest/parameter.html
 5 | 
 6 | ## LightGBM
 7 | 
 8 | - LightGBM，https://lightgbm.readthedocs.io/en/latest/
 9 | - 参数介绍：https://lightgbm.readthedocs.io/en/latest/Parameters.html
10 | 
11 | ## CatBoost
12 | 
13 | - CatBoost，https://yandex.com/dev/catboost/
14 | - 参数介绍：https://catboost.ai/docs/
15 | 


--------------------------------------------------------------------------------