├── LICENSE ├── README.md ├── competition ├── 2020DCIC-创新大赛大数据赛道 │ ├── README.md │ └── Task1-EDA.ipynb ├── 2021阿里云供应链大赛——需求预测及单级库存优化 │ └── baseline.ipynb ├── 2023全球智能汽车AI挑战赛——赛道二:智能驾驶汽车虚拟仿真视频数据理解赛道 │ ├── README.md │ └── clip_demo (2).ipynb ├── 2024“大运河杯”数据开发应用创新大赛——城市治理_baseline.ipynb ├── 2024数字中国创新大赛DCIC │ └── 海上风电出力预测赛道 │ │ ├── README.md │ │ └── dcic_baseline.ipynb ├── AIWIN2021 │ ├── AIWIN-保险文本知识问答-baseline.ipynb │ ├── AIWIN-保险文本问答-submit.ipynb │ ├── AIWIN_互联网舆情企业风险-submit.ipynb │ ├── AIWIN_互联网舆情识别-NER.ipynb │ ├── AIWIN_互联网舆情识别-baseline.ipynb │ ├── README.md │ └── 致Great_互联网舆情企业风险事件的识别和预警.ipynb ├── AIWIN2023 │ ├── README.md │ ├── 中文网页自动导航_关键词_baseline.ipynb │ └── 研报类型识别baseline.ipynb ├── ATEC2022 │ ├── README.md │ └── task1-EDA-Model.ipynb ├── DC竞赛-AI助疫·口罩佩戴检测大赛 │ ├── 1_train.py │ ├── 2_predict.py │ └── README.md ├── DIGIX2021 │ └── README.md ├── DataFountain-CCFBDI-2021 │ ├── README.md │ └── 个贷违约预测-860.ipynb ├── DataFountain-三角形图计算算法设计及性能优化 │ └── README.md ├── DataFountain-乘用车细分市场销量预测 │ └── README.md ├── DataFountain-云计算时代的大数据查询分析优化 │ └── README.md ├── DataFountain-互联网新闻情感分析 │ ├── README.md │ └── bert_baseline.ipynb ├── DataFountain-互联网金融新实体发现 │ ├── README.md │ ├── bert-chinese-ner.zip │ └── bert_baseline.ipynb ├── DataFountain-企业网络资产及安全事件分析与可视化 │ └── README.md ├── DataFountain-企业非法集资风险预测 │ ├── 843 (1).ipynb │ └── README.md ├── DataFountain-基于OCR的身份证要素提取 │ └── README.md ├── DataFountain-多人种人脸识别 │ └── README.md ├── DataFountain-技术需求与技术成果项目之间关联度计算模型 │ ├── README.md │ └── bert_baseline.py ├── DataFountain-离散制造过程中典型工件的质量符合率预测 │ └── README.md ├── DataFountain-视频版权检测算法 │ ├── README.md │ └── ccf_video_baseline.ipynb ├── DataFountain-金融信息负面及主体判定 │ └── README.md ├── Kesci-中国华录杯人群密度检测 │ ├── README.md │ └── test.py ├── Tianchi-2020数字中国创新大赛—算法赛:智慧海洋建设 │ └── README.md ├── Tianchi-安泰杯跨境电商智能算法大赛 │ └── README.md ├── Tianchi-心电人机智能大赛心电异常事件预测 │ └── README.md ├── Tianchi-第三届阿里云安全算法挑战赛 │ ├── EDA.ipynb │ ├── GBM_old.ipynb │ ├── LGB_LinuX_0819.py │ ├── README.md │ ├── api.csv │ ├── finetune.ipynb │ └── gbm.py ├── TinyMind人民币面值&冠字号编码识别挑战赛 │ ├── .DS_Store │ ├── README.md │ ├── task1 │ │ ├── 1_train.ipynb │ │ ├── README.md │ │ └── predict_rmb.py │ └── task2 │ │ ├── .DS_Store │ │ ├── 1_train_faster_rcnn.py │ │ ├── 2_predict_faster_rcnn.py │ │ ├── 3_savejson.py │ │ ├── VOC2007.zip │ │ ├── crnn-pytorch │ │ ├── README.md │ │ ├── __init__.py │ │ ├── dataset │ │ │ ├── __init__.py │ │ │ ├── collate_fn.py │ │ │ ├── data_transform.py │ │ │ ├── test_data.py │ │ │ └── text_data.py │ │ ├── fold_tta.pkl │ │ ├── lr_policy.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── crnn.py │ │ │ └── model_loader.py │ │ ├── pb_rcnn_label.csv │ │ ├── submit.py │ │ ├── test.py │ │ ├── test2.py │ │ ├── test2_tta.py │ │ └── train.py │ │ ├── data │ │ └── data.json │ │ └── multi-digit-pytorch │ │ ├── .ipynb_checkpoints │ │ └── 未命名-checkpoint.ipynb │ │ ├── 1_train.py │ │ ├── 2_predict.py │ │ ├── example.log │ │ └── 未命名.ipynb ├── WSDM2022 │ └── README.md ├── biendata-智源&计算所-互联网虚假新闻检测挑战赛 │ ├── README.md │ └── task1_bert.ipynb ├── kaggle-allstate-claims-severity │ ├── README.md │ ├── XGB_encoding(LB1106.33084).py │ └── nn_bagging_1111.84364.py ├── kaggle-atecup-deepfake │ ├── README.md │ └── ffdi-resnet-baseline.ipynb ├── kaggle-quickdraw-doodle-recognition │ ├── 1_save2df.py │ ├── 2_train.py │ ├── EDA.ipynb │ ├── EDA_predict.ipynb │ ├── PlotLoss.ipynb │ ├── README.md │ └── Transform_Example.ipynb ├── kaggle-spaceship-titanic.ipynb ├── kaggle-two-sigma-connect-rental-listing-inquiries │ ├── README.md │ └── lgb.py ├── yanxishe-IMDB评论剧透检测 │ ├── README.md │ └── src │ │ └── ml.ipynb ├── yanxishe-人脸年龄识别 │ ├── 1_train.py │ ├── 1_train_pretrain.py │ ├── 2_predit.py │ ├── README.md │ └── 人脸年龄识别练习赛冠军源码_1575964312087.zip ├── yanxishe-喵脸关键点检测 │ ├── 1_train.py │ ├── 2_predict.py │ ├── README.md │ └── train_box.csv ├── yanxishe-白葡萄酒品质预测 │ ├── README.md │ ├── lgb_baseline.py │ └── winequality_dataset.zip ├── yanxishe-美食识别挑战(1):豆腐VS土豆 │ ├── 1_train.py │ ├── 2_predict.py │ └── README.md ├── yanxishe-肌肉活动电信号推测手势 │ ├── README.md │ └── lgb_baseline.ipynb ├── yanxishe-肺炎X光病灶识别 │ ├── 1_train.py │ ├── 2_predict.py │ └── README.md ├── yanxishe-胸腔X光肺炎检测 │ ├── 1_train.py │ ├── 2_predict.py │ └── README.md ├── 全球AI攻防挑战赛 │ ├── README.md │ ├── 全球AI攻防挑战赛—赛道一:大模型生图安全疫苗注入_baseline.ipynb │ └── 全球AI攻防挑战赛—赛道二:金融场景凭证篡改检测_baseline.ipynb ├── 点石-Retention Rate of Baidu Hao Kan APP Users │ ├── 1_splitdf.py │ ├── 2_baseline_1128.py │ ├── 2_baseline_1202.py │ ├── 2_baseline_1203_Train0.75989_Test0.75627.py │ ├── 2_baseline_1203_Train0.76103_Test0.75740.py │ ├── 2_baseline_1203_Train0.77218_Test0.76203.py │ ├── README.md │ └── featselect.py ├── 科大讯飞AI开发者大赛-事件抽取挑战 │ └── README.md ├── 科大讯飞AI开发者大赛-婴儿啼哭声识别挑战赛 │ ├── README.md │ └── cry_baseline.ipynb ├── 科大讯飞AI开发者大赛-温室温度预测挑战赛 │ ├── README.md │ └── baseline.py ├── 科大讯飞AI开发者大赛-脑PET图像分析和疾病预测挑战赛算法挑战大赛 │ ├── 1_train.py │ ├── 2_predict.py │ └── README.md ├── 科大讯飞AI开发者大赛2021 │ ├── 上海海事大学_蛋白质结构预测赛.ipynb │ ├── 中国农业大学_农作物生长情况识别挑战赛.ipynb │ ├── 中国农业大学_引导拍照挑战赛.ipynb │ ├── 中国科学技术大学_新冠肺炎声音诊断挑战赛.ipynb │ ├── 中文成语填空挑战赛 │ │ ├── README.md │ │ ├── baseline.py │ │ ├── gen_train_test.py │ │ └── run.sh │ ├── 中文问题相似度挑战赛 │ │ ├── README.md │ │ └── bert-nsp-xunfei.ipynb │ ├── 人脸关键点检测挑战赛 │ │ ├── README.md │ │ ├── face-keypoint2.ipynb │ │ ├── face-keypoint_kfold.ipynb │ │ └── face-keypoint_kfold_stronger.ipynb │ ├── 人脸情绪识别挑战赛 │ │ ├── README.md │ │ ├── keras_cnn_baseline.ipynb │ │ ├── pytorch_cnn_baseline-66.ipynb │ │ └── pytorch_cnn_baseline.ipynb │ ├── 北京林业大学_环境空气质量评价挑战赛.ipynb │ ├── 学术论文分类挑战赛 │ │ ├── README.md │ │ └── tfidf_baseline.ipynb │ ├── 安徽大学-脑部PETMR图像疾病预测挑战赛.ipynb │ ├── 广告图片素材分类_baseline.ipynb │ ├── 广告点击率预估挑战赛 │ │ ├── README.md │ │ └── 广告点击率预估挑战赛Baseline.ipynb │ ├── 智能硬件语音控制的时频图分类挑战赛.ipynb │ ├── 清华大学_智能硬件语音控制的时频图分类挑战赛.ipynb │ ├── 电商图像检索挑战赛 │ │ ├── README.md │ │ ├── cnn_arcface.ipynb │ │ ├── cnn_arcface_kfold.ipynb │ │ └── cnn_baseline.ipynb │ ├── 科大讯飞商店销量预测 │ │ ├── README.md │ │ └── lgb.py │ ├── 科大讯飞股份有限公司_基于用户画像的商品推荐挑战赛.ipynb │ ├── 科大讯飞股份有限公司_猪只盘点挑战赛.ipynb │ ├── 移动设备用户年龄和性别预测 │ │ ├── README.md │ │ └── 移动设备用户年龄和性别预测Baseline.ipynb │ ├── 蛋白质结构预测挑战赛.md │ └── 车辆贷款违约预测挑战赛 │ │ ├── Baseline.py │ │ └── README.md ├── 科大讯飞AI开发者大赛2022 │ ├── LED生产封装瑕疵识别挑战赛_baseline.ipynb │ ├── README.md │ ├── 中文对话文本匹配挑战赛_baseline.ipynb │ ├── 人员聚集识别挑战赛-baseline.ipynb │ ├── 作物引导拍照挑战赛_baseline.ipynb │ ├── 创意视角下的数字广告CTR预估-数据读取.ipynb │ ├── 创意视角下的数字广告CTR预估-模型搭建.ipynb │ ├── 商品销量智能预测挑战赛_baseline.ipynb │ ├── 国产平台动作识别挑战赛.ipynb │ ├── 基于论文摘要的文本分类与查询性问答-bert.ipynb │ ├── 基于论文摘要的文本分类与查询性问答-tfidf.ipynb │ ├── 房屋租赁价格预测挑战赛-baseline.ipynb │ ├── 智能家居使用场景识别挑战赛_baseline.ipynb │ ├── 智能硬件语音控制的时频图分类挑战赛2.0-baseline.ipynb │ ├── 机动车车牌识别挑战赛-baseline.ipynb │ ├── 汽车领域多语种迁移学习挑战赛-baseline-0.61.py │ ├── 电信客户流失预测_baseline.ipynb │ ├── 疫情微博情绪识别挑战赛-baseline.ipynb │ ├── 神经影像分析与疾病预测挑战赛_baseline.ipynb │ ├── 糖尿病遗传风险检测挑战赛-baseline.ipynb │ └── 非标准化疾病诉求的简单分诊挑战赛_baseline.ipynb ├── 科大讯飞AI开发者大赛2023 │ ├── 5G移动用户使用预测挑战赛_baseline.ipynb │ ├── AI量化模型预测挑战赛_baseline.ipynb │ ├── ChatGPT生成文本检测器_baseline.ipynb │ ├── README.md │ ├── Stable Diffusion鉴别器挑战赛_baseline.ipynb │ ├── 中文语义病句识别纠正_baseline.md │ ├── 交通场景运输车辆外廓视觉测量挑战赛_baseline.ipynb │ ├── 人岗匹配挑战赛2023_baseline.md │ ├── 企业经营健康评估挑战赛_baseline.ipynb │ ├── 健康成人脑龄预测挑战赛_baseline.ipynb │ ├── 农作物朝向检测挑战赛_baseline.ipynb │ ├── 农机作业轨迹测算挑战赛_baseline.ipynb │ ├── 农民身份识别挑战赛_baseline.ipynb │ ├── 叶片病害识别挑战赛_baseline.ipynb │ ├── 图片文本块检测_baseline.ipynb │ ├── 基于可见光图像的柑橘花果梢语义分割挑战赛_baseline.ipynb │ ├── 基于图像识别算法的无人船障碍物检测挑战赛_baseline.ipynb │ ├── 基于用户画像的商品推荐挑战赛2.0_baseline.ipynb │ ├── 基于论文摘要的文本分类与关键词抽取挑战赛_baseline.ipynb │ ├── 基于近红外光谱的煤质参数预测挑战赛_baseline.ipynb │ ├── 多标签图像检索挑战赛_baseline.ipynb │ ├── 大视角差图像特征提取及匹配挑战赛.ipynb │ ├── 学术文档篇章级结构恢复挑战赛_baseline.ipynb │ ├── 学术文档要素分类挑战赛_baseline.ipynb │ ├── 工业场景下的服装生产力预测挑战赛_baseline.ipynb │ ├── 快速现场细胞学评价中的恶性细胞识别挑战赛_baseline.ipynb │ ├── 旋转机械故障诊断挑战赛_baseline.ipynb │ ├── 机器翻译质量评估挑战赛2023_baseline.ipynb │ ├── 标书实体抽取挑战赛_baseline.ipynb │ ├── 校招简历信息完整性检测挑战赛_baseline.ipynb │ ├── 校招简历应聘岗位与项目技能匹配检测挑战赛_baseline.ipynb │ ├── 水泵状态监测与故障诊断挑战赛_baseline.ipynb │ ├── 汽车保险索赔预测挑战赛_baseline.ipynb │ ├── 汽车领域文本规则泛化性增强挑战赛_baseline.ipynb │ ├── 用户新增预测挑战赛_baseline.ipynb │ ├── 社交账号网络分类挑战赛_baseline.ipynb │ ├── 移动广告营销场景下的人群召回算法挑战赛_baseline.ipynb │ ├── 空气质量指数预测挑战赛_baseline.ipynb │ ├── 糖尿病风险预测挑战赛_baseline.ipynb │ ├── 能源消耗预测挑战赛_baseline.ipynb │ ├── 脑PET图像分析和疾病预测挑战赛_baseline.ipynb │ ├── 自动驾驶疲劳检测挑战赛_baseline.ipynb │ ├── 苹果病害图像识别挑战赛_baseline.ipynb │ ├── 跨境电商效果广告ROI预测挑战赛_baseline.py │ ├── 通信系统调制格式识别与分类挑战赛_baseline.ipynb │ ├── 遥感图像倾斜舰船小目标检测挑战赛_baseline.ipynb │ ├── 酒店住宿价格预测挑战赛_baseline.ipynb │ ├── 锂离子电池生产参数调控及生产温度预测挑战赛_baseline.ipynb │ ├── 高分辨率遥感影像建筑物变化检测挑战赛_baseline.ipynb │ └── 鸟类品种识别挑战赛_baseline.ipynb ├── 科大讯飞AI开发者大赛2024 │ ├── README.md │ ├── 交通标识识别挑战赛_baseline.ipynb │ ├── 人岗匹配挑战赛赛季3_baseline.ipynb │ ├── 低资源文本翻译挑战赛_baseline.ipynb │ ├── 农业行人重识别挑战赛_baseline.ipynb │ ├── 分子性质AI预测挑战赛_baseline.ipynb │ ├── 基于无人机图像的农民劳作行为识别挑战赛_baseline.ipynb │ ├── 基于术语词典干预的机器翻译挑战赛_baseline.ipynb │ ├── 基于热力学定律的电池材料生产参数动态调控挑战赛_baseline.ipynb │ ├── 基于超声数据的多病种疾病预测挑战赛_baseline.ipynb │ ├── 大模型RAG智能问答挑战赛_baseline.ipynb │ ├── 大模型图像风格迁移挑战赛_label.ipynb │ ├── 大模型图文匹配识别挑战赛_baseline.ipynb │ ├── 大模型图表知识问答挑战赛_baseline.ipynb │ ├── 大模型能力评测中文成语释义与解析_baseline.ipynb │ ├── 心理健康辅助诊断挑战赛_0.92308.ipynb │ ├── 心理健康辅助诊断挑战赛_baseline.ipynb │ ├── 机器翻译质量评估挑战赛_baseline.ipynb │ ├── 濒危大型动物种类识别挑战赛_baseline.ipynb │ ├── 玉米雄穗识别挑战赛_baseline.ipynb │ ├── 电力需求预测挑战赛_baseline.ipynb │ ├── 短视频精准推荐挑战赛_baseline.ipynb │ ├── 网络安全入侵检测挑战赛_baseline.ipynb │ ├── 轻度认知障碍疾病预测挑战赛_baseline.ipynb │ ├── 问答意图聚类挑战赛_baseline.ipynb │ └── 高分辨率遥感识别检索挑战赛_baseline.ipynb ├── 第三届“马栏山杯”国际音视频算法大赛 │ ├── README.md │ ├── mgtv-用户下一个观看视频预测-390.ipynb │ └── mgtv-用户下一个观看视频预测-BERT.ipynb ├── 第四届工业大数据创新竞赛:算法赛道 │ ├── README.md │ └── 注塑成型赛道baseline.ipynb ├── 腾讯-2018腾讯广告算法大赛 │ └── README.md ├── 腾讯-2019腾讯广告算法大赛 │ └── README.md └── 阿里灵杰问天引擎电商搜索算法赛 │ ├── README.md │ ├── sentence-bert.ipynb │ └── 无监督baseline.ipynb ├── docs ├── .nojekyll ├── 2021-科大讯飞AI开发者大赛 │ └── README.md ├── README.md ├── _sidebar.md └── index.html └── tutorial ├── bert ├── README.md ├── bert-cls-example.ipynb ├── bert-mlm-example.ipynb ├── bert-ner-example.ipynb ├── bert-nsp-example.ipynb └── bert-qa-example.ipynb ├── jax └── README.md ├── paddlepaddle └── README.md ├── rank-ensemble.ipynb ├── sklearn └── README.md └── tree └── README.md /competition/2020DCIC-创新大赛大数据赛道/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /competition/2023全球智能汽车AI挑战赛——赛道二:智能驾驶汽车虚拟仿真视频数据理解赛道/README.md: -------------------------------------------------------------------------------- 1 | ## 赛题:智能驾驶汽车虚拟仿真视频数据理解赛道 2 | 3 | 输入:元宇宙仿真平台生成的前视摄像头虚拟视频数据(8-10秒左右); 4 | 5 | 输出:对视频中的信息进行综合理解,以指定的json文件格式,按照数据说明中的关键词(key)填充描述型的文本信息(value,中文/英文均可以); 6 | 7 | 8 | https://tianchi.aliyun.com/competition/entrance/532155/information 9 | 10 | 11 | -------------------------------------------------------------------------------- /competition/2024数字中国创新大赛DCIC/海上风电出力预测赛道/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /competition/AIWIN2021/README.md: -------------------------------------------------------------------------------- 1 | # AIWIN 秋季竞赛 2 | 3 | 4 | ## 赛题1- 手写体 OCR 识别竞赛 5 | 6 | 手写体 OCR 识别竞赛由交通银行命题,设立两个任务,其中任务一由第四范式提供开放数据集,特别针对金额和日期做识别,任务二要求在指定训练环境完成不可下载训练集的训练,增加了银行机构的文本内容。任务一适合新手,并配套学习营和特别的学习奖励。 7 | 8 | 比赛地址:http://ailab.aiwin.org.cn/competitions/65 9 | 10 | baseline地址:https://aistudio.baidu.com/aistudio/projectdetail/2612313 11 | 12 | ## 赛题2- 心电图智能诊断竞赛 13 | 14 | 心电图智能诊断竞赛由数创医疗和复旦大学附属中山医院共同命题,设立两个任务,其中任务一诊断心电图的正常异常与否,任务二对10+种不同症状予以判断综合分类。任务一同步设有学习营和配套的学习奖励,欢迎新手参与。 15 | 16 | 比赛地址:http://ailab.aiwin.org.cn/competitions/64 17 | 18 | baseline地址:https://aistudio.baidu.com/aistudio/projectdetail/2653802 19 | 20 | # AIWIN 春季竞赛 21 | 22 | ## 赛题1—互联网舆情企业风险事件的识别和预警 23 | 24 | 参赛选手从给定的互联网信息中提取、识别出企业主体名称,以及标记风险标签(内容包含新闻标题、正文、及对应标签等)。 25 | 26 | 比赛地址:http://ailab.aiwin.org.cn/competitions/48 27 | 28 | 29 | ## 赛题2-保险文本视觉认知问答竞赛 30 | 31 | 利用OCR技术自动识别影像资料后,再通过AI智能判断所识别文字的内在逻辑,回答关于图片的自然语言问题。问题的答案是可以从图片中提取的任何文本/标记。 32 | 33 | 比赛地址:http://ailab.aiwin.org.cn/competitions/49 34 | 35 | 训练集+测试集OCR识别结果:http://datawhale-cdn.coggle.club/aiwin2021/ocr/ocr_result.zip 36 | 37 | 38 | ## 赛题3-文化传媒数字资产的自动编目 39 | 40 | 基于计算机视觉、NLP和语音识别等多模态技术,以新闻视频为类型,通过AI算法自动将完整新闻节目进行时序解构、添加语义标签、并进行内容归类。 41 | 42 | 比赛地址:http://ailab.aiwin.org.cn/competitions/51 43 | 44 | ## 赛题4-机器学习在债券定价中的应用 45 | 46 | 利用宏观数据、行情数据或者其它特色数据构建特征,进行机器学习建模,对中债10年期国债、中债10年期国开债、中债10年期AAA级地方政府债、中债10年期AAA级城投债以及中债10年期AAA级企业债到期收益率进行预测,预测给定的未来时间段(2021.5.6-2021.6.4期间,包含两端日期,共 23 个交易日)的系列十年期债券利率价格,并分析所用特征重要程度,给出相关逻辑解释。 47 | 48 | 比赛地址:http://ailab.aiwin.org.cn/competitions/52 49 | -------------------------------------------------------------------------------- /competition/AIWIN2023/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /competition/ATEC2022/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /competition/DC竞赛-AI助疫·口罩佩戴检测大赛/2_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | # import imagehash 12 | from PIL import Image 13 | 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 15 | 16 | from efficientnet_pytorch import EfficientNet 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 18 | 19 | import torch 20 | torch.manual_seed(0) 21 | torch.backends.cudnn.deterministic = False 22 | torch.backends.cudnn.benchmark = True 23 | 24 | import torchvision.models as models 25 | import torchvision.transforms as transforms 26 | import torchvision.datasets as datasets 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.autograd import Variable 31 | from torch.utils.data.dataset import Dataset 32 | 33 | class QRDataset(Dataset): 34 | def __init__(self, img_path, transform=None): 35 | self.img_path = img_path 36 | if transform is not None: 37 | self.transform = transform 38 | else: 39 | self.transform = None 40 | 41 | def __getitem__(self, index): 42 | start_time = time.time() 43 | img = Image.open(self.img_path[index]).convert('RGB') 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index]))) 48 | 49 | def __len__(self): 50 | return len(self.img_path) 51 | 52 | class VisitNet(nn.Module): 53 | def __init__(self): 54 | super(VisitNet, self).__init__() 55 | 56 | # model = models.resnet18(True) 57 | # model.avgpool = nn.AdaptiveAvgPool2d(1) 58 | # model.fc = nn.Linear(512, 2) 59 | # self.resnet = model 60 | 61 | model = EfficientNet.from_pretrained('efficientnet-b4') 62 | model._fc = nn.Linear(1792, 2) 63 | self.resnet = model 64 | 65 | def forward(self, img): 66 | out = self.resnet(img) 67 | return out 68 | 69 | def predict(test_loader, model, tta=10): 70 | # switch to evaluate mode 71 | model.eval() 72 | 73 | test_pred_tta = None 74 | for _ in range(tta): 75 | test_pred = [] 76 | with torch.no_grad(): 77 | end = time.time() 78 | for i, (input, target) in enumerate(test_loader): 79 | input = input.cuda() 80 | target = target.cuda() 81 | 82 | # compute output 83 | output = model(input) 84 | output = output.data.cpu().numpy() 85 | 86 | test_pred.append(output) 87 | test_pred = np.vstack(test_pred) 88 | 89 | if test_pred_tta is None: 90 | test_pred_tta = test_pred 91 | else: 92 | test_pred_tta += test_pred 93 | 94 | return test_pred_tta 95 | 96 | test_jpg = ['game_gauzeMask_data/toPredict/{0}.jpg'.format(x) for x in range(0, 3802)] 97 | test_jpg = np.array(test_jpg) 98 | 99 | test_pred = None 100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt', 101 | 'resnet18_fold3.pt'][:]: 102 | 103 | test_loader = torch.utils.data.DataLoader( 104 | QRDataset(test_jpg, 105 | transforms.Compose([ 106 | transforms.Resize((256, 256)), 107 | transforms.RandomHorizontalFlip(), 108 | transforms.RandomVerticalFlip(), 109 | transforms.ToTensor(), 110 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 111 | ]) 112 | ), batch_size=50, shuffle=False, num_workers=10, pin_memory=True 113 | ) 114 | 115 | print(model_path) 116 | model = VisitNet().cuda() 117 | model.load_state_dict(torch.load(model_path)) 118 | # model = nn.DataParallel(model).cuda() 119 | if test_pred is None: 120 | test_pred = predict(test_loader, model, 1) 121 | else: 122 | test_pred += predict(test_loader, model, 1) 123 | 124 | print(test_pred.shape) 125 | 126 | test_csv = pd.DataFrame() 127 | test_csv['ID'] = list(range(0, 3082)) 128 | test_csv['Label'] = np.argmax(test_pred, 1) 129 | test_csv['Label'] = test_csv['Label'].map({1:'pos', 0:'neg'}) 130 | test_csv.to_csv('tmp.csv', index=None) -------------------------------------------------------------------------------- /competition/DC竞赛-AI助疫·口罩佩戴检测大赛/README.md: -------------------------------------------------------------------------------- 1 | 比赛链接:https://www.dcjingsai.com/common/cmpt/AI%E5%8A%A9%E7%96%AB%C2%B7%E5%8F%A3%E7%BD%A9%E4%BD%A9%E6%88%B4%E6%A3%80%E6%B5%8B%E5%A4%A7%E8%B5%9B_%E7%AB%9E%E8%B5%9B%E4%BF%A1%E6%81%AF.html 2 | 3 | 比赛任务:戴口罩、不带口罩,二分类; 4 | 5 | 思路:直接CNN训练 6 | 7 | ``` 8 | python 1_train.py 9 | python 2_predict.py 10 | ``` 11 | -------------------------------------------------------------------------------- /competition/DIGIX2021/README.md: -------------------------------------------------------------------------------- 1 | ## 2021 DIGIX全球校园AI算法精英大赛 2 | 3 | 报名链接(限在校生):https://developer.huawei.com/consumer/cn/activity/digixActivity/digixdetail/201621215957378831?ha_source=gb_sf&ha_sourceId=89000073 4 | 5 | - 赛题一:通过使用日志数据预测用户的留存周期; 6 | - 赛题二:通过给出的文章数据判别文章质量; 7 | - 赛题三:使用用户数据及历史行为数据建模完成视频推荐; 8 | - 赛题四:为基于多语言多模态的搜索排序任务; 9 | - 赛题五:为识别菜单图片中的文本信息。 10 | 11 | 报名时间:2021 年 8 月 31 日前均可报名 12 | 13 | 组队规则:每支队伍不超过 3 人,队长一人 14 | 15 | --- 16 | 17 | ### 赛题1:基于多目标多视图的用户留存周期预测 18 | - 简介 19 | 20 | 活跃留存周期预测通常使用单一视图做预测。在音乐领域,结合用户关注的音乐话题信息进行表征学习、结合歌曲信息进行音频、歌词、歌曲名、评论文本的多模态表征、结合歌手、用户、歌曲构建知识图谱,基于用户在APP侧信息构造行为链路向量化,辅助多日留存的多目标优化。在业界探索下一代机器学习模型与多模态向量化中有非常深远的价值。 21 | 22 | - 赛题说明 23 | 24 | 本题目基于脱敏和采样后的数据信息,保证数据安全。利用连续30天的用户行为日志,用户信息,歌曲信息,歌手信息,歌曲音频信息,预测未来30天内用户的留存情况,按未来一日、两日、三日、七日、十四日、三十日分段。 25 | 26 | - 评价指标:Area Under Curve (AUC) 27 | - baseline地址:https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game1 28 | 29 | - 提交文件: 30 | 提交的结果需要包含三个文件: 31 | - submission.csv:模型预测的结果文件,格式和给出的标注文件一致 32 | - DIGIX Implementation Instruction.docx:一份 word 文档描述所使用的模型以及所使用的环境 33 | - Source Code.zip:所使用的源码打包文件 34 | 35 | --- 36 | 37 | ### 赛题2:基于多模型迁移预训练文章质量判别 38 | - 简介 39 | 40 | 文章质量判别是信息流领域的核心问题,提升文章质量判别的准确率是提升信息流质量和精准推送的核心技术点。在本次大赛中,主办方提供匿名化的文章质量数据,参赛选手基于给定的数据构建文章质量判别模型。希望通过本次大赛挖掘nlp算法领域的人才,推动 nlp算法的发展。 41 | 42 | - 赛题说明 43 | 44 | 本题目将为选手提供文章数据,参赛选手基于给定的数据构建文章质量判别模型。所提供的数据经过脱敏处理,保证数据安全。 45 | 46 | 基础数据集包含两部分:训练集和测试集。其中训练集给定了该样本的文章质量的相关标签;测试集用于计算参赛选手模型的评分指标,参赛选手需要计算出测试集中每个样本文章质量判断及优质文章的类型。 47 | 48 | - 评价指标:F1 Score 49 | - baseline地址:https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game2 50 | - 赛题讲解视频:https://www.bilibili.com/video/BV1Rf4y157eo 51 | 52 | 53 | - 提交文件: 54 | 提交的结果需要包含三个文件: 55 | - submission.csv:模型预测的结果文件,格式和给出的标注文件一致 56 | - DIGIX Implementation Instruction.docx:一份 word 文档描述所使用的模型以及所使用的环境 57 | - Source Code.zip:所使用的源码打包文件 58 | 59 | --- 60 | 61 | ### 赛题3:基于多目标优化的视频推荐 62 | 63 | - 简介 64 | 65 | 推荐系统大多都是基于隐式反馈来做推荐,比如用户的点击、观看时长、评论、分享等,且不同隐式反馈表达了用户不同的喜好程度。如果仅仅以单目标对推荐结果进行衡量,会存在衡量不全面的问题。如视频场景,假设某个用户打开一个视频看了开头觉得不喜欢立马关掉,如果以点击为目标则体现的是用户感兴趣,但实际情况是用户对这个视频不感兴趣。从这个例子可以看出,在视频推荐中如果仅仅以点击为目标,可能忽视了用户更深层次的隐式反馈。因此,视频推荐除了关注用户点击,还需关注用户观看时长、分享等目标,期望通过多目标能更深入地挖掘用户兴趣,做更精准的推荐。 66 | 67 | - 赛题说明 68 | 69 | 本赛题提供14天数据用于训练,1天数据用于测试,数据包括用户特征,视频内容特征,以及用户历史行为数据,选手基于给出的数据,提供推荐策略,目标是预测每位用户观看视频时长所在区间,且预测是否对视频进行分享。所提供的数据经过脱敏处理,保证数据安全。 70 | 71 | - 评价指标:AUC加权和 72 | - baseline地址:https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game3 73 | - 赛题讲解视频:https://www.bilibili.com/video/BV1gg411M7Bx 74 | 75 | - 提交文件: 76 | 提交的结果需要包含三个文件: 77 | - submission.csv:模型预测的结果文件,格式和给出的标注文件一致 78 | - DIGIX Implementation Instruction.docx:一份 word 文档描述所使用的模型以及所使用的环境 79 | - Source Code.zip:所使用的源码打包文件 80 | 81 | --- 82 | 83 | ### 赛题4:基于多模态多语言的搜索排序 84 | 85 | - 简介 86 | 87 | 搜索,是用户获取信息,找答案最方便快捷的方式。一次用户搜索会经历 Query 解析、召回、排序多个环节,排序作为最后整个过程一环,对用户的体验有最直接的影响。在多语言、多模态的场景下如何充分利用信息、更好的优化用户体验,是业界普遍在探索的难题,也是机器学习算法的明珠。 88 | 89 | - 赛题说明 90 | 91 | 本题目将为选手提供的搜索数据、公开爬取经过清理后的网页属性库,参赛选手基于给定的数据构建召回、排序模型。所提供的数据经过脱敏处理,保证数据安全。 92 | 93 | 基础数据集包含两部分:训练集和测试集。其中训练集为若干个Query下的网页排序;测试集选手需提交对提供Query的网页排序,用于计算与真实排序的HIT@K。 94 | 95 | 本题目标是:在给定搜索关键字(query)和候选网页(doc)集合下,通过对 query 和网页的 title\url 等进行相关性预测,给出这个 query 对应的网页排序结果。本题包含英语和土语两个语言的数据。在训练集中,我们分别提供了这两种语言的 query和与之对应的网页排序结果(约 100 个)。选手可以基于这个训练集数据进行模型训练。预测集,我们提供了待测试的 query,以及用于召回排序的候选网页集合。选手在这个集合上进行召回排序,并将结果提交到系统。 96 | 97 | - 评价指标:HIT@K 98 | - baseline地址:https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game4 99 | 100 | 101 | - 提交文件: 102 | 提交的结果需要包含三个文件: 103 | - submission.csv:模型预测的结果文件,格式和给出的标注文件一致 104 | - DIGIX Implementation Instruction.docx:一份 word 文档描述所使用的模型以及所使用的环境 105 | - Source Code.zip:所使用的源码打包文件 106 | 107 | --- 108 | 109 | ### 赛题5:小样本菜单识别 110 | 111 | - 简介 112 | 113 | 图像文本识别在日常生活中有广泛的应用。在不同应用场景下,图像文本识别存在不同挑战。在菜单文字识别任务中,利用少量样本学习模型,同时解决多语言、艺术字等复杂场景下的问题,是提高识别准确率,提升用户体验的关键。希望通过本次比赛,挖掘计算机视觉方向人才,推动该领域发展。 114 | 115 | - 赛题说明 116 | 117 | 本赛题为选手提供菜单图片数据及其标注。训练集包括通用字符菜单图片及少量包含特殊字符菜单图片,选手使用训练数据进行模型训练。测试集分为A/B两个测试集。测试集仅提供菜单图片,选手使用模型预测菜单图片中文本主体的位置和内容。 118 | 119 | 检测并识别菜单中的文字,给出文字内容和坐标。我们提供了包括菜单图片及其对应的标注文件作为训练数据,标注以 JSON 文件形式给出。选手训练模型后在给定的测试集上预测测试菜单图片中的文字内容及坐标。本赛题使用 F1-score 作为最终效果的评价指标,综合考虑了字符级准确率 precision 和字符级召回率 recall。 120 | 121 | - 赛题数据 122 | 123 | 数据包括菜单图片及其对应的标注文件,标注以 JSON 文件形式给出,格式如下: 124 | ``` 125 | {"imagename.jpg":[{"label":"context","points":[[x1,y1],[x2,y2],…]},…],…} 126 | ``` 127 | 128 | 其中,“imagename.jpg”为图片名,“label”表示文本内容,“points”表示文本区域边缘坐标序列(文本可能是多边形)。标注文件中,“###”表示模糊字符或者其他语言字符,无需处理。 129 | 130 | - 评价指标:F1 Score 131 | - baseline地址:https://gitee.com/coggle/DIGIX2021-BASELINE/tree/main/baseline-game5 132 | - 赛题讲解视频:https://www.bilibili.com/video/BV14f4y1579M 133 | 134 | - 提交文件: 135 | 提交的结果需要包含三个文件: 136 | - label_special.json:模型预测的结果文件,格式和给出的标注文件一致 137 | - DIGIX Implementation Instruction.docx:一份 word 文档描述所使用的模型以及所使用的环境 138 | - Source Code.zip:所使用的源码打包文件 139 | -------------------------------------------------------------------------------- /competition/DataFountain-CCFBDI-2021/README.md: -------------------------------------------------------------------------------- 1 | ## 个贷违约预测 2 | 3 | - 赛题类型:结构化数据挖掘、金融风控 4 | 5 | https://www.datafountain.cn/competitions/530 6 | 7 | 本赛题要求利用已有的与目标客群稍有差异的另一批信贷数据,辅助目标业务风控模型的创建,两者数据集之间存在大量相同的字段和极少的共同用户。此处希望大家可以利用迁移学习捕捉不同业务中用户基本信息与违约行为之间的关联,帮助实现对新业务的用户违约预测。 8 | 9 | - baseline1:[阿水0.86单表思路](https://github.com/datawhalechina/competition-baseline/blob/master/competition/DataFountain-CCFBDI-2021/%E4%B8%AA%E8%B4%B7%E8%BF%9D%E7%BA%A6%E9%A2%84%E6%B5%8B-860.ipynb) 10 | - baseline2:[恒哥0.87多表思路](https://github.com/LogicJake/competition_baselines/tree/master/competitions/2021ccf_loan) 11 | 12 | ## 剧本角色情感识别 13 | 14 | - 赛题类型:NLP、情感分类 15 | 16 | https://www.datafountain.cn/competitions/518 17 | 18 | 本赛题提供一部分电影剧本作为训练集,训练集数据已由人工进行标注,参赛队伍需要对剧本场景中每句对白和动作描述中涉及到的每个角色的情感从多个维度进行分析和识别。该任务的主要难点和挑战包括:1)剧本的行文风格和通常的新闻类语料差别较大,更加口语化;2)剧本中角色情感不仅仅取决于当前的文本,对前文语义可能有深度依赖。 19 | 20 | - basline1:[恒哥 Bert 0.682](https://github.com/LogicJake/competition_baselines/tree/master/competitions/2021ccf_aqy) 21 | - basline2:[强哥 Bert多任务 0.67](https://github.com/China-ChallengeHub/ChallengeHub-Baselines/blob/main/aiqiyi-baseline.ipynb) 22 | 23 | ![](https://coggle.club/assets/img/coggle_qrcode.jpg) 24 | 25 | 26 | ## 用户上网异常行为分析 27 | 28 | - 赛题类型:结构化数据挖掘 29 | 30 | https://www.datafountain.cn/competitions/520 31 | 32 | 利用机器学习、深度学习,UEBA等人工智能方法,基于无标签的用户日常上网日志数据,构建用户上网行为基线和上网行为评价模型,依据上网行为与基线的距离确定偏离程度。 33 | - 通过用户日常上网数据构建行为基线; 34 | - 采用无监督学习模型,基于用户上网行为特征,构建上网行为评价模型,评价上网行为与基线的偏离程度。 35 | 36 | - baseline:[CquptDJ](https://blog.csdn.net/qq_44694861/article/details/120423658) 37 | 38 | 39 | ## 产品评论观点提取 40 | 41 | - 赛题类型:NLP、NER 42 | 43 | https://www.datafountain.cn/competitions/529 44 | 45 | 观点提取旨在从非结构化的评论文本中提取标准化、结构化的信息,如产品名、评论维度、评论观点等。此处希望大家能够通过自然语言处理的语义情感分析技术判断出一段银行产品评论文本的情感倾向,并能进一步通过语义分析和实体识别,标识出评论所讨论的产品名,评价指标和评价关键词。 46 | 47 | - baseline:[恒哥](https://github.com/LogicJake/competition_baselines/tree/master/competitions/2021ccf_ner) 48 | 49 | ## 基于飞桨实现花样滑冰选手骨骼点动作识别 50 | 51 | - 赛题类型:计算机视觉、姿态估计 52 | 53 | https://www.datafountain.cn/competitions/519/ 54 | 55 | 基于现实场景的应用需求以及图深度学习模型的发展,本次比赛旨在通过征集各队伍建立的高精度、细粒度、意义明确的动作识别模型,探索基于骨骼点的时空细粒度人体动作识别新方法。本次比赛将基于评价指标Accuracy对各队伍提交结果的评测成绩进行排名,Accuracy得分越高,则认为该模型的动作识别效果越好。 56 | 57 | - baseline:[飞浆](https://aistudio.baidu.com/aistudio/projectdetail/2417717) 58 | 59 | ## 千言-问题匹配鲁棒性评测 60 | 61 | - 赛题类型:NLP、文本匹配 62 | 63 | https://www.datafountain.cn/competitions/516/ 64 | 65 | 问题匹配(Question Matching)任务旨在判断两个自然问句之间的语义是否等价,是自然语言处理领域的一个重要研究方向。问题匹配同时也具有很高的商业价值,在信息检索、智能客服等领域发挥着重要作用。 66 | 67 | - baseline:[飞浆](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_matching/question_matching) 68 | -------------------------------------------------------------------------------- /competition/DataFountain-三角形图计算算法设计及性能优化/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/349 2 | 3 | 第二名开源:https://github.com/wang-zhq/TC_CUDA 4 | -------------------------------------------------------------------------------- /competition/DataFountain-乘用车细分市场销量预测/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/352 2 | 3 | 冠军思路:https://zhuanlan.zhihu.com/p/98926322 4 | 冠军代码:https://github.com/cxq80803716/2019-CCF-BDCI-Car_sales 5 | -------------------------------------------------------------------------------- /competition/DataFountain-云计算时代的大数据查询分析优化/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/347/ranking?sch=weekly 2 | 3 | 线上第二名:https://github.com/WenbinHou/CCF-BDCI-2019-Database 4 | -------------------------------------------------------------------------------- /competition/DataFountain-互联网新闻情感分析/README.md: -------------------------------------------------------------------------------- 1 | baseline思路: 2 | 3 | 直接bert梭哈,如果显存不够可以改maxlen和batachsize 4 | 5 | 也可以不同maxlen和batchsize的bert结果进行平均,会有收益。 6 | 7 | https://www.datafountain.cn/competitions/350 8 | 9 | 第一名:https://github.com/cxy229/BDCI2019-SENTIMENT-CLASSIFICATION 10 | -------------------------------------------------------------------------------- /competition/DataFountain-互联网金融新实体发现/README.md: -------------------------------------------------------------------------------- 1 | baseline思路: 2 | 3 | 参考https://github.com/ProHiryu/bert-chinese-ner 4 | 5 | 训练和预测可以修改如下参数: 6 | 7 | ``` 8 | flags.DEFINE_bool( 9 | "do_train", True, 10 | "Whether to run training." 11 | ) 12 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 13 | 14 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 15 | 16 | flags.DEFINE_bool("do_predict", False,"Whether to run the model in inference mode on the test set.") 17 | ``` 18 | 19 | 具体用法https://github.com/ProHiryu/bert-chinese-ner 20 | 21 | ## Top选手分享 22 | 23 | 第四名:https://github.com/rebornZH/2019-CCF-BDCI-NLP 24 | 25 | 第五名:https://github.com/light8lee/2019-BDCI-FinancialEntityDiscovery 26 | -------------------------------------------------------------------------------- /competition/DataFountain-互联网金融新实体发现/bert-chinese-ner.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/DataFountain-互联网金融新实体发现/bert-chinese-ner.zip -------------------------------------------------------------------------------- /competition/DataFountain-企业网络资产及安全事件分析与可视化/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/358 2 | 3 | 第二名开源:https://github.com/Mrzhangxiaohua/2019CCF_Visualization 4 | -------------------------------------------------------------------------------- /competition/DataFountain-企业非法集资风险预测/README.md: -------------------------------------------------------------------------------- 1 | 比赛链接:https://www.datafountain.cn/competitions/469/ 2 | 3 | 比赛直播:https://www.bilibili.com/video/BV1mf4y1q7az?p=2 4 | 5 | 线上843分数 6 | -------------------------------------------------------------------------------- /competition/DataFountain-基于OCR的身份证要素提取/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/346 2 | 3 | 第一名代码:https://github.com/Mingtzge/2019-CCF-BDCI-OCR-MCZJ-OCR-IdentificationIDElement 4 | 5 | 2019CCF BDCI大赛 最佳创新探索奖获得者、OCR身份证要素提取单赛题冠军 天晨破晓团队 6 | -------------------------------------------------------------------------------- /competition/DataFountain-多人种人脸识别/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/348 2 | 3 | 第一名:天才儿童代码,[方案分享](https://mp.weixin.qq.com/s?__biz=MzIwNDA5NDYzNA==&mid=2247483935&idx=1&sn=c82806f1c4fdd3c3c6e31a31a6faf75c&chksm=96c42fdaa1b3a6cc95e05cc401b97bb7588b86cf221664e3949de6473240debd6fcae5da3a93&token=1694266337),[代码分享](https://github.com/themostnewone/2019ccf) 4 | -------------------------------------------------------------------------------- /competition/DataFountain-技术需求与技术成果项目之间关联度计算模型/README.md: -------------------------------------------------------------------------------- 1 | baseline思路:pair 分类问题的bert baseline 2 | 3 | 代码存在一定问题,在encode阶段如果ach过长会导致req取不到,所以应该分开切片或者其他切片方式。 4 | 5 | https://www.datafountain.cn/competitions/359 6 | 7 | 第一名:https://github.com/Makaixin/Correlation-between-requirements-and-achievements 8 | 9 | 第二名:https://github.com/rebornZH/2019-CCF-BDCI-NLP 10 | -------------------------------------------------------------------------------- /competition/DataFountain-离散制造过程中典型工件的质量符合率预测/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/351/ 2 | 3 | 第一名:https://github.com/CcIsHandsome/-TOP1- 4 | -------------------------------------------------------------------------------- /competition/DataFountain-视频版权检测算法/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/354/ 2 | 3 | 4 | 5 | 1. 并行提取视频关键帧; 6 | 2. 通过resnet18提取关键帧特征; 7 | 3. 通过通过CNN特征计算得到query与refer对应关系; 8 | 4. 视频侵权时间段还需要进一步分析; 9 | 5. 截止到现在全网唯一成功提交的思路; 10 | -------------------------------------------------------------------------------- /competition/DataFountain-金融信息负面及主体判定/README.md: -------------------------------------------------------------------------------- 1 | https://www.datafountain.cn/competitions/353 2 | 3 | 冠军:https://github.com/xiong666/ccf_financial_negative 4 | 5 | 线上第一名:https://github.com/A-Rain/BDCI2019-Negative_Finance_Info_Judge 6 | 7 | 第二名:https://github.com/rebornZH/2019-CCF-BDCI-NLP 8 | 9 | 第三名:https://github.com/Chevalier1024/CCF-BDCI-ABSA 10 | -------------------------------------------------------------------------------- /competition/Kesci-中国华录杯人群密度检测/README.md: -------------------------------------------------------------------------------- 1 | 比赛链接:https://www.kesci.com/home/competition/5df1d33d23ea6d002b264ada/content 2 | 3 | 人群密度检测:在一张图片当中统计图片当中行人的数量。特别说明,当画面中行人数量大于 100 时,均按 100 计算。 4 | 5 | ![](https://github.com/weizheliu/Context-Aware-Crowd-Counting/raw/master/images/prediction.png) 6 | 7 | 比赛数据集链接:链接: https://pan.baidu.com/s/1wtmQUlsr_fcUKGTW1K-4oA 提取码: c2ab 8 | 9 | baseline思路,使用Crowd Counting进行预测,使用*Context-Aware Crowd Counting*的预训练权重: 10 | 11 | 1. `git clone https://github.com/weizheliu/Context-Aware-Crowd-Counting` 12 | 2. 下载pretrained model(part_B_pre.pth.tar),在我们分享的数据集中已经包含 13 | 3. `python test.py`即可,线上分数341左右 14 | -------------------------------------------------------------------------------- /competition/Kesci-中国华录杯人群密度检测/test.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import PIL.Image as Image 3 | import numpy as np 4 | import os 5 | import glob 6 | import scipy 7 | from image import * 8 | from model import CANNet 9 | import torch 10 | from torch.autograd import Variable 11 | 12 | from sklearn.metrics import mean_squared_error,mean_absolute_error 13 | 14 | from torchvision import transforms 15 | 16 | 17 | transform=transforms.Compose([ 18 | transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406], 19 | std=[0.229, 0.224, 0.225]), 20 | ]) 21 | 22 | # the folder contains all the test images 23 | img_folder='../A/' 24 | img_paths=[] 25 | 26 | for img_path in glob.glob(os.path.join(img_folder, '*')): 27 | img_paths.append(img_path) 28 | 29 | model = CANNet() 30 | 31 | model = model.cuda() 32 | 33 | checkpoint = torch.load('part_B_pre.pth.tar') 34 | 35 | model.load_state_dict(checkpoint['state_dict']) 36 | 37 | model.eval() 38 | 39 | pred= [] 40 | gt = [] 41 | 42 | # for i in xrange(len(img_paths)): 43 | # img = transform(Image.open(img_paths[i]).convert('RGB')).cuda() 44 | # img = img.unsqueeze(0) 45 | # h,w = img.shape[2:4] 46 | # h_d = h/2 47 | # w_d = w/2 48 | # img_1 = Variable(img[:,:,:h_d,:w_d].cuda()) 49 | # img_2 = Variable(img[:,:,:h_d,w_d:].cuda()) 50 | # img_3 = Variable(img[:,:,h_d:,:w_d].cuda()) 51 | # img_4 = Variable(img[:,:,h_d:,w_d:].cuda()) 52 | # density_1 = model(img_1).data.cpu().numpy() 53 | # density_2 = model(img_2).data.cpu().numpy() 54 | # density_3 = model(img_3).data.cpu().numpy() 55 | # density_4 = model(img_4).data.cpu().numpy() 56 | 57 | # pure_name = os.path.splitext(os.path.basename(img_paths[i]))[0] 58 | # # gt_file = h5py.File(img_paths[i].replace('.jpg','.h5').replace('images','ground_truth'),'r') 59 | # # groundtruth = np.asarray(gt_file['density']) 60 | # pred_sum = density_1.sum()+density_2.sum()+density_3.sum()+density_4.sum() 61 | # pred.append(pred_sum) 62 | # # gt.append(np.sum(groundtruth)) 63 | # print(img_paths[i], pred_sum) 64 | 65 | for i in xrange(len(img_paths)): 66 | img = Image.open(img_paths[i]) 67 | print('') 68 | print(img.size) 69 | if img.size[0] > 1200: 70 | img = img.resize((1024, int(img.size[1]*1024.0/img.size[0]))) 71 | # elif img.size[1] < 350: 72 | # img = img.resize((1024, int(img.size[1]*1024.0/img.size[0]))) 73 | print(img.size) 74 | 75 | img2 = transform(img.transpose(Image.FLIP_LEFT_RIGHT).convert('RGB')).cuda() 76 | img = transform(img.convert('RGB')).cuda() 77 | img2 = img2.unsqueeze(0) 78 | img = img.unsqueeze(0) 79 | h,w = img.shape[2:4] 80 | h_d = h/2 81 | w_d = w/2 82 | 83 | density_1 = model(img.cuda()).data.cpu().numpy() 84 | density_2 = model(img2.cuda()).data.cpu().numpy() 85 | 86 | # # img = img.unsqueeze(0) 87 | # h,w = img.shape[2:4] 88 | # h_d = h/2 89 | # w_d = w/2 90 | # img_1 = Variable(img[:,:,:h_d,:w_d].cuda()) 91 | # img_2 = Variable(img[:,:,:h_d,w_d:].cuda()) 92 | # img_3 = Variable(img[:,:,h_d:,:w_d].cuda()) 93 | # img_4 = Variable(img[:,:,h_d:,w_d:].cuda()) 94 | # density_3 = model(img_1).data.cpu().numpy() 95 | # density_4 = model(img_2).data.cpu().numpy() 96 | # density_5 = model(img_3).data.cpu().numpy() 97 | # density_6 = model(img_4).data.cpu().numpy() 98 | 99 | pure_name = os.path.splitext(os.path.basename(img_paths[i]))[0] 100 | # gt_file = h5py.File(img_paths[i].replace('.jpg','.h5').replace('images','ground_truth'),'r') 101 | # groundtruth = np.asarray(gt_file['density']) 102 | pred_sum = density_1.sum() + density_2.sum() 103 | pred.append(pred_sum/2) 104 | # gt.append(np.sum(groundtruth)) 105 | print(img_paths[i], pred_sum) 106 | 107 | import pandas as pd 108 | df = pd.DataFrame() 109 | df['file'] = [os.path.basename(x) for x in img_paths] 110 | df['man_count'] = pred 111 | df['man_count'] = df['man_count'].round() 112 | df['man_count'] = df['man_count'].astype(int) 113 | df.loc[df['man_count'] > 100, 'man_count'] = 100 114 | df.loc[df['man_count'] < 0, 'man_count'] = 0 115 | df.to_csv('../tmp2.csv', index=None) -------------------------------------------------------------------------------- /competition/Tianchi-2020数字中国创新大赛—算法赛:智慧海洋建设/README.md: -------------------------------------------------------------------------------- 1 | 比赛链接:https://tianchi.aliyun.com/competition/entrance/231768/introduction 2 | 3 | baseline地址:https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.3.163c5cfdJTbd9E&postId=87376 4 | 5 | -------------------------------------------------------------------------------- /competition/Tianchi-安泰杯跨境电商智能算法大赛/README.md: -------------------------------------------------------------------------------- 1 | https://tianchi.aliyun.com/competition/entrance/231718/information 2 | 3 | - 比赛数据:https://pan.baidu.com/s/1rHXSI44LjIi_dwDN5dZkFg 提取码: 2h6q 4 | - 冠军:[法国南部代码](https://github.com/RainFung/Tianchi-AntaiCup-International-E-commerce-Artificial-Intelligence-Challenge) 5 | -------------------------------------------------------------------------------- /competition/Tianchi-心电人机智能大赛心电异常事件预测/README.md: -------------------------------------------------------------------------------- 1 | https://tianchi.aliyun.com/competition/entrance/231754/introduction 2 | 3 | 冠军分享:https://github.com/RandomWalk-xzq/Hefei_ECG_TOP1 4 | -------------------------------------------------------------------------------- /competition/Tianchi-第三届阿里云安全算法挑战赛/README.md: -------------------------------------------------------------------------------- 1 | https://tianchi.aliyun.com/competition/entrance/231668/information 2 | -------------------------------------------------------------------------------- /competition/Tianchi-第三届阿里云安全算法挑战赛/api.csv: -------------------------------------------------------------------------------- 1 | GetSystemTimeAsFileTime 2 | NtAllocateVirtualMemory 3 | NtFreeVirtualMemory 4 | SetUnhandledExceptionFilter 5 | LdrLoadDll 6 | LdrGetProcedureAddress 7 | LdrUnloadDll 8 | NtCreateMutant 9 | NtCreateSection 10 | NtMapViewOfSection 11 | CoInitializeEx 12 | RegOpenKeyExW 13 | CoUninitialize 14 | NtUnmapViewOfSection 15 | NtClose 16 | LdrGetDllHandle 17 | NtTerminateProcess 18 | NtOpenKey 19 | NtQueryValueKey 20 | __exception__ 21 | SetErrorMode 22 | RegQueryValueExW 23 | RegCloseKey 24 | NtCreateFile 25 | NtWriteFile 26 | CreateProcessInternalW 27 | NtProtectVirtualMemory 28 | RegOpenKeyExA 29 | NtQueryAttributesFile 30 | LoadStringA 31 | GetSystemMetrics 32 | RegQueryValueExA 33 | FindResourceExW 34 | LoadResource 35 | GetSystemWindowsDirectoryW 36 | FindResourceA 37 | SizeofResource 38 | GetFileVersionInfoSizeW 39 | GetFileVersionInfoW 40 | DrawTextExA 41 | WSAStartup 42 | socket 43 | setsockopt 44 | closesocket 45 | bind 46 | NtSetInformationFile 47 | NtDeviceIoControlFile 48 | CreateThread 49 | NtOpenFile 50 | GetSystemDirectoryW 51 | NtOpenMutant 52 | NtOpenSection 53 | RegEnumKeyExW 54 | LoadStringW 55 | GetCursorPos 56 | EnumWindows 57 | GetKeyState 58 | NtQuerySystemInformation 59 | FindFirstFileExW 60 | NtOpenDirectoryObject 61 | GetVolumePathNameW 62 | CreateDirectoryW 63 | GetFileAttributesW 64 | DeleteFileW 65 | CopyFileA 66 | CreateToolhelp32Snapshot 67 | Thread32First 68 | Thread32Next 69 | NtDuplicateObject 70 | GetSystemInfo 71 | NtOpenKeyEx 72 | GetTempPathW 73 | SetFilePointer 74 | NtReadFile 75 | GetFileType 76 | GetTimeZoneInformation 77 | SetWindowsHookExA 78 | NtEnumerateKey 79 | NtQueryInformationFile 80 | listen 81 | connect 82 | gethostbyname 83 | NtOpenProcess 84 | WriteProcessMemory 85 | RtlAddVectoredExceptionHandler 86 | ReadProcessMemory 87 | FindWindowA 88 | SHGetFolderPathW 89 | CreateActCtxW 90 | FindResourceW 91 | SetWindowsHookExW 92 | GetForegroundWindow 93 | RegQueryInfoKeyW 94 | RegEnumValueW 95 | GetFileSizeEx 96 | DrawTextExW 97 | Process32FirstW 98 | Process32NextW 99 | NtReadVirtualMemory 100 | OutputDebugStringA 101 | SearchPathW 102 | OleInitialize 103 | CryptAcquireContextW 104 | GetFileSize 105 | SetEndOfFile 106 | GlobalMemoryStatus 107 | CoGetClassObject 108 | CoCreateInstance 109 | NtQueryKey 110 | NtSetValueKey 111 | NtDelayExecution 112 | RegEnumKeyW 113 | NtQueryDirectoryFile 114 | GetFileInformationByHandleEx 115 | NtEnumerateValueKey 116 | GetUserNameExW 117 | GetComputerNameW 118 | GetUserNameW 119 | DeviceIoControl 120 | FindWindowW 121 | RegCreateKeyExW 122 | SendNotifyMessageW 123 | RegSetValueExW 124 | GetFileAttributesExW 125 | GetFileInformationByHandle 126 | SetFileTime 127 | LookupAccountSidW 128 | IsDebuggerPresent 129 | NtResumeThread 130 | GlobalMemoryStatusEx 131 | GetShortPathNameW 132 | NtCreateKey 133 | CoInitializeSecurity 134 | UuidCreate 135 | NtCreateThreadEx 136 | RtlAddVectoredContinueHandler 137 | LookupPrivilegeValueW 138 | NtOpenThread 139 | Module32FirstW 140 | Module32NextW 141 | GetKeyboardState 142 | WriteConsoleA 143 | GetVolumeNameForVolumeMountPointW 144 | NtQueryFullAttributesFile 145 | SetFilePointerEx 146 | GetVolumePathNamesForVolumeNameW 147 | system 148 | WriteConsoleW 149 | RemoveDirectoryA 150 | GetNativeSystemInfo 151 | GetSystemDirectoryA 152 | CopyFileW 153 | GetAdaptersInfo 154 | RegEnumValueA 155 | RegDeleteValueW 156 | RegCreateKeyExA 157 | GetUserNameA 158 | SetFileAttributesW 159 | RegEnumKeyExA 160 | OpenSCManagerA 161 | OpenServiceA 162 | RegSetValueExA 163 | RegDeleteValueA 164 | InternetCrackUrlA 165 | InternetSetOptionA 166 | InternetGetConnectedState 167 | InternetOpenW 168 | InternetSetStatusCallback 169 | InternetConnectW 170 | HttpOpenRequestW 171 | InternetQueryOptionA 172 | HttpSendRequestW 173 | HttpQueryInfoA 174 | InternetCloseHandle 175 | getaddrinfo 176 | GetAdaptersAddresses 177 | getsockname 178 | select 179 | CryptProtectMemory 180 | CryptUnprotectMemory 181 | GetComputerNameA 182 | GetFileVersionInfoSizeExW 183 | GetFileVersionInfoExW 184 | InternetCrackUrlW 185 | SHGetSpecialFolderLocation 186 | CryptHashData 187 | NetUserGetInfo 188 | shutdown 189 | CreateServiceA 190 | StartServiceA 191 | ShellExecuteExW 192 | SetStdHandle 193 | NtQueryMultipleValueKey 194 | CreateJobObjectW 195 | SetInformationJobObject 196 | GetSystemWindowsDirectoryA 197 | FindResourceExA 198 | RemoveDirectoryW 199 | GetDiskFreeSpaceExW 200 | MoveFileWithProgressW 201 | NetShareEnum 202 | RegDeleteKeyW 203 | GetDiskFreeSpaceW 204 | RegQueryInfoKeyA 205 | OpenSCManagerW 206 | OpenServiceW 207 | CryptAcquireContextA 208 | GetAddrInfoW 209 | NtTerminateThread 210 | CreateServiceW 211 | NtDeleteKey 212 | GetBestInterfaceEx 213 | timeGetTime 214 | InternetOpenA 215 | CryptEncrypt 216 | InternetConnectA 217 | HttpOpenRequestA 218 | HttpSendRequestA 219 | StartServiceW 220 | ControlService 221 | DeleteService 222 | CryptExportKey 223 | CryptCreateHash 224 | WSASocketW 225 | NtSuspendThread 226 | NtGetContextThread 227 | UnhookWindowsHookEx 228 | CertOpenStore 229 | CryptDecodeObjectEx 230 | CertControlStore 231 | NtDeleteValueKey 232 | GetAsyncKeyState 233 | EnumServicesStatusW 234 | DnsQuery_W 235 | FindWindowExW 236 | FindFirstFileExA 237 | RegDeleteKeyA 238 | FindWindowExA 239 | InternetOpenUrlA 240 | SendNotifyMessageA 241 | CoCreateInstanceEx 242 | IWbemServices_ExecQuery 243 | WSASocketA 244 | URLDownloadToFileW 245 | accept 246 | NtCreateDirectoryObject 247 | CertCreateCertificateContext 248 | AssignProcessToJobObject 249 | SetFileInformationByHandle 250 | NetGetJoinInformation 251 | InternetReadFile 252 | RtlRemoveVectoredExceptionHandler 253 | CryptGenKey 254 | MessageBoxTimeoutA 255 | NetUserGetLocalGroups 256 | DeleteUrlCacheEntryW 257 | send 258 | recv 259 | ioctlsocket 260 | WSARecv 261 | WSASend 262 | sendto 263 | CopyFileExW 264 | RegisterHotKey 265 | MessageBoxTimeoutW 266 | CreateRemoteThread 267 | GetUserNameExA 268 | EnumServicesStatusA 269 | NtQueueApcThread 270 | RtlCreateUserThread 271 | InternetOpenUrlW 272 | CryptProtectData 273 | WSAConnect 274 | CryptDecrypt 275 | CreateDirectoryExW 276 | IWbemServices_ExecMethod 277 | recvfrom 278 | ObtainUserAgentString 279 | DnsQuery_A 280 | ReadCabinetState 281 | NtSetContextThread 282 | WSARecvFrom 283 | WSASendTo 284 | NtLoadKey 285 | NtLoadDriver 286 | DeleteUrlCacheEntryA 287 | GetInterfaceInfo 288 | NtWriteVirtualMemory 289 | RtlCompressBuffer 290 | NtShutdownSystem 291 | TaskDialog 292 | NtDeleteFile 293 | InternetGetConnectedStateExW 294 | CryptUnprotectData 295 | InternetGetConnectedStateExA 296 | NtSaveKeyEx 297 | NtSaveKey 298 | CertOpenSystemStoreA 299 | PRF 300 | ExitWindowsEx 301 | WSAAccept 302 | CreateRemoteThreadEx 303 | CertOpenSystemStoreW 304 | NtUnloadDriver 305 | NtCreateThread 306 | NtLoadKeyEx 307 | InternetWriteFile 308 | RtlDecompressBuffer 309 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/.DS_Store -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/README.md: -------------------------------------------------------------------------------- 1 | # TinyMind人民币面值&冠字号编码识别挑战赛 2 | 3 | https://www.tinymind.cn/competitions/47 4 | 5 | 任务1面值分类100分代码,和任务2编码识别第五名代码。 6 | 7 | - 任务1:直接是一个分类问题; 8 | - 任务2:可以抽象成一个字符识别问题; 9 | - 先用检测模型(Fast-RCNN)进行检测; 10 | - 再使用识别模型CRNN或者muti-CNN进行识别 11 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task1/README.md: -------------------------------------------------------------------------------- 1 | 1. 修改`predict_rmb.py`文件中对应的路径; 2 | 2. `python predict_rmb.py` 3 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task1/predict_rmb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | from PIL import Image 12 | 13 | from sklearn.preprocessing import LabelEncoder 14 | from sklearn.model_selection import train_test_split, StratifiedKFold 15 | 16 | import torch 17 | torch.manual_seed(0) 18 | torch.backends.cudnn.deterministic = False 19 | torch.backends.cudnn.benchmark = True 20 | 21 | import torchvision.models as models 22 | import torchvision.transforms as transforms 23 | import torchvision.datasets as datasets 24 | import torch.nn as nn 25 | import torch.nn.functional as F 26 | import torch.optim as optim 27 | from torch.autograd import Variable 28 | from torch.utils.data.dataset import Dataset 29 | 30 | class QRDataset(Dataset): 31 | def __init__(self, img_path, img_label, transform=None): 32 | self.img_path = img_path 33 | self.img_label=img_label 34 | 35 | if transform is not None: 36 | self.transform = transform 37 | else: 38 | self.transform = None 39 | 40 | def __getitem__(self, index): 41 | start_time = time.time() 42 | img = Image.open(self.img_path[index]) 43 | 44 | if self.transform is not None: 45 | img = self.transform(img) 46 | 47 | return img, torch.from_numpy(np.array([self.img_label[index]])) 48 | 49 | def __len__(self): 50 | return len(self.img_path) 51 | 52 | class VisitNet(nn.Module): 53 | def __init__(self): 54 | super(VisitNet, self).__init__() 55 | model = models.resnet18(False) 56 | model.avgpool = nn.AdaptiveAvgPool2d(1) 57 | model.fc = nn.Linear(512, 256) 58 | self.resnet = model 59 | 60 | def forward(self, img): 61 | out = self.resnet(img) 62 | return F.log_softmax(out, dim=1) 63 | 64 | def predict(test_loader, model, tta=10): 65 | # switch to evaluate mode 66 | model.eval() 67 | 68 | test_pred_tta = None 69 | for _ in range(tta): 70 | test_pred = [] 71 | with torch.no_grad(): 72 | end = time.time() 73 | for i, (input, target) in enumerate(test_loader): 74 | input = input.cuda() 75 | target = target.cuda() 76 | 77 | # compute output 78 | output = model(input) 79 | output = output.data.cpu().numpy() 80 | 81 | test_pred.append(output) 82 | test_pred = np.vstack(test_pred) 83 | 84 | if test_pred_tta is None: 85 | test_pred_tta = test_pred 86 | else: 87 | test_pred_tta += test_pred 88 | 89 | return test_pred_tta 90 | 91 | 92 | def main(): 93 | 94 | # 修改输入的路径 95 | df_train = pd.read_csv('../../input/train_face_value_label.csv', dtype={' label': object, 'name': object}) 96 | lbl = LabelEncoder() 97 | df_train['y'] = lbl.fit_transform(df_train[' label'].values) 98 | 99 | # 修改输入的路径 100 | test_path = glob.glob('../../input/public_test_data/*.jpg') 101 | test_path = np.array(test_path) 102 | 103 | test_loader = torch.utils.data.DataLoader( 104 | QRDataset(test_path, np.zeros(len(test_path)), 105 | transforms.Compose([ 106 | # transforms.Resize((124, 124)), 107 | transforms.Resize(280), 108 | transforms.RandomCrop((256, 256)), 109 | transforms.RandomHorizontalFlip(), 110 | transforms.RandomVerticalFlip(), 111 | transforms.ToTensor(), 112 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 113 | ]) 114 | ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True 115 | ) 116 | 117 | model = VisitNet() 118 | model = model.cuda() 119 | model.load_state_dict(torch.load('./resnet18_fold0_11_Acc@1100.00(100.00).pt')) 120 | 121 | test_pred = predict(test_loader, model, 10) 122 | test_pred = np.vstack(test_pred) 123 | test_pred = np.argmax(test_pred, 1) 124 | 125 | test_pred = lbl.inverse_transform(test_pred) 126 | test_csv = pd.DataFrame() 127 | test_csv['name'] = [x.split('/')[-1] for x in test_path] 128 | test_csv['label'] = test_pred 129 | test_csv.sort_values(by='name', inplace=True) 130 | test_csv.to_csv('tmp_newmodel_resnet18_tta10.csv', index=None, sep=',') 131 | 132 | if __name__== "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/.DS_Store -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/2_predict_faster_rcnn.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os, glob, shutil, codecs 3 | 4 | import mxnet as mx 5 | from matplotlib import pyplot as plt 6 | import gluoncv 7 | from gluoncv import model_zoo, data, utils 8 | 9 | net = model_zoo.get_model('faster_rcnn_resnet50_v1b_voc', ctx=mx.gpu(0), pretrained=False) 10 | net.load_parameters('./faster_rcnn_resnet50_v1b_voc_0002_0.0519.params') 11 | net.classes = ['zipcode'] 12 | net.collect_params().reset_ctx(ctx = mx.gpu(0)) 13 | 14 | # MXNET_CUDNN_AUTOTUNE_DEFAULT=0 python 2_predict_faster_rcnn.py 15 | 16 | with codecs.open('./data/train_data_box.csv', 'w') as up: 17 | for path in glob.glob('../input/train_data/*.jpg'): 18 | orig_img_cv2 = cv2.imread(path) 19 | x, orig_img = data.transforms.presets.rcnn.load_test(path) 20 | x = x.as_in_context(mx.gpu(0)) 21 | box_ids, scores, bboxes = net(x) 22 | bboxes = bboxes.asnumpy()[0][0].astype(int) 23 | 24 | y1, x1, y2, x2 = bboxes 25 | x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0]) 26 | x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0]) 27 | 28 | y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1]) 29 | y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1]) 30 | 31 | x1, x2 = int(x1), int(x2) 32 | y1, y2 = int(y1), int(y2) 33 | 34 | # x1-=10; x2+=10 35 | # y1-=10; y2+=10 36 | 37 | # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :]) 38 | cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)]) 39 | up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2)) 40 | 41 | with codecs.open('./data/public_test_data_box.csv', 'w') as up: 42 | for path in glob.glob('../input/public_test_data/*.jpg'): 43 | orig_img_cv2 = cv2.imread(path) 44 | x, orig_img = data.transforms.presets.rcnn.load_test(path) 45 | x = x.as_in_context(mx.gpu(0)) 46 | box_ids, scores, bboxes = net(x) 47 | bboxes = bboxes.asnumpy()[0][0].astype(int) 48 | 49 | y1, x1, y2, x2 = bboxes 50 | x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0]) 51 | x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0]) 52 | 53 | y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1]) 54 | y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1]) 55 | 56 | x1, x2 = int(x1), int(x2) 57 | y1, y2 = int(y1), int(y2) 58 | 59 | #x1-=10; x2+=10 60 | # y1-=10; y2+=10 61 | 62 | # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :]) 63 | cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)]) 64 | up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2)) 65 | 66 | with codecs.open('./data/private_test_data_box.csv', 'w') as up: 67 | for path in glob.glob('../input/private_test_data/*.jpg'): 68 | orig_img_cv2 = cv2.imread(path) 69 | x, orig_img = data.transforms.presets.rcnn.load_test(path) 70 | x = x.as_in_context(mx.gpu(0)) 71 | box_ids, scores, bboxes = net(x) 72 | bboxes = bboxes.asnumpy()[0][0].astype(int) 73 | 74 | y1, x1, y2, x2 = bboxes 75 | x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0]) 76 | x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0]) 77 | 78 | y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1]) 79 | y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1]) 80 | 81 | x1, x2 = int(x1), int(x2) 82 | y1, y2 = int(y1), int(y2) 83 | 84 | #x1-=10; x2+=10 85 | # y1-=10; y2+=10 86 | 87 | # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :]) 88 | cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)]) 89 | up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2)) -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/3_savejson.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os, glob, shutil, codecs, json 3 | from tqdm import tqdm, tqdm_notebook 4 | # %pylab inline 5 | 6 | 7 | 8 | desc = {} 9 | desc['abc'] = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' 10 | 11 | desc['train'] = [] 12 | desc['test'] = [] 13 | desc['pb'] = [] 14 | 15 | import pandas as pd 16 | df_train_label = pd.read_csv('../input/train_id_label.csv') 17 | df_submit = pd.read_csv('./crnn-pytorch/pb_rcnn_label.csv') 18 | df_submit['label'] = df_submit['label'].apply(lambda x: ' '+x) 19 | df_submit.columns = ['name', ' label'] 20 | 21 | df_train_label = pd.concat([df_train_label, df_submit], axis=0, ignore_index=True) 22 | print(df_train_label.shape) 23 | 24 | train_guanzi = df_train_label[' label'].apply(lambda x: x[-4:]).unique() 25 | 26 | 27 | def checkImageIsValid(imagePath): 28 | img = cv2.imread(imagePath) 29 | if img is None: 30 | return False 31 | 32 | with open(imagePath, 'rb') as f: 33 | imageBin = f.read() 34 | 35 | if imageBin is None: 36 | return False 37 | 38 | try: 39 | imageBuf = np.fromstring(imageBin, dtype=np.uint8) 40 | img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE) 41 | imgH, imgW = img.shape[0], img.shape[1] 42 | if imgH * imgW == 0: 43 | return False 44 | return True 45 | except: 46 | return False 47 | 48 | bad_img_path = [] 49 | for x in df_train_label['name'].values: 50 | if not checkImageIsValid('./data/data/'+x): 51 | bad_img_path.append(x) 52 | 53 | 54 | import numpy as np 55 | from sklearn.model_selection import KFold, StratifiedKFold 56 | X = np.zeros((df_train_label['name'].shape[0], 2)) 57 | kf = KFold(n_splits=24) 58 | kf.get_n_splits(X) 59 | 60 | print(kf) 61 | fold_idx=0 62 | for train_index, test_index in kf.split(X, df_train_label[' label'].apply(lambda x:x[1:2])): 63 | print("TRAIN:", train_index, "TEST:", test_index) 64 | 65 | desc['fold'+str(fold_idx)+'_train'] = [] 66 | desc['fold'+str(fold_idx)+'_test'] = [] 67 | 68 | for row in df_train_label.iloc[train_index].iterrows(): 69 | # desc['fold'+str(fold_idx)+'_train'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']}) 70 | # continue 71 | 72 | if row[1]['name'] in bad_img_path: 73 | continue 74 | 75 | if checkImageIsValid('./data/data/'+row[1]['name']): 76 | desc['fold'+str(fold_idx)+'_train'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']}) 77 | else: 78 | print('./data/data/'+row[1]['name']) 79 | 80 | for row in df_train_label.iloc[test_index].iterrows(): 81 | # desc['fold'+str(fold_idx)+'_test'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']}) 82 | # continue 83 | 84 | if row[1]['name'] in bad_img_path: 85 | continue 86 | 87 | if checkImageIsValid('./data/data/'+row[1]['name']): 88 | desc['fold'+str(fold_idx)+'_test'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']}) 89 | else: 90 | print('./data/data/'+row[1]['name']) 91 | 92 | fold_idx+=1 93 | 94 | for row in glob.glob('../input/private_test_data/*'): 95 | desc['pb'].append({'text':'QJ69411105', 'name':row.split('/')[-1]}) 96 | 97 | with open('./data/desc.json', 'w') as outfile: 98 | json.dump(desc, outfile) -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/VOC2007.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/VOC2007.zip -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/README.md: -------------------------------------------------------------------------------- 1 | Convolutional Recurrent Neural Network 2 | ====================================== 3 | 4 | This software implements OCR system using CNN + RNN + CTCLoss, inspired by CRNN network. 5 | 6 | Usage 7 | ----- 8 | 9 | ` 10 | python ./train.py --help 11 | ` 12 | 13 | Demo 14 | ---- 15 | 16 | 1. Train simple OCR using TestDataset data generator. 17 | Training for ~60-100 epochs. 18 | ``` 19 | python train.py --test-init True --test-epoch 10 --output-dir 20 | ``` 21 | 22 | 2. Run test for trained model with visualization mode. 23 | ``` 24 | python test.py --snapshot /crnn_resnet18_10_best --visualize True 25 | ``` 26 | 27 | Train on custom dataset 28 | ----------------------- 29 | 30 | 1. Create dataset 31 | 32 | - Structure of dataset: 33 | ``` 34 | 35 | ---- data 36 | -------- 37 | ... 38 | -------- 39 | ---- desc.json 40 | ``` 41 | 42 | - Structure of desc.json: 43 | ``` 44 | { 45 | "abc": , 46 | "train": [ 47 | { 48 | "text": 49 | "name": 50 | }, 51 | ... 52 | { 53 | "text": 54 | "name": 55 | } 56 | ], 57 | "test": [ 58 | { 59 | "text": 60 | "name": 61 | }, 62 | ... 63 | { 64 | "text": 65 | "name": 66 | } 67 | ] 68 | } 69 | ``` 70 | 71 | 2. Train simple OCR using custom dataset. 72 | ``` 73 | python train.pt --test-init True --test-epoch 10 --output-dir --data-path 74 | ``` 75 | 76 | 3. Run test for trained model with visualization mode. 77 | ``` 78 | python test.py --snapshot /crnn_resnet18_10_best --visualize True --data-path 79 | ``` 80 | 81 | 82 | Dependence 83 | ---------- 84 | * pytorch 0.3.0 + 85 | * [warp-ctc](https://github.com/SeanNaren/warp-ctc) 86 | 87 | Articles 88 | -------- 89 | 90 | * [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717) 91 | * [Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks](https://dl.acm.org/citation.cfm?id=1143891) 92 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/__init__.py -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/__init__.py -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/collate_fn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def text_collate(batch): 5 | img = list() 6 | seq = list() 7 | seq_len = list() 8 | for sample in batch: 9 | img.append(torch.from_numpy(sample["img"].transpose((2, 0, 1))).float()) 10 | seq.extend(sample["seq"]) 11 | seq_len.append(sample["seq_len"]) 12 | img = torch.stack(img) 13 | seq = torch.Tensor(seq).int() 14 | seq_len = torch.Tensor(seq_len).int() 15 | batch = {"img": img, "seq": seq, "seq_len": seq_len} 16 | return batch 17 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/data_transform.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import cv2 4 | import torch 5 | import albumentations.augmentations.functional as albumentations 6 | 7 | class ToTensor(object): 8 | def __call__(self, sample): 9 | sample["img"] = torch.from_numpy(sample["img"].transpose((2, 0, 1))).float() 10 | # sample["img"][0] = (sample["img"][0] - 0.485)/0.229 11 | # sample["img"][0] = (sample["img"][0] - 0.456)/0.224 12 | # sample["img"][0] = (sample["img"][0] - 0.406)/0.225 13 | 14 | sample["seq"] = torch.Tensor(sample["seq"]).int() 15 | return sample 16 | 17 | 18 | class Resize(object): 19 | def __init__(self, size=(320, 32)): 20 | self.size = size 21 | 22 | def __call__(self, sample): 23 | if sample["img"] is None: 24 | return np.zeros((320, 32, 3)) 25 | 26 | else: 27 | sample["img"] = cv2.resize(sample["img"], self.size) 28 | sample["img"] = sample["img"].astype(float)/255.0 29 | sample["img"][0] = (sample["img"][0] - 0.485)/0.229 30 | sample["img"][0] = (sample["img"][0] - 0.456)/0.224 31 | sample["img"][0] = (sample["img"][0] - 0.406)/0.225 32 | return sample 33 | 34 | 35 | class Rotation(object): 36 | def __init__(self, angle=5, fill_value=0, p = 0.5): 37 | self.angle = angle 38 | self.fill_value = fill_value 39 | self.p = p 40 | 41 | def __call__(self, sample): 42 | if np.random.uniform(0.0, 1.0) < self.p: 43 | return sample 44 | h,w,_ = sample["img"].shape 45 | ang_rot = np.random.uniform(self.angle) - self.angle/2 46 | transform = cv2.getRotationMatrix2D((w/2, h/2), ang_rot, 1) 47 | sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value) 48 | return sample 49 | 50 | 51 | class Translation(object): 52 | def __init__(self, fill_value=0, p = 0.5): 53 | self.fill_value = fill_value 54 | self.p = p 55 | 56 | def __call__(self, sample): 57 | if np.random.uniform(0.0, 1.0) < self.p: 58 | return sample 59 | h,w,_ = sample["img"].shape 60 | trans_range = [w / 20, h / 20] 61 | tr_x = trans_range[0]*np.random.uniform()-trans_range[0]/2 62 | tr_y = trans_range[1]*np.random.uniform()-trans_range[1]/2 63 | transform = np.float32([[1,0, tr_x], [0,1, tr_y]]) 64 | sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value) 65 | return sample 66 | 67 | 68 | class Scale(object): 69 | def __init__(self, scale=[0.5, 1.2], fill_value=0, p = 0.5): 70 | self.scale = scale 71 | self.fill_value = fill_value 72 | self.p = p 73 | 74 | def __call__(self, sample): 75 | if np.random.uniform(0.0, 1.0) < self.p: 76 | return sample 77 | h, w, _ = sample["img"].shape 78 | scale = np.random.uniform(self.scale[0], self.scale[1]) 79 | transform = np.float32([[scale, 0, 0],[0, scale, 0]]) 80 | sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value) 81 | return sample 82 | 83 | # add lyz 84 | class Snow(object): 85 | def __init__(self, p = 0.5): 86 | self.p = p 87 | 88 | def __call__(self, sample): 89 | if np.random.uniform(0.0, 1.0) < self.p or not sample["aug"]: 90 | return sample 91 | h, w, _ = sample["img"].shape 92 | sample["img"] = albumentations.add_snow(sample["img"], snow_point=0.5, brightness_coeff=2) 93 | return sample 94 | 95 | class Contrast(object): 96 | def __init__(self, p = 0.5): 97 | self.p = p 98 | 99 | def __call__(self, sample): 100 | if np.random.uniform(0.0, 1.0) < self.p: 101 | return sample 102 | h, w, _ = sample["img"].shape 103 | sample["img"] = albumentations.brightness_contrast_adjust(sample["img"], beta=np.random.uniform(0.0, 1.0)+0.1) 104 | # sample["img"] = cv2.GaussianBlur(sample["img"],(3,3),0) 105 | return sample 106 | 107 | class Grid_distortion(object): 108 | def __init__(self, p = 0.5): 109 | self.p = p 110 | 111 | def __call__(self, sample): 112 | # print('grid', np.random.uniform(0.0, 1.0)) 113 | 114 | if np.random.uniform(0.0, 1.0) < self.p: 115 | return sample 116 | h, w, _ = sample["img"].shape 117 | 118 | # grid_distortion 119 | if np.random.uniform(0.0, 1.0) < self.p: 120 | num_steps=15 121 | distort_limit=[-0.05,0.05] 122 | stepsx = [1 + random.uniform(distort_limit[0], distort_limit[1]) for i in 123 | range(num_steps + 1)] 124 | stepsy = [1 + random.uniform(distort_limit[0], distort_limit[1]) for i in 125 | range(num_steps + 1)] 126 | sample["img"]=albumentations.grid_distortion(sample["img"],5,stepsx, stepsy) 127 | # elastic_transform 128 | else: 129 | sample["img"]=albumentations.elastic_transform(sample["img"], alpha=5, sigma=1, alpha_affine=random.uniform(0,2), 130 | interpolation=cv2.INTER_LINEAR, border_mode=cv2.BORDER_REFLECT_101,) 131 | 132 | if np.random.uniform(0.0, 1.0) < self.p-0.2: 133 | sample["img"]=albumentations.jpeg_compression(sample["img"], random.randint(20, 100)) 134 | return sample -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/test_data.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | from torch.utils.data import Dataset 5 | import string 6 | import random 7 | 8 | class TestDataset(Dataset): 9 | def __init__(self, 10 | epoch_len = 10000, 11 | seq_len = 8, 12 | transform=None, 13 | abc=string.digits): 14 | super().__init__() 15 | self.abc = abc 16 | self.epoch_len = epoch_len 17 | self.seq_len = seq_len 18 | self.transform = transform 19 | 20 | def __len__(self): 21 | return self.epoch_len 22 | 23 | def get_abc(self): 24 | return self.abc 25 | 26 | def set_mode(self, mode='train'): 27 | return 28 | 29 | def generate_string(self): 30 | return ''.join(random.choice(self.abc) for _ in range(self.seq_len)) 31 | 32 | def get_sample(self): 33 | h, w = 64, int(self.seq_len * 64 * 2.5) 34 | pw = int(w / self.seq_len) 35 | seq = [] 36 | img = np.zeros((h, w), dtype=np.uint8) 37 | text = self.generate_string() 38 | for i in range(len(text)): 39 | c = text[i] 40 | seq.append(self.abc.find(c) + 1) 41 | hs, ws = 32, 32 42 | symb = np.zeros((hs, ws), dtype=np.uint8) 43 | font = cv2.FONT_HERSHEY_SIMPLEX 44 | cv2.putText(symb, str(c), (3, 30), font, 1.2, (255), 2, cv2.LINE_AA) 45 | # Rotation 46 | angle = 60 47 | ang_rot = np.random.uniform(angle) - angle/2 48 | transform = cv2.getRotationMatrix2D((ws/2, hs/2), ang_rot, 1) 49 | symb = cv2.warpAffine(symb, transform, (ws, hs), borderValue = 0) 50 | # Scale 51 | scale = np.random.uniform(0.7, 1.0) 52 | transform = np.float32([[scale, 0, 0],[0, scale, 0]]) 53 | symb = cv2.warpAffine(symb, transform, (ws, hs), borderValue = 0) 54 | y = np.random.randint(hs, h) 55 | x = np.random.randint(i * pw, (i + 1) * pw - ws) 56 | img[y-hs:y, x:x+ws] = symb 57 | nw = int(w * 32 / h) 58 | img = cv2.resize(img, (nw, 32)) 59 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) 60 | return img, seq 61 | 62 | def __getitem__(self, idx): 63 | img, seq = self.get_sample() 64 | sample = {"img": img, "seq": seq, "seq_len": len(seq), "aug": True} 65 | if self.transform: 66 | sample = self.transform(sample) 67 | return sample 68 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/text_data.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import json 3 | import os 4 | import cv2 5 | 6 | class TextDataset(Dataset): 7 | def __init__(self, data_path, mode="train", transform=None): 8 | super(Dataset, self).__init__() 9 | self.data_path = data_path 10 | self.mode = mode 11 | self.config = json.load(open(os.path.join(data_path, "desc.json"))) 12 | self.transform = transform 13 | 14 | def abc_len(self): 15 | return len(self.config["abc"]) 16 | 17 | def get_abc(self): 18 | return self.config["abc"] 19 | 20 | def set_mode(self, mode): 21 | self.mode = mode 22 | 23 | def __len__(self): 24 | if self.mode == "test": 25 | return len(self.config[self.mode]) 26 | return len(self.config[self.mode]) 27 | 28 | def __getitem__(self, idx): 29 | 30 | name = self.config[self.mode][idx]["name"] 31 | text = self.config[self.mode][idx]["text"] 32 | 33 | img = cv2.imread(os.path.join(self.data_path, "data", name)) 34 | # print(os.path.join(self.data_path, "data", name)) 35 | # img = cv2.imread(os.path.join(self.data_path, name)) 36 | seq = self.text_to_seq(text) 37 | sample = {"img": img, "seq": seq, "seq_len": len(seq), "aug": self.mode == "train"} 38 | if self.transform: 39 | # print('trans') 40 | sample = self.transform(sample) 41 | return sample 42 | 43 | def text_to_seq(self, text): 44 | seq = [] 45 | for c in text: 46 | seq.append(self.config["abc"].find(c) + 1) 47 | return seq -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/fold_tta.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/fold_tta.pkl -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/lr_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class StepLR(object): 5 | def __init__(self, optimizer, step_size=1000, max_iter=10000): 6 | self.optimizer = optimizer 7 | self.max_iter = max_iter 8 | self.step_size = step_size 9 | self.last_iter = -1 10 | self.base_lrs = list(map(lambda group: group['lr'], optimizer.param_groups)) 11 | 12 | def get_lr(self): 13 | return self.optimizer.param_groups[0]['lr'] 14 | 15 | def step(self, last_iter=None): 16 | if last_iter is not None: 17 | self.last_iter = last_iter 18 | if self.last_iter + 1 == self.max_iter: 19 | self.last_iter = -1 20 | self.last_iter = (self.last_iter + 1) % self.max_iter 21 | for ids, param_group in enumerate(self.optimizer.param_groups): 22 | param_group['lr'] = self.base_lrs[ids] * 0.8 ** ( self.last_iter // self.step_size ) 23 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/__init__.py -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/model_loader.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from .crnn import CRNN 7 | 8 | def load_weights(target, source_state): 9 | new_dict = OrderedDict() 10 | for k, v in target.state_dict().items(): 11 | if k in source_state and v.size() == source_state[k].size(): 12 | new_dict[k] = source_state[k] 13 | else: 14 | new_dict[k] = v 15 | target.load_state_dict(new_dict) 16 | 17 | def load_model(abc, seq_proj=[0, 0], backend='resnet18', snapshot=None, cuda=True): 18 | net = CRNN(abc=abc, seq_proj=seq_proj, backend=backend) 19 | net = nn.DataParallel(net) 20 | if snapshot is not None: 21 | load_weights(net, torch.load(snapshot)) 22 | if cuda: 23 | net = net.cuda() 24 | return net 25 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/submit.py: -------------------------------------------------------------------------------- 1 | def check_label(s): 2 | if '*' in s: 3 | return True 4 | if len(s) != 10: 5 | return True 6 | 7 | if len(set(s[3:]) & set(string.ascii_uppercase)) > 0: 8 | return True 9 | 10 | if s[0] in string.digits: 11 | return True 12 | 13 | if s[0] in string.ascii_uppercase and s[1] in string.ascii_uppercase and s[2] in string.ascii_uppercase: 14 | return True 15 | 16 | if s[0] in string.ascii_uppercase and s[1] in string.ascii_uppercase: 17 | return True 18 | elif s[0] in string.ascii_uppercase and s[2] in string.ascii_uppercase and s[1] in string.digits: 19 | return True 20 | else: 21 | return False 22 | 23 | 24 | import pandas as pd 25 | import string 26 | submit_df1 = pd.read_csv('./tmp_rcnn_tta10_pb.csv') 27 | submit_df2 = pd.read_csv('../multi-digit-pytorch/tmp_rcnn_tta10_cnn.csv') 28 | 29 | submit_df1.loc[submit_df1['name'] == 'OFTUHPVE.jpg', 'label'] = submit_df2[submit_df2['name'] == 'OFTUHPVE.jpg']['label'] 30 | submit_df1[~submit_df1['label'].apply(lambda x: check_label(x))] 31 | submit_df1.to_csv('tmp_rcnn_tta10_pb_submit.csv',index=None) -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import string 4 | from tqdm import tqdm 5 | import click 6 | import numpy as np 7 | import pandas as pd 8 | import torch 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | 12 | from dataset.test_data import TestDataset 13 | from dataset.text_data import TextDataset 14 | from dataset.collate_fn import text_collate 15 | from dataset.data_transform import Resize, Rotation, Translation, Scale 16 | from models.model_loader import load_model 17 | from torchvision.transforms import Compose 18 | 19 | import editdistance 20 | 21 | def test(net, data, abc, cuda, visualize, batch_size=10): 22 | data_loader = DataLoader(data, batch_size=10, num_workers=1, shuffle=False, collate_fn=text_collate) 23 | 24 | error_idx = [] 25 | idx= 0 26 | count = 0.0 27 | tp = 0.0 28 | avg_ed = 0.0 29 | iterator = tqdm(data_loader) 30 | for sample in iterator: 31 | imgs = Variable(sample["img"]) 32 | if cuda: 33 | imgs = imgs.cuda() 34 | out = net(imgs, decode=True) 35 | gt = (sample["seq"].numpy() - 1).tolist() 36 | lens = sample["seq_len"].numpy().tolist() 37 | pos = 0 38 | key = '' 39 | for i in range(len(out)): 40 | gts = ''.join(abc[c] for c in gt[pos:pos+lens[i]]) 41 | pos += lens[i] 42 | 43 | if gts != out[i]: 44 | # print(out[i], gts, imgs.shape) 45 | error_idx.append(int(count)) 46 | if gts == out[i]: 47 | tp += 1.0 48 | else: 49 | avg_ed += editdistance.eval(out[i], gts) 50 | count += 1.0 51 | if not visualize: 52 | iterator.set_description("acc: {0:.4f}; avg_ed: {1:.4f}".format(tp / count, avg_ed / count)) 53 | idx+=1 54 | 55 | acc = tp / count 56 | avg_ed = avg_ed / count 57 | return acc, avg_ed, error_idx 58 | 59 | @click.command() 60 | @click.option('--data-path', type=str, default=None, help='Path to dataset') 61 | @click.option('--abc', type=str, default=string.digits+string.ascii_uppercase, help='Alphabet') 62 | @click.option('--seq-proj', type=str, default="10x20", help='Projection of sequence') 63 | @click.option('--backend', type=str, default="resnet18", help='Backend network') 64 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights') 65 | @click.option('--input-size', type=str, default="320x32", help='Input size') 66 | @click.option('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3') 67 | @click.option('--visualize', type=bool, default=False, help='Visualize output') 68 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize): 69 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu 70 | cuda = True if gpu is not '' else False 71 | 72 | input_size = [int(x) for x in input_size.split('x')] 73 | transform = Compose([ 74 | Rotation(), 75 | Resize(size=(input_size[0], input_size[1])) 76 | ]) 77 | if data_path is not None: 78 | data = TextDataset(data_path=data_path, mode="test", transform=transform) 79 | else: 80 | data = TestDataset(transform=transform, abc=abc) 81 | seq_proj = [int(x) for x in seq_proj.split('x')] 82 | net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval() 83 | acc, avg_ed = test(net, data, data.get_abc(), cuda, visualize) 84 | 85 | df_submit = pd.DataFrame() 86 | 87 | print("Accuracy: {}".format(acc)) 88 | print("Edit distance: {}".format(avg_ed)) 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/test2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2, glob 3 | import string 4 | from tqdm import tqdm 5 | import click 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import torch 10 | from torch.autograd import Variable 11 | from torch.utils.data import DataLoader 12 | 13 | from dataset.test_data import TestDataset 14 | from dataset.text_data import TextDataset 15 | from dataset.collate_fn import text_collate 16 | from dataset.data_transform import Resize, Rotation, Translation, Scale, Contrast, Snow, Grid_distortion 17 | from models.model_loader import load_model 18 | from torchvision.transforms import Compose 19 | 20 | import editdistance 21 | 22 | def pred_to_string(pred): 23 | seq = [] 24 | for i in range(pred.shape[0]): 25 | label = np.argmax(pred[i]) 26 | seq.append(label - 1) 27 | out = [] 28 | for i in range(len(seq)): 29 | if len(out) == 0: 30 | if seq[i] != -1: 31 | out.append(seq[i]) 32 | else: 33 | if seq[i] != -1 and seq[i] != seq[i - 1]: 34 | out.append(seq[i]) 35 | out = ''.join('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'[i] for i in out) 36 | return out 37 | 38 | def decode(pred): 39 | seq = [] 40 | for i in range(pred.shape[0]): 41 | seq.append(pred_to_string(pred[i])) 42 | return seq 43 | 44 | def test(net, data, abc, cuda, visualize, batch_size=256): 45 | data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=text_collate) 46 | 47 | count = 0.0 48 | tp = 0.0 49 | avg_ed = 0.0 50 | pred_pb = [] 51 | iterator = tqdm(data_loader) 52 | for sample in iterator: 53 | imgs = Variable(sample["img"]) 54 | if cuda: 55 | imgs = imgs.cuda() 56 | out = net(imgs, decode=True) 57 | gt = (sample["seq"].numpy() - 1).tolist() 58 | lens = sample["seq_len"].numpy().tolist() 59 | pos = 0 60 | key = '' 61 | for i in range(len(out)): 62 | gts = ''.join(abc[c] for c in gt[pos:pos+lens[i]]) 63 | pos += lens[i] 64 | pred_pb.append(out[i]) 65 | 66 | if gts == out[i]: 67 | tp += 1.0 68 | else: 69 | avg_ed += editdistance.eval(out[i], gts) 70 | count += 1.0 71 | if not visualize: 72 | iterator.set_description("acc: {0:.4f}; avg_ed: {0:.4f}".format(tp / count, avg_ed / count)) 73 | 74 | acc = tp / count 75 | avg_ed = avg_ed / count 76 | return acc, avg_ed, pred_pb 77 | 78 | 79 | def test_tta(net, data, abc, cuda, visualize, batch_size=256): 80 | pred_pb_tta = None 81 | 82 | for _ in range(10): 83 | data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=text_collate) 84 | iterator = tqdm(data_loader) 85 | 86 | pred_pb = [] 87 | for sample in iterator: 88 | imgs = Variable(sample["img"]) 89 | if cuda: 90 | imgs = imgs.cuda() 91 | out = net(imgs, decode=False) 92 | out = out.permute(1, 0, 2).cpu().data.numpy() 93 | 94 | pred_pb.append(out) 95 | 96 | if pred_pb_tta is None: 97 | pred_pb_tta = np.concatenate(pred_pb) 98 | else: 99 | pred_pb_tta += np.concatenate(pred_pb) 100 | return 0, 0, decode(pred_pb_tta) 101 | 102 | @click.command() 103 | @click.option('--data-path', type=str, default=None, help='Path to dataset') 104 | @click.option('--abc', type=str, default=string.digits+string.ascii_uppercase, help='Alphabet') 105 | @click.option('--seq-proj', type=str, default="10x20", help='Projection of sequence') 106 | @click.option('--backend', type=str, default="resnet34", help='Backend network') 107 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights') 108 | @click.option('--input-size', type=str, default="320x32", help='Input size') 109 | @click.option('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3') 110 | @click.option('--visualize', type=bool, default=False, help='Visualize output') 111 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize): 112 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu 113 | cuda = True if gpu is not '' else False 114 | 115 | input_size = [int(x) for x in input_size.split('x')] 116 | transform = Compose([ 117 | Rotation(), 118 | Translation(), 119 | # Scale(), 120 | Contrast(), 121 | Grid_distortion(), 122 | Resize(size=(input_size[0], input_size[1])) 123 | ]) 124 | if data_path is not None: 125 | data = TextDataset(data_path=data_path, mode="pb", transform=transform) 126 | else: 127 | data = TestDataset(transform=transform, abc=abc) 128 | seq_proj = [int(x) for x in seq_proj.split('x')] 129 | net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval() 130 | acc, avg_ed, pred_pb = test_tta(net, data, data.get_abc(), cuda, visualize) 131 | 132 | df_submit = pd.DataFrame() 133 | df_submit['name'] = [x.split('/')[-1] for x in glob.glob('../../input/public_test_data/*')] 134 | df_submit['label'] = pred_pb 135 | 136 | df_submit.to_csv('tmp_rcnn_tta10.csv', index=None) 137 | print("Accuracy: {}".format(acc)) 138 | print("Edit distance: {}".format(avg_ed)) 139 | 140 | if __name__ == '__main__': 141 | main() 142 | -------------------------------------------------------------------------------- /competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/data/data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/TinyMind人民币面值&冠字号编码识别挑战赛/task2/data/data.json -------------------------------------------------------------------------------- /competition/WSDM2022/README.md: -------------------------------------------------------------------------------- 1 | **WSDM会议(CCF B类会议)** 是涉及搜索和数据挖掘的网络启发研究的主要会议之一。WSDM Cup将于10月15日开始,一直持续2022年到1月下旬。 2 | 3 | > 比赛赛题解析录屏(11月28日):https://www.bilibili.com/video/BV1Ng411K7Jm/ 4 | 5 | ## User Retention Score Prediction 6 | 7 | http://challenge.ai.iqiyi.com/detail?raceId=61600f6cef1b65639cd5eaa6 8 | 9 | 举办方:iQIYI 10 | 11 | 赛题类型:用户留存预测、CTR类型 12 | 13 | ### 赛题背景 14 | 15 | 爱奇艺手机端APP,通过深度学习等最新的AI技术,提升用户个性化的产品体验,更好地让用户享受定制化的娱乐服务。我们用“N日留存分”这一关键指标来衡量用户的满意程度。 16 | 17 | 例如,如果一个用户10月1日的“7日留存分”等于3,代表这个用户接下来的7天里(10月2日~8日),有3天会访问爱奇艺APP。预测用户的留存分是个充满挑战的难题:不同用户本身的偏好、活跃度差异很大,另外用户可支配的娱乐时间、热门内容的流行趋势等其他因素,也有很强的周期性特征。 18 | 19 | ### 赛题任务 20 | 21 | 本次大赛基于爱奇艺APP脱敏和采样后的数据信息,预测用户的7日留存分。参赛队伍需要设计相应的算法进行数据分析和预测。 22 | 23 | ### 评价指标 24 | 本次比赛是一个数值预测类问题。评价函数使用:$100*(1-\frac{1}{n}\sum^n_1|\frac{F_t-A_t}{7}|)$ 25 | 26 | $n$是测试集用户数量,$F$是参赛者对用户的7日留存分预测值,$A$是真实的7日留存分真实值。 27 | 28 | ### 赛题开源 29 | 30 | - [第一名思路](https://zhuanlan.zhihu.com/p/462736790), [代码](https://github.com/hansu1017/WSDM2022-Retention-Score-Prediction) 31 | - [第三名代码](https://github.com/Chenfei-Kang/2022_WSDM_iQiYi_Retention_Score_Prediction) 32 | 33 | ### 其他开源 34 | 35 | - [`举办方`开源了84.5分数的代码](http://challenge.ai.iqiyi.com/detail?raceId=61600f6cef1b65639cd5eaa6),基于Keras,需要32G内存 + 4G GPU 36 | - [`阿水`基于举办方改写了模型代码](https://aistudio.baidu.com/aistudio/projectdetail/2715522),线上85.5,基于PaddlePaddle,需要32G内存 + 4G GPU 37 | - [`第一次打比赛`只使用了两个特征](https://github.com/LogicJake/competition_baselines/tree/master/competitions/wsdm_iqiyi_torch),基于Pytorch,需要8G内存 + 4G GPU 38 | 39 | ## Temporal Link Prediction 40 | 41 | https://www.dgl.ai/WSDM2022-Challenge/ 42 | 43 | 举办方:Intel / Amazon 44 | 45 | 比赛类型:图算法 46 | 47 | ### 赛题背景 48 | 49 | Temporal Link Prediction是时间图上的经典任务之一。与询问部分观察图上两个节点之间是否存在边的链接预测相反,时间链接预测询问在给定时间跨度内两个节点之间是否存在边。 50 | 51 | 它比传统的链接预测更有用,因为可以围绕模型构建多个应用程序,例如预测电子商务中客户的需求,或预测社交网络中将发生什么事件等。 52 | 53 | ### 赛题任务 54 | 55 | 在这个挑战中,我们希望有一个模型可以同时处理两种数据: 56 | 57 | - 数据集 A:以实体为节点,以不同类型的事件为边的动态事件图。 58 | - 数据集 B:用户-项目图,以用户和项目为节点,以不同类型的交互为边。 59 | 60 | 该任务将预测在给定时间戳之前两个给定节点之间是否存在给定类型的边。 61 | 62 | 63 | ### 评价指标 64 | 65 | 使用 ROC 下的面积 (AUC) 作为两个数据集的评估指标,并使用两个$AUC$的调和平均值作为提交的分数。 66 | 67 | 具体来说设$AUC_A$和$AUC_B$分别为数据集A和数据集B的$AUC$。 68 | 69 | ## Cross- Market Recommendation 70 | 71 | https://xmrec.github.io/wsdmcup/ 72 | 73 | 举办方:University of Amsterdam / University of Massachusetts Amherst / Amazon 74 | 75 | 比赛类型:推荐系统 76 | 77 | ### 赛题背景 78 | 79 | 电子商务公司通常跨市场运营;例如亚马逊已将业务和销售扩展到全球18 个市场(即国家/地区)。跨市场推荐涉及通过利用类似的高资源市场的数据向目标市场的用户推荐相关产品的问题,例如利用美国市场的数据改进目标市场的推荐。 80 | 81 | 然而关键的挑战是数据,例如用户与产品的交互数据(点击、购买、评论),传达了个别市场的某些偏见。因此在源市场上训练的算法在不同的目标市场不一定有效。 82 | 83 | ### 赛题目标 84 | 85 | 在本次WSDM杯挑战赛中,我们提供不同市场的用户购买和评分数据,目标是通过利用来自类似辅助市场的数据来改进这些目标市场中的个人推荐系统。 86 | 87 | ### 评估指标 88 | 89 | 使用NDCG@10进行评估,项目的分数为每个用户排序,前10个项目被考虑进行评估。 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /competition/biendata-智源&计算所-互联网虚假新闻检测挑战赛/README.md: -------------------------------------------------------------------------------- 1 | https://www.biendata.com/competition/falsenews/ 2 | 3 | - task1:直接使用bert,单折在91.0精度 4 | 5 | 冠军分享:https://mp.weixin.qq.com/s/jS_QUezLyBzfOBeiHN_gkQ 6 | 7 | 冠军代码:https://www.biendata.com/models/category/3529/L_notebook/ 8 | -------------------------------------------------------------------------------- /competition/kaggle-allstate-claims-severity/README.md: -------------------------------------------------------------------------------- 1 | https://www.kaggle.com/c/allstate-claims-severity/ 2 | -------------------------------------------------------------------------------- /competition/kaggle-allstate-claims-severity/nn_bagging_1111.84364.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | Author: Danijel Kivaranovic 4 | Title: Neural network (Keras) with sparse data 5 | ''' 6 | 7 | ## import libraries 8 | import numpy as np 9 | np.random.seed(123) 10 | 11 | import pandas as pd 12 | import subprocess 13 | from scipy.sparse import csr_matrix, hstack 14 | from sklearn.metrics import mean_absolute_error 15 | from sklearn.preprocessing import StandardScaler 16 | from sklearn.model_selection import KFold 17 | from keras.models import Sequential 18 | from keras.layers import Dense, Dropout, Activation 19 | from keras.layers.normalization import BatchNormalization 20 | from keras.layers.advanced_activations import PReLU 21 | 22 | ## Batch generators ################################################################################################################################## 23 | 24 | def batch_generator(X, y, batch_size, shuffle): 25 | #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices) 26 | number_of_batches = np.ceil(X.shape[0]/batch_size) 27 | counter = 0 28 | sample_index = np.arange(X.shape[0]) 29 | if shuffle: 30 | np.random.shuffle(sample_index) 31 | while True: 32 | batch_index = sample_index[batch_size*counter:batch_size*(counter+1)] 33 | X_batch = X[batch_index,:].toarray() 34 | y_batch = y[batch_index] 35 | counter += 1 36 | yield X_batch, y_batch 37 | if (counter == number_of_batches): 38 | if shuffle: 39 | np.random.shuffle(sample_index) 40 | counter = 0 41 | 42 | def batch_generatorp(X, batch_size, shuffle): 43 | number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size) 44 | counter = 0 45 | sample_index = np.arange(X.shape[0]) 46 | while True: 47 | batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)] 48 | X_batch = X[batch_index, :].toarray() 49 | counter += 1 50 | yield X_batch 51 | if (counter == number_of_batches): 52 | counter = 0 53 | 54 | ######################################################################################################################################################## 55 | 56 | ## read data 57 | train = pd.read_csv('../input/train.csv') 58 | test = pd.read_csv('../input/test.csv') 59 | 60 | index = list(train.index) 61 | print (index[0:10]) 62 | np.random.shuffle(index) 63 | print (index[0:10]) 64 | train = train.iloc[index] 65 | 'train = train.iloc[np.random.permutation(len(train))]' 66 | 67 | ## set test loss to NaN 68 | test['loss'] = np.nan 69 | 70 | ## response and IDs 71 | y = np.log(train['loss'].values+200) 72 | id_train = train['id'].values 73 | id_test = test['id'].values 74 | 75 | ## stack train test 76 | ntrain = train.shape[0] 77 | tr_te = pd.concat((train, test), axis = 0) 78 | 79 | ## Preprocessing and transforming to sparse data 80 | sparse_data = [] 81 | 82 | f_cat = [f for f in tr_te.columns if 'cat' in f] 83 | for f in f_cat: 84 | dummy = pd.get_dummies(tr_te[f].astype('category')) 85 | tmp = csr_matrix(dummy) 86 | sparse_data.append(tmp) 87 | 88 | f_num = [f for f in tr_te.columns if 'cont' in f] 89 | scaler = StandardScaler() 90 | tmp = csr_matrix(scaler.fit_transform(tr_te[f_num])) 91 | sparse_data.append(tmp) 92 | 93 | del(tr_te, train, test) 94 | 95 | ## sparse train and test data 96 | xtr_te = hstack(sparse_data, format = 'csr') 97 | xtrain = xtr_te[:ntrain, :] 98 | xtest = xtr_te[ntrain:, :] 99 | 100 | print('Dim train', xtrain.shape) 101 | print('Dim test', xtest.shape) 102 | 103 | del(xtr_te, sparse_data, tmp) 104 | 105 | ## neural net 106 | def nn_model(): 107 | model = Sequential() 108 | 109 | model.add(Dense(400, input_dim = xtrain.shape[1], init = 'he_normal')) 110 | model.add(PReLU()) 111 | model.add(BatchNormalization()) 112 | model.add(Dropout(0.4)) 113 | 114 | model.add(Dense(200, init = 'he_normal')) 115 | model.add(PReLU()) 116 | model.add(BatchNormalization()) 117 | model.add(Dropout(0.2)) 118 | 119 | model.add(Dense(50, init = 'he_normal')) 120 | model.add(PReLU()) 121 | model.add(BatchNormalization()) 122 | model.add(Dropout(0.2)) 123 | 124 | model.add(Dense(1, init = 'he_normal')) 125 | model.compile(loss = 'mae', optimizer = 'adadelta') 126 | return(model) 127 | 128 | ## cv-folds 129 | nfolds = 10 130 | folds = KFold(n_splits = nfolds, shuffle = True, random_state = 111) 131 | 132 | ## train models 133 | i = 0 134 | nbags = 10 135 | nepochs = 55 136 | pred_oob = np.zeros(xtrain.shape[0]) 137 | pred_test = np.zeros(xtest.shape[0]) 138 | 139 | for (inTr, inTe) in folds.split(xtrain): 140 | xtr = xtrain[inTr] 141 | ytr = y[inTr] 142 | xte = xtrain[inTe] 143 | yte = y[inTe] 144 | pred = np.zeros(xte.shape[0]) 145 | for j in range(nbags): 146 | model = nn_model() 147 | fit = model.fit_generator(generator = batch_generator(xtr, ytr, 128, True), 148 | nb_epoch = nepochs, 149 | samples_per_epoch = xtr.shape[0], 150 | verbose = 1) 151 | pred += np.exp(model.predict_generator(generator = batch_generatorp(xte, 800, False), val_samples = xte.shape[0])[:,0])-200 152 | pred_test += np.exp(model.predict_generator(generator = batch_generatorp(xtest, 800, False), val_samples = xtest.shape[0])[:,0])-200 153 | pred /= nbags 154 | pred_oob[inTe] = pred 155 | score = mean_absolute_error(np.exp(yte)-200, pred) 156 | i += 1 157 | print('Fold ', i, '- MAE:', score) 158 | 159 | print('Total - MAE:', mean_absolute_error(np.exp(y)-200, pred_oob)) 160 | 161 | ## train predictions 162 | df = pd.DataFrame({'id': id_train, 'loss': pred_oob}) 163 | df.to_csv('preds_oob.csv', index = False) 164 | 165 | ## test predictions 166 | pred_test /= (nfolds*nbags) 167 | df = pd.DataFrame({'id': id_test, 'loss': pred_test}) 168 | df.to_csv('submission_keras_shift_perm.csv', index = False) -------------------------------------------------------------------------------- /competition/kaggle-atecup-deepfake/README.md: -------------------------------------------------------------------------------- 1 | 本赛事由蚂蚁集团主办,在全球知名的数据科学竞赛平台Kaggle进行。赛事针对“AI换脸”的欺诈风险进行攻防实战演练,设立了100万元人民币的奖金池,鼓励推动AI向善的技术人才。 2 | 3 | 近年来,“AI换脸”诈骗事件频发,面对全球范围的技术挑战,大赛设立了百万奖金池,分设图片赛道和音视频赛道,在此诚邀全球的学者、工程师、教育者、学生及独立开发者积极参与。 4 | 5 | 6 | - 赛道一:图像赛道,确定给定的人脸图像是否是深度伪造图像,并输出其为深度伪造图像的概率。 7 | - 赛道二:音视频赛道,确定包含人脸的视频(带音频)是否是Deepfake视频,并输出其深度伪造音视频的概率。 8 | 9 | 赛事地址:https://www.atecup.cn/deepfake 10 | -------------------------------------------------------------------------------- /competition/kaggle-quickdraw-doodle-recognition/1_save2df.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys, codecs, glob 5 | import numpy as np 6 | import pandas as pd 7 | import cv2 8 | 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.cross_validation import train_test_split 11 | 12 | # 读取单个csv文件 13 | def read_df(path, nrows): 14 | print('Reading...', path) 15 | if nrows.isdigit(): 16 | return pd.read_csv(path, nrows=int(nrows), parse_dates=['timestamp']) 17 | else: 18 | return pd.read_csv(path, parse_dates=['timestamp']) 19 | 20 | # 读取多个csv文件 21 | def contcat_df(paths, nrows): 22 | dfs = [] 23 | for path in paths: 24 | dfs.append(read_df(path, nrows)) 25 | return pd.concat(dfs, axis=0, ignore_index=True) 26 | 27 | def main(): 28 | if not os.path.exists('./data'): 29 | os.mkdir('./data') 30 | 31 | CLASSES_CSV = glob.glob('../input/train_simplified/*.csv') 32 | CLASSES = [x.split('/')[-1][:-4] for x in CLASSES_CSV] 33 | 34 | print('Reading data...') 35 | df = contcat_df(CLASSES_CSV, number) 36 | df = df.reindex(np.random.permutation(df.index)) 37 | 38 | lbl = LabelEncoder().fit(df['word']) 39 | df['word'] = lbl.transform(df['word']) 40 | 41 | if df.shape[0] * 0.05 < 120000: 42 | df_train, df_val = train_test_split(df, test_size=0.05) 43 | else: 44 | df_train, df_val = df.iloc[:-500000], df.iloc[-500000:] 45 | 46 | print('Train:', df_train.shape[0], 'Val', df_val.shape[0]) 47 | print('Save data...') 48 | df_train.to_pickle(os.path.join('./data/', 'train_' + str(number) + '.pkl')) 49 | df_val.to_pickle(os.path.join('./data/', 'val_' + str(number) + '.pkl')) 50 | 51 | # python 1_save2df.py 50000 52 | # python 1_save2df.py all 53 | if __name__ == "__main__": 54 | number = str(sys.argv[1]) 55 | main() -------------------------------------------------------------------------------- /competition/kaggle-quickdraw-doodle-recognition/README.md: -------------------------------------------------------------------------------- 1 | https://quickdraw.withgoogle.com/ 2 | 3 | https://www.kaggle.com/c/quickdraw-doodle-recognition/ 4 | -------------------------------------------------------------------------------- /competition/kaggle-two-sigma-connect-rental-listing-inquiries/README.md: -------------------------------------------------------------------------------- 1 | https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/ 2 | -------------------------------------------------------------------------------- /competition/yanxishe-IMDB评论剧透检测/README.md: -------------------------------------------------------------------------------- 1 | # IMDB评论剧透检测 2 | ## 竞赛链接 3 | https://god.yanxishe.com/20 4 | ## score 5 | 74.729 6 | ## 操作说明 7 | 数据放在data目录下 8 | 执行ml.ipynb 9 | ## 优化方向 10 | baseline中只利用了review_text信息 11 | #### 文本方向 12 | review_summary,以及IMDB_movie_details.json信息进行挖掘 13 | #### 时序方向 14 | review_date进行挖掘 15 | #### 其他方向 16 | movie_id,user_id,rating进行挖掘 17 | -------------------------------------------------------------------------------- /competition/yanxishe-人脸年龄识别/2_predit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | # import imagehash 12 | from PIL import Image 13 | 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 15 | 16 | from efficientnet_pytorch import EfficientNet 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 18 | 19 | import torch 20 | torch.manual_seed(0) 21 | torch.backends.cudnn.deterministic = False 22 | torch.backends.cudnn.benchmark = True 23 | 24 | import torchvision.models as models 25 | import torchvision.transforms as transforms 26 | import torchvision.datasets as datasets 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.autograd import Variable 31 | from torch.utils.data.dataset import Dataset 32 | 33 | class QRDataset(Dataset): 34 | def __init__(self, img_path, transform=None): 35 | self.img_path = img_path 36 | if transform is not None: 37 | self.transform = transform 38 | else: 39 | self.transform = None 40 | 41 | def __getitem__(self, index): 42 | start_time = time.time() 43 | img = Image.open(self.img_path[index]).convert('RGB') 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index]))) 48 | 49 | def __len__(self): 50 | return len(self.img_path) 51 | 52 | class VisitNet(nn.Module): 53 | def __init__(self): 54 | super(VisitNet, self).__init__() 55 | 56 | # model = models.resnet18(True) 57 | # model.avgpool = nn.AdaptiveAvgPool2d(1) 58 | # model.fc = nn.Linear(512, 2) 59 | # self.resnet = model 60 | 61 | model = EfficientNet.from_pretrained('efficientnet-b0') 62 | model._fc = nn.Linear(1280, 100) 63 | self.resnet = model 64 | 65 | def forward(self, img): 66 | out = self.resnet(img) 67 | return out 68 | 69 | def predict(test_loader, model, tta=10): 70 | # switch to evaluate mode 71 | model.eval() 72 | 73 | test_pred_tta = None 74 | for _ in range(tta): 75 | test_pred = [] 76 | with torch.no_grad(): 77 | end = time.time() 78 | for i, (input, target) in enumerate(test_loader): 79 | input = input.cuda() 80 | target = target.cuda() 81 | 82 | # compute output 83 | output = model(input) 84 | output = output.data.cpu().numpy() 85 | 86 | test_pred.append(output) 87 | test_pred = np.vstack(test_pred) 88 | 89 | if test_pred_tta is None: 90 | test_pred_tta = test_pred 91 | else: 92 | test_pred_tta += test_pred 93 | 94 | return test_pred_tta 95 | 96 | test_jpg = ['../face_age_dataset/test/{0}.png'.format(x) for x in range(1, 1805)] 97 | test_jpg = np.array(test_jpg) 98 | 99 | test_pred = None 100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt', 'resnet18_fold3.pt']: 101 | 102 | test_loader = torch.utils.data.DataLoader( 103 | QRDataset(test_jpg, 104 | transforms.Compose([ 105 | transforms.Resize((224, 224)), 106 | transforms.RandomHorizontalFlip(), 107 | transforms.RandomVerticalFlip(), 108 | transforms.ToTensor(), 109 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 110 | ]) 111 | ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True 112 | ) 113 | 114 | 115 | model = VisitNet().cuda() 116 | model = nn.DataParallel(model).cuda() 117 | model.load_state_dict(torch.load(model_path)) 118 | # model = nn.DataParallel(model).cuda() 119 | if test_pred is None: 120 | test_pred = predict(test_loader, model, 5) 121 | else: 122 | test_pred += predict(test_loader, model, 5) 123 | 124 | test_csv = pd.DataFrame() 125 | test_csv[0] = list(range(1, 1805)) 126 | test_csv[1] = np.argmax(test_pred, 1) 127 | test_csv[1] = test_csv[1].apply(lambda x: str(x).zfill(3)) 128 | test_csv.to_csv('tmp.csv', index=None, header=None) -------------------------------------------------------------------------------- /competition/yanxishe-人脸年龄识别/README.md: -------------------------------------------------------------------------------- 1 | 比赛链接:https://god.yanxishe.com/10 2 | 3 | 使用IMDB-WIKI数据集进行pretrain,再到比赛数据集finetune; 4 | https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/ 5 | 6 | 单模型4折,就可以达到25准确率,复现Top5成绩; 7 | 8 | ``` 9 | python3 1_train.py 10 | python3 2_predict.py 11 | ``` 12 | 13 | 人脸年龄识别练习赛冠军源码_1575964312087.zip为比赛前三名的代码; 14 | -------------------------------------------------------------------------------- /competition/yanxishe-人脸年龄识别/人脸年龄识别练习赛冠军源码_1575964312087.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/yanxishe-人脸年龄识别/人脸年龄识别练习赛冠军源码_1575964312087.zip -------------------------------------------------------------------------------- /competition/yanxishe-喵脸关键点检测/2_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | # import imagehash 12 | from PIL import Image 13 | 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 15 | 16 | from efficientnet_pytorch import EfficientNet 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 18 | 19 | import torch 20 | torch.manual_seed(0) 21 | torch.backends.cudnn.deterministic = False 22 | torch.backends.cudnn.benchmark = True 23 | 24 | import torchvision.models as models 25 | import torchvision.transforms as transforms 26 | import torchvision.datasets as datasets 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.autograd import Variable 31 | from torch.utils.data.dataset import Dataset 32 | 33 | class QRDataset(Dataset): 34 | def __init__(self, img_path, transform=None): 35 | self.img_path = img_path 36 | if transform is not None: 37 | self.transform = transform 38 | else: 39 | self.transform = None 40 | 41 | def __getitem__(self, index): 42 | start_time = time.time() 43 | img = Image.open(self.img_path[index]).convert('RGB') 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index]))) 48 | 49 | def __len__(self): 50 | return len(self.img_path) 51 | 52 | class VisitNet(nn.Module): 53 | def __init__(self): 54 | super(VisitNet, self).__init__() 55 | 56 | # model = models.resnet18(True) 57 | # model.avgpool = nn.AdaptiveAvgPool2d(1) 58 | # model.fc = nn.Linear(512, 2) 59 | # self.resnet = model 60 | 61 | model = EfficientNet.from_pretrained('efficientnet-b0') 62 | model._fc = nn.Linear(1280, 18) 63 | self.resnet = model 64 | 65 | def forward(self, img): 66 | out = self.resnet(img) 67 | return out 68 | 69 | def predict(test_loader, model, tta=10): 70 | # switch to evaluate mode 71 | model.eval() 72 | 73 | test_pred_tta = None 74 | for _ in range(tta): 75 | test_pred = [] 76 | with torch.no_grad(): 77 | end = time.time() 78 | for i, (input, target) in tqdm(enumerate(test_loader), total=len(test_loader)): 79 | input = input.cuda() 80 | target = target.cuda() 81 | 82 | # compute output 83 | output = model(input) 84 | output = output.data.cpu().numpy() 85 | 86 | test_pred.append(output) 87 | test_pred = np.vstack(test_pred) 88 | 89 | if test_pred_tta is None: 90 | test_pred_tta = test_pred 91 | else: 92 | test_pred_tta += test_pred 93 | 94 | return test_pred_tta 95 | 96 | test_jpg = ['../test/{0}.jpg'.format(x) for x in range(0, 9526)] 97 | test_jpg = np.array(test_jpg) 98 | 99 | test_pred = None 100 | for model_path in ['./resnet18_fold4.pt']: 101 | 102 | test_loader = torch.utils.data.DataLoader( 103 | QRDataset(test_jpg, 104 | transforms.Compose([ 105 | transforms.Resize((512, 512)), 106 | # transforms.RandomHorizontalFlip(), 107 | # transforms.RandomVerticalFlip(), 108 | transforms.ToTensor(), 109 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 110 | ]) 111 | ), batch_size=20, shuffle=False, num_workers=10, pin_memory=True 112 | ) 113 | 114 | 115 | model = VisitNet().cuda() 116 | model.load_state_dict(torch.load(model_path)) 117 | # model = nn.DataParallel(model).cuda() 118 | if test_pred is None: 119 | test_pred = predict(test_loader, model, 1) 120 | else: 121 | test_pred += predict(test_loader, model, 1) 122 | 123 | # test_csv = pd.DataFrame() 124 | # test_csv[0] = list(range(0, 1047)) 125 | # test_csv[1] = np.argmax(test_pred, 1) 126 | # test_csv.to_csv('tmp.csv', index=None, header=None) 127 | 128 | test_pred = pd.DataFrame(test_pred) 129 | test_pred.columns = ['left_eye_x', 'left_eye_y', 'right_eye_x', 'right_eye_y', 130 | 'mouth_x', 'mouth_y', 'left_ear1_x', 'left_ear1_y', 'left_ear2_x', 131 | 'left_ear2_y', 'left_ear3_x', 'left_ear3_y', 'right_ear1_x', 132 | 'right_ear1_y', 'right_ear2_x', 'right_ear2_y', 'right_ear3_x', 133 | 'right_ear3_y'] 134 | test_pred = test_pred.reset_index() 135 | 136 | img_size = [] 137 | for idx in (range(9526)): 138 | img_size.append(cv2.imread('../test/{0}.jpg'.format(idx)).shape[:2]) 139 | 140 | img_size = np.vstack(img_size) 141 | test_pred['height'] = img_size[:, 0] 142 | test_pred['width'] = img_size[:, 1] 143 | 144 | for col in test_pred.columns: 145 | if '_x' in col: 146 | test_pred[col]*=test_pred['width'] 147 | elif '_y' in col: 148 | test_pred[col]*=test_pred['height'] 149 | 150 | test_pred.astype(int).iloc[:, :-2].to_csv('tmp.csv', index=None, header=None) -------------------------------------------------------------------------------- /competition/yanxishe-喵脸关键点检测/README.md: -------------------------------------------------------------------------------- 1 | https://god.yanxishe.com/19 2 | 3 | 利用CNN进行回归预测 4 | 5 | ``` 6 | python3 1_train.py 7 | python3 2_predict.py 8 | ``` 9 | -------------------------------------------------------------------------------- /competition/yanxishe-白葡萄酒品质预测/README.md: -------------------------------------------------------------------------------- 1 | https://god.yanxishe.com/15 2 | 3 | lgb线上得分96.2667 4 | 5 | -------------------------------------------------------------------------------- /competition/yanxishe-白葡萄酒品质预测/lgb_baseline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import lightgbm as lgb 4 | from sklearn.preprocessing import LabelEncoder 5 | from itertools import combinations 6 | 7 | train_df = pd.read_csv('../input/train.csv', header=None, sep=';') 8 | test_df = pd.read_csv('../input/test.csv', header=None, sep=';') 9 | 10 | train_df = train_df[train_df[11] != 'quality'] 11 | lbl = LabelEncoder().fit(train_df[11]) 12 | train_df[11] = lbl.transform(train_df[11]) 13 | 14 | for a, b in combinations([0,1,2,3,4,7,8,9,10], 2): 15 | train_df[str(a) + '_' + str(b)] = train_df[a].astype(float) + train_df[b].astype(float) 16 | train_df[str(a) + '/' + str(b)] = train_df[a].astype(float) / train_df[b].astype(float) 17 | train_df[str(a) + '*' + str(b)] = train_df[a].astype(float) * train_df[b].astype(float) 18 | train_df[str(a) + '/log' + str(b)] = train_df[a].astype(float) / np.log1p(train_df[b].astype(float)) 19 | 20 | test_df[str(a) + '_' + str(b)] = test_df[a].astype(float) + test_df[b].astype(float) 21 | test_df[str(a) + '/' + str(b)] = test_df[a].astype(float) / test_df[b].astype(float) 22 | test_df[str(a) + '*' + str(b)] = test_df[a].astype(float) * test_df[b].astype(float) 23 | test_df[str(a) + '/log' + str(b)] = test_df[a].astype(float) / np.log1p(test_df[b].astype(float)) 24 | 25 | from sklearn.model_selection import StratifiedKFold 26 | from sklearn.metrics import roc_auc_score 27 | 28 | n_fold = 10 29 | skf = StratifiedKFold(n_splits = n_fold, shuffle = True) 30 | eval_fun = roc_auc_score 31 | 32 | def run_oof(clf, X_train, y_train, X_test, kf): 33 | print(clf) 34 | preds_train = np.zeros((len(X_train), 7), dtype = np.float) 35 | preds_test = np.zeros((len(X_test), 7), dtype = np.float) 36 | train_loss = []; test_loss = [] 37 | 38 | i = 1 39 | for train_index, test_index in kf.split(X_train, y_train): 40 | x_tr = X_train[train_index]; x_te = X_train[test_index] 41 | y_tr = y_train[train_index]; y_te = y_train[test_index] 42 | clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], early_stopping_rounds = 500, verbose = False) 43 | 44 | # train_loss.append(eval_fun(y_tr, clf.predict_proba(x_tr)[:])) 45 | # test_loss.append(eval_fun(y_te, clf.predict_proba(x_te)[:])) 46 | 47 | preds_train[test_index] = clf.predict_proba(x_te)[:] 48 | preds_test += clf.predict_proba(X_test)[:] 49 | 50 | # print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss))) 51 | print('-' * 50) 52 | i += 1 53 | print('Train: ', train_loss) 54 | print('Val: ', test_loss) 55 | print('-' * 50) 56 | # print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss))) 57 | preds_test /= n_fold 58 | return preds_train, preds_test 59 | 60 | params = { 61 | 'learning_rate': 0.01, 62 | 'min_child_samples': 5, 63 | 'max_depth': 5, 64 | 'lambda_l1': 2, 65 | 'boosting': 'gbdt', 66 | 'objective': 'multiclass', 67 | 'n_estimators': 3000, 68 | 'metric': 'multi_error', 69 | 'num_class': 7, 70 | 'feature_fraction': .75, 71 | 'bagging_fraction': .85, 72 | 'seed': 99, 73 | 'num_threads': 20, 74 | 'verbose': -1 75 | } 76 | 77 | train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 78 | train_df.drop(11, axis=1).values, 79 | train_df[11].values, 80 | test_df.values, 81 | skf) 82 | 83 | submit = pd.DataFrame() 84 | submit[0] = range(len(test_df)) 85 | submit[1] = lbl.inverse_transform(np.argmax(test_pred, 1)) 86 | submit.to_csv('lgb.csv', index=None, header=None) 87 | -------------------------------------------------------------------------------- /competition/yanxishe-白葡萄酒品质预测/winequality_dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/competition-baseline/99a1c3e3905573326fa2fc6d2ab2cc58286fee37/competition/yanxishe-白葡萄酒品质预测/winequality_dataset.zip -------------------------------------------------------------------------------- /competition/yanxishe-美食识别挑战(1):豆腐VS土豆/2_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | # import imagehash 12 | from PIL import Image 13 | 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 15 | 16 | from efficientnet_pytorch import EfficientNet 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 18 | 19 | import torch 20 | torch.manual_seed(0) 21 | torch.backends.cudnn.deterministic = False 22 | torch.backends.cudnn.benchmark = True 23 | 24 | import torchvision.models as models 25 | import torchvision.transforms as transforms 26 | import torchvision.datasets as datasets 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.autograd import Variable 31 | from torch.utils.data.dataset import Dataset 32 | 33 | class QRDataset(Dataset): 34 | def __init__(self, img_path, transform=None): 35 | self.img_path = img_path 36 | if transform is not None: 37 | self.transform = transform 38 | else: 39 | self.transform = None 40 | 41 | def __getitem__(self, index): 42 | start_time = time.time() 43 | img = Image.open(self.img_path[index]).convert('RGB') 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index]))) 48 | 49 | def __len__(self): 50 | return len(self.img_path) 51 | 52 | class VisitNet(nn.Module): 53 | def __init__(self): 54 | super(VisitNet, self).__init__() 55 | 56 | model = models.resnet18(True) 57 | model.avgpool = nn.AdaptiveAvgPool2d(1) 58 | model.fc = nn.Linear(512, 2) 59 | self.resnet = model 60 | 61 | # model = EfficientNet.from_pretrained('efficientnet-b4') 62 | # model._fc = nn.Linear(1792, 2) 63 | # self.resnet = model 64 | 65 | def forward(self, img): 66 | out = self.resnet(img) 67 | return out 68 | 69 | def predict(test_loader, model, tta=10): 70 | # switch to evaluate mode 71 | model.eval() 72 | 73 | test_pred_tta = None 74 | for _ in range(tta): 75 | test_pred = [] 76 | with torch.no_grad(): 77 | end = time.time() 78 | for i, (input, target) in enumerate(test_loader): 79 | input = input.cuda() 80 | target = target.cuda() 81 | 82 | # compute output 83 | output = model(input) 84 | output = output.data.cpu().numpy() 85 | 86 | test_pred.append(output) 87 | test_pred = np.vstack(test_pred) 88 | 89 | if test_pred_tta is None: 90 | test_pred_tta = test_pred 91 | else: 92 | test_pred_tta += test_pred 93 | 94 | return test_pred_tta 95 | 96 | test_jpg = ['./豆腐和土豆/test/{0}.jpg'.format(x) for x in range(0, 1047)] 97 | test_jpg = np.array(test_jpg) 98 | 99 | test_pred = None 100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt', 101 | 'resnet18_fold3.pt', 'resnet18_fold4.pt', 'resnet18_fold5.pt', 102 | 'resnet18_fold6.pt', 'resnet18_fold7.pt', 'resnet18_fold8.pt', 103 | 'resnet18_fold9.pt']: 104 | 105 | test_loader = torch.utils.data.DataLoader( 106 | QRDataset(test_jpg, 107 | transforms.Compose([ 108 | transforms.Resize((512, 512)), 109 | transforms.RandomHorizontalFlip(), 110 | transforms.RandomVerticalFlip(), 111 | transforms.ToTensor(), 112 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 113 | ]) 114 | ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True 115 | ) 116 | 117 | 118 | model = VisitNet().cuda() 119 | model.load_state_dict(torch.load(model_path)) 120 | # model = nn.DataParallel(model).cuda() 121 | if test_pred is None: 122 | test_pred = predict(test_loader, model, 2) 123 | else: 124 | test_pred += predict(test_loader, model, 2) 125 | 126 | test_csv = pd.DataFrame() 127 | test_csv[0] = list(range(0, 1047)) 128 | test_csv[1] = np.argmax(test_pred, 1) 129 | test_csv.to_csv('tmp.csv', index=None, header=None) -------------------------------------------------------------------------------- /competition/yanxishe-美食识别挑战(1):豆腐VS土豆/README.md: -------------------------------------------------------------------------------- 1 | https://god.yanxishe.com/16 2 | 3 | resnet18 10fold tta, 4 | 5 | 修改代码里面文件路径后: 6 | 7 | ``` 8 | python 1_train.py 9 | python 2_predict.py 10 | ``` 11 | -------------------------------------------------------------------------------- /competition/yanxishe-肌肉活动电信号推测手势/README.md: -------------------------------------------------------------------------------- 1 | https://god.yanxishe.com/14 2 | 3 | LBG baseline,线上95.2 4 | -------------------------------------------------------------------------------- /competition/yanxishe-肺炎X光病灶识别/2_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | # import imagehash 12 | from PIL import Image 13 | 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 15 | 16 | from efficientnet_pytorch import EfficientNet 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 18 | 19 | import torch 20 | torch.manual_seed(0) 21 | torch.backends.cudnn.deterministic = False 22 | torch.backends.cudnn.benchmark = True 23 | 24 | import torchvision.models as models 25 | import torchvision.transforms as transforms 26 | import torchvision.datasets as datasets 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.autograd import Variable 31 | from torch.utils.data.dataset import Dataset 32 | 33 | class QRDataset(Dataset): 34 | def __init__(self, img_path, transform=None): 35 | self.img_path = img_path 36 | if transform is not None: 37 | self.transform = transform 38 | else: 39 | self.transform = None 40 | 41 | def __getitem__(self, index): 42 | start_time = time.time() 43 | img = Image.open(self.img_path[index]).convert('RGB') 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index]))) 48 | 49 | def __len__(self): 50 | return len(self.img_path) 51 | 52 | class VisitNet(nn.Module): 53 | def __init__(self): 54 | super(VisitNet, self).__init__() 55 | 56 | model = models.resnet18(True) 57 | model.avgpool = nn.AdaptiveAvgPool2d(1) 58 | model.fc = nn.Linear(512, 5) 59 | self.resnet = model 60 | 61 | # model = EfficientNet.from_pretrained('efficientnet-b4') 62 | # model._fc = nn.Linear(1792, 2) 63 | # self.resnet = model 64 | 65 | def forward(self, img): 66 | out = self.resnet(img) 67 | return out 68 | 69 | def predict(test_loader, model, tta=10): 70 | # switch to evaluate mode 71 | model.eval() 72 | 73 | test_pred_tta = None 74 | for _ in range(tta): 75 | test_pred = [] 76 | with torch.no_grad(): 77 | end = time.time() 78 | for i, (input, target) in enumerate(test_loader): 79 | input = input.cuda() 80 | target = target.cuda() 81 | 82 | # compute output 83 | output = model(input) 84 | output = output.data.cpu().numpy() 85 | 86 | test_pred.append(output) 87 | test_pred = np.vstack(test_pred) 88 | 89 | if test_pred_tta is None: 90 | test_pred_tta = test_pred 91 | else: 92 | test_pred_tta += test_pred 93 | 94 | return test_pred_tta 95 | 96 | test_jpg = ['./test/{0}.jpg'.format(x) for x in range(0, 6671)] 97 | test_jpg = np.array(test_jpg) 98 | 99 | test_pred = None 100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold0.pt']: 101 | 102 | test_loader = torch.utils.data.DataLoader( 103 | QRDataset(test_jpg, 104 | transforms.Compose([ 105 | transforms.Resize((512, 512)), 106 | transforms.RandomHorizontalFlip(), 107 | transforms.RandomVerticalFlip(), 108 | transforms.ToTensor(), 109 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 110 | ]) 111 | ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True 112 | ) 113 | 114 | 115 | model = VisitNet().cuda() 116 | model.load_state_dict(torch.load(model_path)) 117 | # model = nn.DataParallel(model).cuda() 118 | if test_pred is None: 119 | test_pred = predict(test_loader, model, 2) 120 | else: 121 | test_pred += predict(test_loader, model, 2) 122 | 123 | test_csv = pd.DataFrame() 124 | test_csv[0] = list(range(0, 6671)) 125 | test_csv[1] = np.argmax(test_pred, 1) 126 | test_csv.to_csv('tmp.csv', index=None, header=None) -------------------------------------------------------------------------------- /competition/yanxishe-肺炎X光病灶识别/README.md: -------------------------------------------------------------------------------- 1 | https://god.yanxishe.com/18 2 | 3 | 直接用分类的思路线上成绩77.5146,暂时没有用到位置信息; 4 | ``` 5 | python3 1_train.py 6 | python3 2_predict.py 7 | ``` 8 | -------------------------------------------------------------------------------- /competition/yanxishe-胸腔X光肺炎检测/2_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | # import imagehash 12 | from PIL import Image 13 | 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 15 | 16 | from efficientnet_pytorch import EfficientNet 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 18 | 19 | import torch 20 | torch.manual_seed(0) 21 | torch.backends.cudnn.deterministic = False 22 | torch.backends.cudnn.benchmark = True 23 | 24 | import torchvision.models as models 25 | import torchvision.transforms as transforms 26 | import torchvision.datasets as datasets 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.autograd import Variable 31 | from torch.utils.data.dataset import Dataset 32 | 33 | class QRDataset(Dataset): 34 | def __init__(self, img_path, transform=None): 35 | self.img_path = img_path 36 | if transform is not None: 37 | self.transform = transform 38 | else: 39 | self.transform = None 40 | 41 | def __getitem__(self, index): 42 | start_time = time.time() 43 | img = Image.open(self.img_path[index]).convert('RGB') 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | return img,torch.from_numpy(np.array(int('PNEUMONIA' in self.img_path[index]))) 48 | 49 | def __len__(self): 50 | return len(self.img_path) 51 | 52 | class VisitNet(nn.Module): 53 | def __init__(self): 54 | super(VisitNet, self).__init__() 55 | 56 | model = models.resnet18(True) 57 | model.avgpool = nn.AdaptiveAvgPool2d(1) 58 | model.fc = nn.Linear(512, 2) 59 | self.resnet = model 60 | 61 | # model = EfficientNet.from_pretrained('efficientnet-b4') 62 | # model._fc = nn.Linear(1792, 2) 63 | # self.resnet = model 64 | 65 | def forward(self, img): 66 | out = self.resnet(img) 67 | return out 68 | 69 | def predict(test_loader, model, tta=10): 70 | # switch to evaluate mode 71 | model.eval() 72 | 73 | test_pred_tta = None 74 | for _ in range(tta): 75 | test_pred = [] 76 | with torch.no_grad(): 77 | end = time.time() 78 | for i, (input, target) in enumerate(test_loader): 79 | input = input.cuda() 80 | target = target.cuda() 81 | 82 | # compute output 83 | output = model(input) 84 | output = output.data.cpu().numpy() 85 | 86 | test_pred.append(output) 87 | test_pred = np.vstack(test_pred) 88 | 89 | if test_pred_tta is None: 90 | test_pred_tta = test_pred 91 | else: 92 | test_pred_tta += test_pred 93 | 94 | return test_pred_tta 95 | 96 | test_jpg = ['../input/xray_dataset/test/{0}.jpeg'.format(x) for x in range(1, 1758)] 97 | test_jpg = np.array(test_jpg) 98 | 99 | test_pred = None 100 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt', 101 | 'resnet18_fold3.pt']: 102 | 103 | test_loader = torch.utils.data.DataLoader( 104 | QRDataset(test_jpg, 105 | transforms.Compose([ 106 | transforms.Resize((512, 512)), 107 | transforms.RandomHorizontalFlip(), 108 | transforms.RandomVerticalFlip(), 109 | transforms.ToTensor(), 110 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 111 | ]) 112 | ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True 113 | ) 114 | 115 | 116 | model = VisitNet().cuda() 117 | model.load_state_dict(torch.load(model_path)) 118 | # model = nn.DataParallel(model).cuda() 119 | if test_pred is None: 120 | test_pred = predict(test_loader, model, 5) 121 | else: 122 | test_pred += predict(test_loader, model, 5) 123 | 124 | test_csv = pd.DataFrame() 125 | test_csv[0] = list(range(1, 1758)) 126 | test_csv[1] = np.argmax(test_pred, 1) 127 | test_csv.to_csv('tmp.csv', index=None, header=None) 128 | -------------------------------------------------------------------------------- /competition/yanxishe-胸腔X光肺炎检测/README.md: -------------------------------------------------------------------------------- 1 | https://god.yanxishe.com/13 2 | 3 | pytorch resnet18 TTA 4 | 线上99 5 | 6 | ``` 7 | python3 1_train.py 8 | python3 2_predict.py 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /competition/全球AI攻防挑战赛/README.md: -------------------------------------------------------------------------------- 1 | 在全球人工智能发展和治理广受关注的大趋势下,由中国图象图形学学会、蚂蚁集团、云安全联盟CSA大中华区主办,广泛联合学界、机构共同组织发起全球AI攻防挑战赛。本次比赛包含攻防两大赛道,分别聚焦大模型自身安全和大模型生成内容的防伪检测,涉及信用成长、凭证审核、商家入驻、智能助理等多个业务场景,覆盖机器学习、图像处理与计算机视觉、数据处理等多个算法领域,旨在聚合行业及学界力量共同守护AI及大模型的安全,共同推动AI安全可信技术的发展。 2 | 3 | - 赛题 1:https://tianchi.aliyun.com/s/24acb952f488f1f713a5294cf585bea3 4 | - 赛题 2:https://tianchi.aliyun.com/s/14a815673dc09ef786edf5794bf3bce2 5 | -------------------------------------------------------------------------------- /competition/点石-Retention Rate of Baidu Hao Kan APP Users/1_splitdf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import os, sys, time, codecs, glob 5 | from tqdm import tqdm, tqdm_notebook 6 | 7 | def read_input(debug=True): 8 | if debug: 9 | nrows = 100000 10 | else: 11 | nrows = None 12 | 13 | train = pd.read_csv('../input/train', sep='\t', nrows=nrows, 14 | names=['user_id', 'user_male', 'user_age', 'user_edu', 'user_district', 'label', 'user_install', 15 | 'video_id', 'video_class', 'video_tag', 'video_creator', 'video_uptime', 'video_duration', 16 | 'behavior_show', 'behavior_click', 'behavior_recommend', 'behavior_playback', 'behavior_timestamp', 17 | 'behavior_comment', 'behavior_like', 'behavior_forard'], 18 | dtype={'user_id':object, 'video_tag':object}) 19 | test = pd.read_csv('../input/test', sep='\t', nrows=nrows, 20 | names=['user_id', 'user_male', 'user_age', 'user_edu', 'user_district', 'user_install', 21 | 'video_id', 'video_class', 'video_tag', 'video_creator', 'video_uptime', 'video_duration', 22 | 'behavior_show', 'behavior_click', 'behavior_recommend', 'behavior_playback', 'behavior_timestamp', 23 | 'behavior_comment', 'behavior_like', 'behavior_forard']) 24 | 25 | # train['video_uptime'] = train['video_uptime'].apply(lambda x: timestamp_datetime(x)) 26 | # train['behavior_timestamp'] = train['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000)) 27 | # train['video_tag'] = train['video_tag'].apply(lambda x: x.split('$')) 28 | # train.sort_values(by=['user_id', 'behavior_timestamp'], inplace=True) 29 | 30 | 31 | # test['video_uptime'] = test['video_uptime'].apply(lambda x: timestamp_datetime(x)) 32 | # test['behavior_timestamp'] = test['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000)) 33 | # test['video_tag'] = test['video_tag'].apply(lambda x: x.split('$')) 34 | # test.sort_values(by=['user_id', 'behavior_timestamp'], inplace=True) 35 | 36 | return train, test 37 | 38 | train, test = read_input(debug=False) 39 | 40 | # idx = train['user_id'].value_counts() 41 | # idx = idx[train['user_id'].unique()] 42 | # idx = idx.reset_index() 43 | # for i, rows in tqdm(enumerate(idx.iterrows())): 44 | # if i == 0: 45 | # start = 0 46 | # else: 47 | # start = idx.iloc[:i]['user_id'].sum() 48 | # span = idx.iloc[i]['user_id'] 49 | 50 | # tmp_df = train.iloc[start :start+span] 51 | # tmp_df.to_csv('./train/{0}.csv'.format(str(idx.iloc[i]['index'])), index=None) 52 | 53 | idx = test['user_id'].value_counts() 54 | idx = idx[test['user_id'].unique()] 55 | idx = idx.reset_index() 56 | for i, rows in tqdm(enumerate(idx.iterrows())): 57 | if i == 0: 58 | start = 0 59 | else: 60 | start = idx.iloc[:i]['user_id'].sum() 61 | span = idx.iloc[i]['user_id'] 62 | 63 | tmp_df = test.iloc[start :start+span] 64 | tmp_df.to_csv('./test/{0}.csv'.format(str(idx.iloc[i]['index'])), index=None) 65 | -------------------------------------------------------------------------------- /competition/点石-Retention Rate of Baidu Hao Kan APP Users/README.md: -------------------------------------------------------------------------------- 1 | https://dianshi.baidu.com/competition/24/rule 2 | 3 | 比赛数据下载:链接: https://pan.baidu.com/s/1Nw64v5jPAoom3PUxRZqxNw 提取码: w54b 4 | 5 | 第五名代码 6 | -------------------------------------------------------------------------------- /competition/点石-Retention Rate of Baidu Hao Kan APP Users/featselect.py: -------------------------------------------------------------------------------- 1 | import os, sys, codecs 2 | import lightgbm as lgb 3 | 4 | def modelWarpper(clf, data_train, data_label, basescore): 5 | params = { 6 | 'learning_rate': 0.01, 7 | 'min_child_samples': 5, 8 | 'max_depth': 4, 9 | 'lambda_l1': 2, 10 | 'boosting': 'gbdt', 11 | 'objective': 'binary', 12 | 'n_estimators': 2000, 13 | 'metric': 'auc', 14 | # 'num_class': 6, 15 | 'feature_fraction': .85, 16 | 'bagging_fraction': .85, 17 | 'seed': 99, 18 | 'num_threads': -1, 19 | 'verbose': -1 20 | } 21 | for col in data_train.columns: 22 | cv_results1 = lgb.cv( 23 | params, 24 | lgb.Dataset(data_train.drop([col], axis=1).values, label=data_label.values), 25 | num_boost_round=2000, 26 | nfold=7, verbose_eval=False, 27 | early_stopping_rounds=200, 28 | ) 29 | 30 | if cv_results1['auc-mean'][-1] > basescore: 31 | print('+', col, 'CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1]) 32 | else: 33 | print('-', col, 'CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1]) 34 | 35 | XX -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛-事件抽取挑战/README.md: -------------------------------------------------------------------------------- 1 | 赛题官网:http://challenge.xfyun.cn/topic/info?type=hotspot 2 | 3 | 赛题baseline:https://zhuanlan.zhihu.com/p/150190165 4 | 5 | 思路:使用BERT完成事件抽取 6 | 7 | 硬软件需要: 8 | - 安装bert4keras,https://github.com/bojone/bert4keras 9 | - 有GPU 10 | - 下载BERT预训练参数,https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip 11 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛-婴儿啼哭声识别挑战赛/README.md: -------------------------------------------------------------------------------- 1 | 比赛链接:http://challenge.xfyun.cn/topic/info?type=baby-crying 2 | 3 | baseline思路:MFCC特征+CNN模型 4 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛-温室温度预测挑战赛/README.md: -------------------------------------------------------------------------------- 1 | 比赛链接:http://challenge.xfyun.cn/topic/info?type=temperature 2 | 3 | 线上得分0.14左右。 4 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛-脑PET图像分析和疾病预测挑战赛算法挑战大赛/2_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, glob, argparse 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import time, datetime 8 | import pdb, traceback 9 | 10 | import cv2 11 | # import imagehash 12 | from PIL import Image 13 | 14 | from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 15 | 16 | # from efficientnet_pytorch import EfficientNet 17 | # model = EfficientNet.from_pretrained('efficientnet-b4') 18 | 19 | import torch 20 | torch.manual_seed(0) 21 | torch.backends.cudnn.deterministic = False 22 | torch.backends.cudnn.benchmark = True 23 | 24 | import torchvision.models as models 25 | import torchvision.transforms as transforms 26 | import torchvision.datasets as datasets 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.autograd import Variable 31 | from torch.utils.data.dataset import Dataset 32 | 33 | class QRDataset(Dataset): 34 | def __init__(self, train_jpg, transform=None): 35 | self.train_jpg = train_jpg 36 | if transform is not None: 37 | self.transform = transform 38 | else: 39 | self.transform = None 40 | 41 | def __getitem__(self, index): 42 | start_time = time.time() 43 | img = Image.open(self.train_jpg[index]).convert('RGB') 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | 48 | return img,torch.from_numpy(np.array(int('AD' in self.train_jpg[index]))) 49 | 50 | def __len__(self): 51 | return len(self.train_jpg) 52 | 53 | class VisitNet(nn.Module): 54 | def __init__(self): 55 | super(VisitNet, self).__init__() 56 | 57 | model = models.resnet34(True) 58 | model.avgpool = nn.AdaptiveAvgPool2d(1) 59 | model.fc = nn.Linear(512, 2) 60 | self.resnet = model 61 | 62 | # model = EfficientNet.from_pretrained('efficientnet-b4') 63 | # model._fc = nn.Linear(1792, 2) 64 | # self.resnet = model 65 | 66 | def forward(self, img): 67 | out = self.resnet(img) 68 | return out 69 | 70 | def predict(test_loader, model, tta=10): 71 | # switch to evaluate mode 72 | model.eval() 73 | 74 | test_pred_tta = None 75 | for _ in range(tta): 76 | test_pred = [] 77 | with torch.no_grad(): 78 | end = time.time() 79 | for i, (input, target) in enumerate(test_loader): 80 | input = input.cuda() 81 | target = target.cuda() 82 | 83 | # compute output 84 | output = model(input) 85 | output = output.data.cpu().numpy() 86 | 87 | test_pred.append(output) 88 | test_pred = np.vstack(test_pred) 89 | 90 | if test_pred_tta is None: 91 | test_pred_tta = test_pred 92 | else: 93 | test_pred_tta += test_pred 94 | 95 | return test_pred_tta 96 | 97 | test_jpg = ['../初赛数据/test/AD&CN/{0}.png'.format(x) for x in range(1, 1001)] 98 | test_jpg = np.array(test_jpg) 99 | 100 | test_pred = None 101 | for model_path in ['resnet18_fold0.pt', 'resnet18_fold1.pt', 'resnet18_fold2.pt', 102 | 'resnet18_fold3.pt', 'resnet18_fold4.pt', 'resnet18_fold5.pt', 103 | 'resnet18_fold6.pt', 'resnet18_fold7.pt', 'resnet18_fold8.pt', 104 | 'resnet18_fold9.pt'][:1]: 105 | 106 | test_loader = torch.utils.data.DataLoader( 107 | QRDataset(test_jpg, 108 | transforms.Compose([ 109 | transforms.Resize((512, 512)), 110 | # transforms.CenterCrop((450, 450)), 111 | transforms.RandomHorizontalFlip(), 112 | transforms.RandomVerticalFlip(), 113 | transforms.ToTensor(), 114 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 115 | ]) 116 | ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True 117 | ) 118 | 119 | 120 | model = VisitNet().cuda() 121 | model.load_state_dict(torch.load(model_path)) 122 | # model = nn.DataParallel(model).cuda() 123 | if test_pred is None: 124 | test_pred = predict(test_loader, model, 5) 125 | else: 126 | test_pred += predict(test_loader, model, 5) 127 | 128 | test_csv = pd.DataFrame() 129 | test_csv['uuid'] = list(range(1, 1001)) 130 | test_csv['label'] = np.argmax(test_pred, 1) 131 | test_csv['label'] = test_csv['label'].map({1: 'AD', 0: 'CN'}) 132 | test_csv.to_csv('tmp.csv', index=None) -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛-脑PET图像分析和疾病预测挑战赛算法挑战大赛/README.md: -------------------------------------------------------------------------------- 1 | 赛题链接:http://challenge.xfyun.cn/topic/info?type=PET 2 | 3 | 赛题思路:CNN分类 4 | 5 | ``` 6 | python3 1_train.py 7 | python3 2_predict.py 8 | ``` 9 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/中国科学技术大学_新冠肺炎声音诊断挑战赛.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 64, 6 | "id": "0c588760", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import pandas as pd\n", 12 | "import librosa\n", 13 | "import glob\n", 14 | "import numpy as np\n", 15 | "import xgboost as xgb" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "e41b08d3", 21 | "metadata": {}, 22 | "source": [ 23 | "# 处理训练集" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 70, 29 | "id": "850a8554", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "path = '初赛训练集\\\\cough'\n", 34 | "fea = []\n", 35 | "label = []\n", 36 | "for i in os.listdir(path):\n", 37 | " for j in os.listdir(label_path+'\\\\'+i):\n", 38 | " y, sr = librosa.load(path=label_path+'\\\\'+i+'\\\\'+j, sr=None, mono=False)\n", 39 | " y = y[::3]\n", 40 | " # 默认提取 20 帧\n", 41 | " audio_mac = librosa.feature.mfcc(y=y, sr=16000)\n", 42 | " y_shape = audio_mac.shape[1]\n", 43 | " max_pad_size=11\n", 44 | " if y_shape < max_pad_size:\n", 45 | " pad_size = max_pad_size - y_shape\n", 46 | " audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')\n", 47 | " else:\n", 48 | " audio_mac = audio_mac[:, :max_pad_size]\n", 49 | " fea.append(audio_mac.flatten())\n", 50 | " label.append(i)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 71, 56 | "id": "34a055db", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df = pd.DataFrame(fea)\n", 61 | "df['label'] = label\n", 62 | "fea_names = [i for i in df.columns if i not in ['label']]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 72, 68 | "id": "b244421a", 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "[23:14:32] WARNING: ..\\src\\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" 76 | ] 77 | }, 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", 82 | " colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n", 83 | " importance_type='gain', interaction_constraints='',\n", 84 | " learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n", 85 | " min_child_weight=1, missing=nan, monotone_constraints='()',\n", 86 | " n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n", 87 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n", 88 | " tree_method='exact', validate_parameters=1, verbosity=None)" 89 | ] 90 | }, 91 | "execution_count": 72, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "model = xgb.XGBClassifier()\n", 98 | "model.fit(df[fea_names],df['label'])" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "d15c937f", 104 | "metadata": {}, 105 | "source": [ 106 | "# 处理测试集" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 55, 112 | "id": "4540e914", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "path = '初赛测试集'\n", 117 | "fea = []\n", 118 | "files = []\n", 119 | "for j in os.listdir(path):\n", 120 | " files.append(j)\n", 121 | " y, sr = librosa.load(path=path+'\\\\'+j, sr=None, mono=False)\n", 122 | " y = y[::3]\n", 123 | " # 默认提取 20 帧\n", 124 | " audio_mac = librosa.feature.mfcc(y=y, sr=16000)\n", 125 | " y_shape = audio_mac.shape[1]\n", 126 | " max_pad_size=11\n", 127 | " if y_shape < max_pad_size:\n", 128 | " pad_size = max_pad_size - y_shape\n", 129 | " audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')\n", 130 | " else:\n", 131 | " audio_mac = audio_mac[:, :max_pad_size]\n", 132 | " fea.append(audio_mac.flatten())" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 56, 138 | "id": "423e324b", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "df2 = pd.DataFrame(fea)\n", 143 | "df2['category_id'] = model.predict(df2[fea_names])\n", 144 | "df2['category_id'].value_counts()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 78, 150 | "id": "64bc86d7", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "df2['sample_id']=files\n", 155 | "df2[['sample_id','category_id']].to_csv('sub.csv',index=False)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "127bfc2b", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "# 使用keras或其他方法\n", 166 | "# df['label'] = df['label'].apply(lambda x:0 if x=='Negative' else 1)\n", 167 | "\n", 168 | "# from sklearn.model_selection import train_test_split\n", 169 | "# x_train, x_val, y_train, y_val = train_test_split(df[fea_names], df['label'], test_size=0.2, random_state=42)\n", 170 | "\n", 171 | "# from keras.models import Sequential\n", 172 | "# from keras.layers import Dense\n", 173 | "# import keras\n", 174 | "\n", 175 | "# model = Sequential()\n", 176 | "# model.add(Dense(64, activation='relu', input_shape=(220,)))\n", 177 | "# model.add(Dense(64, activation='relu'))\n", 178 | "# model.add(Dense(64, activation='relu'))\n", 179 | "# model.add(Dense(1, activation='softmax'))\n", 180 | "\n", 181 | "# model.compile(loss=keras.losses.categorical_crossentropy,\n", 182 | "# optimizer=keras.optimizers.RMSprop(),\n", 183 | "# metrics=['accuracy'])\n", 184 | "# model.fit(x_train, y_train, batch_size=30, epochs=20, verbose=1,validation_data=(x_val, y_val))" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.8.8" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 5 209 | } 210 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/中文成语填空挑战赛/README.md: -------------------------------------------------------------------------------- 1 | ## 中文成语填空挑战赛 2 | 3 | 中国文化博大精深源远流长,其中成语更是中国文化的精华。成语大多由四个字组成,一般都有典故或出处。有些成语从字面上不难理解,如“小题大做”、“后来居上”等。有些成语必须知道来源或典故才能懂得意思,如“朝三暮四”、“杯弓蛇影”等。 4 | 5 | 成语学习是小学语文和初中重要的学习内容,如何在语句中选择合适的成语?本次赛题中希望选手构建模型能理解中文成语。 6 | 7 | 比赛链接:http://challenge.xfyun.cn/topic/info?type=chinese-idioms&ch=dw-sq-1 8 | 9 | | text | 曾经在越南这个全球第四大网游市场占据80%的金山游戏CEO邹涛对记者表示:“海外市场的本土网游企业也在崛起,这一点在越南等东南亚市场表现尤其明显,越南本土游戏公司[MASK][MASK][MASK][MASK],再加上更多的中国企业瞄准这一市场,竞争更加激烈 | 10 | | ------------- | ------------------------------------------------------------ | 11 | | candidate | 张王赵李, 海不波溢, 七男八婿, 异军突起 | 12 | | label | 异军突起 | 13 | | | | 14 | 15 | 训练集5w条数据,测试集1w条数据,均为csv格式,列使用\t分割。测试集中label字段为空,需要选手预测。 16 | 17 | 18 | ## 赛事任务 19 | 20 | 给定一个中文句子的情况下,需要选手在给定上下文的情况下从待选的成语中选择最为合适的成语。即给定句子的上下文,完成合适的成语填入对应位置。 -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/中文成语填空挑战赛/gen_train_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | # author:quincy qiang 4 | # email:yanqiangmiffy@gmail.com 5 | # datetime:2021/8/16 11:20 6 | # description:"do something" 7 | 8 | import re 9 | import pandas as pd 10 | from tqdm import tqdm 11 | 12 | train = pd.read_csv('data/train.csv', sep='\t') 13 | test = pd.read_csv('data/test.csv', sep='\t') 14 | 15 | print(train) 16 | print(test) 17 | 18 | 19 | def process_text(text): 20 | return re.sub(' +', ' ', text).strip() 21 | 22 | 23 | def get_question(text): 24 | """ 25 | 根据[MASK][MASK][MASK][MASK]获取问题 26 | :param text: 27 | :return: 28 | """ 29 | sentences = re.split('(。|!|\!|\.|?|\?)', text) # 保留分割符 30 | for sent in sentences: 31 | if '[MASK][MASK][MASK][MASK]' in sent: 32 | return sent 33 | return text 34 | 35 | 36 | cols = [ 37 | "Unnamed: 0", 38 | "video-id", 39 | "fold-ind", # q_id 40 | "startphrase", 41 | "sent1", # content 42 | "sent2", # question 43 | "gold-source", 44 | "ending0", "ending1", "ending2", "ending3", # choice 45 | "label"] 46 | 47 | # ====================================================== 48 | # 生成训练集 49 | # ====================================================== 50 | res = [] 51 | 52 | for idx, row in tqdm(train.iterrows()): 53 | q_id = f'train_{idx}' 54 | content = row['text'] 55 | content = process_text(content) 56 | question = get_question(content) 57 | modified_choices = eval(row['candidate']) 58 | label = modified_choices.index(row['label']) 59 | ## Hard-code for swag format! 60 | res.append(("", 61 | "", 62 | q_id, 63 | "", 64 | content, 65 | question, 66 | "", 67 | modified_choices[0], 68 | modified_choices[1], 69 | modified_choices[2], 70 | modified_choices[3], 71 | label)) 72 | df = pd.DataFrame(res, columns=cols) 73 | 74 | # ====================================================== 75 | # 生成测试集 76 | # ====================================================== 77 | res = [] 78 | print("test.shape", test.shape) 79 | for idx, row in tqdm(test.iterrows()): 80 | q_id = f'test_{idx}' 81 | content = row['text'] 82 | content = process_text(content) 83 | question = get_question(content) 84 | modified_choices = eval(row['candidate']) 85 | ## Hard-code for swag format! 86 | res.append(("", 87 | "", 88 | q_id, 89 | "", 90 | content, 91 | question, 92 | "", 93 | modified_choices[0], 94 | modified_choices[1], 95 | modified_choices[2], 96 | modified_choices[3], 97 | 0)) 98 | df_test = pd.DataFrame(res, columns=cols) 99 | 100 | print(df_test.shape) 101 | 102 | 103 | DEBUG = False 104 | if DEBUG: 105 | df.iloc[:50].to_csv('data/new_train.csv', index=False) 106 | df.iloc[-50:].to_csv('data/new_valid.csv', index=False) 107 | df_test.iloc[:50].to_csv('data/new_test.csv', index=False) 108 | else: 109 | df.iloc[:45000].to_csv('data/new_train.csv', index=False) 110 | df.iloc[5000:].to_csv('data/new_valid.csv', index=False) 111 | df_test.to_csv('data/new_test.csv', index=False) 112 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/中文成语填空挑战赛/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -u baseline.py \ 4 | --model_name_or_path 'hfl/chinese-xlnet-base' \ 5 | --do_train \ 6 | --do_eval \ 7 | --do_predict \ 8 | --logging_steps=100 \ 9 | --max_seq_length 200 \ 10 | --train_file data/new_train.csv \ 11 | --validation_file data/new_valid.csv \ 12 | --test_file data/new_test.csv \ 13 | --learning_rate 3e-5 \ 14 | --num_train_epochs 2 \ 15 | --output_dir 'models/xlnet' \ 16 | --gradient_accumulation_steps 4 \ 17 | --per_device_eval_batch_size 16 \ 18 | --per_device_train_batch_size 16 \ 19 | --overwrite_output 20 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/中文问题相似度挑战赛/README.md: -------------------------------------------------------------------------------- 1 | ## 中文问题相似度挑战赛 2 | 3 | ### 赛事背景 4 | 问答系统中包括三个主要的部分:问题理解,信息检索和答案抽取。而问题理解是问答系统的第一部分也是非常关键的一部分。问题理解有非常广泛的应用,如重复评论识别、相似问题识别等。 5 | 6 | 重复问题检测是一个常见的文本挖掘任务,在很多实际问答社区都有相应的应用。重复问题检测可以方便进行问题的答案聚合,以及问题答案推荐,自动QA等。由于中文词语的多样性和灵活性,本赛题需要选手构建一个重复问题识别算法。 7 | 8 | ### 赛事任务 9 | 本次赛题希望参赛选手对两个问题完成相似度打分。 10 | 11 | 训练集:约5千条问题对和标签。若两个问题是相同的问题,标签为1;否则为0。 12 | 13 | 测试集:约5千条问题对,需要选手预测标签。 14 | 15 | http://challenge.xfyun.cn/topic/info?type=chinese-question-similarity&ch=dw-sq-1 16 | 17 | ### baseline 18 | 19 | - [BERT NSP方法](https://github.com/datawhalechina/competition-baseline/blob/master/competition/%E7%A7%91%E5%A4%A7%E8%AE%AF%E9%A3%9EAI%E5%BC%80%E5%8F%91%E8%80%85%E5%A4%A7%E8%B5%9B2021/%E4%B8%AD%E6%96%87%E9%97%AE%E9%A2%98%E7%9B%B8%E4%BC%BC%E5%BA%A6%E6%8C%91%E6%88%98%E8%B5%9B/bert-nsp.ipynb) 20 | - [word2vec + LightGBM](https://mp.weixin.qq.com/s/E3sfNaNg8JH-w_7Yv40MWw), 链接:https://pan.baidu.com/s/1WC3vQGlgBFvnlAXcj-0qrA 提取码:v7aj 21 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/人脸关键点检测挑战赛/README.md: -------------------------------------------------------------------------------- 1 | http://challenge.xfyun.cn/topic/info?type=key-points-of-human-face&ch=dw-sq-1 2 | 3 | ## 赛事背景 4 | 人脸识别是基于人的面部特征信息进行身份识别的一种生物识别技术,金融和安防是目前人脸识别应用最广泛的两个领域。人脸关键点是人脸识别中的关键技术。人脸关键点检测需要识别出人脸的指定位置坐标,例如眉毛、眼睛、鼻子、嘴巴和脸部轮廓等位置坐标等。 5 | 6 | ## 赛事任务 7 | 8 | 给定人脸图像,找到4个人脸关键点,赛题任务可以视为一个关键点检测问题。 9 | 10 | - 训练集:5千张人脸图像,并且给定了具体的人脸关键点标注。 11 | - 测试集:约2千张人脸图像,需要选手识别出具体的关键点位置。 12 | 13 | 14 | ## 赛题数据 15 | 16 | 赛题数据由训练集和测试集组成,train.csv为训练集标注数据,train.npy和test.npy为训练集图片和测试集图片,可以使用numpy.load进行读取。train.csv的信息为左眼坐标、右眼坐标、鼻子坐标和嘴巴坐标,总共8个点。 17 | 18 | 本次竞赛的评价标准回归MAE进行评价,数值越小性能更优,最高分为0。评估代码参考: 19 | 20 | ``` 21 | from sklearn.metrics import mean_absolute_error 22 | y_true = [3, -0.5, 2, 7] 23 | y_pred = [2.5, 0.0, 2, 8] 24 | mean_absolute_error(y_true, y_pred) 25 | ``` 26 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/人脸情绪识别挑战赛/README.md: -------------------------------------------------------------------------------- 1 | ## 人脸情绪识别挑战赛 2 | 3 | 人脸表情是传播人类情感信息与协调人际关系的重要方式,表情识别是指从静态照片或视频序列中选择出表情状态,从而确定对人物的情绪与心理变化。在日常生活中人类习惯从面部表情中吸收非言语暗示,那么计算机可以完成类似任务吗?答案是肯定的,但是需要训练它学会识别情绪。 4 | 5 | 赛题链接:http://challenge.xfyun.cn/topic/info?type=facial-emotion-recognition 6 | 7 | ## 赛事任务 8 | 9 | 给定人脸照片完成具体的情绪识别,选手需要根据训练集数据构建情绪识别任务,并对测试集图像进行预测,识别人脸的7种情绪。 10 | 11 | ![](https://ai-contest-static.xfyun.cn/2021/120.jpg) 12 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/学术论文分类挑战赛/README.md: -------------------------------------------------------------------------------- 1 | ## 学术论文分类挑战赛 2 | 3 | 随着人工智能技术不断发展,每周都有非常多的论文公开发布。现如今对论文进行分类逐渐成为非常现实的问题,这也是研究人员和研究机构每天都面临的问题。现在希望选手能构建一个论文分类模型。 4 | 5 | 比赛链接:http://challenge.xfyun.cn/topic/info?type=academic-paper-classification 6 | 7 | 8 | ## 赛事任务 9 | 10 | 本次赛题希望参赛选手利用论文信息:论文id、标题、摘要,划分论文具体类别。 11 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/广告点击率预估挑战赛/README.md: -------------------------------------------------------------------------------- 1 | http://challenge.xfyun.cn/topic/info?type=Ad-click-through&ch=dw-sq-1 2 | 3 | ## 赛事背景 4 | 广告点击率预估是在线广告交易的核心环节之一,如果说一家公司想知道 CTR(点击率),以确定将他们的钱花在数字广告上是否值得。点击率高表示对该特定广告系列更感兴趣,点击率低可能表明广告可能不那么相关。高点击率表明更多人点击了网站,这有利于在谷歌、必应等在线平台上以更少的钱获得更好的广告位置。 5 | 6 | 近年来,各大有关广告点击率预估的比赛相拥而至,如腾讯广告算法大赛、科大讯飞营销算法大赛、阿里妈妈点击率预估大赛等。可以看出这是一个企业长期关注的问题,也是值得花时间探索的问题。 7 | 8 | ## 赛事任务 9 | 平台展示给用户特定的广告,用户存在点击与不点击两种行为。给定某平台实际广告业务中的用户行为数据,共包含13个用户相关的字段,其中isClick字段表明用户是否会点击广告。 10 | 11 | 任务目标是通过训练集训练模型,来预测测试集中isClick字段的概率结果,即用户点击平台所推荐广告的概率,以此为依据,表示用户对特定广告感兴趣的程度。 12 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/电商图像检索挑战赛/README.md: -------------------------------------------------------------------------------- 1 | ## 赛事背景 2 | 在电商应用中每天商家都会上传数以百万的商品图像,商品图像可能是从不同角度拍摄的,也有可能是不同款式的商品图像。对于消费者而言,很难通过肉眼去找到相似的商品。如果有一种人工智能算法,能够找到相同商品的相同图像,则是非常有用的一项技术。 3 | 4 | http://challenge.xfyun.cn/topic/info?type=e-commerce-image-retrieval 5 | 6 | ## 赛事任务 7 | 给定一批电商商品(主要是服务商品)的图像,找到属于同一个商品的图像。任务可以视为一个图像检索问题,或者一个图像聚类问题,需要将同一个商品的图像聚类到一起。 8 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/科大讯飞商店销量预测/README.md: -------------------------------------------------------------------------------- 1 | ## 线下商店销量预测挑战赛 2 | 3 | http://challenge.xfyun.cn/topic/info?type=offline-store-sales-forecast&ch=dw-sq-1 4 | 5 | ### 赛事背景 6 | 企业运营效率的提高主要依托于两个要素:销售预测的精度和供应链的反应速度。销售预测精度高,即便供应链反应速度不快,也能够实现库存与资金的高周转;采购管理、补货管理、销售管理等的基础便是销售预测。 7 | 8 | 销量预测是个非常经典的时序预测问题,通过一段时间内销售数据,预测未来商品的销量,对商品进行合理的分配和调度,解决供货上的不足或者堆积等问题。 9 | 10 | ### 赛事任务 11 | 给定商店销量历史相关数据和时间等信息,预测商店对应商品的周销量。 12 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/科大讯飞商店销量预测/lgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from sklearn.metrics import mean_squared_error 5 | from sklearn.model_selection import KFold 6 | 7 | import lightgbm as lgb 8 | 9 | import warnings 10 | warnings.filterwarnings('ignore') 11 | 12 | train = pd.read_csv('train.csv') 13 | test = pd.read_csv('test.csv') 14 | sample_submit = pd.read_csv('sample_submit.csv') 15 | 16 | df = pd.concat([train, test], axis=0, ignore_index=True) 17 | 18 | def lag_feature_adv(df, lags, col): 19 | ''' 20 | 历史N周平移特征 21 | ''' 22 | tmp = df[['week','shop_id','item_id',col]] 23 | for i in lags: 24 | shifted = tmp.copy() 25 | shifted.columns = ['week','shop_id','item_id', col+'_lag_'+str(i)+'_adv'] 26 | shifted['week'] += i 27 | df = pd.merge(df, shifted, on=['week','shop_id','item_id'], how='left') 28 | df[col+'_lag_'+str(i)+'_adv'] = df[col+'_lag_'+str(i)+'_adv'] 29 | return df 30 | 31 | df = lag_feature_adv(df, [1, 2, 3], 'weekly_sales') 32 | 33 | x_train = df[df.week < 33].drop(['weekly_sales'], axis=1) 34 | y_train = df[df.week < 33]['weekly_sales'] 35 | x_test = df[df.week == 33].drop(['weekly_sales'], axis=1) 36 | 37 | 38 | def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'): 39 | folds = 5 40 | seed = 1024 41 | kf = KFold(n_splits=folds, shuffle=True, random_state=seed) 42 | 43 | train = np.zeros(train_x.shape[0]) 44 | test = np.zeros(test_x.shape[0]) 45 | 46 | categorical_feature = ['shop_id','item_id','item_category_id'] 47 | cv_scores = [] 48 | 49 | for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): 50 | print('************************************ {} ************************************'.format(str(i+1))) 51 | trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] 52 | 53 | train_matrix = clf.Dataset(trn_x, label=trn_y) 54 | valid_matrix = clf.Dataset(val_x, label=val_y) 55 | 56 | params = { 57 | 'boosting_type': 'gbdt', 58 | 'objective': 'mse', 59 | 'metric': 'mse', 60 | 'min_child_weight': 5, 61 | 'num_leaves': 2 ** 7, 62 | 'lambda_l2': 10, 63 | 'feature_fraction': 0.9, 64 | 'bagging_fraction': 0.9, 65 | 'bagging_freq': 4, 66 | 'learning_rate': 0.05, 67 | 'seed': 1024, 68 | 'n_jobs':-1, 69 | 'silent': True, 70 | 'verbose': -1, 71 | } 72 | 73 | model = clf.train(params, train_matrix, 5000, valid_sets=[train_matrix, valid_matrix], 74 | categorical_feature = categorical_feature, 75 | verbose_eval=500,early_stopping_rounds=200) 76 | val_pred = model.predict(val_x, num_iteration=model.best_iteration) 77 | test_pred = model.predict(test_x, num_iteration=model.best_iteration) 78 | 79 | train[valid_index] = val_pred 80 | test += test_pred / kf.n_splits 81 | cv_scores.append(mean_squared_error(val_y, val_pred)) 82 | 83 | print(cv_scores) 84 | 85 | print("%s_scotrainre_list:" % clf_name, cv_scores) 86 | print("%s_score_mean:" % clf_name, np.mean(cv_scores)) 87 | print("%s_score_std:" % clf_name, np.std(cv_scores)) 88 | return train, test 89 | 90 | lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test) 91 | 92 | 93 | sample_submit['weekly_sales'] = lgb_test 94 | sample_submit['weekly_sales'] = sample_submit['weekly_sales'].apply(lambda x:x if x>0 else 0).values 95 | sample_submit.to_csv('baseline_result.csv', index=False) 96 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/科大讯飞股份有限公司_基于用户画像的商品推荐挑战赛.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 28, 6 | "id": "9de58cd6", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import lightgbm as lgb\n", 13 | "\n", 14 | "from sklearn.preprocessing import LabelEncoder\n", 15 | "from sklearn.metrics import f1_score\n", 16 | "from sklearn.model_selection import StratifiedKFold,KFold\n", 17 | "\n", 18 | "import multiprocessing\n", 19 | "\n", 20 | "from tqdm import tqdm\n", 21 | "import warnings\n", 22 | "warnings.filterwarnings(\"ignore\")" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "36f84a54", 29 | "metadata": { 30 | "scrolled": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "df2 = pd.read_csv('测试集/apply_new.txt',header=None,names=['pid','gender','age','targid','time','province','city','model','make'])\n", 35 | "df = pd.read_csv('train.txt',header=None,names=['pid','label','gender','age','targid','time','province','city','model','make'])\n", 36 | "df.head(5)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "3a70dd46", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def get_fea(df):\n", 47 | " df['targid_list']=df['targid'].apply(lambda x:x[1:-1].split(\",\"))\n", 48 | " for i in range(30):\n", 49 | " df['targid'+str(i)]=df['targid_list'].apply(lambda x:x[i] if len(x)>=i+1 else None)\n", 50 | " return df" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 1, 56 | "id": "88e8225f", 57 | "metadata": { 58 | "scrolled": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "train = get_fea(df)\n", 63 | "test = get_fea(df2)\n", 64 | "for col in ['province','city']:\n", 65 | " le = LabelEncoder()\n", 66 | " test[col] = le.fit_transform(test[col])\n", 67 | " train[col] = le.transform(train[col])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 39, 73 | "id": "76a876a1", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "for i in range(30):\n", 78 | " df['targid'+str(i)] = df['targid'+str(i)].astype('float64')\n", 79 | " df2['targid'+str(i)] = df2['targid'+str(i)].astype('float64')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 41, 85 | "id": "12289c30", 86 | "metadata": { 87 | "scrolled": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "fea = [f for f in train.columns if f not in ['pid','model','make','targid_list','targid','time','label','len']]# \n", 92 | "model = lgb.LGBMRegressor(max_depth=15,num_leaves=20,learning_rate=0.1,n_estimators=100,seed=2020)\n", 93 | "\n", 94 | "model.fit(train[fea],train['label'])\n", 95 | "pre = model.predict(test[fea])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 46, 101 | "id": "2e2baebd", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "test['pre'] = pre\n", 106 | "test['pre'] = test['pre'].apply(lambda x:1 if x>0.5 else 0)\n", 107 | "sub = test[['pid','pre']]\n", 108 | "sub = sub.rename(columns=({'pid':'user_id','pre':'category_id'}))\n", 109 | "sub.to_csv('sub.csv',index=False)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "id": "604ec138", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [] 119 | } 120 | ], 121 | "metadata": { 122 | "kernelspec": { 123 | "display_name": "Python 3", 124 | "language": "python", 125 | "name": "python3" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 3 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython3", 137 | "version": "3.8.8" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 5 142 | } 143 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/科大讯飞股份有限公司_猪只盘点挑战赛.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "5516f498", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "from imageai.Detection import ObjectDetection\n", 12 | "\n", 13 | "# imageai说明请查看官网: https://github.com/OlafenwaMoses/ImageAI/" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 14, 19 | "id": "4ccc8ee3", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "execution_path = os.getcwd()\n", 24 | "\n", 25 | "detector = ObjectDetection()\n", 26 | "detector.setModelTypeAsRetinaNet()\n", 27 | "detector.setModelPath( os.path.join(execution_path , \"resnet50_coco_best_v2.1.0.h5\")) # 需要提取从官网下载h5文件\n", 28 | "detector.loadModel()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 8, 34 | "id": "3ae4b045", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "num = []\n", 39 | "pigs = []\n", 40 | "pic = []" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 13, 46 | "id": "b1e853f8", 47 | "metadata": { 48 | "scrolled": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "# 注意,这里的路径必须是全英文路径\n", 53 | "for file in os.listdir('E:\\\\pig\\\\test'):\n", 54 | " detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , \"E:\\\\pig\\\\test\\\\\"+file), output_image_path=os.path.join(execution_path , \"E:\\\\pig\\\\test2\\\\\"+'2a'+file), minimum_percentage_probability=30)\n", 55 | " num.append(len(detections))\n", 56 | " pig = []\n", 57 | " pic.append(file)\n", 58 | " print(len(pic))\n", 59 | " for eachObject in detections: \n", 60 | " pig.append([eachObject[\"percentage_probability\"]]+eachObject[\"box_points\"])\n", 61 | "# print(eachObject[\"name\"] , \" : \", eachObject[\"percentage_probability\"], \" : \", eachObject[\"box_points\"] )\n", 62 | "# print(\"--------------------------------\")\n", 63 | " pigs.append(pig)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 10, 69 | "id": "f16b5c7f", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# 写文件pig_count.txt\n", 74 | "with open('E:\\\\pig\\\\det_files\\\\pig_count.txt','w',encoding='utf-8') as f:\n", 75 | " [f.write('{0} {1}\\n'.format(key, value)) for key,value in zip(pic,num)]" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 12, 81 | "id": "a40234e3", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# 写文件det_results\n", 86 | "for i in range(len(num)):\n", 87 | " with open('E:\\\\pig\\\\det_files\\\\det_results\\\\'+pic[i].split('.')[0]+'.txt','w',encoding='utf-8') as f:\n", 88 | " [f.write('pig {0} {1} {2} {3} {4}\\n'.format(min(0.01*value[0]+0.4,0.99),(value[3]+value[1])/2,(value[2]+value[4])/2,(value[3]-value[1]),(value[4]-value[2]))) for value in pigs[i]]\n", 89 | " # ****需要注意,最终提交的格式是x_ccenter,y_center,w,h和训练集里面的box格式不同。这个当时提交踩了很多坑" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "f68675b7", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.8.8" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 5 122 | } 123 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/移动设备用户年龄和性别预测/README.md: -------------------------------------------------------------------------------- 1 | ## 赛事背景 2 | 对于移动设备厂商而言,获取当前手机用户的人口属性信息是非常困难的。基于用户的手机及日常使用应用程序的偏好准确地预测其人口属性信息是提升个性化体验、构建精准用户画像的基础。 3 | 4 | 需要说明的是,本赛事数据已获得个人用户的充分认可和同意,并已进行适当的匿名处理以保护隐私。由于保密,我们不会提供有关如何获得性别和年龄数据的详细信息。 5 | 6 | 赛题链接:http://challenge.xfyun.cn/topic/info?type=mobile-devices&ch=dw-sq-1 7 | 8 | ## 赛事任务 9 | 10 | 本次比赛有两个任务,分别对移动设备(device_id)进行性别和年龄的预测,这里包含二分类和回归两个问题,最终会将两个部分的分数结合起来进行排名。 -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/蛋白质结构预测挑战赛.md: -------------------------------------------------------------------------------- 1 | ## 赛事背景 2 | 蛋白质是组成人体一切细胞、组织的重要成分,是参与人类生命活动必不可少的一部分。没有蛋白质就没有生命,足以见得蛋白质在生命中的重要性。基于此,越来越多的学者开始关注对生物信息学中蛋白质组学的研究,如蛋白质序列分析,蛋白质表达分析,蛋白质结构预测等。 3 | 4 | 蛋白质结构预测是生物信息学的重要应用之一,蛋白质的结构对于理解蛋白质的功能十分重要。深入了解蛋白质的功能对疾病的基因检测和新型药物的开发有很直接的帮助,研究表明,具有相似结构的蛋白质其功能也相似。蛋白质的一级结构(氨基酸序列)可以由其基因编码序列获得,而蛋白质的结构由氨基酸序列唯一决定,这些结构信息包括二级结构,三级结构,四级结构。目前,实现对这些结构的准确预测仍然是一个亟待突破的关键问题。 5 | 6 | 比赛链接:http://challenge.xfyun.cn/topic/info?type=protein&ch=dw-sq-1 7 | 8 | ## 赛事任务 9 | 蛋白质折叠识别常被用于解决蛋白质结构预测问题,本次大赛提供了蛋白质结构分类数据库SCOP中的ASTRAL SCOPe 2.07数据中蛋白质相似性小于40%的α,β,α+β,α/β类中所属的折叠类型作为研究对象。参赛选手需基于提供的样本集构建模型,实现蛋白质的折叠分类。 10 | 11 | 12 | ## 赛题思路 13 | 14 | https://github.com/HighingLIN/danbaizhi 15 | 16 | 把蛋白质的结构视为一句话,每个句子的精度为一个字母,然后embedding编码,再用宽视野的Conv1D提取每个字的局部特征,用MaxPooling1D再去提取局部特征(为了防止过拟合,所以pool_size比较大)。 17 | 18 | 模型的灵感来自于天池蛋白质比赛的top3开源:https://github.com/yjh126yjh/TianChi_Protein-Secondary-Structure-Prediction . -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2021/车辆贷款违约预测挑战赛/Baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | # 导入第三方包 6 | import pandas as pd 7 | import numpy as np 8 | 9 | import lightgbm as lgb 10 | 11 | from sklearn.model_selection import KFold 12 | from sklearn.metrics import f1_score, roc_auc_score 13 | 14 | import warnings 15 | warnings.filterwarnings('ignore') 16 | 17 | 18 | 19 | # 读取数据集,具体下载方式可见操作手册 20 | train = pd.read_csv('train.csv') 21 | test = pd.read_csv('test.csv') 22 | 23 | sample_submit = pd.read_csv('sample_submit.csv') 24 | 25 | 26 | # 训练数据及测试数据准备 27 | all_cols = [f for f in train.columns if f not in ['customer_id','loan_default']] 28 | 29 | x_train = train[all_cols] 30 | x_test = test[all_cols] 31 | 32 | y_train = train['loan_default'] 33 | 34 | 35 | # 作为baseline部分仅使用经典的**LightGBM**作为训练模型,我们还能尝试**XGBoost、CatBoost和NN(神经网络)** 36 | def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'): 37 | folds = 5 38 | seed = 2021 39 | kf = KFold(n_splits=folds, shuffle=True, random_state=seed) 40 | 41 | train = np.zeros(train_x.shape[0]) 42 | test = np.zeros(test_x.shape[0]) 43 | 44 | cv_scores = [] 45 | 46 | for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): 47 | print('************************************ {} ************************************'.format(str(i+1))) 48 | trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] 49 | 50 | train_matrix = clf.Dataset(trn_x, label=trn_y) 51 | valid_matrix = clf.Dataset(val_x, label=val_y) 52 | 53 | params = { 54 | 'boosting_type': 'gbdt', 55 | 'objective': 'binary', 56 | 'metric': 'auc', 57 | 'min_child_weight': 5, 58 | 'num_leaves': 2 ** 7, 59 | 'lambda_l2': 10, 60 | 'feature_fraction': 0.9, 61 | 'bagging_fraction': 0.9, 62 | 'bagging_freq': 4, 63 | 'learning_rate': 0.01, 64 | 'seed': 2021, 65 | 'nthread': 28, 66 | 'n_jobs':-1, 67 | 'silent': True, 68 | 'verbose': -1, 69 | } 70 | 71 | model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=200) 72 | val_pred = model.predict(val_x, num_iteration=model.best_iteration) 73 | test_pred = model.predict(test_x, num_iteration=model.best_iteration) 74 | 75 | # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20]) 76 | 77 | train[valid_index] = val_pred 78 | test += test_pred / kf.n_splits 79 | cv_scores.append(roc_auc_score(val_y, val_pred)) 80 | 81 | print(cv_scores) 82 | 83 | print("%s_scotrainre_list:" % clf_name, cv_scores) 84 | print("%s_score_mean:" % clf_name, np.mean(cv_scores)) 85 | print("%s_score_std:" % clf_name, np.std(cv_scores)) 86 | return train, test 87 | 88 | 89 | 90 | lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test) 91 | 92 | 93 | # 预测结果 94 | sample_submit['loan_default'] = lgb_test 95 | sample_submit['loan_default'] = sample_submit['loan_default'].apply(lambda x:1 if x>0.25 else 0).values 96 | sample_submit.to_csv('baseline_result.csv', index=False) 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2022/README.md: -------------------------------------------------------------------------------- 1 | 本届大赛按照算法、应用、编程赛、虚拟形象选拔、辩论赛、创意集市创意赛等等方向设置众多赛道;覆盖了智能语音、视觉、自然语言、图文识别等AI热门技术;涵盖了元宇宙、遗址文化、生物与环保、医疗健康、智能家居、电商销售等众多领域。大赛地址: 2 | 3 | https://challenge.xfyun.cn/?ch=ds22-dw-sq04 4 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2022/汽车领域多语种迁移学习挑战赛-baseline-0.61.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # 读取文件 2 | import numpy as np # 数值计算 3 | import nagisa # 日文分词 4 | from sklearn.feature_extraction.text import TfidfVectorizer # 文本特征提取 5 | from sklearn.linear_model import LogisticRegression # 逻辑回归 6 | from sklearn.pipeline import make_pipeline # 组合流水线 7 | 8 | # 读取数据 9 | train_cn = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/中文_trian.xlsx') 10 | train_ja = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/日语_train.xlsx') 11 | train_en = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/英文_train.xlsx') 12 | 13 | test_ja = pd.read_excel('testA.xlsx', sheet_name='日语_testA') 14 | test_en = pd.read_excel('testA.xlsx', sheet_name='英文_testA') 15 | 16 | # 文本分词 17 | train_ja['words'] = train_ja['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words)) 18 | train_en['words'] = train_en['原始文本'].apply(lambda x: x.lower()) 19 | 20 | test_ja['words'] = test_ja['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words)) 21 | test_en['words'] = test_en['原始文本'].apply(lambda x: x.lower()) 22 | 23 | # 训练TFIDF和逻辑回归 24 | pipline = make_pipeline( 25 | TfidfVectorizer(), 26 | LogisticRegression() 27 | ) 28 | pipline.fit( 29 | train_ja['words'].tolist() + train_en['words'].tolist(), 30 | train_ja['意图'].tolist() + train_en['意图'].tolist() 31 | ) 32 | 33 | # 模型预测 34 | test_ja['意图'] = pipline.predict(test_ja['words']) 35 | test_en['意图'] = pipline.predict(test_en['words']) 36 | test_en['槽值1'] = np.nan 37 | test_en['槽值2'] = np.nan 38 | 39 | test_ja['槽值1'] = np.nan 40 | test_ja['槽值2'] = np.nan 41 | 42 | # 写入提交文件 43 | writer = pd.ExcelWriter('submit.xlsx') 44 | test_en.drop(['words'], axis=1).to_excel(writer, sheet_name='英文_testA', index=None) 45 | test_ja.drop(['words'], axis=1).to_excel(writer, sheet_name='日语_testA', index=None) 46 | writer.save() 47 | writer.close() 48 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/README.md: -------------------------------------------------------------------------------- 1 | https://challenge.xfyun.cn/?ch=vWxQGFU 2 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/中文语义病句识别纠正_baseline.md: -------------------------------------------------------------------------------- 1 | - 赛题名称:中文语义病句识别纠正 2 | - 赛题类型:病句识别、错词纠正 3 | - 赛题报名链接👇: 4 | 5 | https://challenge.xfyun.cn/topic/info?type=identification-and-correction&ch=vWxQGFU 6 | 7 | ## 赛题背景 8 | 9 | 近年来随着自媒体热潮的掀起,人人都是信息的生产者,互联网上文本错误的内容暴增,如何避免这些文本错误,成为了人们迫切关注的问题。因此,各大有关文本校对的比赛相拥而至。 10 | 11 | 然而,过往的文本错误主要针对拼写错误和语法错误,这些错误对于人类来说相对简单,往往是由外国语言学习者和中文母语写作者的疏忽而产生的。对于出版、教育等一些对深层次的中文语义错误识别有需求的行业,中文语义病句的识别将会有更大的帮助。 12 | 13 | 语义病句经常出现在初高中的语文考试题目中,用来衡量学生对语文知识的掌握程度,这类语义病句对于学生来说是比较困难的,对于研究也有重大意义。 14 | 15 | 16 | ## 赛事任务 17 | 18 | 本赛事包含两个任务,分别是:中文语义病句识别和中文语义病句纠正。中文语义病句识别是一个二分类的问题,预测句子是否是语义病句。中文语义病句纠正任务需要针对病句给出纠正后的句子。语义错误和拼写错误、语法错误不同,语义错误更加关注句子语义层面的合法性,语义病句例子如下表所示。 19 | 20 | | 病句 | 纠正后的句子 | 21 | | ---------------------------------------- | ---------------------------------------- | 22 | | 英法联军**烧毁并洗劫**了北京圆明园。 | 英法联军**洗劫并烧毁**了北京圆明园。 | 23 | | 山上的水宝贵,把它留给**晚上来**的人喝。 | 山上的水宝贵,把它留给**上来晚**的人喝。 | 24 | | 国内彩电**市场**严重**滞销**。 | 国内彩电严重**滞销**。 | 25 | 26 | 27 | ## 赛题数据 28 | 本次比赛使用的数据一部分来自网络上的中小学病句题库,一部分来自人工标注。每条数据包括句子id、句子标签(0:正确句子/1:病句)、原始句子、纠正后的句子。数据格式示例如下表所示: 29 | 30 | | id | 标签 | 原始句子 | 纠正后的句子 | 31 | | ---- | ---- | ------------------------------------ | ------------------------------------ | 32 | | 1 | 1 | 英法联军烧毁并洗劫了北京圆明园。 | 英法联军洗劫并烧毁了北京圆明园。 | 33 | | 2 | 1 | 山上的水宝贵,把它留给晚上来的人喝。 | 山上的水宝贵,把它留给上来晚的人喝。 | 34 | | 3 | 0 | 国内彩电严重滞销。 | 国内彩电严重滞销。 | 35 | 36 | ## 评估指标 37 | - 中文语义病句识别任务 38 | 39 | 本模型依据提交的结果文件,采用针对语义病句的F1-score进行评价。 40 | 41 | - 中文语义病句纠正任务 42 | 43 | 本任务采用ChERRANT(Chinese ERRANT)中文GEC评估工具。ChERRANT的主要功能是通过对比预测编辑和标准编辑,计算预测结果的精确度、召回度、F值指标,从而评估语法纠错模型的性能。 44 | 45 | ## 解题思路 46 | 47 | 赛题本质是错词纠正任务,需要使用错词纠正的模型进行训练和预测。这里给出一种简单的思路,使用`t5`进行错词纠正。 48 | 49 | ### 步骤1:配置pycorrector 50 | 51 | `pycorrector`自带有`t5`预训练的错词纠正模型,首选需要配置如下库: 52 | 53 | ``` 54 | torch 55 | transformers 56 | datasets 57 | loguru 58 | ``` 59 | 60 | 接下来下载`pycorrector`代码: 61 | 62 | ``` 63 | https://github.com/shibing624/pycorrector 64 | ``` 65 | 66 | ### 步骤2:定义数据集 67 | 68 | 将比赛数据集转换为tsv格式,为【原始句子】 + 【\t】 + 【正确句子的格式】。每一行为一条训练样本。 69 | 70 | ``` 71 | 你说的是对,跟那些失业的人比起来你也算是辛运的。 你说的是对,跟那些失业的人比起来你也算是幸运的。 72 | ``` 73 | 74 | ### 步骤3:模型训练 75 | 76 | 77 | 根据自己的GPU大小,修改batch size。 78 | 79 | https://github.com/shibing624/pycorrector/blob/master/pycorrector/t5/train.py 80 | 81 | 命令行运行进行训练: 82 | 83 | ``` 84 | python train.py --do_train --train_path ../../../train.tsv 85 | ``` 86 | 87 | ### 步骤4:模型预测与提交 88 | 89 | 修改预测代码中的模型加载路径,传入待预测句子。最终将生成的结果写为指定json的格式,然后提交。 90 | 91 | 92 | https://github.com/shibing624/pycorrector/blob/master/pycorrector/t5/t5_corrector.py 93 | 94 | ``` 95 | python3.9 t5_corrector.py 96 | ``` 97 | 98 | 这种思路的分数有31分左右,还有很大的提高空间。比如交叉训练、预训练,或者使用ChatGLM进行尝试。 99 | 100 | 101 | 方案开源地址: 102 | 103 | https://github.com/datawhalechina/competition-baseline/tree/master/competition/%E7%A7%91%E5%A4%A7%E8%AE%AF%E9%A3%9EAI%E5%BC%80%E5%8F%91%E8%80%85%E5%A4%A7%E8%B5%9B2023 104 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/人岗匹配挑战赛2023_baseline.md: -------------------------------------------------------------------------------- 1 | 本地目录如下: 2 | 3 | ``` 4 | person-post-matching-2023/ 5 | run.py 6 | train.json 从比赛官网下载 7 | job_list.json 从比赛官网下载 8 | ``` 9 | 10 | 打包提交过程 11 | ``` 12 | tar -cvzf person-post-matching-2023.tar.gz person-post-matching-2023/ 13 | s3cmd put person-post-matching-2023.tar.gz s3://ai-competition/你的url/ 14 | ``` 15 | 16 | run.py代码内容如下: 17 | 18 | ```python 19 | import json 20 | import pandas as pd 21 | import numpy as np 22 | from sklearn.linear_model import LogisticRegression 23 | from sklearn.ensemble import RandomForestClassifier 24 | from sklearn.model_selection import cross_val_predict 25 | 26 | train_data = pd.read_json('./train.json') 27 | train_data['解析结果'] = train_data['解析结果'].apply(lambda x : json.dumps(x).replace('"', ' ').replace('"', ' ').split()) 28 | 29 | test_data = pd.read_json('/work/data/personnel-matching-test-set/test.json') 30 | test_data['解析结果'] = test_data['解析结果'].apply(lambda x : json.dumps(x).replace('"', ' ').replace('"', ' ').split()) 31 | 32 | joblist = pd.read_json('./job_list.json') 33 | joblist['解析结果'] = joblist['岗位名称'] + ' ' + joblist['岗位介绍'] + ' ' + joblist['岗位要求'] 34 | joblist['解析结果'] = joblist['解析结果'].apply(lambda x : x.split()) 35 | 36 | train_feat = [] 37 | for row in train_data.iterrows(): 38 | label = row[1]['岗位ID'] 39 | query_text= row[1]['解析结果'] 40 | feat = [ 41 | label, 42 | len(query_text), len(set(query_text)), len(query_text) - len(set(query_text)), 43 | ] 44 | for target_text in joblist['解析结果']: 45 | feat += [ 46 | len(set(query_text) & set(target_text)), 47 | len(set(query_text) & set(target_text)) / len(query_text), 48 | len(set(query_text) & set(target_text)) / len(target_text), 49 | 50 | len(set(query_text) & set(target_text)) / len(set(target_text)), 51 | len(set(query_text) & set(target_text)) / len(set(query_text)) 52 | 53 | ] 54 | train_feat.append(feat) 55 | train_feat = np.array(train_feat) 56 | m = RandomForestClassifier() 57 | m.fit( 58 | train_feat[:, 1:], 59 | train_feat[:, 0], 60 | ) 61 | 62 | test_feat = [] 63 | for row in test_data.iterrows(): 64 | query_text= row[1]['解析结果'] 65 | feat = [ 66 | len(query_text), len(set(query_text)), len(query_text) - len(set(query_text)), 67 | ] 68 | for target_text in joblist['解析结果']: 69 | feat += [ 70 | len(set(query_text) & set(target_text)), 71 | len(set(query_text) & set(target_text)) / len(query_text), 72 | len(set(query_text) & set(target_text)) / len(target_text), 73 | 74 | len(set(query_text) & set(target_text)) / len(set(target_text)), 75 | len(set(query_text) & set(target_text)) / len(set(query_text)) 76 | 77 | ] 78 | test_feat.append(feat) 79 | test_feat = np.array(test_feat) 80 | pd.DataFrame({ 81 | '简历ID': range(len(test_data)), 82 | '岗位ID': m.predict(test_feat).astype(int) 83 | }).to_csv('/work/output/result.csv', index=None) 84 | 85 | ``` 86 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/基于近红外光谱的煤质参数预测挑战赛_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "584f7c00-08a1-4776-a29a-a4e9095fcf48", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 13, 18 | "id": "aad799ff-7a71-4c3c-883c-9c7d5b504c10", 19 | "metadata": { 20 | "tags": [] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "train_data = pd.read_csv('基于近红外光谱的煤质参数预测挑战赛公开数据/train_data.csv')\n", 25 | "train_label = pd.read_csv('基于近红外光谱的煤质参数预测挑战赛公开数据/train_label.csv')\n", 26 | "\n", 27 | "test_data = pd.read_csv('基于近红外光谱的煤质参数预测挑战赛公开数据/test_data.csv')\n", 28 | "submit = pd.read_csv('提交示例.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 10, 34 | "id": "06b6d303-accd-4aad-84ee-8b3e225714b7", 35 | "metadata": { 36 | "tags": [] 37 | }, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "((100, 126), (500, 126))" 43 | ] 44 | }, 45 | "execution_count": 10, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "test_data.shape, train_data.shape" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 19, 57 | "id": "f56185c0-f1cc-46e9-9795-4b394254ca24", 58 | "metadata": { 59 | "tags": [] 60 | }, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/html": [ 65 | "
\n", 66 | "\n", 79 | "\n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
样品编号水分灰分
0012.26.04
1111.76.03
229.029.58
337.622.95
4414.419.87
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " 样品编号 水分 灰分\n", 125 | "0 0 12.2 6.04\n", 126 | "1 1 11.7 6.03\n", 127 | "2 2 9.0 29.58\n", 128 | "3 3 7.6 22.95\n", 129 | "4 4 14.4 19.87" 130 | ] 131 | }, 132 | "execution_count": 19, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "train_label.head()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 21, 144 | "id": "62faf00b-b406-4e14-b788-bb227c1c7928", 145 | "metadata": { 146 | "tags": [] 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "from sklearn.linear_model import LinearRegression\n", 151 | "\n", 152 | "m = LinearRegression()\n", 153 | "m.fit(train_data.iloc[:, 1:], train_label['水分'])\n", 154 | "submit['水分'] = m.predict(test_data.iloc[:, 1:])\n", 155 | "\n", 156 | "m = LinearRegression()\n", 157 | "m.fit(train_data.iloc[:, 1:], train_label['灰分'])\n", 158 | "submit['灰分'] = m.predict(test_data.iloc[:, 1:])\n", 159 | "\n", 160 | "submit.to_csv('lr.csv', index=None)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "36ed68c8-e3f6-42ba-afb1-4ffc02885968", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3 (ipykernel)", 175 | "language": "python", 176 | "name": "python3.10" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.9.10" 189 | }, 190 | "widgets": { 191 | "application/vnd.jupyter.widget-state+json": { 192 | "state": {}, 193 | "version_major": 2, 194 | "version_minor": 0 195 | } 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 5 200 | } 201 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/能源消耗预测挑战赛_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "id": "90ded076-1959-49b5-8021-2f954c96d92f", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 17, 18 | "id": "0ecb9839-fe23-47a1-8a5f-9e5ee07bd8a9", 19 | "metadata": { 20 | "tags": [] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "train_data = pd.read_csv(\"能源消耗预测挑战赛公开数据/train.csv\")\n", 25 | "test_data = pd.read_csv(\"能源消耗预测挑战赛公开数据/test.csv\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 18, 31 | "id": "827e9054-cdb6-41f7-a4b3-132d22257400", 32 | "metadata": { 33 | "tags": [] 34 | }, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/html": [ 39 | "
\n", 40 | "\n", 53 | "\n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | "
dateid
020191227000
120191227001
220191227002
320191227003
420191227004
.........
598752019123123494
598762019123123495
598772019123123496
598782019123123497
598792019123123498
\n", 119 | "

59880 rows × 2 columns

\n", 120 | "
" 121 | ], 122 | "text/plain": [ 123 | " date id\n", 124 | "0 2019122700 0\n", 125 | "1 2019122700 1\n", 126 | "2 2019122700 2\n", 127 | "3 2019122700 3\n", 128 | "4 2019122700 4\n", 129 | "... ... ...\n", 130 | "59875 2019123123 494\n", 131 | "59876 2019123123 495\n", 132 | "59877 2019123123 496\n", 133 | "59878 2019123123 497\n", 134 | "59879 2019123123 498\n", 135 | "\n", 136 | "[59880 rows x 2 columns]" 137 | ] 138 | }, 139 | "execution_count": 18, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "test_data" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 19, 151 | "id": "7bf021cb-9954-4830-b3bc-c9627fc4011a", 152 | "metadata": { 153 | "tags": [] 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "train_data['new_date'] = pd.to_datetime(train_data['date'], format='%Y%m%d%H')\n", 158 | "test_data['new_date'] = pd.to_datetime(test_data['date'], format='%Y%m%d%H')" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 20, 164 | "id": "682eda08-76c4-446d-8e35-1055a50ab8e8", 165 | "metadata": { 166 | "tags": [] 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "train_data['new_date_hour'] = train_data['new_date'].dt.hour\n", 171 | "test_data['new_date_hour'] = test_data['new_date'].dt.hour" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 23, 177 | "id": "9e5a60b9-b677-44b0-8d63-87aea9213d0e", 178 | "metadata": { 179 | "tags": [] 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "mean_target = train_data.groupby(['id', \"new_date_hour\"])['target'].mean().reset_index()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 24, 189 | "id": "a2330daa-8885-4756-b905-a404d975f5e0", 190 | "metadata": { 191 | "tags": [] 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "pd.merge(test_data, mean_target, \n", 196 | " on=['id', \"new_date_hour\"], how='left'\n", 197 | ")[['date', 'id', 'target']].to_csv('submit.csv', index=None)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "id": "1de2bb6e-fc05-469c-ba33-b8185778e393", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3 (ipykernel)", 212 | "language": "python", 213 | "name": "python3.10" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.9.10" 226 | }, 227 | "widgets": { 228 | "application/vnd.jupyter.widget-state+json": { 229 | "state": {}, 230 | "version_major": 2, 231 | "version_minor": 0 232 | } 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 5 237 | } 238 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/自动驾驶疲劳检测挑战赛_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "id": "96596192-bd3b-4b9a-8b72-6060f63ba75b", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "from sklearn.svm import LinearSVC" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 11, 20 | "id": "d00f1077-eb0c-4dc4-b80d-231590d59655", 21 | "metadata": { 22 | "tags": [] 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "train_data = np.load('自动驾驶疲劳检测挑战赛公开数据-更新/train.npy')\n", 27 | "test_data = np.load('自动驾驶疲劳检测挑战赛公开数据-更新/test.npy')\n", 28 | "train_label = pd.read_csv('自动驾驶疲劳检测挑战赛公开数据-更新/train_label.csv', header=None)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 15, 34 | "id": "7afcf7bd-fa8b-41b6-ace9-fbbe884b888b", 35 | "metadata": { 36 | "tags": [] 37 | }, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "((10000, 64, 64, 3), (15000, 64, 64, 3))" 43 | ] 44 | }, 45 | "execution_count": 15, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "train_data.shape, test_data.shape" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "53e64dd9-7aee-4248-acf3-03e7106543ab", 58 | "metadata": { 59 | "tags": [] 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "m = LinearSVC()\n", 64 | "m.fit(train_data.reshape(10000, -1), train_label)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 17, 70 | "id": "5a609443-9af4-4e7c-a899-bd06a492a4dd", 71 | "metadata": { 72 | "tags": [] 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "pd.DataFrame(m.predict(test_data.reshape(15000, -1))).to_csv('submit.csv', index=None, header=None)" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3 (ipykernel)", 83 | "language": "python", 84 | "name": "python3.10" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.9.10" 97 | }, 98 | "widgets": { 99 | "application/vnd.jupyter.widget-state+json": { 100 | "state": {}, 101 | "version_major": 2, 102 | "version_minor": 0 103 | } 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 5 108 | } 109 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/跨境电商效果广告ROI预测挑战赛_baseline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | train_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/train.csv') 7 | test_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/testA.csv') 8 | 9 | train_data['datetime'] = pd.to_datetime(train_data['datetime']) 10 | test_data['datetime'] = pd.to_datetime(test_data['datetime']) 11 | train_data['datetime_hour'] = train_data['datetime'].dt.hour 12 | test_data['datetime_hour'] = test_data['datetime'].dt.hour 13 | 14 | train_data.drop('datetime', axis=1, inplace=True) 15 | test_data.drop('datetime', axis=1, inplace=True) 16 | 17 | from sklearn.preprocessing import LabelEncoder 18 | 19 | for col in ['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries']: 20 | lbl = LabelEncoder() 21 | lbl.fit(list(train_data[col]) + list(test_data[col])) 22 | train_data[col] = lbl.transform(list(train_data[col])) 23 | test_data[col] = lbl.transform(list(test_data[col])) 24 | 25 | from lightgbm import LGBMRegressor 26 | model = LGBMRegressor() 27 | 28 | train_data['product_id_roi_mean'] = train_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean()) 29 | test_data['product_id_roi_mean'] = test_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean()) 30 | 31 | train_data['account_id_roi_mean'] = train_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean()) 32 | test_data['account_id_roi_mean'] = test_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean()) 33 | 34 | train_data['countries_roi_mean'] = train_data['countries'].map(train_data.groupby(['countries'])['roi'].mean()) 35 | test_data['countries_roi_mean'] = test_data['countries'].map(train_data.groupby(['countries'])['roi'].mean()) 36 | 37 | train_data['datetime_hour_roi_mean'] = train_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean()) 38 | test_data['datetime_hour_roi_mean'] = test_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean()) 39 | 40 | model.fit( 41 | train_data.iloc[:].drop('roi', axis=1), 42 | train_data.iloc[:]['roi'], categorical_feature=['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries'] 43 | ) 44 | 45 | df = pd.read_csv('提交示例.csv') 46 | df['roi'] = model.predict(test_data.iloc[:].drop('uuid', axis=1)) 47 | df.to_csv('submit.csv', index=None) 48 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2023/通信系统调制格式识别与分类挑战赛_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 35, 6 | "id": "617a981e-a469-492b-87ef-4f527e714e19", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 36, 17 | "id": "3d880c58-2e16-4afb-bc5d-a7412eb24a62", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "y_train = np.load('通信调制格式识别与分类数据集公开数据/训练集/Y_train.npy')\n", 22 | "x_train = np.load('通信调制格式识别与分类数据集公开数据/训练集/X_train.npy')\n", 23 | "\n", 24 | "x_test = np.load('通信调制格式识别与分类数据集公开数据/测试集/X_test.npy')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 37, 30 | "id": "9e3fdb86-5051-49d8-aebc-79e71e2d8e7f", 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "(176000, 2, 128)" 37 | ] 38 | }, 39 | "execution_count": 37, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "x_train.shape" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 38, 51 | "id": "2393a30d-c902-4507-90e4-e763867e9074", 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "1 16152\n", 58 | "7 16063\n", 59 | "2 16047\n", 60 | "4 16024\n", 61 | "6 16018\n", 62 | "8 16010\n", 63 | "9 15977\n", 64 | "0 15944\n", 65 | "5 15942\n", 66 | "3 15923\n", 67 | "10 15900\n", 68 | "dtype: int64" 69 | ] 70 | }, 71 | "execution_count": 38, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "pd.DataFrame(y_train.argmax(1)).value_counts()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 43, 83 | "id": "5b7aa1bd-6f1a-4c9f-b53f-f29264bad295", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "x_train = x_train.reshape(-1, 256)\n", 88 | "x_test = x_test.reshape(-1, 256)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 46, 94 | "id": "803aea89-1443-44bb-9853-2cecd0990419", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "from sklearn.linear_model import SGDClassifier\n", 99 | "from sklearn.naive_bayes import GaussianNB\n", 100 | "from sklearn.neighbors import KNeighborsClassifier\n", 101 | "from sklearn.preprocessing import OneHotEncoder\n", 102 | "\n", 103 | "from sklearn.model_selection import cross_val_predict" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 47, 109 | "id": "a835e17e-2659-4b94-81b4-9d3aeb15b647", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "pred1 = cross_val_predict(GaussianNB(), x_train, y_train.argmax(1))" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 48, 119 | "id": "7be151f1-51a5-4eb3-801b-947ad2884719", 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "0.20875" 126 | ] 127 | }, 128 | "execution_count": 48, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "(y_train.argmax(1) == pred1).mean()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 49, 140 | "id": "163513e6-3f89-4755-b787-1cd01ea587da", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "model = GaussianNB().fit(x_train, y_train.argmax(1))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 50, 150 | "id": "e9c0d45e-9fb7-4d7c-85a0-93e25258c467", 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "array([8, 7, 3, ..., 6, 2, 7])" 157 | ] 158 | }, 159 | "execution_count": 50, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "model.predict(x_test)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 51, 171 | "id": "e2e614c7-6bfc-40a5-9f0c-9d3c53f9b90b", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "submit = np.zeros((len(x_test), 11))\n", 176 | "submit[np.arange(len(x_test)), model.predict(x_test)] = 1" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 54, 182 | "id": "9df998da-70ad-420c-b3ac-f338c527824a", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "np.save('submit.npy', submit)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "835f8204-859d-4dac-ae13-8a88a874b139", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Python 3 (ipykernel)", 201 | "language": "python", 202 | "name": "python3.10" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.9.10" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 5 219 | } 220 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2024/README.md: -------------------------------------------------------------------------------- 1 | https://challenge.xfyun.cn/?ch=dw24_AtTCK9 2 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2024/低资源文本翻译挑战赛_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7e53efda-8330-4f93-8b0a-a26d25c95ce1", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 14 | ] 15 | }, 16 | { 17 | "data": { 18 | "application/vnd.jupyter.widget-view+json": { 19 | "model_id": "c18328a4fa614a84b23facc7e233de76", 20 | "version_major": 2, 21 | "version_minor": 0 22 | }, 23 | "text/plain": [ 24 | "Loading checkpoint shards: 0%| | 0/4 [00:00<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Sevgili Anne,\n", 35 | "\n", 36 | "Bugün sana, seni ne kadar çok sevdiğimi ifade etmek istiyorum. Senin için hissettiklerim kelimelerle ifade edilemeyecek kadar derin ve güçlü. Sen benim hayatımın ışığı, en büyük destekçim ve en sevdiğim insansın.\n", 37 | "\n", 38 | "Her gün seninle geçirdiğim her an için minnettarım. Senin sevgin, rehberliğin ve desteğin olmadan hayatım çok farklı olurdu. Bana verdiğin\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "# pip install transformers==4.41.1\n", 44 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n", 45 | "\n", 46 | "# https://huggingface.co/CohereForAI/aya-23-8B/\n", 47 | "model_id = \"/home/lyz/hf-models/aya-23-8B/\" # 本地路径\n", 48 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 49 | "model = AutoModelForCausalLM.from_pretrained(model_id)\n", 50 | "\n", 51 | "# Format message with the command-r-plus chat template\n", 52 | "messages = [{\"role\": \"user\", \"content\": \"Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz\"}]\n", 53 | "input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\")\n", 54 | "## <|START_OF_TURN_TOKEN|><|USER_TOKEN|>Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>\n", 55 | "\n", 56 | "gen_tokens = model.generate(\n", 57 | " input_ids, \n", 58 | " max_new_tokens=100, \n", 59 | " do_sample=True, \n", 60 | " temperature=0.3,\n", 61 | " )\n", 62 | "\n", 63 | "gen_text = tokenizer.decode(gen_tokens[0])\n", 64 | "print(gen_text)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "21d274a9-b0da-425c-82bb-bc93c0a8c254", 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stderr", 75 | "output_type": "stream", 76 | "text": [ 77 | " 0%| | 3/1000 [00:30<2:58:12, 10.72s/it]" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "from tqdm import tqdm\n", 83 | "test_lines = open('testA.nl').readlines()\n", 84 | "\n", 85 | "result = []\n", 86 | "for line in tqdm(test_lines):\n", 87 | " # Format message with the command-r-plus chat template\n", 88 | " messages = [{\"role\": \"user\", \"content\": f\"将下面荷兰语翻译为中文:{line}\"}]\n", 89 | " input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\")\n", 90 | " \n", 91 | " gen_tokens = model.generate(\n", 92 | " input_ids, \n", 93 | " max_new_tokens=100, \n", 94 | " do_sample=True, \n", 95 | " temperature=0.3,\n", 96 | " )\n", 97 | " \n", 98 | " gen_text = tokenizer.decode(gen_tokens[0])\n", 99 | " result.append(\n", 100 | " gen_text.split('<|CHATBOT_TOKEN|>')[1].split('<|')[0]\n", 101 | " )\n", 102 | "\n", 103 | " with open('submit.csv', 'a') as up:\n", 104 | " up.write(result[-1].replace('\\n', '') + '\\n')" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "06b375b3-8a5f-411d-b1e1-049e6350c5c5", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "py3.11", 119 | "language": "python", 120 | "name": "py3.11" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.11.8" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 5 137 | } 138 | -------------------------------------------------------------------------------- /competition/科大讯飞AI开发者大赛2024/大模型能力评测中文成语释义与解析_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "714855ed-9040-44b1-a930-86edf0952277", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "application/vnd.jupyter.widget-view+json": { 12 | "model_id": "4428241dd396428683892fe51ed5d438", 13 | "version_major": 2, 14 | "version_minor": 0 15 | }, 16 | "text/plain": [ 17 | "Loading checkpoint shards: 0%| | 0/2 [00:00 2 | 3 | 4 | 5 | 数据竞赛开源项目 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /tutorial/bert/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## transformers教程 3 | 4 | https://github.com/huggingface/transformers/tree/main/examples/pytorch 5 | 6 | https://github.com/huggingface/transformers/tree/main/examples/tensorflow 7 | -------------------------------------------------------------------------------- /tutorial/jax/README.md: -------------------------------------------------------------------------------- 1 | ## Jax 2 | 3 | - [https://jax.readthedocs.io/en/latest/](https://jax.readthedocs.io/en/latest/) 4 | - [🤗 Hugging Face Models](https://huggingface.co/models?library=jax) 5 | 6 | ## Flax 7 | 8 | - Doc: [https://flax.readthedocs.io/en/latest/index.html](https://flax.readthedocs.io/en/latest/index.html) 9 | - Github: [https://github.com/google/flax](https://github.com/google/flax) 10 | 11 | ## Demo 12 | 13 | - [Digit Recognizer using JAX/FLAX (Toy image dataset)](https://www.kaggle.com/nilaychauhan/digit-recognizer-using-jax-flax) 14 | - [Dog Breed Classification using JAX/FLAX (Image Dataset)](https://www.kaggle.com/nilaychauhan/dog-breed-classification-using-jax-and-flax) 15 | - [Jigsaw toxic comment classification using JAX/FLAX (Text Dataset)](https://www.kaggle.com/nilaychauhan/jigsaw-toxic-comment-classification-using-jax-flax) 16 | - [Cornell BirdCall classification using JAX/FLAX (Audio dataset)](https://www.kaggle.com/nilaychauhan/cornell-birdcall-audio-recognition-using-jax-flax) 17 | -------------------------------------------------------------------------------- /tutorial/paddlepaddle/README.md: -------------------------------------------------------------------------------- 1 | ## 用PaddlePaddle打比赛 2 | 3 | ### 计算机视觉比赛 4 | 5 | - 科大讯飞-人脸关键点检测挑战赛 6 | - [科大讯飞-人脸关键点检测挑战赛:基础思路](https://aistudio.baidu.com/aistudio/projectdetail/2772561) 7 | - [科大讯飞-人脸关键点检测挑战赛:进阶思路预训练模型](https://aistudio.baidu.com/aistudio/projectdetail/2792492) 8 | - 科大讯飞-电商图像检索挑战赛 9 | - [科大讯飞-电商图像检索挑战赛:基础思路CNN相似度](https://aistudio.baidu.com/aistudio/projectdetail/2798206) 10 | - [AIWIN 手写体OCR识别竞赛](https://aistudio.baidu.com/aistudio/projectdetail/2612313) 11 | - [DataFountain-交通标志分类识别:CNN多分类](https://aistudio.baidu.com/aistudio/projectdetail/3171955) 12 | - [DataFountain-天气以及时间分类:CNN多标签分类](https://aistudio.baidu.com/aistudio/projectdetail/3169455) 13 | 14 | ### 自然语言处理比赛 15 | 16 | - [科大讯飞-学术论文分类挑战赛:ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/3162632) 17 | - [科大讯飞-中文问题相似度挑战赛:ERNIE-NSP](https://aistudio.baidu.com/aistudio/projectdetail/3168859) 18 | - [DataFountain-疫情期间网民情绪识别:ERNIE分类](https://aistudio.baidu.com/aistudio/projectdetail/3172384) 19 | - [科大讯飞-中文成语填空挑战赛:ERNIE MASK填空](https://aistudio.baidu.com/aistudio/projectdetail/3169048) 20 | 21 | ### 推荐系统&CTR比赛 22 | 23 | - [使用Paddle完成图书推荐](https://aistudio.baidu.com/aistudio/projectdetail/2556840) 24 | - [WSDM-爱奇艺:用户留存预测挑战赛](https://aistudio.baidu.com/aistudio/projectdetail/2715522) 25 | - [WSDM-亚马逊跨境电商推荐topline](https://aistudio.baidu.com/aistudio/projectdetail/3142643) 26 | 27 | ### 结构化&时间序列 28 | 29 | - [AIWIN 心电图智能诊断竞赛](https://aistudio.baidu.com/aistudio/projectdetail/2653802) 30 | 31 | ## Paddle资料 32 | 33 | 问:AI Studio有什么学习资料? 34 | - 项目环境介绍:https://ai.baidu.com/ai-doc/AISTUDIO/Dk3e2vxg9 35 | - Notebook环境:https://ai.baidu.com/ai-doc/AISTUDIO/sk3e2z8sb 36 | -------------------------------------------------------------------------------- /tutorial/rank-ensemble.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 66, 6 | "metadata": { 7 | "execution": { 8 | "iopub.execute_input": "2022-07-27T02:17:11.148005Z", 9 | "iopub.status.busy": "2022-07-27T02:17:11.147455Z", 10 | "iopub.status.idle": "2022-07-27T02:17:11.153326Z", 11 | "shell.execute_reply": "2022-07-27T02:17:11.152655Z", 12 | "shell.execute_reply.started": "2022-07-27T02:17:11.147953Z" 13 | }, 14 | "tags": [] 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from sklearn.metrics import roc_auc_score\n", 19 | "import numpy as np\n", 20 | "from scipy.stats import rankdata\n", 21 | "from sklearn.linear_model import LinearRegression" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 67, 27 | "metadata": { 28 | "execution": { 29 | "iopub.execute_input": "2022-07-27T02:17:11.327557Z", 30 | "iopub.status.busy": "2022-07-27T02:17:11.327015Z", 31 | "iopub.status.idle": "2022-07-27T02:17:11.335082Z", 32 | "shell.execute_reply": "2022-07-27T02:17:11.334607Z", 33 | "shell.execute_reply.started": "2022-07-27T02:17:11.327495Z" 34 | }, 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "t_true = np.random.randint(0, 2, size=1000)\n", 40 | "\n", 41 | "pred1 = t_true + np.random.randn(1000)\n", 42 | "pred1 = np.clip(pred1, 0, 1)\n", 43 | "\n", 44 | "pred2 = t_true + np.random.randn(1000) - 0.2\n", 45 | "pred2 = np.clip(pred2, 0, 1)\n", 46 | "\n", 47 | "pred3 = t_true + np.random.randn(1000) - 0.1\n", 48 | "pred3 = np.clip(pred3, 0, 1)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 68, 54 | "metadata": { 55 | "execution": { 56 | "iopub.execute_input": "2022-07-27T02:17:11.898040Z", 57 | "iopub.status.busy": "2022-07-27T02:17:11.897626Z", 58 | "iopub.status.idle": "2022-07-27T02:17:11.908813Z", 59 | "shell.execute_reply": "2022-07-27T02:17:11.908183Z", 60 | "shell.execute_reply.started": "2022-07-27T02:17:11.897992Z" 61 | }, 62 | "tags": [] 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "(0.7140375941955907, 0.7406884932307235, 0.7664949835720489)" 69 | ] 70 | }, 71 | "execution_count": 68, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "roc_auc_score(t_true, pred1), roc_auc_score(t_true, pred2), roc_auc_score(t_true, pred3)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 69, 83 | "metadata": { 84 | "execution": { 85 | "iopub.execute_input": "2022-07-27T02:17:11.909933Z", 86 | "iopub.status.busy": "2022-07-27T02:17:11.909738Z", 87 | "iopub.status.idle": "2022-07-27T02:17:11.972134Z", 88 | "shell.execute_reply": "2022-07-27T02:17:11.971206Z", 89 | "shell.execute_reply.started": "2022-07-27T02:17:11.909909Z" 90 | }, 91 | "tags": [] 92 | }, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "0.8650706942160006" 98 | ] 99 | }, 100 | "execution_count": 69, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "roc_auc_score(t_true, (pred1 + pred2+ pred3) / 3)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 70, 112 | "metadata": { 113 | "execution": { 114 | "iopub.execute_input": "2022-07-27T02:17:11.977691Z", 115 | "iopub.status.busy": "2022-07-27T02:17:11.977440Z", 116 | "iopub.status.idle": "2022-07-27T02:17:12.034941Z", 117 | "shell.execute_reply": "2022-07-27T02:17:12.034030Z", 118 | "shell.execute_reply.started": "2022-07-27T02:17:11.977673Z" 119 | }, 120 | "tags": [] 121 | }, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "0.8575510547104799" 127 | ] 128 | }, 129 | "execution_count": 70, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "roc_auc_score(t_true, (pred1 + pred2**0.2+ pred3**0.1))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 71, 141 | "metadata": { 142 | "execution": { 143 | "iopub.execute_input": "2022-07-27T02:17:12.896308Z", 144 | "iopub.status.busy": "2022-07-27T02:17:12.895739Z", 145 | "iopub.status.idle": "2022-07-27T02:17:12.905122Z", 146 | "shell.execute_reply": "2022-07-27T02:17:12.904599Z", 147 | "shell.execute_reply.started": "2022-07-27T02:17:12.896255Z" 148 | }, 149 | "tags": [] 150 | }, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "0.8664093421215869" 156 | ] 157 | }, 158 | "execution_count": 71, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "pred = rankdata(pred1) + rankdata(pred2)+ rankdata(pred3)\n", 165 | "pred /= 1000\n", 166 | "pred /= 3\n", 167 | "roc_auc_score(t_true, pred)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 65, 173 | "metadata": { 174 | "execution": { 175 | "iopub.execute_input": "2022-07-27T02:14:06.895575Z", 176 | "iopub.status.busy": "2022-07-27T02:14:06.895097Z", 177 | "iopub.status.idle": "2022-07-27T02:14:06.901843Z", 178 | "shell.execute_reply": "2022-07-27T02:14:06.901326Z", 179 | "shell.execute_reply.started": "2022-07-27T02:14:06.895537Z" 180 | } 181 | }, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "array([1., 3., 2.])" 187 | ] 188 | }, 189 | "execution_count": 65, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "rankdata([1,3,2])" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3 (ipykernel)", 209 | "language": "python", 210 | "name": "python3.10" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.9.10" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 4 227 | } 228 | -------------------------------------------------------------------------------- /tutorial/sklearn/README.md: -------------------------------------------------------------------------------- 1 | https://scikit-learn.org/stable/ 2 | 3 | https://intel.github.io/scikit-learn-intelex/ 4 | -------------------------------------------------------------------------------- /tutorial/tree/README.md: -------------------------------------------------------------------------------- 1 | ## XGBoost 2 | 3 | - XGBoost,https://xgboost.readthedocs.io/ 4 | - 参数介绍:https://xgboost.readthedocs.io/en/latest/parameter.html 5 | 6 | ## LightGBM 7 | 8 | - LightGBM,https://lightgbm.readthedocs.io/en/latest/ 9 | - 参数介绍:https://lightgbm.readthedocs.io/en/latest/Parameters.html 10 | 11 | ## CatBoost 12 | 13 | - CatBoost,https://yandex.com/dev/catboost/ 14 | - 参数介绍:https://catboost.ai/docs/ 15 | --------------------------------------------------------------------------------