61 | {/* Back Button */}
62 |
63 |
64 |
68 |
69 |
70 |
71 | {/* Header */}
72 |
73 |
74 |
75 |
{t('smartDatasetCreator.title')}
76 |
77 |
78 | {t('smartDatasetCreator.description')}
79 |
80 |
81 |
82 | {/* Error Message */}
83 |
84 |
85 | {/* Step Indicator */}
86 |
87 |
88 | {/* Step Content */}
89 |
90 | {renderStepContent()}
91 |
92 |
93 | {/* Navigation Buttons */}
94 |
95 |
96 | );
97 | };
--------------------------------------------------------------------------------
/backend/app/models/project_data_source.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from sqlalchemy import Column, String, Text, DateTime, Enum, JSON, ForeignKey, Boolean, Integer
3 | from sqlalchemy.orm import relationship
4 | import enum
5 | import uuid
6 |
7 | from app.db import db
8 |
9 |
10 | class DataSourceType(enum.Enum):
11 | """数据源类型枚举"""
12 | UPLOAD = "upload"
13 | DATABASE = "database"
14 | API = "api"
15 | STORAGE = "storage"
16 | URL = "url"
17 |
18 |
19 | class DataSourceStatus(enum.Enum):
20 | """数据源状态枚举"""
21 | CONNECTED = "connected"
22 | DISCONNECTED = "disconnected"
23 | ERROR = "error"
24 | SYNCING = "syncing"
25 |
26 |
27 | class ProjectDataSource(db.Model):
28 | """项目数据源配置模型"""
29 | __tablename__ = 'project_data_sources'
30 |
31 | # 基础信息
32 | id = Column(String(36), primary_key=True)
33 | project_id = Column(String(36), ForeignKey('data_governance_projects.id'), nullable=False)
34 | name = Column(String(255), nullable=False)
35 | description = Column(Text)
36 |
37 | # 数据源配置
38 | source_type = Column(Enum(DataSourceType), nullable=False)
39 | status = Column(Enum(DataSourceStatus), default=DataSourceStatus.DISCONNECTED)
40 | config = Column(JSON) # 数据源特定配置
41 | connection_string = Column(Text) # 连接字符串(加密存储)
42 |
43 | # 同步信息
44 | last_sync_at = Column(DateTime)
45 | sync_frequency = Column(String(50)) # 同步频率 (hourly, daily, weekly, manual)
46 | auto_sync_enabled = Column(Boolean, default=False)
47 |
48 | # 数据统计
49 | file_count = Column(Integer, default=0)
50 | total_size = Column(Integer, default=0) # 总大小(字节)
51 |
52 | # 时间戳
53 | created_at = Column(DateTime, default=datetime.utcnow)
54 | updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
55 |
56 | # 关系
57 | project = relationship("DataGovernanceProject", back_populates="data_sources")
58 | raw_data_items = relationship("RawData", foreign_keys="RawData.data_source_id", cascade="all, delete-orphan", overlaps="data_source")
59 |
60 | def __init__(self, **kwargs):
61 | super().__init__(**kwargs)
62 | if not self.id:
63 | self.id = str(uuid.uuid4())
64 |
65 | def to_dict(self):
66 | return {
67 | 'id': self.id,
68 | 'project_id': self.project_id,
69 | 'name': self.name,
70 | 'description': self.description,
71 | 'source_type': self.source_type.value if self.source_type else None,
72 | 'status': self.status.value if self.status else None,
73 | 'config': self.config,
74 | 'last_sync_at': self.last_sync_at.isoformat() if self.last_sync_at else None,
75 | 'sync_frequency': self.sync_frequency,
76 | 'auto_sync_enabled': self.auto_sync_enabled,
77 | 'file_count': self.file_count,
78 | 'total_size': self.total_size,
79 | 'created_at': self.created_at.isoformat() if self.created_at else None,
80 | 'updated_at': self.updated_at.isoformat() if self.updated_at else None,
81 | }
--------------------------------------------------------------------------------
/frontend/src/i18n/locales/zh/datasets.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "数据集",
3 | "totalDatasets": "{{count}} 个数据集",
4 | "smartCreate": "智能创建",
5 | "createDataset": "创建数据集",
6 | "searchPlaceholder": "搜索数据集名称或描述...",
7 | "taskType": "任务类型",
8 | "allTaskTypes": "全部",
9 | "nlp": "自然语言处理",
10 | "qa": "问答系统",
11 | "textClassification": "文本分类",
12 | "loading": "加载中...",
13 | "error": "错误",
14 | "retry": "重试",
15 | "noDatasets": "没有找到数据集",
16 | "recommended": "推荐",
17 | "detail": {
18 | "backToList": "返回列表",
19 | "dataPreview": "数据预览",
20 | "versionManagement": "版本管理",
21 | "detailInfo": "详细信息",
22 | "settings": "设置",
23 | "datasetId": "数据集ID",
24 | "name": "名称",
25 | "status": "状态",
26 | "published": "已发布",
27 | "createdAt": "创建时间"
28 | },
29 | "create": {
30 | "title": "创建数据集",
31 | "description": "数据集描述",
32 | "backToList": "返回数据集列表",
33 | "subtitle": "选择数据集的创建方式,并录入相关信息。",
34 | "creationFailed": "创建失败",
35 | "selectMethod": "选择创建方式",
36 | "createEmpty": "创建空数据集",
37 | "createEmptyDesc": "创建一个新的空数据集目录,后续可以上传和管理数据文件。",
38 | "importHuggingface": "从 Hugging Face 导入",
39 | "importHuggingfaceDesc": "从 Hugging Face Hub 导入现有的数据集,支持 URL 或数据集路径。",
40 | "importModelscope": "从 ModelScope 导入",
41 | "importModelscopeDesc": "从魔搭社区导入面向中文优化的数据集,专注于中文AI应用场景。",
42 | "importInfo": "导入信息",
43 | "datasetUrl": "数据集URL或路径",
44 | "required": "必填",
45 | "urlPlaceholder": "例如: squad 或 https://huggingface.co/datasets/squad",
46 | "modelscopeUrlPlaceholder": "例如: open-r1/Mixture-of-Thoughts 或完整URL",
47 | "urlHint": "支持数据集名称 (例如 open-r1/Mixture-of-Thoughts) 或完整URL",
48 | "modelscopeUrlHint": "支持路径格式 (例如 open-r1/Mixture-of-Thoughts) 或完整URL,系统将自动获取数据集信息",
49 | "visitHuggingface": "访问 Hugging Face",
50 | "visitModelscope": "访问魔搭社区",
51 | "basicInfo": "基本信息",
52 | "datasetName": "数据集名称",
53 | "datasetNamePlaceholder": "输入数据集名称...",
54 | "owner": "所有者",
55 | "ownerPlaceholder": "输入所有者名称或组织名称...",
56 | "ownerHint": "所有者和数据集名称的组合必须唯一",
57 | "license": "许可证",
58 | "taskType": "任务类型",
59 | "tags": "标签",
60 | "tagsPlaceholder": "使用逗号分隔标签, 例如 nlp, japanese, qa...",
61 | "descriptionPlaceholder": "描述数据集的内容、用途和特点...",
62 | "createExplanation": "创建说明",
63 | "importExplanation": "导入说明",
64 | "emptyDatasetSteps": [
65 | "• 系统将创建一个新的空数据集目录",
66 | "• 您可以稍后从文件管理界面上传数据文件",
67 | "• 支持多种格式: JSON, CSV, Parquet, TXT等"
68 | ],
69 | "huggingfaceSteps": [
70 | "• 系统将从Hugging Face Hub下载指定的数据集",
71 | "• 数据集信息(名称、描述等)将自动从源获取",
72 | "• 导入过程可能需要几分钟,具体取决于数据集的大小"
73 | ],
74 | "modelscopeSteps": [
75 | "• 系统将从魔搭社区下载指定的数据集",
76 | "• 数据集信息(名称、描述等)将自动从源获取",
77 | "• 导入过程可能需要几分钟,具体取决于数据集的大小"
78 | ],
79 | "reselect": "重新选择",
80 | "creating": "创建中...",
81 | "importing": "导入中...",
82 | "success": {
83 | "title": "数据集创建成功!",
84 | "redirecting": "正在重定向到数据集列表...",
85 | "importStarted": "导入任务已开始!",
86 | "importMessage": "正在从 {{source}} 导入数据集。您可以在任务管理页面查看进度。",
87 | "viewTaskProgress": "查看任务进度",
88 | "backToDatasetList": "返回数据集列表"
89 | }
90 | }
91 | }
--------------------------------------------------------------------------------
/backend/app/api/v1/schemas/library_schemas.py:
--------------------------------------------------------------------------------
1 | from marshmallow import Schema, fields, validate, validates_schema, ValidationError
2 | import re
3 |
4 | def validate_library_name(name):
5 | """验证文件库名称"""
6 | if not name or not name.strip():
7 | raise ValidationError('文件库名称不能为空')
8 | if len(name.strip()) > 255:
9 | raise ValidationError('文件库名称不能超过255个字符')
10 | return name.strip()
11 |
12 | class LibraryCreateSchema(Schema):
13 | """创建文件库的验证模式"""
14 | name = fields.Str(required=True, validate=[validate.Length(min=1, max=255), validate_library_name])
15 | description = fields.Str(missing=None, validate=validate.Length(max=1000))
16 | data_type = fields.Str(
17 | required=True,
18 | validate=validate.OneOf(['training', 'evaluation', 'mixed'])
19 | )
20 | tags = fields.List(fields.Str(), missing=[])
21 |
22 | class LibraryUpdateSchema(Schema):
23 | """更新文件库的验证模式"""
24 | name = fields.Str(validate=validate.Length(min=1, max=255))
25 | description = fields.Str(validate=validate.Length(max=1000), allow_none=True)
26 | data_type = fields.Str(validate=validate.OneOf(['training', 'evaluation', 'mixed']))
27 | tags = fields.List(fields.Str())
28 |
29 | class LibraryQuerySchema(Schema):
30 | """文件库查询参数验证模式"""
31 | page = fields.Int(missing=1, validate=validate.Range(min=1))
32 | per_page = fields.Int(missing=20, validate=validate.Range(min=1, max=100))
33 | name = fields.Str(missing=None)
34 | data_type = fields.Str(missing=None, validate=validate.OneOf(['training', 'evaluation', 'mixed']))
35 | tags = fields.List(fields.Str(), missing=[])
36 | sort_by = fields.Str(missing='created_at', validate=validate.OneOf([
37 | 'created_at', 'updated_at', 'name', 'file_count', 'total_size'
38 | ]))
39 | sort_order = fields.Str(missing='desc', validate=validate.OneOf(['asc', 'desc']))
40 |
41 | class LibraryFileUploadSchema(Schema):
42 | """文件上传验证模式"""
43 | files = fields.List(fields.Raw(), required=True)
44 |
45 | class LibraryFileQuerySchema(Schema):
46 | """文件查询参数验证模式"""
47 | page = fields.Int(missing=1, validate=validate.Range(min=1))
48 | per_page = fields.Int(missing=20, validate=validate.Range(min=1, max=100))
49 | filename = fields.Str(missing=None)
50 | file_type = fields.Str(missing=None)
51 | process_status = fields.Str(missing=None, validate=validate.OneOf([
52 | 'pending', 'processing', 'completed', 'failed'
53 | ]))
54 | sort_by = fields.Str(missing='uploaded_at', validate=validate.OneOf([
55 | 'uploaded_at', 'filename', 'file_size', 'process_status'
56 | ]))
57 | sort_order = fields.Str(missing='desc', validate=validate.OneOf(['asc', 'desc']))
58 |
59 | class LibraryFileUpdateSchema(Schema):
60 | """文件更新验证模式"""
61 | filename = fields.Str(validate=validate.Length(min=1, max=255))
62 | original_filename = fields.Str(validate=validate.Length(min=1, max=255))
63 |
64 | class LibraryStatisticsSchema(Schema):
65 | """统计信息返回模式"""
66 | total_libraries = fields.Int()
67 | total_files = fields.Int()
68 | total_processed = fields.Int()
69 | total_size = fields.Str()
70 | conversion_rate = fields.Float()
--------------------------------------------------------------------------------
/backend/app/models/organization.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import Column, String, Integer, Text, DateTime, Enum as SQLEnum, ForeignKey
2 | from sqlalchemy.orm import relationship
3 | from datetime import datetime
4 | import enum
5 | import uuid
6 |
7 | from app.db import db
8 |
9 |
10 | class OrganizationStatus(enum.Enum):
11 | ACTIVE = "ACTIVE"
12 | INACTIVE = "INACTIVE"
13 |
14 |
15 | class Organization(db.Model):
16 | __tablename__ = 'organizations'
17 |
18 | id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
19 | name = Column(String(255), nullable=False)
20 | code = Column(String(100), unique=True, index=True)
21 | description = Column(Text)
22 | parent_id = Column(String(36), ForeignKey('organizations.id'), index=True)
23 | path = Column(String(1000), index=True) # 层级路径,如 /root/dept1/team1
24 | level = Column(Integer, default=1, index=True)
25 | sort_order = Column(Integer, default=0)
26 | status = Column(SQLEnum(OrganizationStatus, native_enum=True), default=OrganizationStatus.ACTIVE)
27 | created_at = Column(DateTime, default=datetime.utcnow)
28 | updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
29 | created_by = Column(String(36))
30 | updated_by = Column(String(36))
31 |
32 | # Self-referential relationship
33 | parent = relationship("Organization", remote_side="Organization.id", back_populates="children")
34 | children = relationship("Organization", back_populates="parent", cascade="all, delete-orphan")
35 |
36 | # Relationships
37 | user_organizations = relationship("UserOrganization", back_populates="organization", cascade="all, delete-orphan")
38 | user_roles = relationship("UserRole", back_populates="organization")
39 |
40 | def to_dict(self, include_children=False):
41 | result = {
42 | 'id': self.id,
43 | 'name': self.name,
44 | 'code': self.code,
45 | 'description': self.description,
46 | 'parent_id': self.parent_id,
47 | 'path': self.path,
48 | 'level': self.level,
49 | 'sort_order': self.sort_order,
50 | 'status': self.status.value,
51 | 'created_at': self.created_at.isoformat() if self.created_at else None,
52 | 'updated_at': self.updated_at.isoformat() if self.updated_at else None
53 | }
54 |
55 | if include_children:
56 | result['children'] = [child.to_dict() for child in self.children]
57 |
58 | return result
59 |
60 | def get_ancestors(self):
61 | """获取所有上级组织"""
62 | ancestors = []
63 | current = self.parent
64 | while current:
65 | ancestors.append(current)
66 | current = current.parent
67 | return ancestors
68 |
69 | def get_descendants(self):
70 | """获取所有下级组织"""
71 | descendants = []
72 | for child in self.children:
73 | descendants.append(child)
74 | descendants.extend(child.get_descendants())
75 | return descendants
76 |
77 | def __repr__(self):
78 | return f'