"""Initial schema Revision ID: 001 Revises: Create Date: 2024-01-01 00:00:00.000000 """ from typing import Sequence, Union from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. revision: str = '001' down_revision: Union[str, None] = None branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: """ 创建初始数据库架构 包含以下表: - documents: 文档表 - document_chunks: 文档块表 - knowledge_bases: 知识库表 - prompt_dimensions: 提示词维度表 - parsed_documents: 解析文档表 - search_history: 搜索历史表 - document_knowledge_base: 文档和知识库关联表 """ # 创建文档表 op.create_table( 'documents', sa.Column('id', sa.String(length=255), nullable=False, comment='文档唯一标识'), sa.Column('content', sa.Text(), nullable=False, comment='文档内容'), sa.Column('title', sa.String(length=500), nullable=True, comment='文档标题'), sa.Column('metadata', sa.JSON(), nullable=False, comment='文档元数据'), sa.Column('has_embedding', sa.Boolean(), nullable=True, comment='是否已生成向量'), sa.Column('embedding_model', sa.String(length=100), nullable=True, comment='使用的向量模型'), sa.Column('vector_dimension', sa.Integer(), nullable=True, comment='向量维度'), sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'), sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'), sa.PrimaryKeyConstraint('id') ) # 创建文档表索引 op.create_index('ix_documents_created_at', 'documents', ['created_at']) op.create_index('ix_documents_has_embedding', 'documents', ['has_embedding']) # 创建文档块表 op.create_table( 'document_chunks', sa.Column('id', sa.String(length=255), nullable=False, comment='文档块唯一标识'), sa.Column('document_id', sa.String(length=255), nullable=False, comment='所属文档ID'), sa.Column('content', sa.Text(), nullable=False, comment='文档块内容'), sa.Column('position', sa.Integer(), nullable=False, comment='在文档中的位置序号'), sa.Column('page_number', sa.Integer(), nullable=True, comment='页码(如果适用)'), sa.Column('metadata', sa.JSON(), nullable=False, comment='文档块元数据'), sa.Column('has_embedding', sa.Boolean(), nullable=True, comment='是否已生成向量'), sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'), sa.ForeignKeyConstraint(['document_id'], ['documents.id'], ondelete='CASCADE'), sa.PrimaryKeyConstraint('id') ) # 创建文档块表索引 op.create_index('ix_document_chunks_document_id', 'document_chunks', ['document_id']) op.create_index('ix_document_chunks_position', 'document_chunks', ['position']) # 创建知识库表 op.create_table( 'knowledge_bases', sa.Column('id', sa.String(length=255), nullable=False, comment='知识库唯一标识'), sa.Column('name', sa.String(length=255), nullable=False, comment='知识库名称'), sa.Column('description', sa.Text(), nullable=True, comment='知识库描述'), sa.Column('config', sa.JSON(), nullable=False, comment='知识库配置'), sa.Column('tags', sa.JSON(), nullable=False, comment='知识库标签'), sa.Column('document_count', sa.Integer(), nullable=True, comment='文档数量'), sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'), sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'), sa.PrimaryKeyConstraint('id') ) # 创建知识库表索引 op.create_index('ix_knowledge_bases_name', 'knowledge_bases', ['name']) op.create_index('ix_knowledge_bases_created_at', 'knowledge_bases', ['created_at']) # 创建文档和知识库关联表 op.create_table( 'document_knowledge_base', sa.Column('document_id', sa.String(length=255), nullable=False), sa.Column('knowledge_base_id', sa.String(length=255), nullable=False), sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False), sa.ForeignKeyConstraint(['document_id'], ['documents.id'], ), sa.ForeignKeyConstraint(['knowledge_base_id'], ['knowledge_bases.id'], ), sa.PrimaryKeyConstraint('document_id', 'knowledge_base_id') ) # 创建关联表索引 op.create_index('ix_doc_kb_document_id', 'document_knowledge_base', ['document_id']) op.create_index('ix_doc_kb_knowledge_base_id', 'document_knowledge_base', ['knowledge_base_id']) # 创建提示词维度表 op.create_table( 'prompt_dimensions', sa.Column('id', sa.String(length=255), nullable=False, comment='提示词维度唯一标识'), sa.Column('knowledge_base_id', sa.String(length=255), nullable=False, comment='所属知识库ID'), sa.Column('name', sa.String(length=255), nullable=False, comment='维度名称'), sa.Column('description', sa.Text(), nullable=True, comment='维度描述'), sa.Column('template', sa.Text(), nullable=False, comment='提示词模板'), sa.Column('variables', sa.JSON(), nullable=False, comment='模板变量列表'), sa.Column('config', sa.JSON(), nullable=False, comment='维度配置'), sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'), sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'), sa.ForeignKeyConstraint(['knowledge_base_id'], ['knowledge_bases.id'], ondelete='CASCADE'), sa.PrimaryKeyConstraint('id') ) # 创建提示词维度表索引 op.create_index('ix_prompt_dimensions_kb_id', 'prompt_dimensions', ['knowledge_base_id']) op.create_index('ix_prompt_dimensions_name', 'prompt_dimensions', ['name']) # 创建解析文档表 op.create_table( 'parsed_documents', sa.Column('id', sa.String(length=255), nullable=False, comment='解析文档唯一标识'), sa.Column('original_filename', sa.String(length=500), nullable=False, comment='原始文件名'), sa.Column('document_type', sa.String(length=50), nullable=False, comment='文档类型(pdf/image/text/qa_pair)'), sa.Column('file_path', sa.String(length=1000), nullable=True, comment='文件存储路径'), sa.Column('file_size', sa.Integer(), nullable=True, comment='文件大小(字节)'), sa.Column('status', sa.String(length=50), nullable=False, comment='解析状态(pending/processing/completed/failed)'), sa.Column('error_message', sa.Text(), nullable=True, comment='错误信息(如果解析失败)'), sa.Column('chunk_count', sa.Integer(), nullable=True, comment='分块数量'), sa.Column('metadata', sa.JSON(), nullable=False, comment='文档元数据'), sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'), sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'), sa.Column('completed_at', sa.DateTime(), nullable=True, comment='解析完成时间'), sa.PrimaryKeyConstraint('id') ) # 创建解析文档表索引 op.create_index('ix_parsed_documents_status', 'parsed_documents', ['status']) op.create_index('ix_parsed_documents_document_type', 'parsed_documents', ['document_type']) op.create_index('ix_parsed_documents_created_at', 'parsed_documents', ['created_at']) # 创建搜索历史表 op.create_table( 'search_history', sa.Column('id', sa.String(length=255), nullable=False, comment='搜索历史唯一标识'), sa.Column('query_text', sa.Text(), nullable=False, comment='搜索查询文本'), sa.Column('search_type', sa.String(length=50), nullable=False, comment='搜索类型(vector/text/hybrid)'), sa.Column('top_k', sa.Integer(), nullable=False, comment='返回结果数量'), sa.Column('filters', sa.JSON(), nullable=True, comment='过滤条件'), sa.Column('result_count', sa.Integer(), nullable=True, comment='返回结果数量'), sa.Column('execution_time_ms', sa.Integer(), nullable=True, comment='执行时间(毫秒)'), sa.Column('user_id', sa.String(length=255), nullable=True, comment='用户ID'), sa.Column('session_id', sa.String(length=255), nullable=True, comment='会话ID'), sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'), sa.PrimaryKeyConstraint('id') ) # 创建搜索历史表索引 op.create_index('ix_search_history_created_at', 'search_history', ['created_at']) op.create_index('ix_search_history_search_type', 'search_history', ['search_type']) op.create_index('ix_search_history_user_id', 'search_history', ['user_id']) def downgrade() -> None: """ 删除所有表 """ # 按照依赖关系的逆序删除表 op.drop_table('search_history') op.drop_table('parsed_documents') op.drop_table('prompt_dimensions') op.drop_table('document_knowledge_base') op.drop_table('knowledge_bases') op.drop_table('document_chunks') op.drop_table('documents')