| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182 |
- """Initial schema
- Revision ID: 001
- Revises:
- Create Date: 2024-01-01 00:00:00.000000
- """
- from typing import Sequence, Union
- from alembic import op
- import sqlalchemy as sa
- from sqlalchemy.dialects import postgresql
- # revision identifiers, used by Alembic.
- revision: str = '001'
- down_revision: Union[str, None] = None
- branch_labels: Union[str, Sequence[str], None] = None
- depends_on: Union[str, Sequence[str], None] = None
- def upgrade() -> None:
- """
- 创建初始数据库架构
-
- 包含以下表:
- - documents: 文档表
- - document_chunks: 文档块表
- - knowledge_bases: 知识库表
- - prompt_dimensions: 提示词维度表
- - parsed_documents: 解析文档表
- - search_history: 搜索历史表
- - document_knowledge_base: 文档和知识库关联表
- """
-
- # 创建文档表
- op.create_table(
- 'documents',
- sa.Column('id', sa.String(length=255), nullable=False, comment='文档唯一标识'),
- sa.Column('content', sa.Text(), nullable=False, comment='文档内容'),
- sa.Column('title', sa.String(length=500), nullable=True, comment='文档标题'),
- sa.Column('metadata', sa.JSON(), nullable=False, comment='文档元数据'),
- sa.Column('has_embedding', sa.Boolean(), nullable=True, comment='是否已生成向量'),
- sa.Column('embedding_model', sa.String(length=100), nullable=True, comment='使用的向量模型'),
- sa.Column('vector_dimension', sa.Integer(), nullable=True, comment='向量维度'),
- sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
- sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
- sa.PrimaryKeyConstraint('id')
- )
-
- # 创建文档表索引
- op.create_index('ix_documents_created_at', 'documents', ['created_at'])
- op.create_index('ix_documents_has_embedding', 'documents', ['has_embedding'])
-
- # 创建文档块表
- op.create_table(
- 'document_chunks',
- sa.Column('id', sa.String(length=255), nullable=False, comment='文档块唯一标识'),
- sa.Column('document_id', sa.String(length=255), nullable=False, comment='所属文档ID'),
- sa.Column('content', sa.Text(), nullable=False, comment='文档块内容'),
- sa.Column('position', sa.Integer(), nullable=False, comment='在文档中的位置序号'),
- sa.Column('page_number', sa.Integer(), nullable=True, comment='页码(如果适用)'),
- sa.Column('metadata', sa.JSON(), nullable=False, comment='文档块元数据'),
- sa.Column('has_embedding', sa.Boolean(), nullable=True, comment='是否已生成向量'),
- sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
- sa.ForeignKeyConstraint(['document_id'], ['documents.id'], ondelete='CASCADE'),
- sa.PrimaryKeyConstraint('id')
- )
-
- # 创建文档块表索引
- op.create_index('ix_document_chunks_document_id', 'document_chunks', ['document_id'])
- op.create_index('ix_document_chunks_position', 'document_chunks', ['position'])
-
- # 创建知识库表
- op.create_table(
- 'knowledge_bases',
- sa.Column('id', sa.String(length=255), nullable=False, comment='知识库唯一标识'),
- sa.Column('name', sa.String(length=255), nullable=False, comment='知识库名称'),
- sa.Column('description', sa.Text(), nullable=True, comment='知识库描述'),
- sa.Column('config', sa.JSON(), nullable=False, comment='知识库配置'),
- sa.Column('tags', sa.JSON(), nullable=False, comment='知识库标签'),
- sa.Column('document_count', sa.Integer(), nullable=True, comment='文档数量'),
- sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
- sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
- sa.PrimaryKeyConstraint('id')
- )
-
- # 创建知识库表索引
- op.create_index('ix_knowledge_bases_name', 'knowledge_bases', ['name'])
- op.create_index('ix_knowledge_bases_created_at', 'knowledge_bases', ['created_at'])
-
- # 创建文档和知识库关联表
- op.create_table(
- 'document_knowledge_base',
- sa.Column('document_id', sa.String(length=255), nullable=False),
- sa.Column('knowledge_base_id', sa.String(length=255), nullable=False),
- sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False),
- sa.ForeignKeyConstraint(['document_id'], ['documents.id'], ),
- sa.ForeignKeyConstraint(['knowledge_base_id'], ['knowledge_bases.id'], ),
- sa.PrimaryKeyConstraint('document_id', 'knowledge_base_id')
- )
-
- # 创建关联表索引
- op.create_index('ix_doc_kb_document_id', 'document_knowledge_base', ['document_id'])
- op.create_index('ix_doc_kb_knowledge_base_id', 'document_knowledge_base', ['knowledge_base_id'])
-
- # 创建提示词维度表
- op.create_table(
- 'prompt_dimensions',
- sa.Column('id', sa.String(length=255), nullable=False, comment='提示词维度唯一标识'),
- sa.Column('knowledge_base_id', sa.String(length=255), nullable=False, comment='所属知识库ID'),
- sa.Column('name', sa.String(length=255), nullable=False, comment='维度名称'),
- sa.Column('description', sa.Text(), nullable=True, comment='维度描述'),
- sa.Column('template', sa.Text(), nullable=False, comment='提示词模板'),
- sa.Column('variables', sa.JSON(), nullable=False, comment='模板变量列表'),
- sa.Column('config', sa.JSON(), nullable=False, comment='维度配置'),
- sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
- sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
- sa.ForeignKeyConstraint(['knowledge_base_id'], ['knowledge_bases.id'], ondelete='CASCADE'),
- sa.PrimaryKeyConstraint('id')
- )
-
- # 创建提示词维度表索引
- op.create_index('ix_prompt_dimensions_kb_id', 'prompt_dimensions', ['knowledge_base_id'])
- op.create_index('ix_prompt_dimensions_name', 'prompt_dimensions', ['name'])
-
- # 创建解析文档表
- op.create_table(
- 'parsed_documents',
- sa.Column('id', sa.String(length=255), nullable=False, comment='解析文档唯一标识'),
- sa.Column('original_filename', sa.String(length=500), nullable=False, comment='原始文件名'),
- sa.Column('document_type', sa.String(length=50), nullable=False, comment='文档类型(pdf/image/text/qa_pair)'),
- sa.Column('file_path', sa.String(length=1000), nullable=True, comment='文件存储路径'),
- sa.Column('file_size', sa.Integer(), nullable=True, comment='文件大小(字节)'),
- sa.Column('status', sa.String(length=50), nullable=False, comment='解析状态(pending/processing/completed/failed)'),
- sa.Column('error_message', sa.Text(), nullable=True, comment='错误信息(如果解析失败)'),
- sa.Column('chunk_count', sa.Integer(), nullable=True, comment='分块数量'),
- sa.Column('metadata', sa.JSON(), nullable=False, comment='文档元数据'),
- sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
- sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
- sa.Column('completed_at', sa.DateTime(), nullable=True, comment='解析完成时间'),
- sa.PrimaryKeyConstraint('id')
- )
-
- # 创建解析文档表索引
- op.create_index('ix_parsed_documents_status', 'parsed_documents', ['status'])
- op.create_index('ix_parsed_documents_document_type', 'parsed_documents', ['document_type'])
- op.create_index('ix_parsed_documents_created_at', 'parsed_documents', ['created_at'])
-
- # 创建搜索历史表
- op.create_table(
- 'search_history',
- sa.Column('id', sa.String(length=255), nullable=False, comment='搜索历史唯一标识'),
- sa.Column('query_text', sa.Text(), nullable=False, comment='搜索查询文本'),
- sa.Column('search_type', sa.String(length=50), nullable=False, comment='搜索类型(vector/text/hybrid)'),
- sa.Column('top_k', sa.Integer(), nullable=False, comment='返回结果数量'),
- sa.Column('filters', sa.JSON(), nullable=True, comment='过滤条件'),
- sa.Column('result_count', sa.Integer(), nullable=True, comment='返回结果数量'),
- sa.Column('execution_time_ms', sa.Integer(), nullable=True, comment='执行时间(毫秒)'),
- sa.Column('user_id', sa.String(length=255), nullable=True, comment='用户ID'),
- sa.Column('session_id', sa.String(length=255), nullable=True, comment='会话ID'),
- sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
- sa.PrimaryKeyConstraint('id')
- )
-
- # 创建搜索历史表索引
- op.create_index('ix_search_history_created_at', 'search_history', ['created_at'])
- op.create_index('ix_search_history_search_type', 'search_history', ['search_type'])
- op.create_index('ix_search_history_user_id', 'search_history', ['user_id'])
- def downgrade() -> None:
- """
- 删除所有表
- """
- # 按照依赖关系的逆序删除表
- op.drop_table('search_history')
- op.drop_table('parsed_documents')
- op.drop_table('prompt_dimensions')
- op.drop_table('document_knowledge_base')
- op.drop_table('knowledge_bases')
- op.drop_table('document_chunks')
- op.drop_table('documents')
|