001_initial_schema.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. """Initial schema
  2. Revision ID: 001
  3. Revises:
  4. Create Date: 2024-01-01 00:00:00.000000
  5. """
  6. from typing import Sequence, Union
  7. from alembic import op
  8. import sqlalchemy as sa
  9. from sqlalchemy.dialects import postgresql
  10. # revision identifiers, used by Alembic.
  11. revision: str = '001'
  12. down_revision: Union[str, None] = None
  13. branch_labels: Union[str, Sequence[str], None] = None
  14. depends_on: Union[str, Sequence[str], None] = None
  15. def upgrade() -> None:
  16. """
  17. 创建初始数据库架构
  18. 包含以下表:
  19. - documents: 文档表
  20. - document_chunks: 文档块表
  21. - knowledge_bases: 知识库表
  22. - prompt_dimensions: 提示词维度表
  23. - parsed_documents: 解析文档表
  24. - search_history: 搜索历史表
  25. - document_knowledge_base: 文档和知识库关联表
  26. """
  27. # 创建文档表
  28. op.create_table(
  29. 'documents',
  30. sa.Column('id', sa.String(length=255), nullable=False, comment='文档唯一标识'),
  31. sa.Column('content', sa.Text(), nullable=False, comment='文档内容'),
  32. sa.Column('title', sa.String(length=500), nullable=True, comment='文档标题'),
  33. sa.Column('metadata', sa.JSON(), nullable=False, comment='文档元数据'),
  34. sa.Column('has_embedding', sa.Boolean(), nullable=True, comment='是否已生成向量'),
  35. sa.Column('embedding_model', sa.String(length=100), nullable=True, comment='使用的向量模型'),
  36. sa.Column('vector_dimension', sa.Integer(), nullable=True, comment='向量维度'),
  37. sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
  38. sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
  39. sa.PrimaryKeyConstraint('id')
  40. )
  41. # 创建文档表索引
  42. op.create_index('ix_documents_created_at', 'documents', ['created_at'])
  43. op.create_index('ix_documents_has_embedding', 'documents', ['has_embedding'])
  44. # 创建文档块表
  45. op.create_table(
  46. 'document_chunks',
  47. sa.Column('id', sa.String(length=255), nullable=False, comment='文档块唯一标识'),
  48. sa.Column('document_id', sa.String(length=255), nullable=False, comment='所属文档ID'),
  49. sa.Column('content', sa.Text(), nullable=False, comment='文档块内容'),
  50. sa.Column('position', sa.Integer(), nullable=False, comment='在文档中的位置序号'),
  51. sa.Column('page_number', sa.Integer(), nullable=True, comment='页码(如果适用)'),
  52. sa.Column('metadata', sa.JSON(), nullable=False, comment='文档块元数据'),
  53. sa.Column('has_embedding', sa.Boolean(), nullable=True, comment='是否已生成向量'),
  54. sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
  55. sa.ForeignKeyConstraint(['document_id'], ['documents.id'], ondelete='CASCADE'),
  56. sa.PrimaryKeyConstraint('id')
  57. )
  58. # 创建文档块表索引
  59. op.create_index('ix_document_chunks_document_id', 'document_chunks', ['document_id'])
  60. op.create_index('ix_document_chunks_position', 'document_chunks', ['position'])
  61. # 创建知识库表
  62. op.create_table(
  63. 'knowledge_bases',
  64. sa.Column('id', sa.String(length=255), nullable=False, comment='知识库唯一标识'),
  65. sa.Column('name', sa.String(length=255), nullable=False, comment='知识库名称'),
  66. sa.Column('description', sa.Text(), nullable=True, comment='知识库描述'),
  67. sa.Column('config', sa.JSON(), nullable=False, comment='知识库配置'),
  68. sa.Column('tags', sa.JSON(), nullable=False, comment='知识库标签'),
  69. sa.Column('document_count', sa.Integer(), nullable=True, comment='文档数量'),
  70. sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
  71. sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
  72. sa.PrimaryKeyConstraint('id')
  73. )
  74. # 创建知识库表索引
  75. op.create_index('ix_knowledge_bases_name', 'knowledge_bases', ['name'])
  76. op.create_index('ix_knowledge_bases_created_at', 'knowledge_bases', ['created_at'])
  77. # 创建文档和知识库关联表
  78. op.create_table(
  79. 'document_knowledge_base',
  80. sa.Column('document_id', sa.String(length=255), nullable=False),
  81. sa.Column('knowledge_base_id', sa.String(length=255), nullable=False),
  82. sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False),
  83. sa.ForeignKeyConstraint(['document_id'], ['documents.id'], ),
  84. sa.ForeignKeyConstraint(['knowledge_base_id'], ['knowledge_bases.id'], ),
  85. sa.PrimaryKeyConstraint('document_id', 'knowledge_base_id')
  86. )
  87. # 创建关联表索引
  88. op.create_index('ix_doc_kb_document_id', 'document_knowledge_base', ['document_id'])
  89. op.create_index('ix_doc_kb_knowledge_base_id', 'document_knowledge_base', ['knowledge_base_id'])
  90. # 创建提示词维度表
  91. op.create_table(
  92. 'prompt_dimensions',
  93. sa.Column('id', sa.String(length=255), nullable=False, comment='提示词维度唯一标识'),
  94. sa.Column('knowledge_base_id', sa.String(length=255), nullable=False, comment='所属知识库ID'),
  95. sa.Column('name', sa.String(length=255), nullable=False, comment='维度名称'),
  96. sa.Column('description', sa.Text(), nullable=True, comment='维度描述'),
  97. sa.Column('template', sa.Text(), nullable=False, comment='提示词模板'),
  98. sa.Column('variables', sa.JSON(), nullable=False, comment='模板变量列表'),
  99. sa.Column('config', sa.JSON(), nullable=False, comment='维度配置'),
  100. sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
  101. sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
  102. sa.ForeignKeyConstraint(['knowledge_base_id'], ['knowledge_bases.id'], ondelete='CASCADE'),
  103. sa.PrimaryKeyConstraint('id')
  104. )
  105. # 创建提示词维度表索引
  106. op.create_index('ix_prompt_dimensions_kb_id', 'prompt_dimensions', ['knowledge_base_id'])
  107. op.create_index('ix_prompt_dimensions_name', 'prompt_dimensions', ['name'])
  108. # 创建解析文档表
  109. op.create_table(
  110. 'parsed_documents',
  111. sa.Column('id', sa.String(length=255), nullable=False, comment='解析文档唯一标识'),
  112. sa.Column('original_filename', sa.String(length=500), nullable=False, comment='原始文件名'),
  113. sa.Column('document_type', sa.String(length=50), nullable=False, comment='文档类型(pdf/image/text/qa_pair)'),
  114. sa.Column('file_path', sa.String(length=1000), nullable=True, comment='文件存储路径'),
  115. sa.Column('file_size', sa.Integer(), nullable=True, comment='文件大小(字节)'),
  116. sa.Column('status', sa.String(length=50), nullable=False, comment='解析状态(pending/processing/completed/failed)'),
  117. sa.Column('error_message', sa.Text(), nullable=True, comment='错误信息(如果解析失败)'),
  118. sa.Column('chunk_count', sa.Integer(), nullable=True, comment='分块数量'),
  119. sa.Column('metadata', sa.JSON(), nullable=False, comment='文档元数据'),
  120. sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
  121. sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='更新时间'),
  122. sa.Column('completed_at', sa.DateTime(), nullable=True, comment='解析完成时间'),
  123. sa.PrimaryKeyConstraint('id')
  124. )
  125. # 创建解析文档表索引
  126. op.create_index('ix_parsed_documents_status', 'parsed_documents', ['status'])
  127. op.create_index('ix_parsed_documents_document_type', 'parsed_documents', ['document_type'])
  128. op.create_index('ix_parsed_documents_created_at', 'parsed_documents', ['created_at'])
  129. # 创建搜索历史表
  130. op.create_table(
  131. 'search_history',
  132. sa.Column('id', sa.String(length=255), nullable=False, comment='搜索历史唯一标识'),
  133. sa.Column('query_text', sa.Text(), nullable=False, comment='搜索查询文本'),
  134. sa.Column('search_type', sa.String(length=50), nullable=False, comment='搜索类型(vector/text/hybrid)'),
  135. sa.Column('top_k', sa.Integer(), nullable=False, comment='返回结果数量'),
  136. sa.Column('filters', sa.JSON(), nullable=True, comment='过滤条件'),
  137. sa.Column('result_count', sa.Integer(), nullable=True, comment='返回结果数量'),
  138. sa.Column('execution_time_ms', sa.Integer(), nullable=True, comment='执行时间(毫秒)'),
  139. sa.Column('user_id', sa.String(length=255), nullable=True, comment='用户ID'),
  140. sa.Column('session_id', sa.String(length=255), nullable=True, comment='会话ID'),
  141. sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False, comment='创建时间'),
  142. sa.PrimaryKeyConstraint('id')
  143. )
  144. # 创建搜索历史表索引
  145. op.create_index('ix_search_history_created_at', 'search_history', ['created_at'])
  146. op.create_index('ix_search_history_search_type', 'search_history', ['search_type'])
  147. op.create_index('ix_search_history_user_id', 'search_history', ['user_id'])
  148. def downgrade() -> None:
  149. """
  150. 删除所有表
  151. """
  152. # 按照依赖关系的逆序删除表
  153. op.drop_table('search_history')
  154. op.drop_table('parsed_documents')
  155. op.drop_table('prompt_dimensions')
  156. op.drop_table('document_knowledge_base')
  157. op.drop_table('knowledge_bases')
  158. op.drop_table('document_chunks')
  159. op.drop_table('documents')