本文档说明如何设置开发环境、运行测试、以及贡献代码。
git clone https://github.com/YOUR_USERNAME/rag-system.git
cd rag-system
# 创建虚拟环境
python -m venv venv
# 激活虚拟环境
# Linux/macOS:
source venv/bin/activate
# Windows:
venv\Scripts\activate
# 安装生产依赖
pip install -r requirements.txt
# 安装开发依赖
pip install pytest pytest-asyncio pytest-cov hypothesis httpx
pip install flake8 black isort mypy
pip install safety bandit
# 复制示例配置
cp .env.example .env
# 编辑配置(使用你喜欢的编辑器)
nano .env
使用 Docker Compose 启动开发环境所需的服务:
cd docker
docker-compose up -d postgres infinity redis
# 创建数据库
createdb rag_system_dev
# 运行迁移
alembic upgrade head
# 运行测试
pytest
# 启动应用
uvicorn src.main:app --reload
访问 http://localhost:8000/docs 查看 API 文档。
rag_system/
├── src/ # 源代码
│ ├── domain/ # 领域层(业务逻辑)
│ │ ├── shared/ # 共享领域概念
│ │ ├── vector_search/ # 向量搜索领域
│ │ ├── document_parsing/# 文档解析领域
│ │ └── knowledge_base/ # 知识库领域
│ ├── application/ # 应用层(用例)
│ │ ├── shared/ # 共享应用概念
│ │ ├── vector_search/ # 向量搜索用例
│ │ ├── document_parsing/# 文档解析用例
│ │ └── knowledge_base/ # 知识库用例
│ ├── infrastructure/ # 基础设施层(技术实现)
│ │ ├── database/ # 数据库实现
│ │ ├── vector_db/ # 向量数据库适配器
│ │ ├── parsers/ # 文档解析器
│ │ ├── external_services/# 外部服务集成
│ │ └── file_storage/ # 文件存储
│ ├── presentation/ # 表现层(API)
│ │ ├── api/ # API 路由和中间件
│ │ └── schemas/ # 请求/响应模型
│ ├── config/ # 配置管理
│ └── main.py # 应用入口
├── tests/ # 测试代码
│ ├── unit/ # 单元测试
│ ├── integration/ # 集成测试
│ ├── e2e/ # 端到端测试
│ └── fixtures/ # 测试夹具
├── docs/ # 文档
├── scripts/ # 工具脚本
├── docker/ # Docker 配置
└── .github/ # GitHub 配置
详细的目录结构说明请参考 Directory Structure。
# 从 main 分支创建新分支
git checkout main
git pull origin main
git checkout -b feature/your-feature-name
遵循 Clean Architecture 原则:
为每个层编写相应的测试:
# 运行测试
pytest
# 运行特定测试
pytest tests/unit/domain/
# 运行并查看覆盖率
pytest --cov=src --cov-report=html
# 格式化代码
black src tests
isort src tests
# 代码检查
flake8 src tests
# 类型检查
mypy src
# 安全检查
safety check
bandit -r src
使用规范的提交信息:
git add .
git commit -m "feat: add new feature"
提交信息格式:
feat: 新功能fix: 修复 bugdocs: 文档更新style: 代码格式(不影响功能)refactor: 重构test: 测试相关chore: 构建/工具相关git push origin feature/your-feature-name
然后在 GitHub 上创建 Pull Request。
遵循 PEP 8 规范:
# 好的示例
def calculate_similarity(vector_a: List[float], vector_b: List[float]) -> float:
"""
Calculate cosine similarity between two vectors.
Args:
vector_a: First vector
vector_b: Second vector
Returns:
Similarity score between 0 and 1
Raises:
ValueError: If vectors have different dimensions
"""
if len(vector_a) != len(vector_b):
raise ValueError("Vectors must have same dimension")
# Implementation
return similarity_score
所有函数都应该有类型提示:
from typing import List, Optional, Dict, Any
def process_document(
content: str,
metadata: Optional[Dict[str, Any]] = None
) -> List[str]:
"""Process document and return chunks."""
pass
使用 Google 风格的文档字符串:
def search_documents(query: str, limit: int = 10) -> List[Document]:
"""
Search documents using hybrid search.
This function combines vector search and full-text search
to provide better search results.
Args:
query: Search query string
limit: Maximum number of results to return
Returns:
List of matching documents sorted by relevance
Raises:
ValueError: If query is empty
DatabaseError: If database connection fails
Example:
>>> results = search_documents("machine learning", limit=5)
>>> len(results)
5
"""
pass
DocumentParser, VectorDatabase)parse_document, search_vectors)MAX_CHUNK_SIZE, DEFAULT_LIMIT)_internal_method, _cache)# 1. 标准库
import os
import sys
from typing import List, Optional
# 2. 第三方库
import numpy as np
from fastapi import FastAPI, HTTPException
# 3. 本地模块
from src.domain.entities import Document
from src.application.handlers import SearchHandler
tests/
├── unit/ # 单元测试(快速、隔离)
│ ├── domain/ # 领域层测试
│ ├── application/ # 应用层测试
│ └── infrastructure/ # 基础设施层测试
├── integration/ # 集成测试(组件交互)
│ ├── api/ # API 集成测试
│ └── database/ # 数据库集成测试
└── e2e/ # 端到端测试(完整流程)
import pytest
from src.domain.vector_search.entities import Document
def test_document_creation():
"""Test document entity creation."""
doc = Document(
id="doc_123",
content="Test content",
metadata={"title": "Test"}
)
assert doc.id == "doc_123"
assert doc.content == "Test content"
assert doc.metadata["title"] == "Test"
def test_document_validation():
"""Test document validation."""
with pytest.raises(ValueError):
Document(id="", content="Test") # Empty ID should fail
import pytest
from src.infrastructure.database.session import get_session
@pytest.fixture
def db_session():
"""Provide a database session for testing."""
session = get_session()
yield session
session.rollback()
session.close()
@pytest.fixture
def sample_document():
"""Provide a sample document for testing."""
return Document(
id="test_doc",
content="Sample content",
metadata={"title": "Sample"}
)
def test_save_document(db_session, sample_document):
"""Test saving document to database."""
repository = DocumentRepository(db_session)
saved_doc = repository.save(sample_document)
assert saved_doc.id == sample_document.id
from unittest.mock import Mock, patch
def test_search_with_mock():
"""Test search with mocked vector database."""
mock_vector_db = Mock()
mock_vector_db.search.return_value = [
{"id": "doc_1", "score": 0.9},
{"id": "doc_2", "score": 0.8}
]
service = SearchService(vector_db=mock_vector_db)
results = service.search("test query")
assert len(results) == 2
mock_vector_db.search.assert_called_once()
使用 Hypothesis 进行属性测试:
from hypothesis import given, strategies as st
@given(st.lists(st.floats(min_value=0, max_value=1), min_size=1, max_size=1000))
def test_vector_normalization(vector):
"""Test that vector normalization always produces unit vectors."""
normalized = normalize_vector(vector)
magnitude = sum(x**2 for x in normalized) ** 0.5
assert abs(magnitude - 1.0) < 1e-6
# 运行所有测试
pytest
# 运行特定测试文件
pytest tests/unit/domain/test_entities.py
# 运行特定测试函数
pytest tests/unit/domain/test_entities.py::test_document_creation
# 运行特定标记的测试
pytest -m unit # 只运行单元测试
pytest -m integration # 只运行集成测试
pytest -m "not slow" # 跳过慢速测试
# 并行运行测试
pytest -n auto
# 查看覆盖率
pytest --cov=src --cov-report=html
open htmlcov/index.html
# 在代码中设置断点
import pdb; pdb.set_trace()
# 或使用 breakpoint()(Python 3.7+)
breakpoint()
创建 .vscode/launch.json:
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: FastAPI",
"type": "python",
"request": "launch",
"module": "uvicorn",
"args": [
"src.main:app",
"--reload"
],
"jinja": true,
"justMyCode": false
},
{
"name": "Python: Pytest",
"type": "python",
"request": "launch",
"module": "pytest",
"args": [
"-v"
],
"console": "integratedTerminal",
"justMyCode": false
}
]
}
import logging
logger = logging.getLogger(__name__)
def process_document(doc: Document):
logger.debug(f"Processing document: {doc.id}")
logger.info(f"Document size: {len(doc.content)}")
try:
result = parse(doc)
logger.info(f"Successfully parsed document: {doc.id}")
return result
except Exception as e:
logger.error(f"Failed to parse document: {doc.id}", exc_info=True)
raise
import cProfile
import pstats
# 分析函数性能
profiler = cProfile.Profile()
profiler.enable()
# 运行代码
result = expensive_function()
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats('cumulative')
stats.print_stats(10) # 显示前 10 个最慢的函数
运行所有检查:
python scripts/run_tests.py --all
提交代码
推送到你的 fork
创建 Pull Request
使用 GitHub Issues 报告 bug,包含:
使用 GitHub Issues 提出功能请求,包含:
# 启动开发服务器
uvicorn src.main:app --reload
# 运行测试
pytest
# 代码格式化
black src tests
isort src tests
# 代码检查
flake8 src tests
mypy src
# 生成覆盖率报告
pytest --cov=src --cov-report=html
# 运行所有检查
python scripts/run_tests.py --all
# 创建迁移
alembic revision --autogenerate -m "描述"
# 应用迁移
alembic upgrade head
# 回滚迁移
alembic downgrade -1
# 查看迁移历史
alembic history
# 启动服务
docker-compose up -d
# 查看日志
docker-compose logs -f
# 停止服务
docker-compose down
# 重建镜像
docker-compose build --no-cache
VS Code: 推荐插件
PyCharm: 专业的 Python IDE
httpie: 更友好的 HTTP 客户端
pip install httpie
http POST localhost:8000/api/v1/documents/ content="test"
jq: JSON 处理工具
curl localhost:8000/api/v1/documents/ | jq '.data'
祝你开发愉快!🚀