RAG Construction
Overview
Based on DDC methodology (Chapter 2.3), this skill builds Retrieval-Augmented Generation (RAG) systems for construction knowledge bases, enabling semantic search and AI-powered question answering over construction documents.
Book Reference: "Pandas DataFrame и LLM ChatGPT" / "Pandas DataFrame and LLM ChatGPT"
Quick Start
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Callable
from datetime import datetime
import json
import hashlib
import re
class DocumentType(Enum):
"""Types of construction documents"""
SPECIFICATION = "specification"
DRAWING = "drawing"
CONTRACT = "contract"
RFI = "rfi"
SUBMITTAL = "submittal"
CHANGE_ORDER = "change_order"
MEETING_MINUTES = "meeting_minutes"
DAILY_REPORT = "daily_report"
SAFETY_REPORT = "safety_report"
INSPECTION = "inspection"
MANUAL = "manual"
STANDARD = "standard"
class ChunkingStrategy(Enum):
"""Text chunking strategies"""
FIXED_SIZE = "fixed_size"
PARAGRAPH = "paragraph"
SECTION = "section"
SEMANTIC = "semantic"
SENTENCE = "sentence"
@dataclass
class DocumentChunk:
"""A chunk of document text"""
id: str
document_id: str
content: str
metadata: Dict[str, Any]
embedding: Optional[List[float]] = None
token_count: int = 0
position: int = 0
@dataclass
class Document:
"""Construction document"""
id: str
title: str
doc_type: DocumentType
content: str
source: str
metadata: Dict[str, Any] = field(default_factory=dict)
chunks: List[DocumentChunk] = field(default_factory=list)
created_at: datetime = field(default_factory=datetime.now)
@dataclass
class SearchResult:
"""Search result from vector store"""
chunk: DocumentChunk
score: float
document_title: str
doc_type: DocumentType
@dataclass
class RAGResponse:
"""Response from RAG system"""
query: str
answer: str
sources: List[SearchResult]
confidence: float
tokens_used: int
class TextChunker:
"""Split documents into chunks for embedding"""
def __init__(
self,
strategy: ChunkingStrategy = ChunkingStrategy.PARAGRAPH,
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.strategy = strategy
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_document(self, document: Document) -> List[DocumentChunk]:
"""Split document into chunks"""
if self.strategy == ChunkingStrategy.FIXED_SIZE:
return self._chunk_fixed_size(document)
elif self.strategy == ChunkingStrategy.PARAGRAPH:
return self._chunk_by_paragraph(document)
elif self.strategy == ChunkingStrategy.SECTION:
return self._chunk_by_section(document)
elif self.strategy == ChunkingStrategy.SENTENCE:
return self._chunk_by_sentence(document)
else:
return self._chunk_fixed_size(document)
def _chunk_fixed_size(self, document: Document) -> List[DocumentChunk]:
"""Chunk by fixed character size with overlap"""
chunks = []
text = document.content
start = 0
position = 0
while start < len(text):
end = start + self.chunk_size
# Find word boundary
if end < len(text):
while end > start and text[end] not in ' \n\t':
end -= 1
chunk_text = text[start:end].strip()
if chunk_text:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=chunk_text,
metadata={
"doc_type": document.doc_type.value,
"title": document.title,
**document.metadata
},
token_count=len(chunk_text.split()),
position=position
))
position += 1
start = end - self.chunk_overlap
if start >= len(text):
break
return chunks
def _chunk_by_paragraph(self, document: Document) -> List[DocumentChunk]:
"""Chunk by paragraphs"""
chunks = []
paragraphs = document.content.split('\n\n')
current_chunk = ""
position = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(current_chunk) + len(para) < self.chunk_size:
current_chunk += "\n\n" + para if current_chunk else para
else:
if current_chunk:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
"doc_type": document.doc_type.value,
"title": document.title,
**document.metadata
},
token_count=len(current_chunk.split()),
position=position
))
position += 1
current_chunk = para
# Add remaining content
if current_chunk:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
"doc_type": document.doc_type.value,
"title": document.title,
**document.metadata
},
token_count=len(current_chunk.split()),
position=position
))
return chunks
def _chunk_by_section(self, document: Document) -> List[DocumentChunk]:
"""Chunk by document sections (headers)"""
# Split by common section patterns
section_pattern = r'\n(?=(?:\d+\.|\d+\s|SECTION|ARTICLE|PART)\s+[A-Z])'
sections = re.split(section_pattern, document.content)
chunks = []
for position, section in enumerate(sections):
section = section.strip()
if section:
# If section is too large, further split it
if len(section) > self.chunk_size * 2:
sub_chunker = TextChunker(ChunkingStrategy.PARAGRAPH, self.chunk_size)
sub_doc = Document(
id=f"{document.id}_sec{position}",
title=document.title,
doc_type=document.doc_type,
content=section,
source=document.source,
metadata=document.metadata
)
sub_chunks = sub_chunker.chunk_document(sub_doc)
for i, chunk in enumerate(sub_chunks):
chunk.id = self._generate_chunk_id(document.id, position * 100 + i)
chunk.position = position * 100 + i
chunks.extend(sub_chunks)
else:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=section,
metadata={
"doc_type": document.doc_type.value,
"title": document.title,
**document.metadata
},