Data Type Classifier
Overview
Based on DDC methodology (Chapter 2.1), this skill classifies construction data by type, analyzes data sources, and recommends appropriate storage, processing, and integration methods.
Book Reference: "Типы данных в строительстве" / "Data Types in Construction"
Quick Start
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Tuple
from datetime import datetime
import json
import re
import mimetypes
class DataStructure(Enum):
"""Data structure classification"""
STRUCTURED = "structured" # Tables, databases, spreadsheets
SEMI_STRUCTURED = "semi_structured" # JSON, XML, IFC
UNSTRUCTURED = "unstructured" # Documents, images, videos
GEOMETRIC = "geometric" # CAD, BIM geometry
TEMPORAL = "temporal" # Time-series, schedules
SPATIAL = "spatial" # GIS, coordinates
class DataFormat(Enum):
"""Common construction data formats"""
# Structured
CSV = "csv"
EXCEL = "excel"
SQL = "sql"
PARQUET = "parquet"
# Semi-structured
JSON = "json"
XML = "xml"
IFC = "ifc"
BCF = "bcf"
# Unstructured
PDF = "pdf"
DOCX = "docx"
IMAGE = "image"
VIDEO = "video"
# Geometric
DWG = "dwg"
DXF = "dxf"
RVT = "rvt"
NWD = "nwd"
OBJ = "obj"
STL = "stl"
# Schedule
MPP = "mpp"
P6 = "p6"
XER = "xer"
class StorageRecommendation(Enum):
"""Storage system recommendations"""
RELATIONAL_DB = "relational_database"
DOCUMENT_DB = "document_database"
OBJECT_STORAGE = "object_storage"
GRAPH_DB = "graph_database"
TIME_SERIES_DB = "time_series_database"
VECTOR_DB = "vector_database"
FILE_SYSTEM = "file_system"
DATA_LAKE = "data_lake"
@dataclass
class DataCharacteristics:
"""Characteristics of a data source"""
has_schema: bool
has_relationships: bool
is_queryable: bool
is_binary: bool
has_geometry: bool
has_temporal: bool
has_text_content: bool
avg_record_size: Optional[int] = None # bytes
estimated_volume: Optional[str] = None # small/medium/large/huge
update_frequency: Optional[str] = None
@dataclass
class DataClassification:
"""Classification result for a data source"""
source_name: str
source_type: str
detected_format: DataFormat
structure: DataStructure
characteristics: DataCharacteristics
storage_recommendation: StorageRecommendation
processing_tools: List[str]
integration_options: List[str]
quality_considerations: List[str]
confidence: float
@dataclass
class ClassificationReport:
"""Complete classification report"""
total_sources: int
classifications: List[DataClassification]
summary_by_structure: Dict[str, int]
summary_by_format: Dict[str, int]
storage_recommendations: Dict[str, List[str]]
integration_strategy: Dict[str, str]
class DataTypeClassifier:
"""
Classify construction data by type and recommend processing methods.
Based on DDC methodology Chapter 2.1.
"""
def __init__(self):
self.format_signatures = self._define_format_signatures()
self.structure_mapping = self._define_structure_mapping()
self.storage_mapping = self._define_storage_mapping()
self.processing_tools = self._define_processing_tools()
def _define_format_signatures(self) -> Dict[str, Dict]:
"""Define format detection signatures"""
return {
# File extensions
".csv": {"format": DataFormat.CSV, "structure": DataStructure.STRUCTURED},
".xlsx": {"format": DataFormat.EXCEL, "structure": DataStructure.STRUCTURED},
".xls": {"format": DataFormat.EXCEL, "structure": DataStructure.STRUCTURED},
".json": {"format": DataFormat.JSON, "structure": DataStructure.SEMI_STRUCTURED},
".xml": {"format": DataFormat.XML, "structure": DataStructure.SEMI_STRUCTURED},
".ifc": {"format": DataFormat.IFC, "structure": DataStructure.SEMI_STRUCTURED},
".bcf": {"format": DataFormat.BCF, "structure": DataStructure.SEMI_STRUCTURED},
".pdf": {"format": DataFormat.PDF, "structure": DataStructure.UNSTRUCTURED},
".docx": {"format": DataFormat.DOCX, "structure": DataStructure.UNSTRUCTURED},
".dwg": {"format": DataFormat.DWG, "structure": DataStructure.GEOMETRIC},
".dxf": {"format": DataFormat.DXF, "structure": DataStructure.GEOMETRIC},
".rvt": {"format": DataFormat.RVT, "structure": DataStructure.GEOMETRIC},
".nwd": {"format": DataFormat.NWD, "structure": DataStructure.GEOMETRIC},
".mpp": {"format": DataFormat.MPP, "structure": DataStructure.TEMPORAL},
".xer": {"format": DataFormat.XER, "structure": DataStructure.TEMPORAL},
".parquet": {"format": DataFormat.PARQUET, "structure": DataStructure.STRUCTURED},
".jpg": {"format": DataFormat.IMAGE, "structure": DataStructure.UNSTRUCTURED},
".png": {"format": DataFormat.IMAGE, "structure": DataStructure.UNSTRUCTURED},
".mp4": {"format": DataFormat.VIDEO, "structure": DataStructure.UNSTRUCTURED}
}
def _define_structure_mapping(self) -> Dict[DataStructure, Dict]:
"""Define characteristics for each structure type"""
return {
DataStructure.STRUCTURED: {
"description": "Tabular data with fixed schema",
"examples": ["Cost databases", "Material lists", "Vendor records"],
"query_support": True,
"schema_required": True
},
DataStructure.SEMI_STRUCTURED: {
"description": "Hierarchical data with flexible schema",
"examples": ["BIM models (IFC)", "API responses", "Configuration files"],
"query_support": True,
"schema_required": False
},
DataStructure.UNSTRUCTURED: {
"description": "No predefined schema or format",
"examples": ["Contracts", "Photos", "Emails", "Meeting notes"],
"query_support": False,
"schema_required": False
},
DataStructure.GEOMETRIC: {
"description": "3D/2D geometric and spatial data",
"examples": ["CAD drawings", "BIM geometry", "Point clouds"],
"query_support": True,
"schema_required": True
},
DataStructure.TEMPORAL: {
"description": "Time-based sequential data",
"examples": ["Schedules", "Progress data", "Sensor readings"],
"query_support": True,
"schema_required": True
},
DataStructure.SPATIAL: {
"description": "Geographic and location data",
"examples": ["Site maps", "GPS tracks", "GIS layers"],
"query_support": True,
"schema_required": True
}
}
def _define_storage_mapping(self) -> Dict[DataStructure, StorageRecommendation]:
"""Map data structures to storage recommendations"""
return {
DataStructure.STRUCTURED: StorageRecommendation.RELATIONAL_DB,
DataStructure.SEMI_STRUCTURED: StorageRecommendation.DOCUMENT_DB,
DataStructure.UNSTRUCTURED: StorageRecommendation.OBJECT_STORAGE,
DataStructure.GEOMETRIC: StorageRecommendation.FILE_SYSTEM,
DataStructure.TEMPORAL: StorageRecommendation.TIME_SERIES_DB,
DataStructure.SPATIAL: StorageRecommendation.RELATIONAL_DB
}
def _define_processing_tools(self) -> Dict[DataFormat, List[str]]:
"""Define processing tools for each format"""
return {
DataFormat.CSV: ["pandas", "polars", "duckdb"],
DataFormat.EXCEL: ["panda