Data Silo Detection
Overview
Based on DDC methodology (Chapter 1.2), this skill detects and maps data silos in construction organizations, identifying disconnected data sources, duplicate data, and integration opportunities.
Book Reference: "Технологии и системы управления в современном строительстве" / "Technologies and Management Systems in Modern Construction"
Quick Start
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Set, Tuple
from datetime import datetime
import json
from collections import defaultdict
class DataDomain(Enum):
"""Construction data domains"""
DESIGN = "design"
COST = "cost"
SCHEDULE = "schedule"
QUALITY = "quality"
SAFETY = "safety"
PROCUREMENT = "procurement"
SITE = "site"
DOCUMENT = "document"
FINANCIAL = "financial"
HR = "hr"
class SiloSeverity(Enum):
"""Severity level of data silo"""
CRITICAL = "critical" # Major business impact
HIGH = "high" # Significant inefficiency
MEDIUM = "medium" # Noticeable issues
LOW = "low" # Minor inconvenience
class DataSourceType(Enum):
"""Types of data sources"""
DATABASE = "database"
SPREADSHEET = "spreadsheet"
FILE_SHARE = "file_share"
CLOUD_APP = "cloud_app"
DESKTOP_APP = "desktop_app"
PAPER = "paper"
EMAIL = "email"
PERSONAL = "personal"
@dataclass
class DataSource:
"""Represents a data source in the organization"""
id: str
name: str
type: DataSourceType
domain: DataDomain
owner: str
department: str
users: List[str]
data_entities: List[str]
connections: List[str] = field(default_factory=list)
update_frequency: str = "unknown"
access_level: str = "department" # personal, department, organization
has_api: bool = False
last_modified: Optional[datetime] = None
@dataclass
class DataSilo:
"""Detected data silo"""
id: str
sources: List[DataSource]
domain: DataDomain
severity: SiloSeverity
issue_type: str
description: str
impact: str
affected_users: int
affected_processes: List[str]
recommendations: List[str]
estimated_cost: Optional[float] = None
@dataclass
class DuplicateData:
"""Detected duplicate data across sources"""
entity_name: str
sources: List[str]
discrepancy_rate: float # 0-1
master_source: Optional[str] = None
issues: List[str] = field(default_factory=list)
@dataclass
class SiloAnalysis:
"""Complete silo analysis results"""
organization: str
analysis_date: datetime
total_sources: int
silos_detected: List[DataSilo]
duplicates: List[DuplicateData]
connectivity_score: float
data_flow_gaps: List[Dict]
priority_actions: List[str]
integration_roadmap: Dict
class DataSiloDetector:
"""
Detect and analyze data silos in construction organizations.
Based on DDC methodology Chapter 1.2.
"""
def __init__(self):
self.domain_relationships = self._define_domain_relationships()
self.critical_entities = self._define_critical_entities()
def _define_domain_relationships(self) -> Dict[DataDomain, List[DataDomain]]:
"""Define expected relationships between domains"""
return {
DataDomain.DESIGN: [
DataDomain.COST, DataDomain.SCHEDULE,
DataDomain.PROCUREMENT, DataDomain.QUALITY
],
DataDomain.COST: [
DataDomain.DESIGN, DataDomain.SCHEDULE,
DataDomain.FINANCIAL, DataDomain.PROCUREMENT
],
DataDomain.SCHEDULE: [
DataDomain.DESIGN, DataDomain.COST,
DataDomain.SITE, DataDomain.HR
],
DataDomain.PROCUREMENT: [
DataDomain.COST, DataDomain.DESIGN,
DataDomain.SITE, DataDomain.FINANCIAL
],
DataDomain.SITE: [
DataDomain.SCHEDULE, DataDomain.SAFETY,
DataDomain.QUALITY, DataDomain.HR
],
DataDomain.QUALITY: [
DataDomain.DESIGN, DataDomain.SITE,
DataDomain.DOCUMENT
],
DataDomain.SAFETY: [
DataDomain.SITE, DataDomain.HR,
DataDomain.DOCUMENT
],
DataDomain.FINANCIAL: [
DataDomain.COST, DataDomain.PROCUREMENT,
DataDomain.HR
]
}
def _define_critical_entities(self) -> Dict[str, List[DataDomain]]:
"""Define entities that should be shared across domains"""
return {
"project": [DataDomain.DESIGN, DataDomain.COST, DataDomain.SCHEDULE],
"budget": [DataDomain.COST, DataDomain.FINANCIAL, DataDomain.PROCUREMENT],
"schedule": [DataDomain.SCHEDULE, DataDomain.SITE, DataDomain.PROCUREMENT],
"material": [DataDomain.DESIGN, DataDomain.COST, DataDomain.PROCUREMENT],
"labor": [DataDomain.HR, DataDomain.COST, DataDomain.SCHEDULE],
"subcontractor": [DataDomain.PROCUREMENT, DataDomain.COST, DataDomain.SCHEDULE],
"rfi": [DataDomain.DESIGN, DataDomain.DOCUMENT, DataDomain.SITE],
"change_order": [DataDomain.COST, DataDomain.DESIGN, DataDomain.SCHEDULE]
}
def detect_silos(
self,
organization: str,
data_sources: List[DataSource],
process_flows: Optional[List[Dict]] = None
) -> SiloAnalysis:
"""
Detect data silos in the organization.
Args:
organization: Organization name
data_sources: List of data sources to analyze
process_flows: Optional business process flows
Returns:
Complete silo analysis
"""
# Build connectivity graph
connectivity = self._build_connectivity_graph(data_sources)
# Detect isolated sources
isolated_silos = self._detect_isolated_sources(
data_sources, connectivity
)
# Detect domain silos
domain_silos = self._detect_domain_silos(data_sources)
# Detect duplicate data
duplicates = self._detect_duplicates(data_sources)
# Detect data flow gaps
flow_gaps = self._detect_flow_gaps(
data_sources, process_flows
)
# Calculate connectivity score
connectivity_score = self._calculate_connectivity_score(
data_sources, connectivity
)
# Combine all silos
all_silos = isolated_silos + domain_silos
# Prioritize silos
prioritized_silos = self._prioritize_silos(all_silos)
# Generate priority actions
priority_actions = self._generate_priority_actions(
prioritized_silos, duplicates
)
# Create integration roadmap
roadmap = self._create_integration_roadmap(
prioritized_silos, flow_gaps
)
return SiloAnalysis(
organization=organization,
analysis_date=datetime.now(),
total_sources=len(data_sources),
silos_detected=prioritized_silos,
duplicates=duplicates,
connectivity_score=connectivity_score,
data_flow_gaps=flow_gaps,
priority_actions=priority_actions,
integration_roadmap=roadmap
)
def _build_connectivity_graph(
self,
sources: List[DataSource]
) -> Dict[str, Set[str]]:
"""Build graph of source connections"""
graph = defaultdict(set)
for source in sources:
for connection in source.connections:
graph[source.id].add(connection)
graph[connection].add(source.id)
return graph
def _detect_isolated_sources(
self,
sources: List[DataSource],
connectivity: Dict[str, Set[str]]
) -> List[DataSilo]: