Data Source Audit for Construction
Overview
Perform comprehensive audits of construction data sources to identify silos, map data flows, assess quality, and plan integration strategies. Essential for digital transformation and data-driven construction initiatives.
Business Case
Construction organizations typically have 10-50+ data sources:
- Project management systems
- Estimating software
- Scheduling tools
- Accounting/ERP systems
- BIM platforms
- Document management systems
- Field apps
- Spreadsheets
Note: This skill is vendor-agnostic and works with any data source. Product names mentioned elsewhere in examples are trademarks of their respective owners.
This skill helps:
- Discover all data sources
- Map data flows and dependencies
- Identify integration opportunities
- Prioritize data improvement efforts
Technical Implementation
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Set
from enum import Enum
from datetime import datetime
import pandas as pd
import json
class DataSourceType(Enum):
DATABASE = "database"
API = "api"
FILE_SHARE = "file_share"
CLOUD_APP = "cloud_app"
SPREADSHEET = "spreadsheet"
LEGACY_SYSTEM = "legacy_system"
IOT_SENSOR = "iot_sensor"
MANUAL_ENTRY = "manual_entry"
class DataDomain(Enum):
COST = "cost"
SCHEDULE = "schedule"
BIM = "bim"
DOCUMENT = "document"
FIELD = "field"
SAFETY = "safety"
QUALITY = "quality"
HR = "hr"
ACCOUNTING = "accounting"
PROCUREMENT = "procurement"
@dataclass
class DataSource:
name: str
source_type: DataSourceType
domains: List[DataDomain]
owner: str
department: str
description: str
# Technical details
technology: str
location: str # cloud, on-prem, hybrid
access_method: str # API, ODBC, file export, manual
# Data characteristics
update_frequency: str # real-time, daily, weekly, monthly, ad-hoc
data_volume: str # small, medium, large
retention_period: str
# Quality metrics
completeness_score: float = 0.0
accuracy_score: float = 0.0
timeliness_score: float = 0.0
# Integration status
integrations: List[str] = field(default_factory=list)
is_master: bool = False # Is this the master source for any entity?
master_for: List[str] = field(default_factory=list)
# Issues
known_issues: List[str] = field(default_factory=list)
# Metadata
last_audit_date: Optional[datetime] = None
audit_notes: str = ""
@dataclass
class DataFlow:
source: str
target: str
flow_type: str # push, pull, bidirectional, manual
frequency: str
entities: List[str] # What data entities flow
transformation: str # none, simple, complex
status: str # active, planned, deprecated
@dataclass
class DataSilo:
name: str
sources: List[str]
impact: str # high, medium, low
description: str
resolution_options: List[str]
class DataSourceAuditor:
"""Audit and analyze construction data sources."""
def __init__(self):
self.sources: Dict[str, DataSource] = {}
self.flows: List[DataFlow] = []
self.silos: List[DataSilo] = []
def add_source(self, source: DataSource):
"""Register a data source."""
self.sources[source.name] = source
def add_flow(self, flow: DataFlow):
"""Register a data flow between sources."""
self.flows.append(flow)
def discover_sources_from_survey(self, survey_responses: List[Dict]) -> List[DataSource]:
"""Create data sources from survey responses."""
sources = []
for response in survey_responses:
source = DataSource(
name=response['system_name'],
source_type=DataSourceType(response['type']),
domains=[DataDomain(d) for d in response['domains']],
owner=response['owner'],
department=response['department'],
description=response['description'],
technology=response['technology'],
location=response['location'],
access_method=response['access_method'],
update_frequency=response['update_frequency'],
data_volume=response['data_volume'],
retention_period=response['retention_period'],
)
sources.append(source)
self.add_source(source)
return sources
def identify_silos(self) -> List[DataSilo]:
"""Identify data silos based on integration analysis."""
silos = []
# Find sources with no integrations
isolated_sources = [
name for name, source in self.sources.items()
if not source.integrations and source.source_type != DataSourceType.MANUAL_ENTRY
]
if isolated_sources:
silos.append(DataSilo(
name="Isolated Systems",
sources=isolated_sources,
impact="high",
description="Systems with no integrations, requiring manual data transfer",
resolution_options=[
"Implement API integration",
"Set up automated file exports",
"Migrate to integrated platform"
]
))
# Find duplicate data domains without master
domain_sources: Dict[DataDomain, List[str]] = {}
for name, source in self.sources.items():
for domain in source.domains:
if domain not in domain_sources:
domain_sources[domain] = []
domain_sources[domain].append(name)
for domain, sources in domain_sources.items():
if len(sources) > 1:
# Check if any is designated master
masters = [s for s in sources if self.sources[s].is_master]
if not masters:
silos.append(DataSilo(
name=f"No Master for {domain.value}",
sources=sources,
impact="medium",
description=f"Multiple sources for {domain.value} data without designated master",
resolution_options=[
"Designate master data source",
"Implement MDM solution",
"Create data reconciliation process"
]
))
# Find one-way flows that should be bidirectional
flow_pairs = {}
for flow in self.flows:
key = tuple(sorted([flow.source, flow.target]))
if key not in flow_pairs:
flow_pairs[key] = []
flow_pairs[key].append(flow)
for (s1, s2), flows in flow_pairs.items():
if len(flows) == 1 and flows[0].flow_type != 'bidirectional':
# Check if bidirectional would make sense
s1_domains = set(self.sources[s1].domains)
s2_domains = set(self.sources[s2].domains)
if s1_domains & s2_domains: # Overlapping domains
silos.append(DataSilo(
name=f"One-way flow: {s1} -> {s2}",
sources=[s1, s2],
impact="low",
description="Data flows one direction only between systems with overlapping domains",
resolution_options=[
"Evaluate need for bidirectional sync",
"Implement change data capture"
]
))
self.silos = silos
return silos
def assess_source_quality(self, source_name: str, sample_data: pd.DataFrame) -> Dict[str, float]:
"""Assess data quality for a source based on sample data."""
if source_name not in self.sources:
raise ValueError(f"Unknown sour