Digital archive methodology
Patterns for building production-quality digital archives with AI-powered analysis and knowledge graph construction.
Archive architecture
Multi-source integration pattern
┌─────────────────┐ ┌──────────────────┐ ┌────────────────┐
│ OCR Pipeline │ │ Web Scraping │ │ Social Media │
│ (newspapers) │ │ (articles) │ │ (transcripts) │
└────────┬────────┘ └────────┬─────────┘ └───────┬────────┘
│ │ │
└──────────────────────┼──────────────────────┘
│
┌───────────▼───────────┐
│ Unified Schema │
│ (35+ fields) │
└───────────┬───────────┘
│
┌──────────────────────┼──────────────────────┐
│ │ │
┌────────▼────────┐ ┌──────────▼──────────┐ ┌───────▼───────┐
│ AI Enrichment │ │ Entity Extraction │ │ PDF Archive │
│ (Gemini) │ │ (Knowledge Graph) │ │ (WCAG 2.1) │
└────────┬────────┘ └──────────┬──────────┘ └───────┬───────┘
│ │ │
└──────────────────────┼──────────────────────┘
│
┌───────────▼───────────┐
│ Google Sheets │
│ (primary database) │
└───────────┬───────────┘
│
┌───────────▼───────────┐
│ Frontend Export │
│ (JSON/CSV) │
└───────────────────────┘
Unified schema design
from dataclasses import dataclass, field
from datetime import date
from typing import Optional
from enum import Enum
class ContentType(Enum):
ARTICLE = 'Article'
VIDEO = 'Video'
AUDIO = 'Audio'
SOCIAL = 'Social Post'
NEWSPAPER = 'Newspaper Article'
class ThematicCategory(Enum):
PRESS_CRITICISM = 'Press & Media Criticism'
JOURNALISM_THEORY = 'Journalism Theory'
POLITICS = 'Politics & Democracy'
TECHNOLOGY = 'Technology & Digital Media'
EDUCATION = 'Journalism Education'
AUDIENCE = 'Audience & Public Engagement'
class HistoricalEra(Enum):
ERA_1990s = '1990-1999'
ERA_2000_04 = '2000-2004'
ERA_2005_09 = '2005-2009'
ERA_2010_15 = '2010-2015'
ERA_2016_20 = '2016-2020'
ERA_2021_25 = '2021-2025'
ERA_2026_PRESENT = '2026-present'
@dataclass
class ArchiveRecord:
# Core identifiers
id: str # Format: SOURCE-00001
url: str
title: str
# Content
author: Optional[str] = None
publication_date: Optional[date] = None
publication: Optional[str] = None
content_type: ContentType = ContentType.ARTICLE
text: str = ''
# AI-enriched fields
summary: Optional[str] = None
pull_quote: Optional[str] = None
categories: list[ThematicCategory] = field(default_factory=list)
key_concepts: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
era: Optional[HistoricalEra] = None
scope: Optional[str] = None # Theoretical, Commentary, Case Study, etc.
# Entity references
entities_mentioned: list[str] = field(default_factory=list)
related_to: list[str] = field(default_factory=list)
responds_to: list[str] = field(default_factory=list)
# Archive metadata
pdf_url: Optional[str] = None
transcript_url: Optional[str] = None
verified: bool = False
processing_status: str = 'pending'
last_updated: Optional[date] = None
def generate_record_id(source: str, sequence: int) -> str:
"""Generate unique ID with source prefix."""
prefixes = {
'nytimes': 'NYT',
'columbia journalism review': 'CJR',
'pressthink': 'PT',
'twitter': 'TW',
'youtube': 'YT',
'newspaper': 'NEWS',
}
prefix = prefixes.get(source.lower(), 'MISC')
return f"{prefix}-{sequence:05d}"
AI-powered categorization
Taxonomy-based classification
# pip install google-genai
# (the legacy `google-generativeai` SDK was deprecated in 2024 — the
# new `google-genai` package is the supported path. Imports below
# use the new shape.)
import os
from google import genai
from google.genai import types
import json
from typing import Optional
# Default to the current Gemini 2.5 family. For 2026 production
# workloads, the Gemini 3 family (gemini-3-flash, gemini-3-pro) is
# also available — bump the model string when you've verified the
# response shape against your taxonomy prompts.
DEFAULT_GEMINI_MODEL = 'gemini-2.5-flash'
# Single client; reads GOOGLE_API_KEY (or pass api_key=...).
_client = genai.Client(api_key=os.environ.get('GOOGLE_API_KEY'))
TAXONOMY = {
"thematic_categories": [
"Press & Media Criticism",
"Journalism Theory",
"Politics & Democracy",
"Technology & Digital Media",
"Journalism Education",
"Audience & Public Engagement"
],
"key_concepts": [
"The View from Nowhere",
"Verification vs. Assertion",
"Citizens vs. Consumers",
"Public Journalism",
"The Rosen Test",
"Savvy vs. Naive",
"Professional vs. Amateur",
"Production vs. Distribution",
"Trust vs. Transparency",
"Horse Race Coverage",
"Both Sides Journalism",
"Audience Atomization",
"The Church of the Savvy"
],
"scope_types": [
"Theoretical",
"Commentary",
"Historical",
"Case Study",
"Pedagogical",
"Personal Reflection"
]
}
class ArchiveCategorizer:
def __init__(self, model: str = DEFAULT_GEMINI_MODEL, client: genai.Client = None):
self.model = model
self.client = client or _client
def categorize(self, record: ArchiveRecord) -> dict:
prompt = f"""Analyze this archival content and categorize it according to the taxonomy.
CONTENT:
Title: {record.title}
Author: {record.author or 'Unknown'}
Date: {record.publication_date or 'Unknown'}
Text (first 8000 chars):
{record.text[:8000]}
TAXONOMY:
{json.dumps(TAXONOMY, indent=2)}
Respond with JSON containing:
{{
"categories": ["category1", "category2"], // 1-3 from thematic_categories
"key_concepts": ["concept1", "concept2"], // 0-5 from key_concepts list
"scope": "scope_type", // one from scope_types
"era": "YYYY-YYYY", // decade range
"tags": ["tag1", "tag2", "tag3", "tag4", "tag5"], // 5 contextual keywords
"summary": "2-3 sentence summary",
"pull_quote": "Most impactful quote from the text"
}}
IMPORTANT:
- Only use categories/concepts from the taxonomy
- Tags should be lowercase, hyphenated keywords
- Summary should capture the main argument
- Pull quote must be an exact excerpt from the text
"""
# response_mime_type='application/json' makes Gemini emit raw
# JSON without ```json fences — the markdown-stripping fallback
# in _parse_response() is kept as defense-in-depth for older
# models that still wrap output.
response = self.client.models.generate_content(
model=self.model,
contents=prompt,
config=types.GenerateContentConfig(
response_mime_type='application/json',
),
)
result = self._parse_response(response.text)
# Validate against taxonomy
result['categories'] = [c for c in result.get('categories', [])
if c in TAXONOMY['thematic_categories']]
result['key_concepts'] = [c for c in result.get('key_concepts', [])
if c in TAXONOMY['key_concepts']]
return result
def _parse_response(self, text: str) ->