Digital archive methodology

Name: digital-archive
Rating: 5 (232 reviews)
Author: jamditis

Patterns for building production-quality digital archives with AI-powered analysis and knowledge graph construction.

Archive architecture

Multi-source integration pattern

┌─────────────────┐    ┌──────────────────┐    ┌────────────────┐
│  OCR Pipeline   │    │  Web Scraping    │    │  Social Media  │
│  (newspapers)   │    │  (articles)      │    │  (transcripts) │
└────────┬────────┘    └────────┬─────────┘    └───────┬────────┘
         │                      │                      │
         └──────────────────────┼──────────────────────┘
                                │
                    ┌───────────▼───────────┐
                    │  Unified Schema       │
                    │  (35+ fields)         │
                    └───────────┬───────────┘
                                │
         ┌──────────────────────┼──────────────────────┐
         │                      │                      │
┌────────▼────────┐  ┌──────────▼──────────┐  ┌───────▼───────┐
│  AI Enrichment  │  │  Entity Extraction  │  │  PDF Archive  │
│  (Gemini)       │  │  (Knowledge Graph)  │  │  (WCAG 2.1)   │
└────────┬────────┘  └──────────┬──────────┘  └───────┬───────┘
         │                      │                      │
         └──────────────────────┼──────────────────────┘
                                │
                    ┌───────────▼───────────┐
                    │  Google Sheets        │
                    │  (primary database)   │
                    └───────────┬───────────┘
                                │
                    ┌───────────▼───────────┐
                    │  Frontend Export      │
                    │  (JSON/CSV)           │
                    └───────────────────────┘

Unified schema design

from dataclasses import dataclass, field
from datetime import date
from typing import Optional
from enum import Enum

class ContentType(Enum):
    ARTICLE = 'Article'
    VIDEO = 'Video'
    AUDIO = 'Audio'
    SOCIAL = 'Social Post'
    NEWSPAPER = 'Newspaper Article'

class ThematicCategory(Enum):
    PRESS_CRITICISM = 'Press & Media Criticism'
    JOURNALISM_THEORY = 'Journalism Theory'
    POLITICS = 'Politics & Democracy'
    TECHNOLOGY = 'Technology & Digital Media'
    EDUCATION = 'Journalism Education'
    AUDIENCE = 'Audience & Public Engagement'

class HistoricalEra(Enum):
    ERA_1990s = '1990-1999'
    ERA_2000_04 = '2000-2004'
    ERA_2005_09 = '2005-2009'
    ERA_2010_15 = '2010-2015'
    ERA_2016_20 = '2016-2020'
    ERA_2021_25 = '2021-2025'
    ERA_2026_PRESENT = '2026-present'

@dataclass
class ArchiveRecord:
    # Core identifiers
    id: str                              # Format: SOURCE-00001
    url: str
    title: str

    # Content
    author: Optional[str] = None
    publication_date: Optional[date] = None
    publication: Optional[str] = None
    content_type: ContentType = ContentType.ARTICLE
    text: str = ''

    # AI-enriched fields
    summary: Optional[str] = None
    pull_quote: Optional[str] = None
    categories: list[ThematicCategory] = field(default_factory=list)
    key_concepts: list[str] = field(default_factory=list)
    tags: list[str] = field(default_factory=list)
    era: Optional[HistoricalEra] = None
    scope: Optional[str] = None  # Theoretical, Commentary, Case Study, etc.

    # Entity references
    entities_mentioned: list[str] = field(default_factory=list)
    related_to: list[str] = field(default_factory=list)
    responds_to: list[str] = field(default_factory=list)

    # Archive metadata
    pdf_url: Optional[str] = None
    transcript_url: Optional[str] = None
    verified: bool = False
    processing_status: str = 'pending'
    last_updated: Optional[date] = None

def generate_record_id(source: str, sequence: int) -> str:
    """Generate unique ID with source prefix."""
    prefixes = {
        'nytimes': 'NYT',
        'columbia journalism review': 'CJR',
        'pressthink': 'PT',
        'twitter': 'TW',
        'youtube': 'YT',
        'newspaper': 'NEWS',
    }
    prefix = prefixes.get(source.lower(), 'MISC')
    return f"{prefix}-{sequence:05d}"

AI-powered categorization

Taxonomy-based classification

# pip install google-genai
# (the legacy `google-generativeai` SDK was deprecated in 2024 — the
# new `google-genai` package is the supported path. Imports below
# use the new shape.)
import os
from google import genai
from google.genai import types
import json
from typing import Optional

# Default to the current Gemini 2.5 family. For 2026 production
# workloads, the Gemini 3 family (gemini-3-flash, gemini-3-pro) is
# also available — bump the model string when you've verified the
# response shape against your taxonomy prompts.
DEFAULT_GEMINI_MODEL = 'gemini-2.5-flash'

# Single client; reads GOOGLE_API_KEY (or pass api_key=...).
_client = genai.Client(api_key=os.environ.get('GOOGLE_API_KEY'))

TAXONOMY = {
    "thematic_categories": [
        "Press & Media Criticism",
        "Journalism Theory",
        "Politics & Democracy",
        "Technology & Digital Media",
        "Journalism Education",
        "Audience & Public Engagement"
    ],
    "key_concepts": [
        "The View from Nowhere",
        "Verification vs. Assertion",
        "Citizens vs. Consumers",
        "Public Journalism",
        "The Rosen Test",
        "Savvy vs. Naive",
        "Professional vs. Amateur",
        "Production vs. Distribution",
        "Trust vs. Transparency",
        "Horse Race Coverage",
        "Both Sides Journalism",
        "Audience Atomization",
        "The Church of the Savvy"
    ],
    "scope_types": [
        "Theoretical",
        "Commentary",
        "Historical",
        "Case Study",
        "Pedagogical",
        "Personal Reflection"
    ]
}

class ArchiveCategorizer:
    def __init__(self, model: str = DEFAULT_GEMINI_MODEL, client: genai.Client = None):
        self.model = model
        self.client = client or _client

    def categorize(self, record: ArchiveRecord) -> dict:
        prompt = f"""Analyze this archival content and categorize it according to the taxonomy.

CONTENT:
Title: {record.title}
Author: {record.author or 'Unknown'}
Date: {record.publication_date or 'Unknown'}
Text (first 8000 chars):
{record.text[:8000]}

TAXONOMY:
{json.dumps(TAXONOMY, indent=2)}

Respond with JSON containing:
{{
  "categories": ["category1", "category2"],  // 1-3 from thematic_categories
  "key_concepts": ["concept1", "concept2"],  // 0-5 from key_concepts list
  "scope": "scope_type",                     // one from scope_types
  "era": "YYYY-YYYY",                        // decade range
  "tags": ["tag1", "tag2", "tag3", "tag4", "tag5"],  // 5 contextual keywords
  "summary": "2-3 sentence summary",
  "pull_quote": "Most impactful quote from the text"
}}

IMPORTANT:
- Only use categories/concepts from the taxonomy
- Tags should be lowercase, hyphenated keywords
- Summary should capture the main argument
- Pull quote must be an exact excerpt from the text
"""

        # response_mime_type='application/json' makes Gemini emit raw
        # JSON without ```json fences — the markdown-stripping fallback
        # in _parse_response() is kept as defense-in-depth for older
        # models that still wrap output.
        response = self.client.models.generate_content(
            model=self.model,
            contents=prompt,
            config=types.GenerateContentConfig(
                response_mime_type='application/json',
            ),
        )
        result = self._parse_response(response.text)

        # Validate against taxonomy
        result['categories'] = [c for c in result.get('categories', [])
                               if c in TAXONOMY['thematic_categories']]
        result['key_concepts'] = [c for c in result.get('key_concepts', [])
                                  if c in TAXONOMY['key_concepts']]

        return result

    def _parse_response(self, text: str) ->

digital-archive

Cómo agregar

Pega en el README de tu repo

Skills relacionadas

doc-coauthoring

algorithmic-art

seo-aeo-blog-writer

wordpress-centric-high-seo-optimized-blogwriting-skill

Recibe nuevas skills de Escrita e Conteúdo todos los lunes

Digital archive methodology

Archive architecture

Multi-source integration pattern

Unified schema design

AI-powered categorization

Taxonomy-based classification

Comentarios · Sin comentarios