Weaviate Data Ingestion Skill

This skill helps you upload data to your local Weaviate collections efficiently, handling everything from single objects to large batch imports.

Important Note

This skill is designed for LOCAL Weaviate instances only. Ensure you have Weaviate running locally in Docker before using this skill.

Purpose

Add data to your local Weaviate collections with automatic vectorization, proper error handling, and progress tracking.

When to Use This Skill

User wants to add data to a collection
User needs to upload documents, articles, or records
User has images or multi-modal content to ingest
User wants to import data from files (JSON, CSV, text)
User asks about batch uploading or bulk data import

Prerequisites Check

Claude should verify these prerequisites before proceeding:

✅ weaviate-local-setup completed - Python environment and dependencies installed
✅ weaviate-connection completed - Successfully connected to Weaviate
✅ weaviate-collection-manager used - Target collection exists
✅ Docker container running - Weaviate is accessible at localhost:8080

If any prerequisites are missing, Claude should:

Load the required prerequisite skill first
Guide the user through the setup
Then return to this skill

Prerequisites

Local Weaviate running in Docker (see weaviate-local-setup skill)
Active Weaviate connection (use weaviate-connection skill first)
Existing collection (use weaviate-collection-manager skill to create)
Python weaviate-client library installed

Operations

1. Add a Single Object

import weaviate
from weaviate.classes.data import DataObject

# Assuming client is already connected
collection = client.collections.get("Articles")

# Add one object
uuid = collection.data.insert(
    properties={
        "title": "Introduction to Vector Databases",
        "content": "Vector databases enable semantic search by storing embeddings...",
        "author": "John Doe",
        "publishDate": "2025-01-20T10:00:00Z"
    }
)

print(f"✅ Object created with UUID: {uuid}")

2. Add Object with Custom Vector

# If you bring your own embeddings
collection = client.collections.get("CustomEmbeddings")

uuid = collection.data.insert(
    properties={
        "text": "Your content here",
        "metadata": "Additional info"
    },
    vector=[0.1, 0.2, 0.3, ...]  # Your pre-computed embedding
)

3. Batch Upload Multiple Objects

Simple Batch Insert

from weaviate.util import generate_uuid5

collection = client.collections.get("Articles")

# Prepare your data
articles = [
    {
        "title": "AI in 2025",
        "content": "Artificial intelligence continues to evolve...",
        "author": "Jane Smith",
        "publishDate": "2025-01-15T00:00:00Z"
    },
    {
        "title": "Vector Search Explained",
        "content": "Vector search allows you to find similar items...",
        "author": "Bob Johnson",
        "publishDate": "2025-01-18T00:00:00Z"
    },
    # ... more articles
]

# Batch insert with context manager (recommended)
with collection.batch.dynamic() as batch:
    for article in articles:
        batch.add_object(
            properties=article,
            # Optional: provide deterministic UUID based on content
            uuid=generate_uuid5(article['title'])
        )

print(f"✅ Inserted {len(articles)} articles")

Batch with Error Handling

from weaviate.util import generate_uuid5

collection = client.collections.get("Articles")
failed_objects = []

with collection.batch.dynamic() as batch:
    for i, article in enumerate(articles):
        try:
            batch.add_object(
                properties=article,
                uuid=generate_uuid5(article.get('title', str(i)))
            )
        except Exception as e:
            failed_objects.append({"index": i, "error": str(e), "data": article})
            print(f"⚠️  Failed to add article {i}: {str(e)}")

# Check batch results
if batch.failed_objects:
    print(f"❌ {len(batch.failed_objects)} objects failed")
    for failed in batch.failed_objects:
        print(f"   Error: {failed.message}")
else:
    print(f"✅ All {len(articles)} objects inserted successfully")

4. Upload Data from JSON File

import json

# Read JSON file
with open("articles.json", "r") as f:
    data = json.load(f)

collection = client.collections.get("Articles")

# Batch insert
with collection.batch.dynamic() as batch:
    for item in data:
        batch.add_object(properties=item)

print(f"✅ Imported {len(data)} objects from JSON")

5. Upload Data from CSV File

import csv
from datetime import datetime

collection = client.collections.get("Articles")

with open("articles.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)

    with collection.batch.dynamic() as batch:
        for row in reader:
            # Transform CSV row to match schema
            batch.add_object(
                properties={
                    "title": row["title"],
                    "content": row["content"],
                    "author": row["author"],
                    "publishDate": datetime.fromisoformat(row["date"]).isoformat()
                }
            )

print("✅ CSV import complete")

6. Upload Images (Multi-modal Collections)

⚠️ IMPORTANT: Verify Multimodal Vectorizer First

Before uploading images, verify your collection uses a multimodal vectorizer:

# Check collection configuration
collection = client.collections.get("ProductCatalog")
config = collection.config.get()

# Get vectorizer info
vectorizer = config.vectorizer.config.name if hasattr(config.vectorizer.config, 'name') else str(config.vectorizer)

# List of multimodal-compatible vectorizers
MULTIMODAL_VECTORIZERS = [
    'multi2vec-clip',
    'multi2vec-bind',
    'img2vec-neural',
]

# Validate vectorizer
is_multimodal = any(mv in str(vectorizer).lower() for mv in MULTIMODAL_VECTORIZERS)

if not is_multimodal:
    print(f"❌ Warning: Collection uses '{vectorizer}' which may not support images properly")
    print(f"   Recommended vectorizers for images: {', '.join(MULTIMODAL_VECTORIZERS)}")
    print(f"   Images will be stored but may not be vectorized correctly for semantic search")

    # Prompt user to continue
    response = input("\nContinue anyway? (y/n): ")
    if response.lower() != 'y':
        print("Aborted. Use weaviate-collection-manager skill to create a multimodal collection.")
        exit()
else:
    print(f"✅ Collection uses multimodal vectorizer: {vectorizer}")

Single Image Upload

import base64
from pathlib import Path

collection = client.collections.get("ProductCatalog")

def encode_image(image_path: str) -> str:
    """Convert image to base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Add product with image
image_base64 = encode_image("product_photo.jpg")

collection.data.insert(
    properties={
        "name": "Wireless Headphones",
        "description": "Premium noise-cancelling headphones",
        "image": image_base64,  # Base64 encoded
        "price": 299.99,
        "category": "Electronics"
    }
)

print("✅ Product with image uploaded")

Batch Upload Multiple Images

from pathlib import Path
import base64

collection = client.collections.get("ProductCatalog")

# Directory with product images
image_dir = Path("products/")
products = [
    {"name": "Headphones", "price": 299.99, "image_file": "headphones.jpg"},
    {"name": "Laptop", "price": 1299.99, "image_file": "laptop.jpg"},
    {"name": "Phone", "price": 799.99, "image_file": "phone.jpg"},
]

with collection.batch.dynamic() as batch:
    for product in products:
        # Encode image
        image_path = image_dir / product["image_file"]
        with open(image_path, "rb") as img:

weaviate-data-ingestion

How to add

Drop this on your repo README

Related skills

learn-codebase

remove-deadcode

sendgrid-automation

seo

Get new Marketing skills every Monday