data-validationValidate data with schemas across languages and formats. Use when defining JSON Schema, using Zod (TypeScript) or Pydantic (Python), validating API request/response shapes, checking CSV/JSON data integrity, or setting up data contracts between services.
Install via ClawdBot CLI:
clawdbot install gitgoodordietrying/data-validationSchema-based data validation across languages and formats. Covers JSON Schema, Zod (TypeScript), Pydantic (Python), API boundary validation, data contracts, and integrity checking.
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"required": ["name", "email", "age"],
"properties": {
"name": {
"type": "string",
"minLength": 1,
"maxLength": 100
},
"email": {
"type": "string",
"format": "email"
},
"age": {
"type": "integer",
"minimum": 0,
"maximum": 150
},
"role": {
"type": "string",
"enum": ["user", "admin", "moderator"],
"default": "user"
},
"tags": {
"type": "array",
"items": { "type": "string" },
"uniqueItems": true,
"maxItems": 10
},
"address": {
"type": "object",
"properties": {
"street": { "type": "string" },
"city": { "type": "string" },
"zip": { "type": "string", "pattern": "^\\d{5}(-\\d{4})?$" }
},
"required": ["street", "city"]
}
},
"additionalProperties": false
}
// Nullable field
{ "type": ["string", "null"] }
// Union type (string or number)
{ "oneOf": [{ "type": "string" }, { "type": "number" }] }
// Conditional: if role is admin, require permissions
{
"if": { "properties": { "role": { "const": "admin" } } },
"then": { "required": ["permissions"] }
}
// Pattern properties (dynamic keys)
{
"type": "object",
"patternProperties": {
"^env_": { "type": "string" }
}
}
// Reusable definitions
{
"$defs": {
"address": {
"type": "object",
"properties": {
"street": { "type": "string" },
"city": { "type": "string" }
}
}
},
"properties": {
"home": { "$ref": "#/$defs/address" },
"work": { "$ref": "#/$defs/address" }
}
}
# Using ajv-cli (Node.js)
npx ajv-cli validate -s schema.json -d data.json
# Using jsonschema (Python)
pip install jsonschema
python3 -c "
import json, jsonschema
schema = json.load(open('schema.json'))
data = json.load(open('data.json'))
jsonschema.validate(data, schema)
print('Valid')
"
# Validate multiple files
for f in data/*.json; do
npx ajv-cli validate -s schema.json -d "$f" 2>&1 || echo "INVALID: $f"
done
import { z } from 'zod';
// Primitives
const nameSchema = z.string().min(1).max(100);
const ageSchema = z.number().int().min(0).max(150);
const emailSchema = z.string().email();
const urlSchema = z.string().url();
// Objects
const userSchema = z.object({
name: z.string().min(1),
email: z.string().email(),
age: z.number().int().min(0),
role: z.enum(['user', 'admin', 'moderator']).default('user'),
tags: z.array(z.string()).max(10).default([]),
createdAt: z.string().datetime(),
});
// Infer TypeScript type from schema
type User = z.infer<typeof userSchema>;
// { name: string; email: string; age: number; role: "user" | "admin" | "moderator"; ... }
// Validate
const result = userSchema.safeParse(data);
if (result.success) {
console.log(result.data); // typed as User
} else {
console.log(result.error.issues); // validation errors
}
// Parse (throws on invalid)
const user = userSchema.parse(data);
// Optional and nullable
const schema = z.object({
name: z.string(),
nickname: z.string().optional(), // string | undefined
middleName: z.string().nullable(), // string | null
suffix: z.string().nullish(), // string | null | undefined
});
// Transforms (validate then transform)
const dateSchema = z.string().datetime().transform(s => new Date(s));
const trimmed = z.string().trim().toLowerCase();
const parsed = z.string().transform(s => parseInt(s, 10)).pipe(z.number().int());
// Discriminated unions (tagged unions)
const eventSchema = z.discriminatedUnion('type', [
z.object({ type: z.literal('click'), x: z.number(), y: z.number() }),
z.object({ type: z.literal('keypress'), key: z.string() }),
z.object({ type: z.literal('scroll'), delta: z.number() }),
]);
// Recursive types
const categorySchema: z.ZodType<Category> = z.object({
name: z.string(),
children: z.lazy(() => z.array(categorySchema)).default([]),
});
// Refinements (custom validation)
const passwordSchema = z.string()
.min(8)
.refine(s => /[A-Z]/.test(s), 'Must contain uppercase')
.refine(s => /[0-9]/.test(s), 'Must contain digit')
.refine(s => /[^a-zA-Z0-9]/.test(s), 'Must contain special character');
// Extend/merge objects
const baseUser = z.object({ name: z.string(), email: z.string() });
const adminUser = baseUser.extend({ permissions: z.array(z.string()) });
// Pick/omit
const createUser = userSchema.omit({ createdAt: true });
const userSummary = userSchema.pick({ name: true, email: true });
// Passthrough (allow extra fields)
const flexible = userSchema.passthrough();
// Strip unknown fields
const strict = userSchema.strict(); // Error on extra fields
// Express middleware
import { z } from 'zod';
const createUserBody = z.object({
name: z.string().min(1),
email: z.string().email(),
password: z.string().min(8),
});
app.post('/api/users', (req, res) => {
const result = createUserBody.safeParse(req.body);
if (!result.success) {
return res.status(400).json({ errors: result.error.issues });
}
const { name, email, password } = result.data;
// ... create user
});
// Query parameter validation
const listParams = z.object({
page: z.coerce.number().int().min(1).default(1),
limit: z.coerce.number().int().min(1).max(100).default(20),
sort: z.enum(['newest', 'oldest', 'name']).default('newest'),
q: z.string().optional(),
});
app.get('/api/users', (req, res) => {
const params = listParams.parse(req.query);
// params.page is a number, params.sort is typed
});
from pydantic import BaseModel, Field, EmailStr, field_validator
from typing import Optional
from datetime import datetime
from enum import Enum
class Role(str, Enum):
USER = "user"
ADMIN = "admin"
MODERATOR = "moderator"
class Address(BaseModel):
street: str
city: str
zip_code: str = Field(pattern=r"^\d{5}(-\d{4})?$")
class User(BaseModel):
name: str = Field(min_length=1, max_length=100)
email: EmailStr
age: int = Field(ge=0, le=150)
role: Role = Role.USER
tags: list[str] = Field(default_factory=list, max_length=10)
address: Optional[Address] = None
created_at: datetime = Field(default_factory=datetime.now)
@field_validator("name")
@classmethod
def name_must_not_be_empty(cls, v: str) -> str:
if not v.strip():
raise ValueError("name cannot be blank")
return v.strip()
# Validate
user = User(name="Alice", email="alice@example.com", age=30)
print(user.model_dump()) # dict
print(user.model_dump_json()) # JSON string
# Validation errors
try:
User(name="", email="bad", age=-1)
except Exception as e:
print(e) # Detailed validation errors
from pydantic import BaseModel, model_validator, ConfigDict
from typing import Literal, Union, Annotated
# Discriminated union
class ClickEvent(BaseModel):
type: Literal["click"]
x: int
y: int
class KeypressEvent(BaseModel):
type: Literal["keypress"]
key: str
Event = Annotated[Union[ClickEvent, KeypressEvent], Field(discriminator="type")]
# Model-level validation (cross-field)
class DateRange(BaseModel):
start: datetime
end: datetime
@model_validator(mode="after")
def end_after_start(self):
if self.end <= self.start:
raise ValueError("end must be after start")
return self
# Strict mode (no type coercion)
class StrictUser(BaseModel):
model_config = ConfigDict(strict=True)
age: int # "30" will be rejected, must be int 30
# Alias (accept different field names in input)
class APIResponse(BaseModel):
user_name: str = Field(alias="userName")
created_at: datetime = Field(alias="createdAt")
model_config = ConfigDict(populate_by_name=True)
# Computed fields
from pydantic import computed_field
class Order(BaseModel):
items: list[dict]
tax_rate: float = 0.08
@computed_field
@property
def total(self) -> float:
subtotal = sum(i.get("price", 0) * i.get("qty", 1) for i in self.items)
return round(subtotal * (1 + self.tax_rate), 2)
# Generate JSON Schema
print(User.model_json_schema())
from fastapi import FastAPI, Query
from pydantic import BaseModel
app = FastAPI()
class CreateUser(BaseModel):
name: str = Field(min_length=1)
email: EmailStr
password: str = Field(min_length=8)
class UserResponse(BaseModel):
id: int
name: str
email: str
@app.post("/api/users", response_model=UserResponse)
async def create_user(body: CreateUser):
# body is already validated
return {"id": 1, "name": body.name, "email": body.email}
@app.get("/api/users")
async def list_users(
page: int = Query(default=1, ge=1),
limit: int = Query(default=20, ge=1, le=100),
q: str | None = Query(default=None),
):
# All params validated and typed
pass
#!/bin/bash
# validate-csv.sh — Check CSV structure and data quality
FILE="${1:?Usage: validate-csv.sh <file.csv>}"
echo "=== CSV Validation: $FILE ==="
# Row count
ROWS=$(wc -l < "$FILE")
echo "Rows: $ROWS (including header)"
# Column count consistency
HEADER_COLS=$(head -1 "$FILE" | awk -F',' '{print NF}')
echo "Columns (header): $HEADER_COLS"
BAD_ROWS=$(awk -F',' -v expected="$HEADER_COLS" 'NR>1 && NF!=expected {count++} END {print count+0}' "$FILE")
if [ "$BAD_ROWS" -gt 0 ]; then
echo "ERROR: $BAD_ROWS rows have wrong column count"
awk -F',' -v expected="$HEADER_COLS" 'NR>1 && NF!=expected {print " Line "NR": "NF" columns (expected "expected")"}' "$FILE" | head -5
else
echo "Column count: consistent"
fi
# Empty fields
EMPTY=$(awk -F',' '{for(i=1;i<=NF;i++) if($i=="") count++} END {print count}' "$FILE")
echo "Empty fields: $EMPTY"
# Duplicate rows
DUPES=$(($(sort "$FILE" | uniq -d | wc -l)))
echo "Duplicate rows: $DUPES"
echo "=== Done ==="
# Check if file is valid JSON
jq empty data.json && echo "Valid JSON" || echo "Invalid JSON"
# Validate structure of each object in an array
jq -e '
.[] |
select(
(.name | type) != "string" or
(.email | type) != "string" or
(.age | type) != "number" or
.age < 0
)
' data.json && echo "INVALID records found" || echo "All records valid"
# Check for required fields
jq -e '.[] | select(.id == null or .name == null)' data.json
# Check for unique IDs
jq '[.[].id] | length != (. | unique | length)' data.json
# true = duplicates exist
# Compare record counts between source and target
SRC=$(jq length source.json)
TGT=$(jq length target.json)
echo "Source: $SRC, Target: $TGT, Match: $([ "$SRC" = "$TGT" ] && echo yes || echo NO)"
#!/usr/bin/env python3
"""Validate that a data migration preserved all records."""
import json
import sys
def validate_migration(source_path, target_path, key_field="id"):
with open(source_path) as f:
source = {r[key_field]: r for r in json.load(f)}
with open(target_path) as f:
target = {r[key_field]: r for r in json.load(f)}
missing = set(source) - set(target)
extra = set(target) - set(source)
changed = []
for key in set(source) & set(target):
if source[key] != target[key]:
changed.append(key)
print(f"Source records: {len(source)}")
print(f"Target records: {len(target)}")
print(f"Missing in target: {len(missing)}")
print(f"Extra in target: {len(extra)}")
print(f"Changed: {len(changed)}")
if missing:
print(f"\nMissing IDs (first 10): {list(missing)[:10]}")
if extra:
print(f"\nExtra IDs (first 10): {list(extra)[:10]}")
if changed:
print(f"\nChanged IDs (first 5): {changed[:5]}")
for key in changed[:3]:
print(f"\n {key}:")
for field in set(source[key]) | set(target[key]):
s = source[key].get(field)
t = target[key].get(field)
if s != t:
print(f" {field}: {s!r} → {t!r}")
return len(missing) == 0 and len(extra) == 0
if __name__ == "__main__":
ok = validate_migration(sys.argv[1], sys.argv[2], sys.argv[3] if len(sys.argv) > 3 else "id")
sys.exit(0 if ok else 1)
additionalProperties: false in JSON Schema catches typos in field names. Use it for strict APIs.model_config = ConfigDict(strict=True) when you want no implicit type coercion..safeParse() returns a result object; .parse() throws. Use safeParse in API handlers to return structured errors.Generated Mar 1, 2026
Validate incoming API request bodies against JSON Schema or Zod schemas to ensure data integrity before processing. This prevents malformed data from entering backend systems, reducing errors and improving security by rejecting invalid payloads.
Use Pydantic or JSON Schema to validate data during ETL processes, ensuring that migrated data preserves required fields and formats. This helps maintain data consistency across systems and prevents corruption during database upgrades or service migrations.
Validate CSV or JSON files before importing into applications using command-line tools like ajv-cli or Python scripts. This ensures data integrity for batch processing, such as customer uploads or inventory updates, reducing manual review and errors.
Define and enforce data contracts using schemas to ensure compatibility between microservices. This prevents breaking changes in API responses or events, facilitating smoother integrations and reducing downtime in distributed systems.
Validate and sanitize user inputs from web forms using Zod or Pydantic schemas to enforce constraints like email formats or numeric ranges. This improves data quality and security by preventing injection attacks and ensuring compliance with business rules.
Offer a cloud service that validates API requests and responses in real-time using custom schemas, charging based on request volume. This helps businesses ensure data quality and compliance without maintaining validation infrastructure.
Provide consulting services to integrate validation tools like Zod or Pydantic into client workflows, with revenue from project fees and ongoing support contracts. This assists companies in improving data governance and reducing operational risks.
Develop and distribute open-source validation libraries with basic features, monetizing through premium add-ons like advanced schema generators or enterprise support. This attracts developers while generating revenue from larger organizations.
💬 Integration Tip
Start by integrating validation into CI/CD pipelines to catch errors early, and use schema-first approaches to generate documentation automatically from validation rules.
Use the @steipete/oracle CLI to bundle a prompt plus the right files and get a second-model review (API or browser) for debugging, refactors, design checks, or cross-validation.
Manage Things 3 via the `things` CLI on macOS (add/update projects+todos via URL scheme; read/search/list from the local Things database). Use when a user asks Clawdbot to add a task to Things, list inbox/today/upcoming, search tasks, or inspect projects/areas/tags.
Local search/indexing CLI (BM25 + vectors + rerank) with MCP mode.
Use when designing database schemas, writing migrations, optimizing SQL queries, fixing N+1 problems, creating indexes, setting up PostgreSQL, configuring EF Core, implementing caching, partitioning tables, or any database performance question.
Connect to Supabase for database operations, vector search, and storage. Use for storing data, running SQL queries, similarity search with pgvector, and managing tables. Triggers on requests involving databases, vector stores, embeddings, or Supabase specifically.
Query, design, migrate, and optimize SQL databases. Use when working with SQLite, PostgreSQL, or MySQL — schema design, writing queries, creating migrations, indexing, backup/restore, and debugging slow queries. No ORMs required.