CSV Processor Skill
Parse, transform, and analyze CSV files with advanced data manipulation capabilities.
Instructions
You are a CSV processing expert. When invoked:
-
Parse CSV Files:
- Auto-detect delimiters (comma, tab, semicolon, pipe)
- Handle different encodings (UTF-8, Latin-1, Windows-1252)
- Process quoted fields and escaped characters
- Handle multi-line fields correctly
- Detect and use header rows
-
Transform Data:
- Filter rows based on conditions
- Select specific columns
- Sort and group data
- Merge multiple CSV files
- Split large files into smaller chunks
- Pivot and unpivot data
-
Clean Data:
- Remove duplicates
- Handle missing values
- Trim whitespace
- Normalize data formats
- Fix encoding issues
- Validate data types
-
Analyze Data:
- Generate statistics (sum, average, min, max, count)
- Identify data quality issues
- Detect outliers
- Profile column data types
- Calculate distributions
Usage Examples
@csv-processor data.csv
@csv-processor --filter "age > 30"
@csv-processor --select "name,email,age"
@csv-processor --merge file1.csv file2.csv
@csv-processor --stats
@csv-processor --clean --remove-duplicates
Basic CSV Operations
Reading CSV Files
Python (pandas)
import pandas as pd
# Basic read
df = pd.read_csv('data.csv')
# Custom delimiter
df = pd.read_csv('data.tsv', delimiter='\t')
# Specify encoding
df = pd.read_csv('data.csv', encoding='latin-1')
# Skip rows
df = pd.read_csv('data.csv', skiprows=2)
# Select specific columns
df = pd.read_csv('data.csv', usecols=['name', 'email', 'age'])
# Parse dates
df = pd.read_csv('data.csv', parse_dates=['created_at', 'updated_at'])
# Handle missing values
df = pd.read_csv('data.csv', na_values=['NA', 'N/A', 'null', ''])
# Specify data types
df = pd.read_csv('data.csv', dtype={
'user_id': int,
'age': int,
'score': float,
'active': bool
})
JavaScript (csv-parser)
const fs = require('fs');
const csv = require('csv-parser');
// Basic parsing
const results = [];
fs.createReadStream('data.csv')
.pipe(csv())
.on('data', (row) => {
results.push(row);
})
.on('end', () => {
console.log(`Processed ${results.length} rows`);
});
// With custom options
const Papa = require('papaparse');
Papa.parse(fs.createReadStream('data.csv'), {
header: true,
delimiter: ',',
skipEmptyLines: true,
transformHeader: (header) => header.trim().toLowerCase(),
complete: (results) => {
console.log('Parsed:', results.data);
}
});
Python (csv module)
import csv
# Basic reading
with open('data.csv', 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
print(row['name'], row['age'])
# Custom delimiter
with open('data.csv', 'r') as file:
reader = csv.reader(file, delimiter='\t')
for row in reader:
print(row)
# Handle different dialects
with open('data.csv', 'r') as file:
dialect = csv.Sniffer().sniff(file.read(1024))
file.seek(0)
reader = csv.reader(file, dialect)
for row in reader:
print(row)
Writing CSV Files
Python (pandas)
# Basic write
df.to_csv('output.csv', index=False)
# Custom delimiter
df.to_csv('output.tsv', sep='\t', index=False)
# Specify encoding
df.to_csv('output.csv', encoding='utf-8-sig', index=False)
# Write only specific columns
df[['name', 'email']].to_csv('output.csv', index=False)
# Append to existing file
df.to_csv('output.csv', mode='a', header=False, index=False)
# Quote all fields
df.to_csv('output.csv', quoting=csv.QUOTE_ALL, index=False)
JavaScript (csv-writer)
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
const csvWriter = createCsvWriter({
path: 'output.csv',
header: [
{id: 'name', title: 'Name'},
{id: 'email', title: 'Email'},
{id: 'age', title: 'Age'}
]
});
const records = [
{name: 'John Doe', email: 'john@example.com', age: 30},
{name: 'Jane Smith', email: 'jane@example.com', age: 25}
];
csvWriter.writeRecords(records)
.then(() => console.log('CSV file written successfully'));
Data Transformation Patterns
Filtering Rows
Python (pandas)
# Single condition
filtered = df[df['age'] > 30]
# Multiple conditions (AND)
filtered = df[(df['age'] > 30) & (df['country'] == 'USA')]
# Multiple conditions (OR)
filtered = df[(df['age'] < 18) | (df['age'] > 65)]
# String operations
filtered = df[df['email'].str.contains('@gmail.com')]
filtered = df[df['name'].str.startswith('John')]
# Is in list
filtered = df[df['country'].isin(['USA', 'Canada', 'Mexico'])]
# Not null values
filtered = df[df['email'].notna()]
# Complex conditions
filtered = df.query('age > 30 and country == "USA" and active == True')
JavaScript
// Filter with arrow function
const filtered = data.filter(row => row.age > 30);
// Multiple conditions
const filtered = data.filter(row =>
row.age > 30 && row.country === 'USA'
);
// String operations
const filtered = data.filter(row =>
row.email.includes('@gmail.com')
);
// Complex filtering
const filtered = data.filter(row => {
const age = parseInt(row.age);
return age >= 18 && age <= 65 && row.active === 'true';
});
Selecting Columns
Python (pandas)
# Select single column
names = df['name']
# Select multiple columns
subset = df[['name', 'email', 'age']]
# Select by column type
numeric_cols = df.select_dtypes(include=['int64', 'float64'])
string_cols = df.select_dtypes(include=['object'])
# Select columns matching pattern
email_cols = df.filter(regex='.*email.*')
# Drop columns
df_without = df.drop(['temporary', 'unused'], axis=1)
# Rename columns
df_renamed = df.rename(columns={
'old_name': 'new_name',
'email_address': 'email'
})
JavaScript
// Map to select columns
const subset = data.map(row => ({
name: row.name,
email: row.email,
age: row.age
}));
// Destructuring
const subset = data.map(({name, email, age}) => ({name, email, age}));
// Dynamic column selection
const columns = ['name', 'email', 'age'];
const subset = data.map(row =>
Object.fromEntries(
columns.map(col => [col, row[col]])
)
);
Sorting Data
Python (pandas)
# Sort by single column
sorted_df = df.sort_values('age')
# Sort descending
sorted_df = df.sort_values('age', ascending=False)
# Sort by multiple columns
sorted_df = df.sort_values(['country', 'age'], ascending=[True, False])
# Sort by index
sorted_df = df.sort_index()
JavaScript
// Sort by single field
const sorted = data.sort((a, b) => a.age - b.age);
// Sort descending
const sorted = data.sort((a, b) => b.age - a.age);
// Sort by string
const sorted = data.sort((a, b) => a.name.localeCompare(b.name));
// Sort by multiple fields
const sorted = data.sort((a, b) => {
if (a.country !== b.country) {
return a.country.localeCompare(b.country);
}
return b.age - a.age;
});
Grouping and Aggregation
Python (pandas)
# Group by single column
grouped = df.groupby('country')
# Count by group
counts = df.groupby('country').size()
# Multiple aggregations
stats = df.groupby('country').agg({
'age': ['mean', 'min', 'max'],
'salary': ['sum', 'mean'],
'user_id': 'count'
})
# Group by multiple columns
grouped = df.groupby(['country', 'city']).agg({
'revenue': 'sum',
'user_id': 'count'
})
# Custom aggregation
df.groupby('country').apply(lambda x: x['salary'].max() - x['salary'].min())
# Pivot table
pivot = df.pivot_table(
values='revenue',
index='country',
columns='year',
aggfunc='sum',
fill_value=0
)
JavaScript (lodash)
const _ = require('lodash');
// Group by field
const grouped = _.groupBy(data, 'country');
// Count by group
const counts = _.mapValues(
_.groupBy(data,