Parquet Capacity Planning
Parquet Format Capacity Planning Columnar Storage Compression Row Group Partition Strategy Data Lake Storage Estimation Production
| Data Type | Raw Size/Row | Parquet (Snappy) | Parquet (ZSTD) | Compression Ratio |
|---|---|---|---|---|
| Integer (INT64) | 8 bytes | 1-2 bytes | 0.8-1.5 bytes | 75-90% |
| Float (DOUBLE) | 8 bytes | 4-6 bytes | 3-5 bytes | 25-60% |
| String (avg 50 chars) | 50 bytes | 10-20 bytes | 8-15 bytes | 60-85% |
| Boolean | 1 byte | 0.05-0.1 bytes | 0.03-0.08 bytes | 90-97% |
| Timestamp | 8 bytes | 1-3 bytes | 0.8-2 bytes | 62-90% |
| Enum/Category | 20 bytes | 1-3 bytes | 0.5-2 bytes | 85-97% |
Storage Estimation
# === Parquet Storage Calculator ===
from dataclasses import dataclass
@dataclass
class Column:
name: str
dtype: str
avg_bytes_raw: float
compression_ratio: float
@dataclass
class StorageEstimate:
description: str
rows_per_day: int
columns: list
retention_days: int
replicas: int
def estimate_storage(est):
raw_per_row = sum(c.avg_bytes_raw for c in est.columns)
compressed_per_row = sum(c.avg_bytes_raw * (1 - c.compression_ratio) for c in est.columns)
raw_daily_gb = (raw_per_row * est.rows_per_day) / (1024**3)
compressed_daily_gb = (compressed_per_row * est.rows_per_day) / (1024**3)
total_gb = compressed_daily_gb * est.retention_days * est.replicas
print(f" [{est.description}]")
print(f" Rows/day: {est.rows_per_day:,} | Columns: {len(est.columns)}")
print(f" Raw/row: {raw_per_row:.0f} bytes | Compressed/row: {compressed_per_row:.1f} bytes")
print(f" Raw/day: {raw_daily_gb:.1f} GB | Compressed/day: {compressed_daily_gb:.2f} GB")
print(f" Retention: {est.retention_days} days × {est.replicas} replicas")
print(f" Total Storage: {total_gb:.1f} GB ({total_gb/1024:.2f} TB)")
return total_gb
# Example: E-commerce Event Log
ecommerce_cols = [
Column("event_id", "STRING", 36, 0.70),
Column("timestamp", "TIMESTAMP", 8, 0.85),
Column("user_id", "INT64", 8, 0.80),
Column("event_type", "ENUM", 15, 0.95),
Column("product_id", "INT64", 8, 0.75),
Column("category", "ENUM", 20, 0.95),
Column("price", "DOUBLE", 8, 0.40),
Column("quantity", "INT32", 4, 0.80),
Column("session_id", "STRING", 36, 0.65),
Column("device", "ENUM", 10, 0.95),
Column("country", "ENUM", 5, 0.97),
Column("page_url", "STRING", 100, 0.75),
]
print("=== Storage Estimates ===\n")
e1 = StorageEstimate("E-commerce Events", 50_000_000, ecommerce_cols, 365, 2)
total = estimate_storage(e1)
# Cost estimation
s3_cost_per_gb = 0.023
monthly_cost = (total / 12) * s3_cost_per_gb
print(f"\n S3 Cost: /month (/GB)")
Row Group and Compression
# === Row Group Optimization ===
# PyArrow — Write optimized Parquet
# import pyarrow as pa
# import pyarrow.parquet as pq
#
# table = pa.Table.from_pandas(df)
# pq.write_table(table, 'data.parquet',
# compression='zstd',
# compression_level=3,
# row_group_size=1_000_000, # rows per group
# use_dictionary=True, # dictionary encoding
# write_statistics=True, # min/max stats for pushdown
# )
# Spark — Parquet settings
# spark.conf.set("spark.sql.parquet.compression.codec", "zstd")
# spark.conf.set("spark.sql.parquet.block.size", str(256 * 1024 * 1024))
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "true")
# spark.conf.set("spark.sql.parquet.filterPushdown", "true")
# DuckDB — Read Parquet efficiently
# SELECT product_id, SUM(price * quantity) as revenue
# FROM read_parquet('s3://data/events/year=2024/**/*.parquet',
# hive_partitioning=true)
# WHERE event_type = 'purchase'
# AND year = 2024 AND month = 6
# GROUP BY product_id
# ORDER BY revenue DESC
# LIMIT 100;
@dataclass
class CompressionBenchmark:
codec: str
ratio: str
write_speed: str
read_speed: str
cpu_usage: str
best_for: str
benchmarks = [
CompressionBenchmark("Snappy", "60-70%", "Fast (500 MB/s)", "Fast (800 MB/s)", "Low",
"Default, balanced speed/compression"),
CompressionBenchmark("ZSTD (level 3)", "75-85%", "Medium (300 MB/s)", "Fast (700 MB/s)", "Medium",
"Best overall, good compression + speed"),
CompressionBenchmark("ZSTD (level 9)", "80-90%", "Slow (100 MB/s)", "Fast (700 MB/s)", "High",
"Cold storage, archive"),
CompressionBenchmark("Gzip", "75-85%", "Slow (80 MB/s)", "Medium (400 MB/s)", "High",
"Compatibility, legacy systems"),
CompressionBenchmark("LZ4", "55-65%", "Very fast (700 MB/s)", "Very fast (1 GB/s)", "Very low",
"Hot data, latency-sensitive"),
CompressionBenchmark("Uncompressed", "0%", "Very fast", "Very fast", "None",
"Temporary data, debugging"),
]
print("\n=== Compression Benchmarks ===")
for b in benchmarks:
print(f" [{b.codec}] Ratio: {b.ratio}")
print(f" Write: {b.write_speed} | Read: {b.read_speed}")
print(f" CPU: {b.cpu_usage} | Best for: {b.best_for}")
Partition and Lifecycle
# === Partition Strategy ===
@dataclass
class PartitionPlan:
table: str
partition_by: str
files_per_partition: str
file_size: str
compaction: str
plans = [
PartitionPlan("events", "year/month/day", "1-10 files", "128 MB - 1 GB",
"Daily compaction at 02:00 UTC"),
PartitionPlan("users", "country", "1-5 files", "256 MB - 1 GB",
"Weekly compaction"),
PartitionPlan("transactions", "year/month", "5-20 files", "256 MB - 1 GB",
"Monthly compaction"),
PartitionPlan("logs", "year/month/day/hour", "1-3 files", "64 MB - 512 MB",
"Hourly micro-batch, daily compaction"),
]
print("=== Partition Plans ===")
for p in plans:
print(f" [{p.table}] Partition: {p.partition_by}")
print(f" Files: {p.files_per_partition} | Size: {p.file_size}")
print(f" Compaction: {p.compaction}")
# Lifecycle Management
@dataclass
class StorageTier:
tier: str
age: str
storage: str
cost_gb: float
access: str
tiers = [
StorageTier("Hot", "0-30 days", "S3 Standard", 0.023, "Frequent queries"),
StorageTier("Warm", "30-90 days", "S3 Infrequent Access", 0.0125, "Weekly queries"),
StorageTier("Cold", "90-365 days", "S3 Glacier Instant", 0.004, "Monthly queries"),
StorageTier("Archive", "1-7 years", "S3 Glacier Deep", 0.00099, "Compliance only"),
]
print(f"\n\n=== Storage Tiers ===")
for t in tiers:
print(f" [{t.tier}] Age: {t.age} | Storage: {t.storage}")
print(f" Cost: /GB/month | Access: {t.access}")
เคล็ดลับ
- ZSTD: ใช้ ZSTD level 3 เป็น Default ดีกว่า Snappy ทั้ง Compression และ Speed
- Dictionary: เปิด Dictionary Encoding สำหรับ Column ที่มี Cardinality ต่ำ
- Compact: รวมไฟล์เล็กให้เป็น 128 MB - 1 GB ทุกวัน
- Partition: อย่า Over-partition ไม่เกิน 10000 Partitions
- Statistics: เปิด write_statistics สำหรับ Predicate Pushdown
Parquet Format คืออะไร
Apache Parquet Columnar Storage Column-oriented I/O Compression Snappy Gzip ZSTD LZ4 Predicate Pushdown Schema Spark Hive Presto DuckDB Pandas
Capacity Planning ทำอย่างไร
Raw Size Compression Ratio Data Type Algorithm Growth Rate Partition Retention Hot Warm Cold Storage Cost Cloud Provider
Row Group Size ตั้งเท่าไหร่ดี
Default 128 MB Analytical 256 MB - 1 GB ใหญ่ Compression ดี Sequential เร็ว เล็ก Random Access Memory Spark PyArrow Schema Columns
Partition Strategy ควรเป็นอย่างไร
Filter Date Region Category Hive-style ไม่เกิน 10000 Partition 128 MB - 1 GB Compaction รวมไฟล์ Partition Pruning Engine
สรุป
Parquet Format Capacity Planning Columnar Storage ZSTD Compression Row Group Partition Strategy Data Lake S3 Lifecycle Hot Warm Cold Production
