Parquet Format Capacity Planning — วางแผน Storage สำหรับ Columnar Data
Parquet Capacity Planning

Parquet Format Capacity Planning Columnar Storage Compression Row Group Partition Strategy Data Lake Storage Estimation Production
| Data Type | Raw Size/Row | Parquet (Snappy) | Parquet (ZSTD) | Compression Ratio |
|---|---|---|---|---|
| Integer (INT64) | 8 bytes | 1-2 bytes | 0.8-1.5 bytes | 75-90% |
| Float (DOUBLE) | 8 bytes | 4-6 bytes | 3-5 bytes | 25-60% |
| String (avg 50 chars) | 50 bytes | 10-20 bytes | 8-15 bytes | 60-85% |
| Boolean | 1 byte | 0.05-0.1 bytes | 0.03-0.08 bytes | 90-97% |
| Timestamp | 8 bytes | 1-3 bytes | 0.8-2 bytes | 62-90% |
| Enum/Category | 20 bytes | 1-3 bytes | 0.5-2 bytes | 85-97% |
Storage Estimation
# === Parquet Storage Calculator ===
from dataclasses import dataclass
@dataclass
class Column:
name: str
dtype: str
avg_bytes_raw: float
compression_ratio: float
@dataclass
class StorageEstimate:
description: str
rows_per_day: int
columns: list
retention_days: int
replicas: int
def estimate_storage(est):
raw_per_row = sum(c.avg_bytes_raw for c in est.columns)
compressed_per_row = sum(c.avg_bytes_raw * (1 - c.compression_ratio) for c in est.columns)
raw_daily_gb = (raw_per_row * est.rows_per_day) / (1024**3)
compressed_daily_gb = (compressed_per_row * est.rows_per_day) / (1024**3)
total_gb = compressed_daily_gb * est.retention_days * est.replicas
print(f" [{est.description}]")
print(f" Rows/day: {est.rows_per_day:,} | Columns: {len(est.columns)}")
print(f" Raw/row: {raw_per_row:.0f} bytes | Compressed/row: {compressed_per_row:.1f} bytes")
print(f" Raw/day: {raw_daily_gb:.1f} GB | Compressed/day: {compressed_daily_gb:.2f} GB")
print(f" Retention: {est.retention_days} days × {est.replicas} replicas")
print(f" Total Storage: {total_gb:.1f} GB ({total_gb/1024:.2f} TB)")
return total_gb
# Example: E-commerce Event Log
ecommerce_cols = [
Column("event_id", "STRING", 36, 0.70),
Column("timestamp", "TIMESTAMP", 8, 0.85),
Column("user_id", "INT64", 8, 0.80),
Column("event_type", "ENUM", 15, 0.95),
Column("product_id", "INT64", 8, 0.75),
Column("category", "ENUM", 20, 0.95),
Column("price", "DOUBLE", 8, 0.40),
Column("quantity", "INT32", 4, 0.80),
Column("session_id", "STRING", 36, 0.65),
Column("device", "ENUM", 10, 0.95),
Column("country", "ENUM", 5, 0.97),
Column("page_url", "STRING", 100, 0.75),
]
print("=== Storage Estimates ===\n")
e1 = StorageEstimate("E-commerce Events", 50_000_000, ecommerce_cols, 365, 2)
total = estimate_storage(e1)
# Cost estimation
s3_cost_per_gb = 0.023
monthly_cost = (total / 12) * s3_cost_per_gb
print(f"\n S3 Cost: /month (/GB)")
Row Group and Compression
# === Row Group Optimization ===
# PyArrow — Write optimized Parquet
# import pyarrow as pa
# import pyarrow.parquet as pq
#
# table = pa.Table.from_pandas(df)
# pq.write_table(table, 'data.parquet',
# compression='zstd',
# compression_level=3,
# row_group_size=1_000_000, # rows per group
# use_dictionary=True, # dictionary encoding
# write_statistics=True, # min/max stats for pushdown
# )
# Spark — Parquet settings
# spark.conf.set("spark.sql.parquet.compression.codec", "zstd")
# spark.conf.set("spark.sql.parquet.block.size", str(256 * 1024 * 1024))
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "true")
# spark.conf.set("spark.sql.parquet.filterPushdown", "true")
# DuckDB — Read Parquet efficiently
# SELECT product_id, SUM(price * quantity) as revenue
# FROM read_parquet('s3://data/events/year=2024/**/*.parquet',
# hive_partitioning=true)
# WHERE event_type = 'purchase'
# AND year = 2024 AND month = 6
# GROUP BY product_id
# ORDER BY revenue DESC
# LIMIT 100;
@dataclass
class CompressionBenchmark:
codec: str
ratio: str
write_speed: str
read_speed: str
cpu_usage: str
best_for: str
benchmarks = [
CompressionBenchmark("Snappy", "60-70%", "Fast (500 MB/s)", "Fast (800 MB/s)", "Low",
"Default, balanced speed/compression"),
CompressionBenchmark("ZSTD (level 3)", "75-85%", "Medium (300 MB/s)", "Fast (700 MB/s)", "Medium",
"Best overall, good compression + speed"),
CompressionBenchmark("ZSTD (level 9)", "80-90%", "Slow (100 MB/s)", "Fast (700 MB/s)", "High",
"Cold storage, archive"),
CompressionBenchmark("Gzip", "75-85%", "Slow (80 MB/s)", "Medium (400 MB/s)", "High",
"Compatibility, legacy systems"),
CompressionBenchmark("LZ4", "55-65%", "Very fast (700 MB/s)", "Very fast (1 GB/s)", "Very low",
"Hot data, latency-sensitive"),
CompressionBenchmark("Uncompressed", "0%", "Very fast", "Very fast", "None",
"Temporary data, debugging"),
]
print("\n=== Compression Benchmarks ===")
for b in benchmarks:
print(f" [{b.codec}] Ratio: {b.ratio}")
print(f" Write: {b.write_speed} | Read: {b.read_speed}")
print(f" CPU: {b.cpu_usage} | Best for: {b.best_for}")
Partition and Lifecycle
# === Partition Strategy ===
@dataclass
class PartitionPlan:
table: str
partition_by: str
files_per_partition: str
file_size: str
compaction: str
plans = [
PartitionPlan("events", "year/month/day", "1-10 files", "128 MB - 1 GB",
"Daily compaction at 02:00 UTC"),
PartitionPlan("users", "country", "1-5 files", "256 MB - 1 GB",
"Weekly compaction"),
PartitionPlan("transactions", "year/month", "5-20 files", "256 MB - 1 GB",
"Monthly compaction"),
PartitionPlan("logs", "year/month/day/hour", "1-3 files", "64 MB - 512 MB",
"Hourly micro-batch, daily compaction"),
]
print("=== Partition Plans ===")
for p in plans:
print(f" [{p.table}] Partition: {p.partition_by}")
print(f" Files: {p.files_per_partition} | Size: {p.file_size}")
print(f" Compaction: {p.compaction}")
# Lifecycle Management
@dataclass
class StorageTier:
tier: str
age: str
storage: str
cost_gb: float
access: str
tiers = [
StorageTier("Hot", "0-30 days", "S3 Standard", 0.023, "Frequent queries"),
StorageTier("Warm", "30-90 days", "S3 Infrequent Access", 0.0125, "Weekly queries"),
StorageTier("Cold", "90-365 days", "S3 Glacier Instant", 0.004, "Monthly queries"),
StorageTier("Archive", "1-7 years", "S3 Glacier Deep", 0.00099, "Compliance only"),
]
print(f"\n\n=== Storage Tiers ===")
for t in tiers:
print(f" [{t.tier}] Age: {t.age} | Storage: {t.storage}")
print(f" Cost: /GB/month | Access: {t.access}")
เคล็ดลับ

- ZSTD: ใช้ ZSTD level 3 เป็น Default ดีกว่า Snappy ทั้ง Compression และ Speed
- Dictionary: เปิด Dictionary Encoding สำหรับ Column ที่มี Cardinality ต่ำ
- Compact: รวมไฟล์เล็กให้เป็น 128 MB - 1 GB ทุกวัน
- Partition: อย่า Over-partition ไม่เกิน 10000 Partitions
- Statistics: เปิด write_statistics สำหรับ Predicate Pushdown
Parquet Format คืออะไร
Apache Parquet Columnar Storage Column-oriented I/O Compression Snappy Gzip ZSTD LZ4 Predicate Pushdown Schema Spark Hive Presto DuckDB Pandas
เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน Void OOP คืออะไร — ข้อมูลครบถ้วน 2026
Capacity Planning ทำอย่างไร
Raw Size Compression Ratio Data Type Algorithm Growth Rate Partition Retention Hot Warm Cold Storage Cost Cloud Provider
แนะนำเพิ่มเติม — เรียนเทรดกับ iCafeForex
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง TensorFlow Serving Home Lab Setup
Row Group Size ตั้งเท่าไหร่ดี
Default 128 MB Analytical 256 MB - 1 GB ใหญ่ Compression ดี Sequential เร็ว เล็ก Random Access Memory Spark PyArrow Schema Columns
Partition Strategy ควรเป็นอย่างไร
Filter Date Region Category Hive-style ไม่เกิน 10000 Partition 128 MB - 1 GB Compaction รวมไฟล์ Partition Pruning Engine
แนะนำเพิ่มเติม — ติดตาม XM Signal
เนื้อหาเกี่ยวข้อง — อ่านต่อ: Ca SSL คืออะไร — คู่มือ IT Infrastructure 2026 — คู่มือฉบับสมบูรณ์ 2026
สรุป
Parquet Format Capacity Planning Columnar Storage ZSTD Compression Row Group Partition Strategy Data Lake S3 Lifecycle Hot Warm Cold Production
เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน PagerDuty Incident Business Continuity





