Parquet Format Capacity Planning — วางแผน Storage สำหรับ Columnar Data

Parquet Capacity Planning

Parquet Format Capacity Planning Columnar Storage Compression Row Group Partition Strategy Data Lake Storage Estimation Production

Data Type	Raw Size/Row	Parquet (Snappy)	Parquet (ZSTD)	Compression Ratio
Integer (INT64)	8 bytes	1-2 bytes	0.8-1.5 bytes	75-90%
Float (DOUBLE)	8 bytes	4-6 bytes	3-5 bytes	25-60%
String (avg 50 chars)	50 bytes	10-20 bytes	8-15 bytes	60-85%
Boolean	1 byte	0.05-0.1 bytes	0.03-0.08 bytes	90-97%
Timestamp	8 bytes	1-3 bytes	0.8-2 bytes	62-90%
Enum/Category	20 bytes	1-3 bytes	0.5-2 bytes	85-97%

Storage Estimation

# === Parquet Storage Calculator ===

from dataclasses import dataclass

@dataclass
class Column:
 name: str
 dtype: str
 avg_bytes_raw: float
 compression_ratio: float

@dataclass
class StorageEstimate:
 description: str
 rows_per_day: int
 columns: list
 retention_days: int
 replicas: int

def estimate_storage(est):
 raw_per_row = sum(c.avg_bytes_raw for c in est.columns)
 compressed_per_row = sum(c.avg_bytes_raw * (1 - c.compression_ratio) for c in est.columns)

 raw_daily_gb = (raw_per_row * est.rows_per_day) / (1024**3)
 compressed_daily_gb = (compressed_per_row * est.rows_per_day) / (1024**3)
 total_gb = compressed_daily_gb * est.retention_days * est.replicas

 print(f" [{est.description}]")
 print(f" Rows/day: {est.rows_per_day:,} | Columns: {len(est.columns)}")
 print(f" Raw/row: {raw_per_row:.0f} bytes | Compressed/row: {compressed_per_row:.1f} bytes")
 print(f" Raw/day: {raw_daily_gb:.1f} GB | Compressed/day: {compressed_daily_gb:.2f} GB")
 print(f" Retention: {est.retention_days} days × {est.replicas} replicas")
 print(f" Total Storage: {total_gb:.1f} GB ({total_gb/1024:.2f} TB)")
 return total_gb

# Example: E-commerce Event Log
ecommerce_cols = [
 Column("event_id", "STRING", 36, 0.70),
 Column("timestamp", "TIMESTAMP", 8, 0.85),
 Column("user_id", "INT64", 8, 0.80),
 Column("event_type", "ENUM", 15, 0.95),
 Column("product_id", "INT64", 8, 0.75),
 Column("category", "ENUM", 20, 0.95),
 Column("price", "DOUBLE", 8, 0.40),
 Column("quantity", "INT32", 4, 0.80),
 Column("session_id", "STRING", 36, 0.65),
 Column("device", "ENUM", 10, 0.95),
 Column("country", "ENUM", 5, 0.97),
 Column("page_url", "STRING", 100, 0.75),
]

print("=== Storage Estimates ===\n")
e1 = StorageEstimate("E-commerce Events", 50_000_000, ecommerce_cols, 365, 2)
total = estimate_storage(e1)

# Cost estimation
s3_cost_per_gb = 0.023
monthly_cost = (total / 12) * s3_cost_per_gb
print(f"\n S3 Cost: /month (/GB)")

Row Group and Compression

# === Row Group Optimization ===

# PyArrow — Write optimized Parquet
# import pyarrow as pa
# import pyarrow.parquet as pq
#
# table = pa.Table.from_pandas(df)
# pq.write_table(table, 'data.parquet',
# compression='zstd',
# compression_level=3,
# row_group_size=1_000_000, # rows per group
# use_dictionary=True, # dictionary encoding
# write_statistics=True, # min/max stats for pushdown
# )

# Spark — Parquet settings
# spark.conf.set("spark.sql.parquet.compression.codec", "zstd")
# spark.conf.set("spark.sql.parquet.block.size", str(256 * 1024 * 1024))
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "true")
# spark.conf.set("spark.sql.parquet.filterPushdown", "true")

# DuckDB — Read Parquet efficiently
# SELECT product_id, SUM(price * quantity) as revenue
# FROM read_parquet('s3://data/events/year=2024/**/*.parquet',
# hive_partitioning=true)
# WHERE event_type = 'purchase'
# AND year = 2024 AND month = 6
# GROUP BY product_id
# ORDER BY revenue DESC
# LIMIT 100;

@dataclass
class CompressionBenchmark:
 codec: str
 ratio: str
 write_speed: str
 read_speed: str
 cpu_usage: str
 best_for: str

benchmarks = [
 CompressionBenchmark("Snappy", "60-70%", "Fast (500 MB/s)", "Fast (800 MB/s)", "Low",
 "Default, balanced speed/compression"),
 CompressionBenchmark("ZSTD (level 3)", "75-85%", "Medium (300 MB/s)", "Fast (700 MB/s)", "Medium",
 "Best overall, good compression + speed"),
 CompressionBenchmark("ZSTD (level 9)", "80-90%", "Slow (100 MB/s)", "Fast (700 MB/s)", "High",
 "Cold storage, archive"),
 CompressionBenchmark("Gzip", "75-85%", "Slow (80 MB/s)", "Medium (400 MB/s)", "High",
 "Compatibility, legacy systems"),
 CompressionBenchmark("LZ4", "55-65%", "Very fast (700 MB/s)", "Very fast (1 GB/s)", "Very low",
 "Hot data, latency-sensitive"),
 CompressionBenchmark("Uncompressed", "0%", "Very fast", "Very fast", "None",
 "Temporary data, debugging"),
]

print("\n=== Compression Benchmarks ===")
for b in benchmarks:
 print(f" [{b.codec}] Ratio: {b.ratio}")
 print(f" Write: {b.write_speed} | Read: {b.read_speed}")
 print(f" CPU: {b.cpu_usage} | Best for: {b.best_for}")

Partition and Lifecycle

# === Partition Strategy ===

@dataclass
class PartitionPlan:
 table: str
 partition_by: str
 files_per_partition: str
 file_size: str
 compaction: str

plans = [
 PartitionPlan("events", "year/month/day", "1-10 files", "128 MB - 1 GB",
 "Daily compaction at 02:00 UTC"),
 PartitionPlan("users", "country", "1-5 files", "256 MB - 1 GB",
 "Weekly compaction"),
 PartitionPlan("transactions", "year/month", "5-20 files", "256 MB - 1 GB",
 "Monthly compaction"),
 PartitionPlan("logs", "year/month/day/hour", "1-3 files", "64 MB - 512 MB",
 "Hourly micro-batch, daily compaction"),
]

print("=== Partition Plans ===")
for p in plans:
 print(f" [{p.table}] Partition: {p.partition_by}")
 print(f" Files: {p.files_per_partition} | Size: {p.file_size}")
 print(f" Compaction: {p.compaction}")

# Lifecycle Management
@dataclass
class StorageTier:
 tier: str
 age: str
 storage: str
 cost_gb: float
 access: str

tiers = [
 StorageTier("Hot", "0-30 days", "S3 Standard", 0.023, "Frequent queries"),
 StorageTier("Warm", "30-90 days", "S3 Infrequent Access", 0.0125, "Weekly queries"),
 StorageTier("Cold", "90-365 days", "S3 Glacier Instant", 0.004, "Monthly queries"),
 StorageTier("Archive", "1-7 years", "S3 Glacier Deep", 0.00099, "Compliance only"),
]

print(f"\n\n=== Storage Tiers ===")
for t in tiers:
 print(f" [{t.tier}] Age: {t.age} | Storage: {t.storage}")
 print(f" Cost: /GB/month | Access: {t.access}")

เคล็ดลับ

ZSTD: ใช้ ZSTD level 3 เป็น Default ดีกว่า Snappy ทั้ง Compression และ Speed
Dictionary: เปิด Dictionary Encoding สำหรับ Column ที่มี Cardinality ต่ำ
Compact: รวมไฟล์เล็กให้เป็น 128 MB - 1 GB ทุกวัน
Partition: อย่า Over-partition ไม่เกิน 10000 Partitions
Statistics: เปิด write_statistics สำหรับ Predicate Pushdown