SiamCafe.net Blog
Technology

Parquet Format Capacity Planning

parquet format capacity planning
Parquet Format Capacity Planning | SiamCafe Blog
2025-07-12· อ. บอม — SiamCafe.net· 9,304 คำ

Parquet Capacity Planning

Parquet Format Capacity Planning Columnar Storage Compression Row Group Partition Strategy Data Lake Storage Estimation Production

Data TypeRaw Size/RowParquet (Snappy)Parquet (ZSTD)Compression Ratio
Integer (INT64)8 bytes1-2 bytes0.8-1.5 bytes75-90%
Float (DOUBLE)8 bytes4-6 bytes3-5 bytes25-60%
String (avg 50 chars)50 bytes10-20 bytes8-15 bytes60-85%
Boolean1 byte0.05-0.1 bytes0.03-0.08 bytes90-97%
Timestamp8 bytes1-3 bytes0.8-2 bytes62-90%
Enum/Category20 bytes1-3 bytes0.5-2 bytes85-97%

Storage Estimation

# === Parquet Storage Calculator ===

from dataclasses import dataclass

@dataclass
class Column:
 name: str
 dtype: str
 avg_bytes_raw: float
 compression_ratio: float

@dataclass
class StorageEstimate:
 description: str
 rows_per_day: int
 columns: list
 retention_days: int
 replicas: int

def estimate_storage(est):
 raw_per_row = sum(c.avg_bytes_raw for c in est.columns)
 compressed_per_row = sum(c.avg_bytes_raw * (1 - c.compression_ratio) for c in est.columns)

 raw_daily_gb = (raw_per_row * est.rows_per_day) / (1024**3)
 compressed_daily_gb = (compressed_per_row * est.rows_per_day) / (1024**3)
 total_gb = compressed_daily_gb * est.retention_days * est.replicas

 print(f" [{est.description}]")
 print(f" Rows/day: {est.rows_per_day:,} | Columns: {len(est.columns)}")
 print(f" Raw/row: {raw_per_row:.0f} bytes | Compressed/row: {compressed_per_row:.1f} bytes")
 print(f" Raw/day: {raw_daily_gb:.1f} GB | Compressed/day: {compressed_daily_gb:.2f} GB")
 print(f" Retention: {est.retention_days} days × {est.replicas} replicas")
 print(f" Total Storage: {total_gb:.1f} GB ({total_gb/1024:.2f} TB)")
 return total_gb

# Example: E-commerce Event Log
ecommerce_cols = [
 Column("event_id", "STRING", 36, 0.70),
 Column("timestamp", "TIMESTAMP", 8, 0.85),
 Column("user_id", "INT64", 8, 0.80),
 Column("event_type", "ENUM", 15, 0.95),
 Column("product_id", "INT64", 8, 0.75),
 Column("category", "ENUM", 20, 0.95),
 Column("price", "DOUBLE", 8, 0.40),
 Column("quantity", "INT32", 4, 0.80),
 Column("session_id", "STRING", 36, 0.65),
 Column("device", "ENUM", 10, 0.95),
 Column("country", "ENUM", 5, 0.97),
 Column("page_url", "STRING", 100, 0.75),
]

print("=== Storage Estimates ===\n")
e1 = StorageEstimate("E-commerce Events", 50_000_000, ecommerce_cols, 365, 2)
total = estimate_storage(e1)

# Cost estimation
s3_cost_per_gb = 0.023
monthly_cost = (total / 12) * s3_cost_per_gb
print(f"\n S3 Cost: /month (/GB)")

Row Group and Compression

# === Row Group Optimization ===

# PyArrow — Write optimized Parquet
# import pyarrow as pa
# import pyarrow.parquet as pq
#
# table = pa.Table.from_pandas(df)
# pq.write_table(table, 'data.parquet',
# compression='zstd',
# compression_level=3,
# row_group_size=1_000_000, # rows per group
# use_dictionary=True, # dictionary encoding
# write_statistics=True, # min/max stats for pushdown
# )

# Spark — Parquet settings
# spark.conf.set("spark.sql.parquet.compression.codec", "zstd")
# spark.conf.set("spark.sql.parquet.block.size", str(256 * 1024 * 1024))
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "true")
# spark.conf.set("spark.sql.parquet.filterPushdown", "true")

# DuckDB — Read Parquet efficiently
# SELECT product_id, SUM(price * quantity) as revenue
# FROM read_parquet('s3://data/events/year=2024/**/*.parquet',
# hive_partitioning=true)
# WHERE event_type = 'purchase'
# AND year = 2024 AND month = 6
# GROUP BY product_id
# ORDER BY revenue DESC
# LIMIT 100;

@dataclass
class CompressionBenchmark:
 codec: str
 ratio: str
 write_speed: str
 read_speed: str
 cpu_usage: str
 best_for: str

benchmarks = [
 CompressionBenchmark("Snappy", "60-70%", "Fast (500 MB/s)", "Fast (800 MB/s)", "Low",
 "Default, balanced speed/compression"),
 CompressionBenchmark("ZSTD (level 3)", "75-85%", "Medium (300 MB/s)", "Fast (700 MB/s)", "Medium",
 "Best overall, good compression + speed"),
 CompressionBenchmark("ZSTD (level 9)", "80-90%", "Slow (100 MB/s)", "Fast (700 MB/s)", "High",
 "Cold storage, archive"),
 CompressionBenchmark("Gzip", "75-85%", "Slow (80 MB/s)", "Medium (400 MB/s)", "High",
 "Compatibility, legacy systems"),
 CompressionBenchmark("LZ4", "55-65%", "Very fast (700 MB/s)", "Very fast (1 GB/s)", "Very low",
 "Hot data, latency-sensitive"),
 CompressionBenchmark("Uncompressed", "0%", "Very fast", "Very fast", "None",
 "Temporary data, debugging"),
]

print("\n=== Compression Benchmarks ===")
for b in benchmarks:
 print(f" [{b.codec}] Ratio: {b.ratio}")
 print(f" Write: {b.write_speed} | Read: {b.read_speed}")
 print(f" CPU: {b.cpu_usage} | Best for: {b.best_for}")

Partition and Lifecycle

# === Partition Strategy ===

@dataclass
class PartitionPlan:
 table: str
 partition_by: str
 files_per_partition: str
 file_size: str
 compaction: str

plans = [
 PartitionPlan("events", "year/month/day", "1-10 files", "128 MB - 1 GB",
 "Daily compaction at 02:00 UTC"),
 PartitionPlan("users", "country", "1-5 files", "256 MB - 1 GB",
 "Weekly compaction"),
 PartitionPlan("transactions", "year/month", "5-20 files", "256 MB - 1 GB",
 "Monthly compaction"),
 PartitionPlan("logs", "year/month/day/hour", "1-3 files", "64 MB - 512 MB",
 "Hourly micro-batch, daily compaction"),
]

print("=== Partition Plans ===")
for p in plans:
 print(f" [{p.table}] Partition: {p.partition_by}")
 print(f" Files: {p.files_per_partition} | Size: {p.file_size}")
 print(f" Compaction: {p.compaction}")

# Lifecycle Management
@dataclass
class StorageTier:
 tier: str
 age: str
 storage: str
 cost_gb: float
 access: str

tiers = [
 StorageTier("Hot", "0-30 days", "S3 Standard", 0.023, "Frequent queries"),
 StorageTier("Warm", "30-90 days", "S3 Infrequent Access", 0.0125, "Weekly queries"),
 StorageTier("Cold", "90-365 days", "S3 Glacier Instant", 0.004, "Monthly queries"),
 StorageTier("Archive", "1-7 years", "S3 Glacier Deep", 0.00099, "Compliance only"),
]

print(f"\n\n=== Storage Tiers ===")
for t in tiers:
 print(f" [{t.tier}] Age: {t.age} | Storage: {t.storage}")
 print(f" Cost: /GB/month | Access: {t.access}")

เคล็ดลับ

Parquet Format คืออะไร

Apache Parquet Columnar Storage Column-oriented I/O Compression Snappy Gzip ZSTD LZ4 Predicate Pushdown Schema Spark Hive Presto DuckDB Pandas

Capacity Planning ทำอย่างไร

Raw Size Compression Ratio Data Type Algorithm Growth Rate Partition Retention Hot Warm Cold Storage Cost Cloud Provider

Row Group Size ตั้งเท่าไหร่ดี

Default 128 MB Analytical 256 MB - 1 GB ใหญ่ Compression ดี Sequential เร็ว เล็ก Random Access Memory Spark PyArrow Schema Columns

Partition Strategy ควรเป็นอย่างไร

Filter Date Region Category Hive-style ไม่เกิน 10000 Partition 128 MB - 1 GB Compaction รวมไฟล์ Partition Pruning Engine

สรุป

Parquet Format Capacity Planning Columnar Storage ZSTD Compression Row Group Partition Strategy Data Lake S3 Lifecycle Hot Warm Cold Production

📖 บทความที่เกี่ยวข้อง

Parquet Format Automation Scriptอ่านบทความ → Parquet Format Zero Downtime Deploymentอ่านบทความ → Parquet Format Progressive Deliveryอ่านบทความ → Parquet Format Database Migrationอ่านบทความ → Parquet Format Cost Optimization ลดค่าใช้จ่ายอ่านบทความ →

📚 ดูบทความทั้งหมด →