SiamCafe.net Blog
Technology

Parquet Format Capacity Planning

parquet format capacity planning
Parquet Format Capacity Planning | SiamCafe Blog
2025-07-12· อ. บอม — SiamCafe.net· 9,304 คำ

Parquet Capacity Planning

Parquet Format Capacity Planning Columnar Storage Compression Row Group Partition Strategy Data Lake Storage Estimation Production

Data TypeRaw Size/RowParquet (Snappy)Parquet (ZSTD)Compression Ratio
Integer (INT64)8 bytes1-2 bytes0.8-1.5 bytes75-90%
Float (DOUBLE)8 bytes4-6 bytes3-5 bytes25-60%
String (avg 50 chars)50 bytes10-20 bytes8-15 bytes60-85%
Boolean1 byte0.05-0.1 bytes0.03-0.08 bytes90-97%
Timestamp8 bytes1-3 bytes0.8-2 bytes62-90%
Enum/Category20 bytes1-3 bytes0.5-2 bytes85-97%

Storage Estimation

# === Parquet Storage Calculator ===

from dataclasses import dataclass

@dataclass
class Column:
    name: str
    dtype: str
    avg_bytes_raw: float
    compression_ratio: float

@dataclass
class StorageEstimate:
    description: str
    rows_per_day: int
    columns: list
    retention_days: int
    replicas: int

def estimate_storage(est):
    raw_per_row = sum(c.avg_bytes_raw for c in est.columns)
    compressed_per_row = sum(c.avg_bytes_raw * (1 - c.compression_ratio) for c in est.columns)

    raw_daily_gb = (raw_per_row * est.rows_per_day) / (1024**3)
    compressed_daily_gb = (compressed_per_row * est.rows_per_day) / (1024**3)
    total_gb = compressed_daily_gb * est.retention_days * est.replicas

    print(f"  [{est.description}]")
    print(f"    Rows/day: {est.rows_per_day:,} | Columns: {len(est.columns)}")
    print(f"    Raw/row: {raw_per_row:.0f} bytes | Compressed/row: {compressed_per_row:.1f} bytes")
    print(f"    Raw/day: {raw_daily_gb:.1f} GB | Compressed/day: {compressed_daily_gb:.2f} GB")
    print(f"    Retention: {est.retention_days} days × {est.replicas} replicas")
    print(f"    Total Storage: {total_gb:.1f} GB ({total_gb/1024:.2f} TB)")
    return total_gb

# Example: E-commerce Event Log
ecommerce_cols = [
    Column("event_id", "STRING", 36, 0.70),
    Column("timestamp", "TIMESTAMP", 8, 0.85),
    Column("user_id", "INT64", 8, 0.80),
    Column("event_type", "ENUM", 15, 0.95),
    Column("product_id", "INT64", 8, 0.75),
    Column("category", "ENUM", 20, 0.95),
    Column("price", "DOUBLE", 8, 0.40),
    Column("quantity", "INT32", 4, 0.80),
    Column("session_id", "STRING", 36, 0.65),
    Column("device", "ENUM", 10, 0.95),
    Column("country", "ENUM", 5, 0.97),
    Column("page_url", "STRING", 100, 0.75),
]

print("=== Storage Estimates ===\n")
e1 = StorageEstimate("E-commerce Events", 50_000_000, ecommerce_cols, 365, 2)
total = estimate_storage(e1)

# Cost estimation
s3_cost_per_gb = 0.023
monthly_cost = (total / 12) * s3_cost_per_gb
print(f"\n  S3 Cost: /month (/GB)")

Row Group and Compression

# === Row Group Optimization ===

# PyArrow — Write optimized Parquet
# import pyarrow as pa
# import pyarrow.parquet as pq
#
# table = pa.Table.from_pandas(df)
# pq.write_table(table, 'data.parquet',
#     compression='zstd',
#     compression_level=3,
#     row_group_size=1_000_000,     # rows per group
#     use_dictionary=True,           # dictionary encoding
#     write_statistics=True,         # min/max stats for pushdown
# )

# Spark — Parquet settings
# spark.conf.set("spark.sql.parquet.compression.codec", "zstd")
# spark.conf.set("spark.sql.parquet.block.size", str(256 * 1024 * 1024))
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "true")
# spark.conf.set("spark.sql.parquet.filterPushdown", "true")

# DuckDB — Read Parquet efficiently
# SELECT product_id, SUM(price * quantity) as revenue
# FROM read_parquet('s3://data/events/year=2024/**/*.parquet',
#     hive_partitioning=true)
# WHERE event_type = 'purchase'
#   AND year = 2024 AND month = 6
# GROUP BY product_id
# ORDER BY revenue DESC
# LIMIT 100;

@dataclass
class CompressionBenchmark:
    codec: str
    ratio: str
    write_speed: str
    read_speed: str
    cpu_usage: str
    best_for: str

benchmarks = [
    CompressionBenchmark("Snappy", "60-70%", "Fast (500 MB/s)", "Fast (800 MB/s)", "Low",
        "Default, balanced speed/compression"),
    CompressionBenchmark("ZSTD (level 3)", "75-85%", "Medium (300 MB/s)", "Fast (700 MB/s)", "Medium",
        "Best overall, good compression + speed"),
    CompressionBenchmark("ZSTD (level 9)", "80-90%", "Slow (100 MB/s)", "Fast (700 MB/s)", "High",
        "Cold storage, archive"),
    CompressionBenchmark("Gzip", "75-85%", "Slow (80 MB/s)", "Medium (400 MB/s)", "High",
        "Compatibility, legacy systems"),
    CompressionBenchmark("LZ4", "55-65%", "Very fast (700 MB/s)", "Very fast (1 GB/s)", "Very low",
        "Hot data, latency-sensitive"),
    CompressionBenchmark("Uncompressed", "0%", "Very fast", "Very fast", "None",
        "Temporary data, debugging"),
]

print("\n=== Compression Benchmarks ===")
for b in benchmarks:
    print(f"  [{b.codec}] Ratio: {b.ratio}")
    print(f"    Write: {b.write_speed} | Read: {b.read_speed}")
    print(f"    CPU: {b.cpu_usage} | Best for: {b.best_for}")

Partition and Lifecycle

# === Partition Strategy ===

@dataclass
class PartitionPlan:
    table: str
    partition_by: str
    files_per_partition: str
    file_size: str
    compaction: str

plans = [
    PartitionPlan("events", "year/month/day", "1-10 files", "128 MB - 1 GB",
        "Daily compaction at 02:00 UTC"),
    PartitionPlan("users", "country", "1-5 files", "256 MB - 1 GB",
        "Weekly compaction"),
    PartitionPlan("transactions", "year/month", "5-20 files", "256 MB - 1 GB",
        "Monthly compaction"),
    PartitionPlan("logs", "year/month/day/hour", "1-3 files", "64 MB - 512 MB",
        "Hourly micro-batch, daily compaction"),
]

print("=== Partition Plans ===")
for p in plans:
    print(f"  [{p.table}] Partition: {p.partition_by}")
    print(f"    Files: {p.files_per_partition} | Size: {p.file_size}")
    print(f"    Compaction: {p.compaction}")

# Lifecycle Management
@dataclass
class StorageTier:
    tier: str
    age: str
    storage: str
    cost_gb: float
    access: str

tiers = [
    StorageTier("Hot", "0-30 days", "S3 Standard", 0.023, "Frequent queries"),
    StorageTier("Warm", "30-90 days", "S3 Infrequent Access", 0.0125, "Weekly queries"),
    StorageTier("Cold", "90-365 days", "S3 Glacier Instant", 0.004, "Monthly queries"),
    StorageTier("Archive", "1-7 years", "S3 Glacier Deep", 0.00099, "Compliance only"),
]

print(f"\n\n=== Storage Tiers ===")
for t in tiers:
    print(f"  [{t.tier}] Age: {t.age} | Storage: {t.storage}")
    print(f"    Cost: /GB/month | Access: {t.access}")

เคล็ดลับ

Parquet Format คืออะไร

Apache Parquet Columnar Storage Column-oriented I/O Compression Snappy Gzip ZSTD LZ4 Predicate Pushdown Schema Spark Hive Presto DuckDB Pandas

Capacity Planning ทำอย่างไร

Raw Size Compression Ratio Data Type Algorithm Growth Rate Partition Retention Hot Warm Cold Storage Cost Cloud Provider

Row Group Size ตั้งเท่าไหร่ดี

Default 128 MB Analytical 256 MB - 1 GB ใหญ่ Compression ดี Sequential เร็ว เล็ก Random Access Memory Spark PyArrow Schema Columns

Partition Strategy ควรเป็นอย่างไร

Filter Date Region Category Hive-style ไม่เกิน 10000 Partition 128 MB - 1 GB Compaction รวมไฟล์ Partition Pruning Engine

สรุป

Parquet Format Capacity Planning Columnar Storage ZSTD Compression Row Group Partition Strategy Data Lake S3 Lifecycle Hot Warm Cold Production

📖 บทความที่เกี่ยวข้อง

Parquet Format Automation Scriptอ่านบทความ → Parquet Format Zero Downtime Deploymentอ่านบทความ → Parquet Format Progressive Deliveryอ่านบทความ → Parquet Format Database Migrationอ่านบทความ → Parquet Format Cost Optimization ลดค่าใช้จ่ายอ่านบทความ →

📚 ดูบทความทั้งหมด →