Parquet Format Open Source Contribution คืออะไร
Apache Parquet เป็น columnar storage format สำหรับ big data ที่ออกแบบมาเพื่อ efficient data storage และ retrieval รองรับ complex nested data structures และ compression algorithms หลากหลาย Parquet เป็น open-source project ภายใต้ Apache Software Foundation การ contribute ให้กับ Parquet project ช่วยพัฒนา ecosystem ของ big data ทั้งหมด เพราะ Parquet ถูกใช้โดย Spark, Hive, Presto, Pandas, DuckDB และ tools อีกมากมาย บทความนี้อธิบายวิธี contribute ให้กับ Apache Parquet project ตั้งแต่เริ่มต้นจนถึง advanced contributions
Parquet Format Fundamentals
# parquet_basics.py — Parquet format fundamentals
import json
class ParquetBasics:
FEATURES = {
"columnar": {
"name": "Columnar Storage",
"description": "เก็บข้อมูลแบบ column-oriented — อ่านเฉพาะ columns ที่ต้องการ",
"benefit": "Query เร็วกว่า row-based 10-100x สำหรับ analytical workloads",
},
"compression": {
"name": "Efficient Compression",
"description": "รองรับ Snappy, Gzip, LZ4, Zstd, Brotli — compress per column",
"benefit": "ลด storage 50-90% เทียบกับ CSV/JSON — ประหยัด cost",
},
"encoding": {
"name": "Column Encoding",
"description": "Dictionary, RLE, Delta, Bit-packing — encode ตาม data type",
"benefit": "ลด size เพิ่มเติมจาก compression — efficient I/O",
},
"schema": {
"name": "Self-describing Schema",
"description": "Schema embedded ใน file — ไม่ต้อง external schema registry",
"benefit": "Schema evolution supported — add/rename columns ได้",
},
"predicate_pushdown": {
"name": "Predicate Pushdown",
"description": "Filter data ที่ storage level — skip row groups ที่ไม่ match",
"benefit": "อ่าน data น้อยลง → query เร็วขึ้น",
},
}
FILE_STRUCTURE = {
"header": "Magic number (PAR1) — 4 bytes",
"row_groups": "Data divided into row groups (128MB default) — each independently readable",
"column_chunks": "Each row group contains column chunks — one per column",
"pages": "Column chunks divided into pages (1MB default) — unit of compression/encoding",
"footer": "Schema + metadata + column statistics — read footer first to plan query",
}
def show_features(self):
print("=== Parquet Features ===\n")
for key, feat in self.FEATURES.items():
print(f"[{feat['name']}]")
print(f" {feat['description']}")
print()
def show_structure(self):
print("=== File Structure ===")
for part, desc in self.FILE_STRUCTURE.items():
print(f" [{part}] {desc}")
basics = ParquetBasics()
basics.show_features()
basics.show_structure()
Getting Started with Contributions
# contributing.py — How to contribute to Apache Parquet
import json
class ContributingGuide:
REPOSITORIES = {
"parquet_format": {
"name": "apache/parquet-format",
"language": "Thrift/Markdown",
"description": "Format specification — Thrift definitions, documentation",
"url": "github.com/apache/parquet-format",
},
"parquet_java": {
"name": "apache/parquet-java",
"language": "Java",
"description": "Java implementation — ใช้โดย Spark, Hive, Presto",
"url": "github.com/apache/parquet-java",
},
"arrow": {
"name": "apache/arrow (includes parquet-cpp)",
"language": "C++/Python",
"description": "C++ implementation + Python bindings (pyarrow)",
"url": "github.com/apache/arrow",
},
"parquet_rs": {
"name": "apache/arrow-rs (parquet crate)",
"language": "Rust",
"description": "Rust implementation — ใช้โดย DataFusion, Polars",
"url": "github.com/apache/arrow-rs",
},
}
STEPS = {
"step1": {
"name": "1. เริ่มจาก Good First Issues",
"description": "หา issues ที่ tag 'good first issue' หรือ 'beginner' ใน GitHub",
"tip": "เริ่มจาก documentation fixes, typos, test improvements",
},
"step2": {
"name": "2. Setup Development Environment",
"description": "Clone repo, setup build tools, run tests locally",
"tip": "Java: Maven/Gradle, C++: CMake, Rust: Cargo, Python: pip install -e .",
},
"step3": {
"name": "3. Understand the Codebase",
"description": "อ่าน format spec, ดู existing PRs, review code ของคนอื่น",
"tip": "เริ่มจาก review PRs — เรียนรู้ code style + process",
},
"step4": {
"name": "4. Submit Your First PR",
"description": "Fork → branch → change → test → PR → code review",
"tip": "PR เล็กๆ ดีกว่า PR ใหญ่ — ง่ายต่อการ review",
},
"step5": {
"name": "5. Engage with Community",
"description": "Subscribe dev mailing list, join Slack, attend community meetings",
"tip": "Apache projects ใช้ mailing lists เป็นหลัก — subscribe dev@parquet.apache.org",
},
}
def show_repos(self):
print("=== Parquet Repositories ===\n")
for key, repo in self.REPOSITORIES.items():
print(f"[{repo['name']}] ({repo['language']})")
print(f" {repo['description']}")
print()
def show_steps(self):
print("=== Contributing Steps ===")
for key, step in self.STEPS.items():
print(f"\n[{step['name']}]")
print(f" {step['description']}")
guide = ContributingGuide()
guide.show_repos()
guide.show_steps()
Python Parquet Tools
# python_tools.py — Python tools for working with Parquet
import json
class PythonParquetTools:
CODE = """
# parquet_tools.py — Python tools for Parquet analysis and contribution
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import json
from pathlib import Path
class ParquetAnalyzer:
'''Analyze Parquet files — useful for debugging and contributing'''
def __init__(self, file_path):
self.file_path = file_path
self.pf = pq.ParquetFile(file_path)
def metadata(self):
'''Get file metadata'''
meta = self.pf.metadata
return {
'format_version': meta.format_version,
'created_by': meta.created_by,
'num_columns': meta.num_columns,
'num_rows': meta.num_rows,
'num_row_groups': meta.num_row_groups,
'serialized_size': meta.serialized_size,
'file_size_mb': round(Path(self.file_path).stat().st_size / 1e6, 2),
}
def schema_info(self):
'''Get detailed schema information'''
schema = self.pf.schema_arrow
columns = []
for i, field in enumerate(schema):
columns.append({
'name': field.name,
'type': str(field.type),
'nullable': field.nullable,
'metadata': dict(field.metadata) if field.metadata else None,
})
return {
'num_columns': len(columns),
'columns': columns,
'pandas_metadata': schema.pandas_metadata,
}
def row_group_stats(self):
'''Get statistics for each row group'''
meta = self.pf.metadata
groups = []
for i in range(meta.num_row_groups):
rg = meta.row_group(i)
columns = []
for j in range(rg.num_columns):
col = rg.column(j)
columns.append({
'name': col.path_in_schema,
'compression': col.compression,
'encodings': str(col.encodings),
'total_compressed_size': col.total_compressed_size,
'total_uncompressed_size': col.total_uncompressed_size,
'compression_ratio': round(
col.total_uncompressed_size / max(col.total_compressed_size, 1), 1
),
'has_statistics': col.is_stats_set,
})
groups.append({
'row_group_index': i,
'num_rows': rg.num_rows,
'total_byte_size': rg.total_byte_size,
'columns': columns,
})
return groups
def compression_report(self):
'''Generate compression efficiency report'''
groups = self.row_group_stats()
total_compressed = 0
total_uncompressed = 0
for group in groups:
for col in group['columns']:
total_compressed += col['total_compressed_size']
total_uncompressed += col['total_uncompressed_size']
return {
'total_compressed_mb': round(total_compressed / 1e6, 2),
'total_uncompressed_mb': round(total_uncompressed / 1e6, 2),
'overall_ratio': round(total_uncompressed / max(total_compressed, 1), 1),
'space_saved_pct': round((1 - total_compressed / max(total_uncompressed, 1)) * 100, 1),
}
class ParquetWriter:
'''Write Parquet files with various configurations — for testing'''
@staticmethod
def write_with_options(df, output_path, compression='snappy',
row_group_size=128*1024*1024):
table = pa.Table.from_pandas(df)
pq.write_table(
table, output_path,
compression=compression,
row_group_size=row_group_size,
use_dictionary=True,
write_statistics=True,
)
return ParquetAnalyzer(output_path).metadata()
# analyzer = ParquetAnalyzer("data.parquet")
# print(json.dumps(analyzer.metadata(), indent=2))
# print(json.dumps(analyzer.compression_report(), indent=2))
"""
def show_code(self):
print("=== Parquet Analyzer ===")
print(self.CODE[:600])
tools = PythonParquetTools()
tools.show_code()
Contribution Areas
# areas.py — Areas to contribute to Parquet
import json
class ContributionAreas:
AREAS = {
"documentation": {
"name": "Documentation",
"difficulty": "Easy",
"examples": ["Fix typos in spec", "Add examples", "Improve API docs", "Write tutorials"],
"impact": "High — ช่วย new users เข้าใจ format ง่ายขึ้น",
},
"testing": {
"name": "Testing",
"difficulty": "Easy-Medium",
"examples": ["Add unit tests", "Add integration tests", "Improve test coverage", "Fuzz testing"],
"impact": "High — ป้องกัน regressions, improve reliability",
},
"performance": {
"name": "Performance Optimization",
"difficulty": "Medium-Hard",
"examples": ["Optimize encoding", "Improve compression", "Reduce memory usage", "Parallel I/O"],
"impact": "Very High — affects all users of Parquet ecosystem",
},
"new_features": {
"name": "New Features",
"difficulty": "Hard",
"examples": ["New encoding types", "New compression codecs", "Bloom filter improvements", "Encryption"],
"impact": "Very High — requires format spec change + multi-language implementation",
},
"bug_fixes": {
"name": "Bug Fixes",
"difficulty": "Medium",
"examples": ["Fix edge cases", "Fix compatibility issues", "Fix memory leaks"],
"impact": "High — directly helps users experiencing issues",
},
}
def show_areas(self):
print("=== Contribution Areas ===\n")
for key, area in self.AREAS.items():
print(f"[{area['name']}] (Difficulty: {area['difficulty']})")
print(f" Impact: {area['impact']}")
for ex in area['examples'][:3]:
print(f" • {ex}")
print()
areas = ContributionAreas()
areas.show_areas()
Open Source Best Practices
# best_practices.py — Open source contribution best practices
import json
class OpenSourceBestPractices:
PRACTICES = {
"communication": {
"name": "Communication First",
"rules": [
"อ่าน CONTRIBUTING.md ก่อนเริ่ม",
"Comment บน issue ก่อนเริ่มทำ — announce intent",
"ถาม dev mailing list ถ้าไม่แน่ใจ design decision",
"Be patient — Apache projects มี review process ที่ thorough",
],
},
"code_quality": {
"name": "Code Quality",
"rules": [
"Follow existing code style — ไม่ reformat code ที่ไม่เกี่ยว",
"Write tests สำหรับ every change",
"Keep PRs small and focused — 1 PR = 1 logical change",
"Write clear commit messages",
],
},
"review_process": {
"name": "Review Process",
"rules": [
"Respond to review comments promptly",
"Review PRs ของคนอื่น — build trust with maintainers",
"Don't take review feedback personally — it's about code quality",
"Squash commits before merge (if required)",
],
},
}
def show_practices(self):
print("=== Best Practices ===\n")
for key, p in self.PRACTICES.items():
print(f"[{p['name']}]")
for rule in p['rules'][:3]:
print(f" • {rule}")
print()
bp = OpenSourceBestPractices()
bp.show_practices()
FAQ - คำถามที่พบบ่อย
Q: ต้องเก่ง Java/C++ ไหมถึงจะ contribute ได้?
A: ไม่จำเป็น — มีหลาย language implementations: Python (pyarrow): contribute ได้ถ้าเก่ง Python Rust (arrow-rs): ถ้าเก่ง Rust — Parquet crate เติบโตเร็ว Documentation: ไม่ต้อง code เลย — แก้ docs, เขียน tutorials Testing: เขียน test cases — เรียนรู้ format ไปด้วย เริ่มจากภาษาที่ถนัด → expand ไปภาษาอื่นทีหลัง
Q: Parquet กับ ORC อันไหนดีกว่า?
A: Parquet: wider ecosystem (Spark, Pandas, DuckDB, Arrow), self-describing schema, nested data support ดี ORC: optimized for Hive, better for highly structured data, ACID support ใน Hive ปัจจุบัน: Parquet เป็น de-facto standard สำหรับ data engineering — ใช้งานกว้างกว่า contribute Parquet: impact กว้างกว่า — ใช้โดย tools มากกว่า
Q: Apache project มี process ยุ่งยากไหม?
A: มี process ที่ structured: Sign CLA (Contributor License Agreement) ก่อน PR แรก Code review โดย committers — อาจใช้เวลา 1-2 สัปดาห์ Mailing list เป็น official communication channel ข้อดี: เรียนรู้ software engineering ระดับ world-class, ได้ network กับ experts เริ่มจาก: good first issues + review PRs ของคนอื่น → build credibility
Q: contribute แล้วได้อะไร?
A: Skills: เรียนรู้ data engineering, distributed systems, file format design Portfolio: GitHub contributions ที่ verified — ดีกว่า personal projects Network: รู้จัก engineers จาก Databricks, Google, Meta, Apple Career: Apache committer/PMC status = respected ในวงการ Impact: code ที่เขียนถูกใช้โดย millions of users worldwide
