Technology

data science life cycle คือ

data science life cycle คอ
data science life cycle คือ | SiamCafe Blog
2025-09-20· อ. บอม — SiamCafe.net· 11,598 คำ

Data Science Life Cycle

Data Science Life Cycle CRISP-DM Business Understanding Data Collection EDA Modeling Evaluation Deployment Monitoring Python Pandas Scikit-learn MLflow

ขั้นตอนสัดส่วนเวลาเครื่องมือหลักOutput
Business Understanding10%Meeting, DocsProblem Statement
Data Collection10%SQL, API, Web ScrapingRaw Dataset
Data Preparation60%Pandas, NumPyClean Dataset
Modeling10%Scikit-learn, XGBoostTrained Model
Evaluation5%Metrics, Cross-validationPerformance Report
Deployment5%Docker, FastAPI, MLflowProduction API

Data Collection และ EDA

# === Data Collection & EDA ===

# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
#
# # Data Collection
# # From CSV
# df = pd.read_csv('sales_data.csv')
#
# # From SQL
# import sqlalchemy
# engine = sqlalchemy.create_engine('postgresql://user:pass@host/db')
# df = pd.read_sql('SELECT * FROM orders WHERE date > %s', engine,
#                   params=['2024-01-01'])
#
# # From API
# import requests
# response = requests.get('https://api.example.com/data')
# df = pd.DataFrame(response.json())
#
# # EDA — Exploratory Data Analysis
# print(df.shape)           # (10000, 15)
# print(df.dtypes)          # Column types
# print(df.describe())      # Statistics
# print(df.isnull().sum())  # Missing values
# print(df.duplicated().sum())  # Duplicates
#
# # Visualization
# fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# df['revenue'].hist(ax=axes[0,0], bins=50)
# axes[0,0].set_title('Revenue Distribution')
# sns.boxplot(data=df, x='category', y='revenue', ax=axes[0,1])
# sns.heatmap(df.corr(), annot=True, ax=axes[1,0])
# df.groupby('month')['revenue'].sum().plot(ax=axes[1,1])
# plt.tight_layout()
# plt.savefig('eda_report.png')

from dataclasses import dataclass
from typing import List

@dataclass
class DatasetInfo:
    name: str
    rows: int
    columns: int
    missing_pct: float
    duplicates: int
    size_mb: float

datasets = [
    DatasetInfo("sales_orders", 500000, 15, 2.3, 150, 85.5),
    DatasetInfo("customer_profiles", 100000, 25, 5.1, 0, 45.2),
    DatasetInfo("product_catalog", 8500, 12, 0.5, 10, 3.1),
    DatasetInfo("web_analytics", 2000000, 20, 1.2, 5000, 320.0),
    DatasetInfo("support_tickets", 50000, 18, 8.5, 200, 25.6),
]

print("=== Dataset Overview ===")
for d in datasets:
    print(f"  [{d.name}] {d.rows:,} rows x {d.columns} cols ({d.size_mb} MB)")
    print(f"    Missing: {d.missing_pct}% | Duplicates: {d.duplicates:,}")

Data Preparation และ Modeling

# === Data Preparation & Modeling ===

# Data Cleaning
# df = df.drop_duplicates()
# df['email'] = df['email'].str.lower().str.strip()
# df['date'] = pd.to_datetime(df['date'], errors='coerce')
#
# # Handle Missing Values
# df['age'].fillna(df['age'].median(), inplace=True)
# df['category'].fillna('unknown', inplace=True)
# df = df.dropna(subset=['customer_id'])
#
# # Feature Engineering
# df['order_month'] = df['date'].dt.month
# df['is_weekend'] = df['date'].dt.dayofweek >= 5
# df['total_orders'] = df.groupby('customer_id')['order_id'].transform('count')
# df['avg_order_value'] = df.groupby('customer_id')['amount'].transform('mean')
#
# # Encoding & Scaling
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# le = LabelEncoder()
# df['category_encoded'] = le.fit_transform(df['category'])
# scaler = StandardScaler()
# df[['amount_scaled', 'age_scaled']] = scaler.fit_transform(df[['amount', 'age']])

# Modeling
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import classification_report, roc_auc_score
#
# X = df[features]
# y = df['target']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
# models = {
#     'RF': RandomForestClassifier(n_estimators=100, random_state=42),
#     'GBM': GradientBoostingClassifier(n_estimators=100, random_state=42),
# }
#
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
#     print(f"{name}: AUC={auc:.4f}")
#     print(classification_report(y_test, y_pred))

@dataclass
class ModelResult:
    model: str
    accuracy: float
    precision: float
    recall: float
    f1: float
    auc: float
    train_time: str

results = [
    ModelResult("Logistic Regression", 0.82, 0.80, 0.78, 0.79, 0.85, "2s"),
    ModelResult("Random Forest", 0.88, 0.87, 0.85, 0.86, 0.92, "15s"),
    ModelResult("XGBoost", 0.90, 0.89, 0.88, 0.88, 0.94, "30s"),
    ModelResult("LightGBM", 0.91, 0.90, 0.89, 0.89, 0.95, "10s"),
    ModelResult("Neural Network", 0.89, 0.88, 0.87, 0.87, 0.93, "120s"),
]

print("\n=== Model Comparison ===")
for m in results:
    print(f"  [{m.model}] AUC: {m.auc} | F1: {m.f1} | Train: {m.train_time}")
    print(f"    Acc: {m.accuracy} | Prec: {m.precision} | Recall: {m.recall}")

Deployment และ Monitoring

# === Deployment & Monitoring ===

# MLflow — Experiment Tracking
# import mlflow
# import mlflow.sklearn
#
# mlflow.set_experiment("churn_prediction")
# with mlflow.start_run(run_name="xgboost_v2"):
#     model = XGBClassifier(n_estimators=200, max_depth=6)
#     model.fit(X_train, y_train)
#     mlflow.log_params({"n_estimators": 200, "max_depth": 6})
#     mlflow.log_metrics({"auc": 0.94, "f1": 0.88})
#     mlflow.sklearn.log_model(model, "model")

# FastAPI — Model Serving
# from fastapi import FastAPI
# import joblib
#
# app = FastAPI()
# model = joblib.load("model.pkl")
#
# @app.post("/predict")
# async def predict(features: dict):
#     X = pd.DataFrame([features])
#     prediction = model.predict(X)[0]
#     probability = model.predict_proba(X)[0][1]
#     return {"prediction": int(prediction), "probability": float(probability)}

# Docker Deployment
# FROM python:3.11-slim
# WORKDIR /app
# COPY requirements.txt .
# RUN pip install -r requirements.txt
# COPY . .
# CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

lifecycle_tools = {
    "Business Understanding": "Jira, Confluence, Miro",
    "Data Collection": "SQL, APIs, Web Scraping, Spark",
    "Data Preparation": "Pandas, NumPy, Polars, dbt",
    "EDA": "Jupyter, Matplotlib, Seaborn, Plotly",
    "Modeling": "Scikit-learn, XGBoost, PyTorch, TF",
    "Evaluation": "MLflow, Weights & Biases",
    "Deployment": "Docker, FastAPI, KServe, SageMaker",
    "Monitoring": "Evidently, WhyLabs, Grafana",
}

print("Data Science Toolkit:")
for phase, tools in lifecycle_tools.items():
    print(f"  [{phase}]: {tools}")

เคล็ดลับ

การบริหารจัดการฐานข้อมูลอย่างมืออาชีพ

Database Management ที่ดีเริ่มจากการออกแบบ Schema ที่เหมาะสม ใช้ Normalization ลด Data Redundancy สร้าง Index บน Column ที่ Query บ่อย วิเคราะห์ Query Plan เพื่อ Optimize Performance และทำ Regular Maintenance เช่น VACUUM สำหรับ PostgreSQL หรือ OPTIMIZE TABLE สำหรับ MySQL

เรื่อง High Availability ควรติดตั้ง Replication อย่างน้อย 1 Replica สำหรับ Read Scaling และ Disaster Recovery ใช้ Connection Pooling เช่น PgBouncer หรือ ProxySQL ลดภาระ Connection ที่เปิดพร้อมกัน และตั้ง Automated Failover ให้ระบบสลับไป Replica อัตโนมัติเมื่อ Primary ล่ม

Backup ต้องทำทั้ง Full Backup รายวัน และ Incremental Backup ทุก 1-4 ชั่วโมง เก็บ Binary Log หรือ WAL สำหรับ Point-in-Time Recovery ทดสอบ Restore เป็นประจำ และเก็บ Backup ไว้ Off-site ด้วยเสมอ

Data Science Life Cycle คืออะไร

ขั้นตอนทั้งหมด Business Data Collection Preparation Modeling Evaluation Deployment Monitor วงจรซ้ำ CRISP-DM

CRISP-DM คืออะไร

Framework 6 ขั้นตอน Business Understanding Data Understanding Preparation Modeling Evaluation Deployment วนกลับได้

ขั้นตอนไหนใช้เวลามากที่สุด

Data Preparation 60-80% Cleaning Missing Values Feature Engineering Transformation Encoding Scaling ข้อมูลดี Model ดี

เครื่องมือ Data Science มีอะไรบ้าง

Python Pandas NumPy Matplotlib Scikit-learn TensorFlow PyTorch Jupyter SQL Git MLflow Docker Airflow

สรุป

Data Science Life Cycle CRISP-DM Business Understanding Data Collection Preparation EDA Modeling Evaluation Deployment Monitoring Python Pandas Scikit-learn MLflow Docker FastAPI

📖 บทความที่เกี่ยวข้อง

react life cycle คืออ่านบทความ → vue life cycle คืออ่านบทความ → angular life cycle คืออ่านบทความ → redis map data structureอ่านบทความ → data analyst career pathอ่านบทความ →

📚 ดูบทความทั้งหมด →