介绍
DuckDB 是一个 嵌入式的、面向分析型查询(OLAP)的关系型数据库,你可以把它理解为:
✅ “数据库界的 SQLite + 数据仓库能力”
✅ 在应用程序内直接运行,不需要独立服务
✅ 读写 Parquet/CSV 非常强
✅ SQL 兼容性好,性能接近甚至超过部分专用分析系统
本地分析
数据科学
ETL/查询大型文件
内嵌在应用程序做快速数据分析
替代 pandas 加速数据处理
python示例
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import logging
from pathlib import Path
import tempfile
import duckdb
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
temp_directory = tempfile.gettempdir()
base_path = str(Path(temp_directory) / "duckdb")
if not Path(base_path).exists():
Path(base_path).mkdir(parents=True, exist_ok=True)
conn = duckdb.connect()
df = duckdb.sql(
f"SELECT ip,count(ip) FROM '{str(Path(base_path)/'request_log.parquet')}' group by ip order by count(ip) desc"
)
logger.info(df)
# duckdb.install_extension("mysql")
# duckdb.load_extension("mysql")
HOST = ""
USER = ""
PORT = 3306
DATABASE = ""
PASSWORD = ""
ALIAS = "mysqldb"
# with duckdb.connect() as conn:
# conn.install_extension("mysql")
# conn.load_extension("mysql")
# conn.sql("CALL mysql_clear_cache();")
# conn.sql(
# f" ATTACH 'host={HOST} user={USER} port={PORT} database={DATABASE} password={PASSWORD}' AS {ALIAS} (TYPE mysql, READ_ONLY);"
# )
# conn.sql(f" use {ALIAS} ;")
# rows = conn.sql("SHOW TABLES").fetchall()
# logger.info(rows)
# for row in rows:
# table_name = row[0]
# logger.info(table_name)
# conn.sql(
# f"COPY {ALIAS}.{table_name} TO '{str(Path(base_path)/table_name)}.parquet';"
# )