{
"cells": [
{
"cell_type": "markdown",
"id": "7f1f5a4a-ae34-4a27-9efa-399edc0e384a",
"metadata": {
"tags": []
},
"source": [
"## Benchmark: ClickHouse Vs. InfluxDB Vs. Postgresql Vs. Parquet \n",
"\n",
"-----\n",
"\n",
"#### How to use:\n",
"* Rename the file \"properties-model.ini\" to \"properties.ini\"\n",
"* Fill with your own credentials\n",
"----\n",
"\n",
"The proposal of this work is to compare the speed in read/writing a midle level of data ( a dataset with 9 columns and 50.000 lines) to four diferent databases:\n",
"* ClickHouse\n",
"* InfluxDB\n",
"* Postgresql\n",
"* Parquet (in a S3 Minio Storage)
\n",
"ToDo:
\n",
"* DuckDB with Polars\n",
"* MongoDB\n",
"* Kdb+\n",
"\n",
" \n",
"Deve-se relevar:\n",
"é uma \"cold-storage\" ou \"frezze-storage\"?
\n",
"influxdb: alta leitura e possui a vantagem da indexaçõa para vizualização de dados em gráficos.\n",
"\n",
"notas: \n",
"* comparar tamanho do csv com parquet"
]
},
{
"cell_type": "markdown",
"id": "6bb26ce7-1e84-4665-accd-916bb977f95d",
"metadata": {
"tags": []
},
"source": [
"### Imports "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab6c6c81-6ac1-4668-a79b-a9a0341fb35a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import configparser\n",
"import time\n",
"import timeit\n",
"from datetime import datetime\n",
"\n",
"import duckdb\n",
"import influxdb_client\n",
"import pandas as pd\n",
"\n",
"# import pymongo\n",
"from clickhouse_driver import Client\n",
"from dotenv import load_dotenv\n",
"from minio import Minio\n",
"from pymongo import MongoClient\n",
"from pytz import timezone\n",
"from sqlalchemy import create_engine\n",
"\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55c3cd57-0996-4723-beb5-8f3196c96009",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Variables\n",
"dbname = \"EURUSDtest\""
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "968403e3-2e5e-4834-b969-be4600e2963a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"arq = configparser.RawConfigParser()\n",
"arq.read(\"properties.ini\")\n",
"ClickHouseUser = arq.get(\"CLICKHOUSE\", \"user\")\n",
"ClickHouseKey = arq.get(\"CLICKHOUSE\", \"key\")\n",
"ClickHouseUrl = arq.get(\"CLICKHOUSE\", \"url\")\n",
"\n",
"InfluxDBUser = arq.get(\"INFLUXDB\", \"user\")\n",
"InfluxDBKey = arq.get(\"INFLUXDB\", \"key\")\n",
"InfluxDBUrl = arq.get(\"INFLUXDB\", \"url\")\n",
"InfluxDBBucket = arq.get(\"INFLUXDB\", \"bucket\")\n",
"\n",
"PostgresqlUser = arq.get(\"POSTGRESQL\", \"user\")\n",
"PostgresqlKey = arq.get(\"POSTGRESQL\", \"key\")\n",
"PostgresqlUrl = arq.get(\"POSTGRESQL\", \"url\")\n",
"PostgresqlDB = arq.get(\"POSTGRESQL\", \"database\")\n",
"\n",
"S3MinioUser = arq.get(\"S3MINIO\", \"user\")\n",
"S3MinioKey = arq.get(\"S3MINIO\", \"key\")\n",
"S3MinioUrl = arq.get(\"S3MINIO\", \"url\")\n",
"S3MinioRegion = arq.get(\"S3MINIO\", \"region\")\n",
"\n",
"MongoUser = arq.get(\"MONGODB\", \"user\")\n",
"MongoKey = arq.get(\"MONGODB\", \"key\")\n",
"MongoUrl = arq.get(\"MONGODB\", \"url\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3634a4ec-04c2-4f1e-8659-5d22eb17a254",
"metadata": {},
"outputs": [],
"source": [
"# %%time\n",
"# Load Dataset\n",
"df = pd.read_csv(\"out.csv\", index_col=0)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76199f91-31d6-416b-9f15-5d435b3792c9",
"metadata": {},
"outputs": [],
"source": [
"df[\"from\"] = pd.to_datetime(df[\"from\"], unit=\"s\")\n",
"df[\"to\"] = pd.to_datetime(df[\"to\"], unit=\"s\")\n",
"# Optional use when not transoformed yet\n",
"# Transform Datetime"
]
},
{
"cell_type": "markdown",
"id": "274cc026-2f48-4e38-b80f-b1a9ff982060",
"metadata": {
"tags": []
},
"source": [
"#### Funçoes\n",
"\n",
"-> Class"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27de1ec8-4de1-440a-b555-b4a46c5ef7ce",
"metadata": {},
"outputs": [],
"source": [
"def timestamp2dataHora(x, timezone_=\"America/Sao_Paulo\"):\n",
" d = datetime.fromtimestamp(x, tz=timezone(timezone_))\n",
" return d"
]
},
{
"cell_type": "markdown",
"id": "4a8d5703-9bc9-4d38-83ff-457159304d58",
"metadata": {
"tags": []
},
"source": [
"### ClickHouse"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "c3202bbb-2655-45b2-b166-9f45a3ef854c",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'Database created'"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# !! O client oficial usa um driver http, nesse exemplo vamos usar a biblioteca\n",
"# de terceirtos clickhouse_driver recomendada, por sua vez que usa tcp.\n",
"def cHouseConnect():\n",
" client = Client(\n",
" host=ClickHouseUrl,\n",
" user=ClickHouseUser,\n",
" password=ClickHouseKey,\n",
" settings={\"use_numpy\": True},\n",
" )\n",
" return client\n",
"\n",
"\n",
"# Create Tables in ClickHouse\n",
"# !! ALTERAR TIPOS !!\n",
"# ENGINE: 'Memory' desaparece quando server é reiniciado\n",
"def cHouseCreateDb(databasename):\n",
" client = cHouseConnect()\n",
" client.execute(\n",
" \"CREATE TABLE IF NOT EXISTS {} (id UInt32,\"\n",
" \"from DateTime, at UInt64, to DateTime, open Float64,\"\n",
" \"close Float64, min Float64, max Float64, volume UInt32)\"\n",
" \"ENGINE MergeTree ORDER BY to\".format(databasename)\n",
" )\n",
" client.disconnect()\n",
" return \"Database created\"\n",
"\n",
"\n",
"# Write dataframe to db\n",
"def cHouseInsertDf(dbName, dataframe):\n",
" client = cHouseConnect()\n",
" client.insert_dataframe(\"INSERT INTO {} VALUES\".format(dbName), dataframe)\n",
" client.disconnect()\n",
" return \" dataframe {} inserted in clickhouse database\".format(dataframe)\n",
"\n",
"\n",
"def cHouseQueryDf(databaseName):\n",
" client = cHouseConnect()\n",
" dfQuery = client.query_dataframe(\n",
" \"SELECT * FROM default.{}\".format(databaseName)\n",
" ) # LIMIT 10000\n",
" client.disconnect()\n",
" return dfQuery\n",
"\n",
"\n",
"cHouseCreateDb(dbname)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "cc4865b3-a1bc-4a35-9624-15334754b3a1",
"metadata": {},
"outputs": [],
"source": [
"# Insert to db and benchmark time\n",
"start = timeit.default_timer()\n",
"cHouseInsertDf(dbname, df)\n",
"stop = timeit.default_timer()\n",
"cHouse_write_execution_time = stop - start"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "1fac82c1-2d04-44ef-893a-dc13b755e6d8",
"metadata": {},
"outputs": [],
"source": [
"# read from db and benchmark time\n",
"start = timeit.default_timer()\n",
"dfCh = cHouseQueryDf(dbname)\n",
"stop = timeit.default_timer()\n",
"cHouse_read_execution_time = stop - start"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "597ae7bd-2eea-44d7-b379-f0eb7e745c15",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
| \n", " | id | \n", "from | \n", "at | \n", "to | \n", "open | \n", "close | \n", "min | \n", "max | \n", "volume | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "7730801 | \n", "2023-01-02 15:58:45 | \n", "1672675140000000000 | \n", "2023-01-02 15:59:00 | \n", "1.065995 | \n", "1.066035 | \n", "1.065930 | \n", "1.066070 | \n", "57 | \n", "
| 1 | \n", "7730801 | \n", "2023-01-02 15:58:45 | \n", "1672675140000000000 | \n", "2023-01-02 15:59:00 | \n", "1.065995 | \n", "1.066035 | \n", "1.065930 | \n", "1.066070 | \n", "57 | \n", "
| 2 | \n", "7730802 | \n", "2023-01-02 15:59:00 | \n", "1672675155000000000 | \n", "2023-01-02 15:59:15 | \n", "1.066055 | \n", "1.066085 | \n", "1.066005 | \n", "1.066115 | \n", "52 | \n", "
| 3 | \n", "7730802 | \n", "2023-01-02 15:59:00 | \n", "1672675155000000000 | \n", "2023-01-02 15:59:15 | \n", "1.066055 | \n", "1.066085 | \n", "1.066005 | \n", "1.066115 | \n", "52 | \n", "
| 4 | \n", "7730803 | \n", "2023-01-02 15:59:15 | \n", "1672675170000000000 | \n", "2023-01-02 15:59:30 | \n", "1.066080 | \n", "1.066025 | \n", "1.066025 | \n", "1.066110 | \n", "57 | \n", "