You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

188 KiB

None <html> <head> </head>

Benchmark Fastest Timeseries alternatives: ClickHouse Vs. InfluxDB Vs. Postgresql Vs. Parquet Vs. MongoDB Vs. DuckDB Vs Kdb+


How to use:

  • Rename the file "properties-model.ini" to "properties.ini"
  • Fill with your own credentials

The proposal of this work is to compare the speed in read/writing a midle level of data ( a dataset with 9 columns and 1.000.000 lines) to four diferent databases:

Jump to:

Imports

In [170]:
import configparser
import io
import timeit
from datetime import datetime

import duckdb
import influxdb_client
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pdmongo as pdm
from clickhouse_driver import Client
from dotenv import load_dotenv
from minio import Minio
from pymongo import MongoClient
from pytz import timezone
from qbstyles import mpl_style
from qpython import qconnection
from sqlalchemy import create_engine

load_dotenv()
Out[170]:
False
In [89]:
# Variables
dbname = "EURUSDtest"
collection = "finance"
In [50]:
arq = configparser.RawConfigParser()
arq.read("properties.ini")
ClickHouseUser = arq.get("CLICKHOUSE", "user")
ClickHouseKey = arq.get("CLICKHOUSE", "key")
ClickHouseUrl = arq.get("CLICKHOUSE", "url")

InfluxDBUser = arq.get("INFLUXDB", "user")
InfluxDBKey = arq.get("INFLUXDB", "key")
InfluxDBUrl = arq.get("INFLUXDB", "url")
InfluxDBBucket = arq.get("INFLUXDB", "bucket")

PostgresqlUser = arq.get("POSTGRESQL", "user")
PostgresqlKey = arq.get("POSTGRESQL", "key")
PostgresqlUrl = arq.get("POSTGRESQL", "url")
PostgresqlDB = arq.get("POSTGRESQL", "database")

S3MinioUser = arq.get("S3MINIO", "user")
S3MinioKey = arq.get("S3MINIO", "key")
S3MinioUrl = arq.get("S3MINIO", "url")
S3MinioRegion = arq.get("S3MINIO", "region")

MongoUser = arq.get("MONGODB", "user")
MongoKey = arq.get("MONGODB", "key")
MongoUrl = arq.get("MONGODB", "url")
In [51]:
# %%time
# Load Dataset
df = pd.read_csv("out.csv", index_col=0)
df.tail()
Out[51]:
id from at to open close min max volume
999995 7984748 2023-03-03 18:13:30 1677867225000000000 2023-03-03 18:13:45 1.062695 1.062635 1.062630 1.062700 64
999996 7984749 2023-03-03 18:13:45 1677867240000000000 2023-03-03 18:14:00 1.062645 1.062650 1.062625 1.062650 43
999997 7984750 2023-03-03 18:14:00 1677867255000000000 2023-03-03 18:14:15 1.062640 1.062625 1.062620 1.062665 47
999998 7984751 2023-03-03 18:14:15 1677867270000000000 2023-03-03 18:14:30 1.062625 1.062535 1.062535 1.062645 43
999999 7984752 2023-03-03 18:14:30 1677867285000000000 2023-03-03 18:14:45 1.062535 1.062520 1.062520 1.062580 59
In [ ]:
df["from"] = pd.to_datetime(df["from"], unit="s")
df["to"] = pd.to_datetime(df["to"], unit="s")
# Optional use when not transoformed yet
# Transform Datetime

Funçoes

-> Class

In [5]:
def timestamp2dataHora(x, timezone_="America/Sao_Paulo"):
    d = datetime.fromtimestamp(x, tz=timezone(timezone_))
    return d

ClickHouse

In [62]:
# !! driver tcp.
def cHouseConnect():
    client = Client(
        host=ClickHouseUrl,
        user=ClickHouseUser,
        password=ClickHouseKey,
        settings={"use_numpy": True},
    )
    return client


def cHouseDropTables(databasename):
    client = cHouseConnect()
    client.execute("DROP TABLE {}".format(databasename))
    client.disconnect()
    return "Database deleted"


# Create Tables in ClickHouse
# !! ALTERAR TIPOS !!
# ENGINE: 'Memory' desaparece quando server é reiniciado
def cHouseCreateDb(databasename):
    client = cHouseConnect()
    client.execute(
        "CREATE TABLE IF NOT EXISTS {} (id UInt32,"
        "from DateTime, at UInt64, to DateTime, open Float64,"
        "close Float64, min Float64, max  Float64, volume UInt32)"
        "ENGINE MergeTree ORDER BY to".format(databasename)
    )
    client.disconnect()
    return "Database created"


# Write dataframe to db
def cHouseInsertDf(dbName, dataframe):
    client = cHouseConnect()
    client.insert_dataframe("INSERT INTO {} VALUES".format(dbName), dataframe)
    client.disconnect()
    return " dataframe {} inserted in clickhouse database".format(dataframe)


def cHouseQueryDf(databaseName):
    client = cHouseConnect()
    dfQuery = client.query_dataframe("SELECT * FROM default.{}".format(databaseName))
    client.disconnect()
    return dfQuery


cHouseCreateDb(dbname)
Out[62]:
'Database created'
In [63]:
# Insert to db and benchmark time
start = timeit.default_timer()
cHouseInsertDf(dbname, df)
stop = timeit.default_timer()
cHouse_write_execution_time = stop - start
In [64]:
# read from db and benchmark time
start = timeit.default_timer()
dfCh = cHouseQueryDf(dbname)
stop = timeit.default_timer()
cHouse_read_execution_time = stop - start
In [57]:
dfCh.tail()
Out[57]:
id from at to open close min max volume
1999995 8083054 2023-03-28 18:54:00 1680029655003247667 2023-03-28 18:54:15 1.084015 1.084015 1.083995 1.084040 42
1999996 8083055 2023-03-28 18:54:15 1680029670000000000 2023-03-28 18:54:30 1.084020 1.083950 1.083940 1.084045 55
1999997 8083055 2023-03-28 18:54:15 1680029670000000000 2023-03-28 18:54:30 1.084020 1.083950 1.083940 1.084045 55
1999998 8083056 2023-03-28 18:54:30 1680029685000000000 2023-03-28 18:54:45 1.083965 1.083955 1.083945 1.083975 40
1999999 8083056 2023-03-28 18:54:30 1680029685000000000 2023-03-28 18:54:45 1.083965 1.083955 1.083945 1.083975 40
In [65]:
print(cHouse_read_execution_time)
7.571660671004793
In [66]:
print(cHouse_write_execution_time)
5.804937635999522
In [60]:
# Utils
# %%time
# dfCh = cHouseQueryDf(dbname)
# drop table
cHouseDropTables(dbname)
Out[60]:
'Database deleted'

InfluxDB

In [68]:
query = """
from(bucket: "EURUSDtest")
|> range(start:2023-03-03T18:14:30Z, stop: now())
|> filter(fn: (r) => r._measurement == "id")
|> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")"""
In [69]:
def influxdbConnect():
    client = influxdb_client.InfluxDBClient(
        url=InfluxDBUrl, token=InfluxDBKey, org=InfluxDBUser
    )
    return client


def influxdbLoadCsv(csv="out.csv", dictDates=["from", "to"], index="from"):
    # Read data from CSV without index and parse 'TimeStamp' as date.
    df = pd.read_csv(csv, sep=",", index_col=False, parse_dates=dictDates)
    # Set 'TimeStamp' field as index of dataframe # test another indexs
    df.set_index(index, inplace=True)
    return df


def influxdbWriteCsv(dataFrame, bucket, measurement="id", tag="volume"):
    client = influxdbConnect()
    # write_options=SYNCHRONOUS
    with client.write_api() as writer:
        writer.write(
            bucket=bucket,
            record=dataFrame,
            data_frame_measurement_name=measurement,
            data_frame_tag_columns=[tag],
        )
    writer.__del__()
    client.__del__()
    return 0


def influxdRead(org=InfluxDBUrl, query=query):
    client = influxdbConnect()
    InfluxDf = client.query_api().query_data_frame(org="librography", query=query)
    # display(InfluxDf.head())
    return InfluxDf
In [83]:
def delete_data(measurement="id", bucket=InfluxDBBucket, org="librography"):
    client = influxdbConnect()
    delete_api = client.delete_api()
    start = "1970-01-01T00:00:00Z"
    stop = "2024-10-30T00:00:00Z"
    delete_api.delete(start, stop, f'_measurement="{measurement}"', bucket, org)
In [ ]:
delete_data()
In [70]:
dafr = influxdbLoadCsv()
# dafr.head()
In [74]:
start = timeit.default_timer()
influxdbWriteCsv(dafr, InfluxDBBucket)
# dfIdw = cHouseQueryDf(dbname)
stop = timeit.default_timer()
influxdb_write_execution_time = stop - start
In [75]:
print(influxdb_write_execution_time)
87.55419652500132
In [78]:
# read from db and benchmark time
start = timeit.default_timer()
influxdRead()
stop = timeit.default_timer()
influxdb_read_execution_time = stop - start
In [79]:
print(influxdb_read_execution_time)
69.66793818100268
In [80]:
df.tail()
Out[80]:
Write Time Read Time Total Time
Click House 5.80 sec 7.57 sec 13.38 sec
Kdb+ 2.87 sec 4.15 sec 7.03 sec
r2 fill 15 0
r3 fill 14 0

Postgresql

In [201]:
# Connect / Create Tables
def psqlConnect():
    engine = create_engine(
        "postgresql+psycopg2://{}:{}@{}:5432/{}".format(
            PostgresqlUser, PostgresqlKey, PostgresqlUrl, PostgresqlDB
        )
    )
    return engine


psqlConnect()
# testar função
df = pd.read_csv("out.csv", index_col=0)
df.tail()
Out[201]:
id from at to open close min max volume
999995 7984748 2023-03-03 18:13:30 1677867225000000000 2023-03-03 18:13:45 1.062695 1.062635 1.062630 1.062700 64
999996 7984749 2023-03-03 18:13:45 1677867240000000000 2023-03-03 18:14:00 1.062645 1.062650 1.062625 1.062650 43
999997 7984750 2023-03-03 18:14:00 1677867255000000000 2023-03-03 18:14:15 1.062640 1.062625 1.062620 1.062665 47
999998 7984751 2023-03-03 18:14:15 1677867270000000000 2023-03-03 18:14:30 1.062625 1.062535 1.062535 1.062645 43
999999 7984752 2023-03-03 18:14:30 1677867285000000000 2023-03-03 18:14:45 1.062535 1.062520 1.062520 1.062580 59
In [204]:
# Drop old table and create new empty table
def psqlCreateTables(databaseName):
    engine = psqlConnect()
    df.head(0).to_sql(databaseName, engine, if_exists="replace", index=False)
    # Write
    conn = engine.raw_connection()
    cur = conn.cursor()
    output = io.StringIO()
    df.to_csv(output, sep="\t", header=False, index=False)
    output.seek(0)
    contents = output.getvalue()
    cur.copy_from(output, "EURUSDtest")  # , null="")  # null values become ''
    conn.commit()
    cur.close()
    conn.close()
    # disconnect()
    return 0


# funcao read sql
def psqlReadTables():
    engine = psqlConnect()
    df = pd.read_sql_query('select * from "EURUSDtest"', con=engine)
    return df


# testar função
In [205]:
# Insert to db and benchmark time
start = timeit.default_timer()
psqlCreateTables(dbname)
stop = timeit.default_timer()
psql_write_execution_time = stop - start
In [206]:
print(psql_write_execution_time)
24.818096771996352
In [207]:
start = timeit.default_timer()
psqlReadTables()
stop = timeit.default_timer()
psql_read_execution_time = stop - start
In [208]:
print(psql_read_execution_time)
18.282347426000342
In [96]:
# df.head()

S3-Parquet

In [99]:
# fazer sem funçao para ver se melhora
# verifique se esta no ssd os arquivos da pasta git
def s3Connect():
    client = Minio(
        S3MinioUrl,
        secure=False,
        region=S3MinioRegion,
        access_key=S3MinioUser,
        secret_key=S3MinioKey,
    )
    return client


def s3CreateBucket(bucketName="data"):
    client = s3Connect()
    found = client.bucket_exists(bucketName)
    if not found:
        return client.make_bucket(bucketName)
    else:
        return "Bucket '{}' already exists".format(bucketName)


def s3uploadCsv():
    client = s3Connect()
    client.fput_object(
        "data",
        "data.parquet",
        "data/data.parquet",
    )
    return (
        "'data/data.parquet' is successfully uploaded as "
        "object 'data.parquet' to bucket 'data'."
    )
In [100]:
# Insert to db and benchmark time
df.to_parquet("data/data.parquet")
s3CreateBucket()
start = timeit.default_timer()
s3uploadCsv()
stop = timeit.default_timer()
s3_write_execution_time = stop - start
In [101]:
print(s3_write_execution_time)
4.160801975995128
In [102]:
start = timeit.default_timer()
pq = pd.read_parquet("data/data.parquet", engine="pyarrow")
stop = timeit.default_timer()
s3_read_execution_time = stop - start
In [106]:
pq.head()
Out[106]:
id from at to open close min max volume
0 7730801 2023-01-02 15:58:45 1672675140000000000 2023-01-02 15:59:00 1.065995 1.066035 1.065930 1.066070 57
1 7730802 2023-01-02 15:59:00 1672675155000000000 2023-01-02 15:59:15 1.066055 1.066085 1.066005 1.066115 52
2 7730803 2023-01-02 15:59:15 1672675170000000000 2023-01-02 15:59:30 1.066080 1.066025 1.066025 1.066110 57
3 7730804 2023-01-02 15:59:30 1672675185000000000 2023-01-02 15:59:45 1.065980 1.065985 1.065885 1.066045 64
4 7730805 2023-01-02 15:59:45 1672675200000000000 2023-01-02 16:00:00 1.065975 1.066055 1.065830 1.066055 50
In [103]:
print(s3_read_execution_time)
0.5090432809956837

MongoDB

In [107]:
port = "27017"


def mongoLoadCsv(csvfile):
    data = pd.read_csv("out.csv")
    return data


def mongoConnect():
    client = MongoClient(
        "mongodb://{}:{}@{}/EURUSDtest?retryWrites=true&w=majority".format(
            MongoUser, MongoKey, MongoUrl
        ),
        authSource="admin",
    )
    return client


def mongoWriteDict(dados, dbs, collection):
    client = mongoConnect()
    db = client[dbs]
    collection = db[collection]
    # data.reset_index(inplace=True)
    data_dict = dados.to_dict("records")
    # return data_dict
    collection.insert_many(data_dict)
    return 0


def mongoRead():
    df = pdm.read_mongo(
        collection, [], "mongodb://192.168.1.162:27017/{}".format(dbname)
    )
    return df
In [108]:
data = mongoLoadCsv("out.csv")
data.head()
Out[108]:
Unnamed: 0 id from at to open close min max volume
0 0 7730801 2023-01-02 15:58:45 1672675140000000000 2023-01-02 15:59:00 1.065995 1.066035 1.065930 1.066070 57
1 1 7730802 2023-01-02 15:59:00 1672675155000000000 2023-01-02 15:59:15 1.066055 1.066085 1.066005 1.066115 52
2 2 7730803 2023-01-02 15:59:15 1672675170000000000 2023-01-02 15:59:30 1.066080 1.066025 1.066025 1.066110 57
3 3 7730804 2023-01-02 15:59:30 1672675185000000000 2023-01-02 15:59:45 1.065980 1.065985 1.065885 1.066045 64
4 4 7730805 2023-01-02 15:59:45 1672675200000000000 2023-01-02 16:00:00 1.065975 1.066055 1.065830 1.066055 50
In [109]:
start = timeit.default_timer()
dfCh = mongoWriteDict(data, dbname, "finance")
stop = timeit.default_timer()
mongo_write_execution_time = stop - start
In [110]:
print(mongo_write_execution_time)
54.566006569999445
In [111]:
start = timeit.default_timer()
dfMongo = mongoRead()
stop = timeit.default_timer()
mongo_read_execution_time = stop - start
In [112]:
print(mongo_read_execution_time)
34.66374846499821

DuckDB

In [119]:
# https://duckdb.org/2022/07/27/art-storage.html


def duckdbConnect():
    cursor = duckdb.connect()
    return cursor


def duckdbLoadCsv(csvFile="out.csv"):
    data = pd.read_csv(csvFile)
    return data


# write
def duckdbWrite(file, dataframe, tableName):
    conn = duckdbConnect()
    conn = duckdb.connect(file)
    # conn.execute("DROP TABLE EURUSDtest")
    # conn.sql('CREATE TABLE EURUSDtest(i INTEGER)')
    conn.register("tempTable", dataframe)
    conn.execute("CREATE TABLE {} AS SELECT * FROM tempTable".format(tableName))
    conn.close()
    return 0


def duckdbRead(dbfile):
    conn = duckdbConnect()
    conn = duckdb.connect(dbfile)
    conn.execute("SELECT * FROM {}".format(dbname)).fetchdf()
    conn.close()
    return 0
In [120]:
data = duckdbLoadCsv()
In [ ]:
conn = duckdbConnect()
conn.execute("DROP TABLE EURUSDtest")
In [74]:
conn.register("EURUSDtest", data)
Out[74]:
<duckdb.DuckDBPyConnection at 0x7f925fb29b30>
In [84]:
conn = duckdb.connect("file.db")
# conn.execute("DROP TABLE EURUSDtest")
# conn.sql("CREATE TABLE EURUSDtest(i INTEGER)")
display(conn.execute("SHOW TABLES").df())
name
0 EURUSDtest
In [ ]:
# conn = duckdbConnect()
conn.execute("SELECT * FROM EURUSDtest")
conn.fetchall()
In [121]:
start = timeit.default_timer()
dfDuckdb = duckdbWrite("file.db", data, dbname)
stop = timeit.default_timer()
duckdb_write_execution_time = stop - start
In [122]:
print(duckdb_write_execution_time)
2.02947735300404
In [123]:
start = timeit.default_timer()
dfDuckdb = duckdbRead("file.db")
stop = timeit.default_timer()
duckdb_read_execution_time = stop - start
In [124]:
print(duckdb_read_execution_time)
0.5635835029970622

Kdb+

In [31]:
# numpy version boolean fix
np.bool = np.bool_
In [25]:
# functions
# cd ~ && q/l64/q -p 5001


def kdbConnect():
    q = qconnection.QConnection(host="localhost", port=5001)
    q.open()
    return q


def kdbLoadCsv(file="out.csv"):
    data = pd.read_csv(file)
    return data


def kdbWrite():
    q = kdbConnect()
    data = kdbLoadCsv()
    q.sendSync("{t::x}", data)
    q.sendSync("`:/home/sandman/q/tab1 set t")
    q.close()
    return 0


def kdbRead():
    q = kdbConnect()
    # df2 = q.sendSync("tab2: get `:/home/sandman/q/tab1")
    # df2 = q.sendSync("tab2")
    df = pd.DataFrame(q("t"))  # , pandas=True))
    # df3 = q.sendSync("select from t")
    # ver todos esses loads
    q.close()
    return 0
In [28]:
# write
start = timeit.default_timer()
dfKdb = kdbWrite()
stop = timeit.default_timer()
kdb_write_execution_time = stop - start
In [29]:
# print write time
print(kdb_write_execution_time)
2.8739770100000896
In [30]:
# read
start = timeit.default_timer()
dfKdb = kdbRead()
stop = timeit.default_timer()
kdb_read_execution_time = stop - start
In [ ]:
# print read time
print(kdb_read_execution_time)
4.153738381999574

Table

In [209]:
# duckdb_write_execution_time
s = " sec"
data = [
    [
        "{:.2f}".format(cHouse_write_execution_time) + s,
        "{:.2f}".format(cHouse_read_execution_time) + s,
        "{:.2f}".format(cHouse_write_execution_time + cHouse_read_execution_time) + s,
    ],
    [
        "{:.2f}".format(influxdb_write_execution_time) + s,
        "{:.2f}".format(influxdb_read_execution_time) + s,
        "{:.2f}".format(influxdb_write_execution_time + influxdb_read_execution_time)
        + s,
    ],
    [
        "{:.2f}".format(psql_write_execution_time) + s,
        "{:.2f}".format(psql_read_execution_time) + s,
        "{:.2f}".format(psql_write_execution_time + psql_read_execution_time) + s,
    ],
    [
        "{:.2f}".format(s3_write_execution_time) + s,
        "{:.2f}".format(s3_read_execution_time) + s,
        "{:.2f}".format(s3_write_execution_time + s3_read_execution_time) + s,
    ],
    [
        "{:.2f}".format(mongo_write_execution_time) + s,
        "{:.2f}".format(mongo_read_execution_time) + s,
        "{:.2f}".format(mongo_write_execution_time + mongo_read_execution_time) + s,
    ],
    [
        "{:.2f}".format(duckdb_write_execution_time) + s,
        "{:.2f}".format(duckdb_read_execution_time) + s,
        "{:.2f}".format(duckdb_write_execution_time + duckdb_read_execution_time) + s,
    ],
    [
        "{:.2f}".format(kdb_write_execution_time) + s,
        "{:.2f}".format(kdb_read_execution_time) + s,
        "{:.2f}".format(kdb_write_execution_time + kdb_read_execution_time) + s,
    ],
]

index_labels = [
    "Click House",
    "InfluxDb",
    "Postgresql",
    "S3",
    "MongoDb",
    "DuckDb",
    "Kdb+",
]
dff = pd.DataFrame(
    data, columns=["Write Time", "Read Time", "Total Time"], index=index_labels
)
dff
Out[209]:
Write Time Read Time Total Time
Click House 5.80 sec 7.57 sec 13.38 sec
InfluxDb 87.55 sec 69.67 sec 157.22 sec
Postgresql 24.82 sec 18.28 sec 43.10 sec
S3 4.16 sec 0.51 sec 4.67 sec
MongoDb 54.57 sec 34.66 sec 89.23 sec
DuckDb 2.03 sec 0.56 sec 2.59 sec
Kdb+ 2.87 sec 4.15 sec 7.03 sec

Graph

In [221]:
def plot(dark):
    mpl_style(dark)
    # plt.subplots(figsize=(16, 8))
    plt.figure().set_figwidth(14)
    x = np.arange(7)  # change here
    width = 0.40
    y1 = [
        cHouse_read_execution_time,
        influxdb_read_execution_time,
        psql_read_execution_time,
        s3_read_execution_time,
        mongo_read_execution_time,
        duckdb_read_execution_time,
        kdb_read_execution_time,
    ]  # change here
    y2 = [
        cHouse_write_execution_time,
        influxdb_write_execution_time,
        psql_write_execution_time,
        s3_write_execution_time,
        mongo_write_execution_time,
        duckdb_write_execution_time,
        kdb_write_execution_time,
    ]  # change here
    plt.bar(x - 0.2, y2, width)
    plt.bar(x + 0.2, y1, width)
    plt.xticks(
        x,
        [
            "Click House",
            "InfluxDb",
            "Postgresql",
            "S3 Parquet",
            "MongoDb",
            "DuckDb",
            "Kdb+",
        ],
    )
    plt.xlabel("Databases")
    plt.ylabel("Seconds")
    plt.legend(["Write", "Read"])  # ver
    plt.rcParams.update({"font.size": 18})
    plt.title("Fastest Databases  Comparation (Write/Read)")
    plt.show()


plot(dark=True)

Graph-(bests)

In [220]:
def plot(dark):
    mpl_style(dark)
    plt.figure().set_figwidth(15, 10)
    x = np.arange(4)  # change here
    width = 0.40
    y1 = [
        cHouse_read_execution_time,
        s3_read_execution_time,
        duckdb_read_execution_time,
        kdb_read_execution_time,
    ]  # change here
    y2 = [
        cHouse_write_execution_time,
        s3_write_execution_time,
        duckdb_write_execution_time,
        kdb_write_execution_time,
    ]  # change here
    plt.bar(x - 0.2, y2, width)
    plt.bar(x + 0.2, y1, width)
    plt.xticks(
        x,
        [
            "Click House",
            "S3 Parquet",
            "DuckDb",
            "Kdb+",
        ],
    )
    plt.xlabel("Databases")
    plt.ylabel("Seconds")
    plt.legend(["Write", "Read"])  # ver
    plt.rcParams.update({"font.size": 8})
    plt.style.use("dark_background")
    plt.title("Top 4 fastest Databases on Benchmark -  Graph Comparation")
    plt.show()


plot(dark=True)

Size

In [225]:
originalCsv = 106.8
clickHouse_size_on_disk = 20
influxDb_size_on_disk = 84
postgresql_size_on_disk = 128
S3_size_size_on_disk = 31.8
mongoDb_size_on_disk = 174.5
DuckDb_size_on_disk = 36.5
Kdb_size_on_disk = 99.2


def plot(dark):
    mpl_style(dark)
    plt.figure().set_figwidth(12, 10)
    x = np.arange(8)  # change here
    width = 0.50
    y1 = [
        originalCsv,
        clickHouse_size_on_disk,
        influxDb_size_on_disk,
        postgresql_size_on_disk,
        S3_size_size_on_disk,
        mongoDb_size_on_disk,
        DuckDb_size_on_disk,
        Kdb_size_on_disk,
    ]  # change here

    # plt.bar(x - 0.2, y2, width)
    plt.bar(x + 0.0, y1, width, color=(0.2, 0.4, 0.6, 0.6))
    plt.xticks(
        x,
        [
            "Original CSV",
            "Click House",
            "InfluxDb",
            "Postgresql",
            "S3 Parquet",
            "MongoDb",
            "DuckDb",
            "Kdb+",
        ],
    )
    plt.xlabel("Databases")
    plt.ylabel("Megabytes")
    plt.legend(["Size Ocupied on Disk In MB"])  # ver
    plt.rcParams.update({"font.size": 8})
    plt.style.use("dark_background")
    plt.title("Size On Disk Occupied with the Same Archive on different Databases")
    plt.show()


plot(dark=True)
In [ ]:

</html>