Embedding serialization benchmark
👍
2
import json
import base64
import msgpack
import numpy as np
emb = np.random.rand(1536).astype(np.float32)
emb_list = emb.tolist()
resp_emb = np.frombuffer(base64.b64decode(json.loads(json.dumps({'emb': base64.b64encode(emb.tobytes()).decode('utf-8')}))['emb']), dtype=np.float32)
assert sum(np.equal(emb, resp_emb)) == 1536
resp_emb_list = json.loads(json.dumps({'emb': emb_list}))['emb']
assert resp_emb_list == emb_list
msg_emb = np.frombuffer(msgpack.unpackb(msgpack.packb({'emb': emb.tobytes()}))['emb'], dtype=np.float32)
assert sum(np.equal(emb, msg_emb)) == 1536
print("type\t\tsize")
print(f"np bytes:\t{len(emb.tobytes())}")
print(f"msgapck:\t{len(msgpack.packb({'emb': emb.tobytes()}))}")
print(f"json+base64:\t{len(json.dumps({'emb': base64.b64encode(emb.tobytes()).decode()}))}")
get_ipython().run_line_magic('timeit', "json.loads(json.dumps({'emb': emb_list}))['emb']")
get_ipython().run_line_magic('timeit', "np.frombuffer(base64.b64decode(json.loads(json.dumps({'emb': base64.b64encode(emb.tobytes()).decode('utf-8')}))['emb']), dtype=np.float32)")
get_ipython().run_line_magic('timeit', "np.frombuffer(msgpack.unpackb(msgpack.packb({'emb': emb.tobytes()}))['emb'], dtype=np.float32)")
Run it with ipython bench.py
:
type size
np bytes: 6144
msgapck: 6152
json+base64: 8203
489 µs ± 4.68 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
22.8 µs ± 52 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
782 ns ± 1.04 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)