move seeds dir

This commit is contained in:
2025-04-03 21:08:59 -03:00
parent 34085af4a8
commit 1358088a48
41 changed files with 1950 additions and 108 deletions

View File

@@ -1,13 +1,14 @@
from typing import Any, Generator
import layercake.jsonl as jsonl
import boto3
import jsonlines
from elasticsearch import Elasticsearch
from layercake.dynamodb import deserialize
from tqdm import tqdm
from boto3clients import dynamodb_client
elastic_client = Elasticsearch('http://127.0.0.1:9200')
dynamodb_client = boto3.client('dynamodb', endpoint_url='http://127.0.0.1:8000')
jsonl_files = (
'test-orders.jsonl',
'test-users.jsonl',
@@ -51,6 +52,7 @@ class Elastic:
def index_item(
self,
/,
id: str,
index: str,
doc: dict,
@@ -58,7 +60,7 @@ class Elastic:
return self.client.index(
index=index,
id=id,
document=_serialize_python_type(doc),
document=_serialize_to_basic_types(doc),
)
def delete_index(self, index: str) -> bool:
@@ -70,15 +72,15 @@ class Elastic:
return True
def _serialize_python_type(value: Any) -> Any:
def _serialize_to_basic_types(value: Any) -> Any:
if isinstance(value, dict):
return {k: _serialize_python_type(v) for k, v in value.items()}
return {k: _serialize_to_basic_types(v) for k, v in value.items()}
if isinstance(value, set):
return list(value)
if isinstance(value, list):
return [_serialize_python_type(v) for v in value]
return [_serialize_to_basic_types(v) for v in value]
return value
@@ -86,18 +88,20 @@ def _serialize_python_type(value: Any) -> Any:
if __name__ == '__main__':
elastic = Elastic(elastic_client)
# Populate DynamoDB tables with data from JSONL files
for file in tqdm(jsonl_files, desc='Processing files'):
with jsonl.readlines(f'seeds/{file}') as lines:
with jsonlines.open(f'seeds/{file}') as lines:
table_name = file.removesuffix('.jsonl')
for line in tqdm(lines, desc=f'Processing lines in {file}'):
put_item(line, table_name, dynamodb_client)
# Scan DynamoDB tables and index the data into Elasticsearch
for file in tqdm(jsonl_files, desc='Scanning tables'):
table_name = file.removesuffix('.jsonl')
elastic.delete_index(table_name)
for record in tqdm(
for doc in tqdm(
scan_table(
table_name,
dynamodb_client,
@@ -106,4 +110,4 @@ if __name__ == '__main__':
),
desc=f'Indexing {table_name}',
):
elastic.index_item(id=record['id'], index=table_name, doc=record)
elastic.index_item(id=doc['id'], index=table_name, doc=doc)