add batch

This commit is contained in:
2025-05-21 15:48:59 -03:00
parent 7f4fec6e1e
commit 249116cc76
20 changed files with 786 additions and 627 deletions

View File

@@ -1,20 +0,0 @@
import boto3
from aws_lambda_powertools.utilities.data_classes import (
EventBridgeEvent,
event_source,
)
from aws_lambda_powertools.utilities.typing import LambdaContext
from csv_utils import byte_ranges
CHUNK_SIZE = 50
s3_client = boto3.client('s3')
@event_source(data_class=EventBridgeEvent)
def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
new_image = event.detail['new_image']
csvfile = new_image['csv_s3uri']
pairs = byte_ranges(csvfile, CHUNK_SIZE)
return True

View File

@@ -1,47 +0,0 @@
import csv
from io import StringIO
import boto3
from aws_lambda_powertools.utilities.data_classes import (
EventBridgeEvent,
event_source,
)
from aws_lambda_powertools.utilities.typing import LambdaContext
from csv_utils import byte_ranges
CHUNK_SIZE = 50
s3_client = boto3.client('s3')
@event_source(data_class=EventBridgeEvent)
def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
new_image = event.detail['new_image']
csvfile = new_image['csv_s3uri']
*_, pair = byte_ranges(csvfile, CHUNK_SIZE)
data = get_object_range(csvfile, pair[0], pair[1], s3_client=s3_client)
reader = csv.reader(data)
for x in reader:
print(x)
return True
def get_object_range(
s3_uri: str,
start_byte: int,
end_byte: int = -1,
*,
s3_client,
) -> StringIO:
bucket, key = s3_uri.replace('s3://', '').split('/', 1)
range_ = f'bytes={start_byte}-{end_byte}' if end_byte else f'bytes={start_byte}-'
response = s3_client.get_object(
Bucket=bucket,
Key=key,
Range=range_,
)
return StringIO(response['Body'].read().decode('utf-8'))

View File

@@ -8,7 +8,6 @@ dependencies = ["layercake"]
[dependency-groups]
dev = [
"duckdb>=1.2.2",
"pytest>=8.3.4",
"pytest-cov>=6.0.0",
"ruff>=0.9.1",

View File

@@ -1,13 +1,13 @@
import events.csv_chunks as app
import events.chunk_csv as app
def test_csv_chunks(lambda_context):
def test_chunk_csv(lambda_context):
event = {
'detail': {
'new_image': {
'csv_s3uri': 's3://saladeaula.digital/samples/large_users.csv',
's3uri': 's3://saladeaula.digital/samples/large_users.csv',
},
},
}
app.lambda_handler(event, lambda_context)
app.lambda_handler(event, lambda_context) # type: ignore

View File

@@ -12,6 +12,8 @@ def test_byte_ranges():
*_, pair = ranges
start_byte, end_byte = pair
assert ranges == [(0, 808), (809, 1655), (1656, 2303)]
expected = """,RICARDO GALLES BONET,ricardo.bonet@fanucamerica.com,424.430.528-93,NR-10 (RECICLAGEM)
,RULIO SIEFERT SERA,rulio.sera@fanucamerica.com,063.916.859-08,NR-10 (RECICLAGEM)
,MACIEL FERREIRA BOMFIM,maciel.bomfim@fanucamerica.com,334.547.088-85,NR-10 (RECICLAGEM)

18
batch-jobs/uv.lock generated
View File

@@ -90,7 +90,6 @@ dependencies = [
[package.dev-dependencies]
dev = [
{ name = "duckdb" },
{ name = "pytest" },
{ name = "pytest-cov" },
{ name = "ruff" },
@@ -101,7 +100,6 @@ requires-dist = [{ name = "layercake", directory = "../layercake" }]
[package.metadata.requires-dev]
dev = [
{ name = "duckdb", specifier = ">=1.2.2" },
{ name = "pytest", specifier = ">=8.3.4" },
{ name = "pytest-cov", specifier = ">=6.0.0" },
{ name = "ruff", specifier = ">=0.9.1" },
@@ -343,22 +341,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/68/1b/e0a87d256e40e8c888847551b20a017a6b98139178505dc7ffb96f04e954/dnspython-2.7.0-py3-none-any.whl", hash = "sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86", size = 313632, upload-time = "2024-10-05T20:14:57.687Z" },
]
[[package]]
name = "duckdb"
version = "1.2.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/28/b8/0f86278684fb7a1fac7c0c869fc6d68ed005cdc91c963eb4373e0551bc0a/duckdb-1.2.2.tar.gz", hash = "sha256:1e53555dece49201df08645dbfa4510c86440339889667702f936b7d28d39e43", size = 11595514, upload-time = "2025-04-08T08:47:20.234Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/41/31/5e2f68cbd000137f6ed52092ad83a8e9c09eca70c59e0b4c5eb679709997/duckdb-1.2.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:fb9a2c77236fae079185a990434cb9d8432902488ba990235c702fc2692d2dcd", size = 15272507, upload-time = "2025-04-08T08:46:15.605Z" },
{ url = "https://files.pythonhosted.org/packages/d2/15/aa9078fc897e744e077c0c1510e34db4c809de1d51ddb5cb62e1f9c61312/duckdb-1.2.2-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:d8bb89e580cb9a3aaf42e4555bf265d3db9446abfb118e32150e1a5dfa4b5b15", size = 31965548, upload-time = "2025-04-08T08:46:18.593Z" },
{ url = "https://files.pythonhosted.org/packages/9f/28/943773d44fd97055c59b58dde9182733661c2b6e3b3549f15dc26b2e139e/duckdb-1.2.2-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:88916d7f0532dc926bed84b50408c00dcbe6d2097d0de93c3ff647d8d57b4f83", size = 16800600, upload-time = "2025-04-08T08:46:21.51Z" },
{ url = "https://files.pythonhosted.org/packages/39/51/2caf01e7791e490290798c8c155d4d702ed61d69e815915b42e72b3e7473/duckdb-1.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30bece4f58a6c7bb0944a02dd1dc6de435a9daf8668fa31a9fe3a9923b20bd65", size = 18735886, upload-time = "2025-04-08T08:46:24.26Z" },
{ url = "https://files.pythonhosted.org/packages/87/0c/48ae1d485725af3a452303af409a9022d751ecab260cb9ca2f8c9fb670bc/duckdb-1.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bd2c6373b8b54474724c2119f6939c4568c428e1d0be5bcb1f4e3d7f1b7c8bb", size = 20210481, upload-time = "2025-04-08T08:46:26.717Z" },
{ url = "https://files.pythonhosted.org/packages/69/c7/95fcd7bde0f754ea6700208d36b845379cbd2b28779c0eff4dd4a7396369/duckdb-1.2.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72f688a8b0df7030c5a28ca6072817c1f090979e08d28ee5912dee37c26a7d0c", size = 18756619, upload-time = "2025-04-08T08:46:29.035Z" },
{ url = "https://files.pythonhosted.org/packages/ad/1b/c9eab9e84d4a70dd5f7e2a93dd6e9d7b4d868d3df755cd58b572d82d6c5d/duckdb-1.2.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:26e9c349f56f7c99341b5c79bbaff5ba12a5414af0261e79bf1a6a2693f152f6", size = 22294667, upload-time = "2025-04-08T08:46:31.295Z" },
{ url = "https://files.pythonhosted.org/packages/3f/3d/ce68db53084746a4a62695a4cb064e44ce04123f8582bb3afbf6ee944e16/duckdb-1.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1aec7102670e59d83512cf47d32a6c77a79df9df0294c5e4d16b6259851e2e9", size = 11370206, upload-time = "2025-04-08T08:46:33.472Z" },
]
[[package]]
name = "elastic-transport"
version = "8.17.1"