57 lines
1.4 KiB
Python
57 lines
1.4 KiB
Python
# /// script
|
|
# dependencies = [
|
|
# "cloudflare"
|
|
# ]
|
|
# ///
|
|
|
|
from cloudflare import Cloudflare
|
|
|
|
CLOUDFLARE_ACCOUNT_ID = '5436b62470020c04b434ad31c3e4cf4e'
|
|
CLOUDFLARE_API_TOKEN = 'gFndkBJCzH4pRX7mKXokdWfw1xhm8-9FHfvLfhwa'
|
|
|
|
|
|
client = Cloudflare(api_token=CLOUDFLARE_API_TOKEN)
|
|
|
|
assistant = """
|
|
You are a data analysis assistant specialized in identifying Brazilian
|
|
personal data from CSV files.
|
|
|
|
These CSV files may or may not include headers.
|
|
|
|
Your task is to analyze the content and identify only three possible
|
|
data types: 'name', 'cpf', and 'email'.
|
|
|
|
Ignore all other fields.
|
|
"""
|
|
|
|
csv_content = """
|
|
Sérgio Rafael de Siqueira,10,07879819908,osergiosiqueria@gmail.com,cipa
|
|
Tiago Maciel,12,086.790.049-01,tiago@somosbeta.com.br,nr 10
|
|
"""
|
|
|
|
prompt = f"""
|
|
Here is a CSV sample:
|
|
|
|
{csv_content}
|
|
|
|
Your task is to:
|
|
- Detect which columns most likely contain "name", "cpf", or "email".
|
|
- Skip any category that is not present in the data.
|
|
- Return ONLY a valid Python list of tuples, like:
|
|
[('name', index), ('cpf', index), ('email', index)]
|
|
- Use the column index that most likely matches each data type,
|
|
based on frequency and data format.
|
|
- Don't include explanations, code, or any additional text.
|
|
"""
|
|
|
|
r = client.ai.run(
|
|
model_name='@cf/meta/llama-3-8b-instruct',
|
|
account_id=CLOUDFLARE_ACCOUNT_ID,
|
|
messages=[
|
|
{'role': 'system', 'content': assistant},
|
|
{'role': 'user', 'content': prompt},
|
|
],
|
|
)
|
|
|
|
print(r)
|