WIP
cool package instructor
manual indexing was beaten out of me at my first job, so when I saw this online I had to check it out.
looks like a cool package too.
#| eval: false
from dotenv import load_dotenv
'../.env') load_dotenv(
#| eval: false
from io import StringIO
from typing import Annotated, Any, Iterable
from openai import OpenAI
from pydantic import (
BaseModel,
BeforeValidator,
PlainSerializer,
InstanceOf,
WithJsonSchema,
)import pandas as pd
import instructor
= instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)
client
def to_markdown(df: pd.DataFrame) -> str:
return df.to_markdown()
def md_to_df(data: Any) -> Any:
if isinstance(data, str):
return (
pd.read_csv(# Get rid of whitespaces
StringIO(data), ="|",
sep=1,
index_col
)=1, how="all")
.dropna(axis1:]
.iloc[map(lambda x: x.strip())
.
)return data
= Annotated[
MarkdownDataFrame
InstanceOf[pd.DataFrame],
BeforeValidator(md_to_df),
PlainSerializer(to_markdown),
WithJsonSchema(
{"type": "string",
"description": """
The markdown representation of the table,
each one should be tidy, do not try to join tables
that should be seperate""",
}
),
]
class Table(BaseModel):
str
caption:
dataframe: MarkdownDataFrame
def extract_table(url: str) -> Iterable[Table]:
return client.chat.completions.create(
="gpt-4-vision-preview",
model=Iterable[Table],
response_model=1800,
max_tokens=[
messages
{"role": "user",
"content": [
{"type": "text",
"text": """Extract the table from the image, and describe it.
Each table should be tidy, do not try to join tables that
should be seperately described.""",
},
{"type": "image_url",
"image_url": {"url": url},
},
],
}
], )
#| eval: false
= "https://a.storyblok.com/f/47007/2400x2000/bf383abc3c/231031_uk-ireland-in-three-charts_table_v01_b.png"
url = extract_table(url)
tables for tbl in tables:
print(tbl.caption, end="\n")
print(tbl.dataframe)