Dataframely is a Python package to validate the schema and content of polars
data frames. Its
purpose is to make data pipelines more robust by ensuring that data meets expectations and more readable by adding
schema information to data frame type hints.
You can install dataframely
using your favorite package manager, e.g., pixi
or pip
:
pixi add dataframely
pip install dataframely
import dataframely as dy
import polars as pl
class HouseSchema(dy.Schema):
zip_code = dy.String(nullable=False, min_length=3)
num_bedrooms = dy.UInt8(nullable=False)
num_bathrooms = dy.UInt8(nullable=False)
price = dy.Float64(nullable=False)
@dy.rule()
def reasonable_bathroom_to_bedrooom_ratio() -> pl.Expr:
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
return (ratio >= 1 / 3) & (ratio <= 3)
@dy.rule(group_by=["zip_code"])
def minimum_zip_code_count() -> pl.Expr:
return pl.len() >= 2
import polars as pl
df = pl.DataFrame({
"zip_code": ["01234", "01234", "1", "213", "123", "213"],
"num_bedrooms": [2, 2, 1, None, None, 2],
"num_bathrooms": [1, 2, 1, 1, 0, 8],
"price": [100_000, 110_000, 50_000, 80_000, 60_000, 160_000]
})
# Validate the data and cast columns to expected types
validated_df: dy.DataFrame[HouseSchema] = HouseSchema.validate(df, cast=True)
See more advanced usage examples in the documentation.