Subclassing schemas
Subclassing schemas is a useful pattern for pipelines where every next function adds a few columns.
[1]:
from typedspark import Column, Schema, DataSet
from pyspark.sql.types import LongType, StringType
from pyspark.sql.functions import lit
class Person(Schema):
id: Column[LongType]
name: Column[StringType]
class PersonWithAge(Person):
age: Column[LongType]
def foo(df: DataSet[Person]) -> DataSet[PersonWithAge]:
return DataSet[PersonWithAge](
df.withColumn(PersonWithAge.age, lit(42)),
)
Similarly, you can use this pattern when merging (or joining or concatenating) two datasets together.
[2]:
class PersonA(Schema):
id: Column[LongType]
name: Column[StringType]
class PersonB(Schema):
id: Column[LongType]
age: Column
class PersonAB(PersonA, PersonB):
pass
def foo(df_a: DataSet[PersonA], df_b: DataSet[PersonB]) -> DataSet[PersonAB]:
return DataSet[PersonAB](
df_a.join(df_b, PersonAB.id),
)