Skip to content

Feature View

Entities

Entities are used to identify the primary key on which feature values are stored and retrieved. They are used as keys during the lookup of feature values from the Online Store and the join process in point-in-time joins. Entities are generally recognizable, concrete concepts or abstract, such as a customer email, a document number or a surrogate key.

It is possible to define multiple entities for a single a feature view and is also possible to have zero entities.

A dataset with a customer_id entity:

event_timestamp customer_id sum_amount_spent_last_30d
2022-04-04T01:00:00 1 1000
2022-04-04T01:00:00 2 5500
2022-04-04T01:00:00 3 2000
2022-04-05T01:00:00 1 2000

A dataset with multiple entities, customer_id and company_id:

event_timestamp customer_id company_id sum_amount_spent_last_30d
2022-04-04T01:00:00 1 a 400
2022-04-04T01:00:00 1 b 600
2022-04-04T01:00:00 2 a 5000
2022-04-04T01:00:00 2 b 500
2022-04-04T01:00:00 3 a 2000
2022-04-05T01:00:00 1 a 2000

Features

A feature is an individual measurable property. It is typically a property observed on a specific entity, but does not have to be associated with an entity.

A feature of a customer entity could be the number of transactions they have made on
a month, count_transactions_last_30d:

event_timestamp customer_id count_transactions_last_30d
2022-04-01T01:00:00 1 10
2022-04-01T01:00:00 2 3
2022-04-01T01:00:00 3 5
2022-05-01T01:00:00 1 10
2022-05-01T01:00:00 2 20
2022-05-01T01:00:00 3 30

A feature unrelated to an entity could be the number of transactions made by all customers in the last month, count_all_transactions_last_30d:

event_timestamp count_all_transactions_last_30d
2022-04-01T01:00:00 18
2022-05-01T01:00:00 60

Feature View

A Feature View aggregates entities, features and a data source, allowing the Feature Store to consistently manage feature data across time.

Info

Read more on Feast's documentation.

On Amora, defining a Feature View from an AmoraModel is done by decorating the model with amora.feature_store.decorators.feature_view and implementing the protocol FeatureViewProtocol.

E.g: StepCountBySource is a data model that exposes the features value_avg, value_sum and value_count of each source_name entity on a given event_timestamp.

from datetime import datetime
from typing import Optional

import humanize
from pydantic import NameEmail
from sqlalchemy import TIMESTAMP, Float, Integer, String, func, literal, select

from amora.feature_store.decorators import feature_view
from amora.models import AmoraModel, Field, Label, MaterializationTypes, ModelConfig
from amora.protocols import Compilable
from amora.questions import question
from amora.transformations import datetime_trunc_hour
from amora.visualization import BigNumber, LineChart, PieChart
from examples.amora_project.models.steps import Steps


@feature_view
class StepCountBySource(AmoraModel):
    __depends_on__ = [Steps]
    __model_config__ = ModelConfig(
        materialized=MaterializationTypes.table,
        labels={
            Label("quality", "golden"),
            Label("upstream", "apple_health"),
            Label("domain", "health"),
        },
        owner=NameEmail(
            name="Diogo MagalhΓ£es Machado", email="diogo.martins@stone.com.br"
        ),
        description="Step count measurements aggregated by hour",
    )

    value_avg: float = Field(Float, doc="Average step count of the hour")
    value_sum: float = Field(Float, doc="Sum of the step counts of the hour")
    value_count: int = Field(Integer, doc="Count of step count samples of the hour")

    source_name: str = Field(String, primary_key=True, doc="Source of the metric")
    event_timestamp: datetime = Field(
        TIMESTAMP,
        doc="Moment if time of which those features where observed",
        primary_key=True,
    )

    @classmethod
    def source(cls) -> Optional[Compilable]:
        source_name = Steps.sourceName.label(cls.__table__.columns.source_name.key)
        event_timestamp = func.timestamp(datetime_trunc_hour(Steps.creationDate)).label(
            cls.__table__.columns.event_timestamp.key
        )
        return select(
            [
                func.avg(Steps.value).label(cls.__table__.columns.value_avg.key),
                func.sum(Steps.value).label(cls.__table__.columns.value_sum.key),
                func.count(Steps.value).label(cls.__table__.columns.value_count.key),
                source_name,
                event_timestamp,
            ]
        ).group_by(source_name, event_timestamp)

    @classmethod
    def feature_view_entities(cls):
        return [cls.source_name]

    @classmethod
    def feature_view_features(cls):
        return [
            cls.value_avg,
            cls.value_sum,
            cls.value_count,
        ]

    @classmethod
    def feature_view_event_timestamp(cls):
        return cls.event_timestamp

    @classmethod
    def feature_view_fa_icon(cls):
        return "fa-person-running"


@question(view_config=BigNumber())
def how_many_data_points_where_acquired():
    return select(func.sum(StepCountBySource.value_count).label("total"))


@question()
def what_are_the_available_data_sources():
    return select(StepCountBySource.source_name).distinct()


@question(
    view_config=BigNumber(
        value_func=lambda data: humanize.naturaldate(data["event_timestamp"][0])
    )
)
def what_is_the_observation_starting_point():
    return select(func.min(StepCountBySource.event_timestamp).label("event_timestamp"))


@question(
    BigNumber(value_func=lambda data: humanize.naturaldate(data["event_timestamp"][0]))
)
def what_is_the_latest_data_point():
    return select(func.max(StepCountBySource.event_timestamp).label("event_timestamp"))


@question(view_config=PieChart(values="total", names="source_name"))
def what_is_the_total_step_count_to_date():
    """
    Qual o total de passos dados atΓ© hoje?
    """
    return select(
        func.sum(StepCountBySource.value_sum).label("total"),
        StepCountBySource.source_name,
    ).group_by(StepCountBySource.source_name)


@question(
    view_config=BigNumber(
        value_func=lambda data: humanize.intword(data["total_in_kilometers"][0])
        + " Kilometers"
    )
)
def what_is_the_current_estimated_walked_distance():
    avg_step_length_in_cm = literal(79, type_=Integer)
    estimation_in_cm = func.sum(StepCountBySource.value_sum) * avg_step_length_in_cm

    return select(
        estimation_in_cm.label("total_in_centimeters"),
        (estimation_in_cm / 100).label("total_in_meters"),
        (estimation_in_cm / 100000).label("total_in_kilometers"),
        StepCountBySource.source_name,
    ).group_by(StepCountBySource.source_name)


@question(
    view_config=LineChart(
        x_func=lambda data: data["event_timestamp"],
        y_func=lambda data: data["value_sum"],
    )
)
def what_are_the_values_observed_on_the_iphone():
    return (
        select(StepCountBySource.value_sum, StepCountBySource.event_timestamp)
        .where(StepCountBySource.source_name == "iPhone")
        .order_by(StepCountBySource.event_timestamp)
    )

Last update: 2023-11-23