publish

Publisher API.

Publisher

Publisher(publisher_id, url=None)

Publisher to publish timeseries to Arrakis service.

Parameters:

Name	Type	Description	Default
`id`	`str`	Publisher ID string.	required
`url`	`str`	Initial Flight URL to connect to.	`None`

Source code in arrakis/publish.py

def __init__(self, publisher_id: str, url: str | None = None):
    if not HAS_KAFKA:
        msg = (
            "Publishing requires confluent-kafka to be installed."
            "This is provided by the 'publish' extra or it can be "
            "installed manually through pip or conda."
        )
        raise ImportError(msg)

    self.publisher_id = publisher_id
    self.initial_url = parse_url(url)

    self.channels: dict[str, Channel] = {}

    self._producer: Producer
    self._partitions: dict[str, str]
    self._registered = False

publish

publish(block, timeout=constants.DEFAULT_TIMEOUT)

Publish timeseries data

Parameters:

Name	Type	Description	Default
`block`	`SeriesBlock`	A data block with all channels to publish.	required
`timeout`	`timedelta`	The maximum time to wait to publish before timing out. Default is 2 seconds.	`DEFAULT_TIMEOUT`

Source code in arrakis/publish.py

def publish(
    self,
    block: SeriesBlock,
    timeout: timedelta = constants.DEFAULT_TIMEOUT,
) -> None:
    """Publish timeseries data

    Parameters
    ----------
    block : SeriesBlock
        A data block with all channels to publish.
    timeout : timedelta, optional
        The maximum time to wait to publish before timing out.
        Default is 2 seconds.

    """
    if not self._producer:
        msg = (
            "publication interface not initialized, "
            "please use context manager when publishing."
        )
        raise RuntimeError(msg)

    for name, channel in block.channels.items():
        if channel != self.channels[name]:
            msg = f"invalid channel for this publisher: {channel}"
            raise ValueError(msg)

    # FIXME: updating partitions should only be allowed for
    # special blessed publishers, that are currently not using
    # this interface, so we're disabling this functionality for
    # the time being, until we have a better way to manage it.
    #
    # # check for new metadata changes
    # changed = set(block.channels.values()) - set(self.channels.values())
    # # exchange to transfer metadata and get new/updated partition IDs
    # if changed:
    #     self._update_partitions(changed)

    # publish data for each data type, splitting into
    # subblocks based on a maximum channel maximum
    for partition_id, batch in block.to_row_batches(self._partitions):
        topic = f"arrakis-{partition_id}"
        logger.debug("publishing to topic %s: %s", topic, batch)
        self._producer.produce(topic=topic, value=serialize_batch(batch))
        self._producer.flush()

channel_to_dtype_name

channel_to_dtype_name(channel)

Given a channel, return the data type's name.

Source code in arrakis/publish.py

def channel_to_dtype_name(channel: Channel) -> str:
    """Given a channel, return the data type's name."""
    assert channel.data_type is not None
    return channel.data_type.name

serialize_batch

serialize_batch(batch)

Serialize a record batch to bytes.

Parameters:

Name	Type	Description	Default
`batch`	`RecordBatch`	The batch to serialize.	required

Returns:

Type	Description
`bytes`	The serialized buffer.

Source code in arrakis/publish.py

def serialize_batch(batch: pyarrow.RecordBatch):
    """Serialize a record batch to bytes.

    Parameters
    ----------
    batch : pyarrow.RecordBatch
        The batch to serialize.

    Returns
    -------
    bytes
        The serialized buffer.

    """
    sink = pyarrow.BufferOutputStream()
    with pyarrow.ipc.new_stream(sink, batch.schema) as writer:
        writer.write_batch(batch)
    return sink.getvalue()