Auto-Tagging and Querying Articles#
This example package collects plain text files and stores them in Steamship. It tags them by running a zero-shot classifier with a set of labels provided by the package user. The results can then be queried by tag with a configurable threshold.
"""
This package accepts text documents, stores them, labels them with tags,
and retrieves them based on those tags.
"""
from typing import Type, Dict, Any
from steamship import Block, File, Steamship, Tag
from steamship.invocable import Config, create_handler, post, PackageService
class ArticleTaggerPackage(PackageService):
class ArticleTaggerConfig(Config):
"""Configuration required to instantiate this package."""
labels: str # A comma-separated list of tags to apply to articles
def __init__(self, client: Steamship, config: Dict[str, Any] = None):
super().__init__(client, config)
# Instantiate a zero-shot classifier plugin
self.classifier_instance = client.use_plugin(
plugin_handle="zero-shot-tagger-default",
instance_handle="my-classifier",
config={"tag_kind": "tags", # The tag.kind we want in the output
"labels": self.config.labels, # The labels we want to apply
"multi_label": True # multi-class classification
}
)
# The config_cls method allows your package to return a class
# that defines its required configuration.
# See Developer Reference -> Accepting Configuration
# for more details. This package doesn't have any specific
# required configuration, so we return the default Config object.
def config_cls(self) -> Type[Config]:
"""Return our specific config type."""
return self.ArticleTaggerConfig
# This method defines the package user's endpoint for adding content
# The @post annotation automatically makes the method available as
# an HTTP Post request. The name in the annotation defines the HTTP
# route suffix, see Packages -> Package Project Structure.
@post("add_document")
def add_document(self, content: str, url: str) -> str:
"""Accept a new document in plaintext and start sentiment analysis"""
# Upload the content of the file into Steamship.
# Put the content directly into a Block, since we assume it is plaintext.
# Create a tag with the URL so we can get it back later.
file = File.create(self.client,
blocks=[Block.CreateRequest(text=content)],
tags=[Tag.CreateRequest(kind="url", name=url)])
# Tag the file with the sentiment analysis plugin
# Using a plugin is an asynchronous call within Steamship. The
# operation may not be complete when this method completes,
# but that's ok. The other methods will query over whatever is
# currently available.
file.tag(self.classifier_instance.handle)
return file.handle
@staticmethod
def _find_url(file: File):
for tag in file.tags:
if tag.kind == "url":
return tag.name
@post("documents_by_tag")
def documents_by_tag(self, tag: str, threshold: float = 0.7) -> [str]:
"""Query the stored documents for tagged articles"""
# Query our documents for Positive sentiment tags
matching_files = File.query(self.client,
f'kind "tags" and name "{tag}" and value("score") > {threshold}').files
return [self._find_url(file) for file in matching_files]
# This line connects our Package implementation class to the surrounding
# Steamship handler code.
handler = create_handler(ArticleTaggerPackage)