Common Libraries#

Schemas#

ID#

alias of str

Hash#

alias of str

class Status(value)[source]#

An enumeration.

WAITING = 'Waiting'#
STARTED = 'Started'#
RUNNING = 'Running'#
FAILED = 'Failed'#
FINISHED = 'Finished'#
generate_id()[source]#

Generates id for some job that needs to be performed.

get_hash(string)[source]#

Image Models#

class ImageSize(*args, **kwargs)[source]#
height = Ellipsis#
width = Ellipsis#
class Config(*args, **kwargs)[source]#
class ImageFullModelConfig(*args, **kwargs)[source]#
property invariant_json#
class ImageModelInfo(*args, **kwargs)[source]#
image_size = None#
class Config(*args, **kwargs)[source]#
title = 'Information about the image model'#
class ReshapeModelConfig(*args, **kwargs)[source]#
reshape_image_size = Ellipsis#
class Config(*args, **kwargs)[source]#
title = 'Reshape Model'#
class TFFullImageModelConfig(*args, **kwargs)[source]#
tf_image_model_url = Ellipsis#
id = '0'#
output_key = None#
required_image_size = Ellipsis#
class Config(*args, **kwargs)[source]#
title = 'TensorFlow Image Model'#
class FinetunedTFFullImageModelConfig(*args, **kwargs)[source]#
tf_image_model_url = Ellipsis#
base_model = Ellipsis#
train_readers = Ellipsis#
lr = Ellipsis#
epochs = Ellipsis#
output_key = None#
required_image_size = Ellipsis#
class Config(*args, **kwargs)[source]#
title = 'TensorFlow Image Model'#
class TorchvisionInternalModelConfig(*args, **kwargs)[source]#
id_name = Ellipsis#
layer_extractor = Ellipsis#
required_image_size = Ellipsis#
class Config(*args, **kwargs)[source]#
class ImageNoOpModelConfig(*args, **kwargs)[source]#
noop_target_image_size = Ellipsis#
class Config(*args, **kwargs)[source]#
title = 'Image Store (no-op) Model'#
class ImageKerasLayerConfig(*args, **kwargs)[source]#
required_image_size = Ellipsis#
class Config(*args, **kwargs)[source]#
title = 'Image Keras Layer'#
class HFImageModelConfig(*args, **kwargs)[source]#
hf_name = Ellipsis#
required_image_size = Ellipsis#
class Config(*args, **kwargs)[source]#
title = 'Huggingface Image Model'#

Text Models#

class TextFullModelConfig[source]#
property invariant_json#

Gets an invariant JSON from which a model config can be constructed. The invariance here means that two models that produce the same output for a given input should have a same JSON.

Returns:

Invariant JSON representation.

Return type:

str

class TextModelInfo(*, batch_size=None, date_added=datetime.date(2022, 8, 15), num_params=None, token_length=None)[source]#
token_length#
class Config[source]#
title = 'Information about the text model'#
class TFFullTextModelConfig(*, tf_text_model_url, output_key=None)[source]#
tf_text_model_url#
output_key#
class Config[source]#
title = 'TensorFlow Text Model'#
class TextNoOpModelConfig(*, noop_text)[source]#
noop_text#
property invariant_json#

Gets an invariant JSON from which a model config can be constructed. The invariance here means that two models that produce the same output for a given input should have a same JSON.

Returns:

Invariant JSON representation.

Return type:

str

class Config[source]#
title = 'Text Store (no-op) Model'#
class TextKerasLayerConfig(*, text_layer_path)[source]#
text_layer_path#
class Config[source]#
title = 'Text Keras Layer'#
class HFTextModelConfig(*, hf_name, max_length, pooled_output, tokenizer_params=None)[source]#
hf_name#
max_length#
pooled_output#
tokenizer_params#
classmethod pooled_output_available(values)[source]#
class Config[source]#
title = 'HuggingFace Model'#

Pipeline#

class DataType(value)[source]#

Specifies the type of the data.

IMAGE = 'image'#
TEXT = 'text'#
OTHER = 'other'#
UNKNOWN = 'unknown'#
property is_embed_type#

Specifies whether the selected data type is suitable to be embedded.

Returns:

True if the data type is suitable to be embedded or if the data type is unknown, False otherwise.

Return type:

bool

class Device(value)[source]#

Device to use (whether to use hardware acceleration or not).

GPU = 'GPU'#
CPU = 'CPU'#

Model#

class PreprocessingSpecs[source]#

Specifies how the data should be preprocessed by the reader such that it is suitable to be used by a model for inference.

property needs_disabled_multithreading#

Specifies whether the preprocessing should be run in a single thread.

Returns:

True if the preprocessing should not be run in multiple threads, False otherwise.

Return type:

bool

abstract get_tf_preprocessing_fn()[source]#

Returns the function that should be used for preprocessing in TensorFlow datasets.

Returns:

TensorFlow dataset preprocessing function.

Return type:

Callable, optional

abstract get_pt_preprocessing_fn()[source]#

Returns the function that should be used for preprocessing in PyTorch datasets.

Returns:

PyTorch dataset preprocessing function.

Return type:

Callable, optional

class NullPreprocessingSpecs[source]#

Used in places where preprocessing is not needed.

get_tf_preprocessing_fn()[source]#

Returns the function that should be used for preprocessing in TensorFlow datasets.

Returns:

TensorFlow dataset preprocessing function.

Return type:

Callable, optional

get_pt_preprocessing_fn()[source]#

Returns the function that should be used for preprocessing in PyTorch datasets.

Returns:

PyTorch dataset preprocessing function.

Return type:

Callable, optional

class Model[source]#

Interface that should be implemented by all models.

abstract get_preprocessing_specs()[source]#

Returns the preprocessing specification that preprocesses the data in a way that is suitable to be passed to the model.

Returns:

Preprocessing specification.

Return type:

PreprocessingSpecs

abstract property data_type#

Specifies what kind of data the model can embed.

Returns:

Type of data the model can embed.

Return type:

DataType

abstract apply_embedding(features)[source]#

Embeds the specified data.

Parameters:

features (np.ndarray) – Data to be embedded.

Returns:

Embedded data.

Return type:

np.ndarray

class NullModel[source]#

Used in places where the model is not needed.

get_preprocessing_specs()[source]#

Returns the preprocessing specification that preprocesses the data in a way that is suitable to be passed to the model.

Returns:

Preprocessing specification.

Return type:

PreprocessingSpecs

property data_type#

Specifies what kind of data the model can embed.

Returns:

Type of data the model can embed.

Return type:

DataType

apply_embedding(features)[source]#

Embeds the specified data.

Parameters:

features (np.ndarray) – Data to be embedded.

Returns:

Embedded data.

Return type:

np.ndarray

class ModelFactory[source]#

Instantiates model instances given the specified parameters.

abstract static get_model(config, device)[source]#

Instantiates a model given the specified parameters.

Parameters:
  • config (FullModelConfig) – Model configuration.

  • device (Device) – Target device, on which the model should be loaded.

Returns:

Model instance.

Return type:

Model

Reader#

class Reader[source]#

Interface that should be implemented by all readers.

abstract __iter__()[source]#

Returns the iterator which can be iterated to get the data.

abstract __next__()[source]#

Returns next subset of the data (e.g. according to the specified batch size). All NumPy arrays should have the same length (= first dimension).

Returns:

Next subset of the data. Each key is a different part of the data (e.g. image, label, description, …).

Return type:

Dict[str, np.ndarray]

abstract property data_type#

Specifies the data type of the feature with key ‘READER_EMBED_FEATURE_NAME’ even if this feature is not present in the dataset.

Returns:

Type of the feature with key ‘READER_EMBED_FEATURE_NAME’.

Return type:

DataType

class ReaderFactory[source]#

Responsible for instantiating reader instances given the specified parameters.

abstract static get_reader(reader_config, batch_size, specs)[source]#

Instantiates a reader given the specified parameters.

Parameters:
  • reader_config (ReaderConfig) – Reader configuration.

  • specs (PreprocessingSpecs) – Preprocessing specification.

  • batch_size (int, optional) – Batch size; if not specified, maximal possible batch size (whole dataset) should be used.

Returns:

Reader instance.

Return type:

Reader

DB Tools#

Redis#

class StatusMap(pool=None)[source]#

Stores status of jobs. If Redis is used, each thread/process should use its own instance. If Redis is not used, a single instance should be used by one thread/process.

Parameters:

pool (BlockingConnectionPool, optional) – Connection pool needed to use Redis. The default value None means that Redis is not used. Instead, regular Python data structures are used.

__getitem__(job_id)[source]#

Returns the status for the job/request with the specified ID.

Parameters:

job_id (ID) – Job/request ID.

Returns:

Status of the requested job.

Return type:

StatusResponse

__setitem__(job_id, status)[source]#

Sets the status for the job/request with the specified ID.

Parameters:
  • job_id (ID) – Job/request ID.

  • status (StatusResponse) – Status of the job.

__contains__(job_id)[source]#

Checks whether the map contains status for a job/request.

Parameters:

job_id (ID) – Job/request ID.

Returns:

True if the status of the job is stored in the map, False otherwise.

Return type:

bool

class InferenceQueue(pool=None)[source]#

Stores inference requests waiting for execution. If Redis is used, each thread/process should use its own instance. If Redis is not used, a single instance should be used by one thread/process.

Parameters:

pool (BlockingConnectionPool, optional) – Connection pool needed to use Redis. The default value None means that Redis is not used. Instead, regular Python data structures are used.

empty()[source]#

Checks whether the queue is empty.

Returns:

True if the queue is empty, False otherwise.

Return type:

bool

put(value)[source]#

Puts the request into queue.

Parameters:

value (InferenceRequest) – Request to put into queue.

get()[source]#

Returns the first element from the queue. Before this call, it should be checked with another method that the queue is not empty. Note that it is not safe to call this method from multiple threads/processes as the element could have already been returned to another thread/process.

Returns:

First element from the queue.

Return type:

InferenceRequest

class ClassifierDeps(pool=None)[source]#

Stores classifier requests waiting for execution and tracks their dependencies. If Redis is used, each thread/process should use its own instance. If Redis is not used, a single instance should be used by one thread/process.

Parameters:
  • pool (BlockingConnectionPool, optional) – Connection pool needed to use Redis.

  • Instead (The default value None means that Redis is not used.) –

  • used. (regular Python data structures are) –

add_request(request)[source]#

Stores a classifier request and starts to track its dependencies.

Parameters:

request (ClassifierRequest) – Classifier request.

update_dependencies(status_map)[source]#

Updates dependencies of all classifier jobs to determine which jobs are ready for execution and which jobs cannot be executed, because one of their dependencies failed.

Parameters:

status_map (StatusMap) – Statuses of all jobs used to check statuses of classifier jobs dependencies.

Returns:

Pairs (classifier job ID, dependency ID). Each pair means that a dependency failed and because of that the classifier job cannot be executed, so it should be marked as failed.

Return type:

Sequence[Tuple[ID, ID]]

any_request_ready()[source]#

Checks whether any classifier job/request is ready for execution, because all of its dependencies are ready.

Returns:

True if any request is ready for execution, False otherwise.

Return type:

bool

get_ready_request()[source]#

Returns a request that has all of its dependencies ready. Before this call, it should be checked with another method that such request actually exists. Note that it is not safe to call this method from multiple threads/processes as the request could have already been returned to another thread/process.

Returns:

Request ready for execution.

Return type:

ClassifierRequest

class RedisData(host, port)[source]#

Provides a simple way of instantiating Redis data structures.

Parameters:
  • host (str) – Redis host.

  • port (str) – Redis port.

get_status_map()[source]#
get_task2vec_queue()[source]#
get_inference_queue()[source]#
get_classifier_deps()[source]#
get_hyperband_queue()[source]#
get_finetune_queue()[source]#
flush()[source]#
get_all()[source]#

Returns all Redis data structures.

Returns:

All data structures.

Return type:

Tuple[InferenceQueue, StatusMap, ClassifierDeps]

class Task2vecQueue(pool=None)[source]#

Stores task2vec requests waiting for execution. If Redis is used, each thread/process should use its own instance. If Redis is not used, a single instance should be used by one thread/process.

Parameters:

pool (BlockingConnectionPool, optional) – Connection pool needed to use Redis. The default value None means that Redis is not used. Instead, regular Python data structures are used.

empty()[source]#

Checks whether the queue is empty.

Returns:

True if the queue is empty, False otherwise.

Return type:

bool

put(value)[source]#

Puts the request into queue.

Parameters:

value (Task2Vec Request) – Request to put into queue.

get()[source]#

Returns the first element from the queue. Before this call, it should be checked with another method that the queue is not empty. Note that it is not safe to call this method from multiple threads/processes as the element could have already been returned to another thread/process.

Returns:

First element from the queue.

Return type:

Task2VecRequest

class HyperbandQueue(pool)[source]#
empty()[source]#

Checks whether the queue is empty.

Returns:

True if the queue is empty, False otherwise.

Return type:

bool

put(value)[source]#

Puts the request into queue.

Parameters:

value (Hyperband Request) – Request to put into queue.

get()[source]#

Returns the first element from the queue. Before this call, it should be checked with another method that the queue is not empty. Note that it is not safe to call this method from multiple threads/processes as the element could have already been returned to another thread/process.

Returns:

First element from the queue.

Return type:

HyperbandStatus

PostgreSQL#

class JobsDBInterface[source]#
abstract store_inference_job(req)[source]#

Stores inference job/request to the database.

Parameters:

req (InferenceRequest) – Inference request to store.

abstract store_nearest_neighbor_job(classifier_job_hash, nn_result)[source]#

Stores the nearest neighbor job result.

Parameters:
  • classifier_job_hash (Hash) – Identifier of a specific nearest neighbor job.

  • nn_result (NearestNeighborResult) – Result from which a new result can be computed if only labels get changed.

abstract store_known_result(job_hash, nn_result, classifier_request)[source]#

Store Known Results

abstract query_model_by_tags()[source]#

Query model by tags

abstract store_task2vec_job(req)[source]#
abstract store_finetune_job(req)[source]#

Store finetune job

abstract get_reader_by_json(json_reader)[source]#
abstract get_known_result_by_params(classifier_type, model_json, train_reader_json, test_reader_json)[source]#
abstract purge()[source]#

Purge saved results in database

abstract get_nn_result(nn_job_hash)[source]#

Retrieves the nearest neighbor result from the database.

Parameters:

nn_job_hash (Hash) – Identifier of a specific nearest neighbor job.

Returns:

Result from which a new result can be computed if only labels get changed. The value None means that the result for the specific job (identified by its hash) has not been stored yet.

Return type:

NearestNeighborResult, optional

abstract get_successful_inference_request_hashes()[source]#

Retrieves hashes of inference jobs that successfully completed in the past.

Returns:

Hashes of successful inference jobs.

Return type:

Set[Hash]

abstract get_successful_classifier_request_hashes_and_errors()[source]#

Retrieves hashes of classifier jobs that successfully completed in the past and the error they achieved.

Returns:

Mapping from successful classifier job hashes to their errors.

Return type:

Dict[Hash, float]

abstract get_successful_task2vec_request_hashes()[source]#

Retrieves hashes of task2vec jobs that successfully completed in the past

Returns:

Hashes of successful task2vec jobs.

Return type:

Set[Hash]

abstract get_successful_finetune_request_hashes()[source]#

Retrieves hashes of finetune jobs that successfully completed in the past

Returns:

Hashes of successful finetune jobs

Return type:

Set[Hash]

abstract store_linear_job(h, linear_result)[source]#

Store linear classification job

abstract store_hyperband_job(h)[source]#

Store linear classification job

abstract check_hyperband_job(h)[source]#

Store linear classification job

class BaseDBInterface[source]#
abstract populate_model_databases()[source]#

Populates model databases with predefined models.

abstract populate_tasks_database()[source]#

Populates tasks database with predefined tasks.

abstract register_image_model(request)[source]#

Stores an image model together with information about it in the database.

Parameters:

request (ImageModelRegistrationRequest) – Model and information about it.

abstract register_text_model(request)[source]#

Stores a text model together with information about it in the database.

Parameters:

request (TextModelRegistrationRequest) – Model and information about it.

abstract register_reader(request)[source]#
abstract get_batch_size(model)[source]#

Retrieves the batch size that was specified for a model.

Parameters:

model (Union[ImageFullModelConfig, TextFullModelConfig]) – Model for which the batch size will be returned.

Returns:

The batch size. The value None means that batch size for the model was not specified in the past.

Return type:

int, optional

abstract get_readers_used_with_a_model(request)[source]#

Retrieves readers that were used in the past together with the specified model.

Parameters:

request (ReadersUsedWithAModelRequest) – Model for which the corresponding readers will be returned.

Returns:

Readers that were used with the specified model and the hash identifying the job in which they were used together.

Return type:

ReadersUsedWithAModelResponse

abstract get_models_used_with_a_reader(request)[source]#

Retrieves models that were used in the past together with the specified reader.

Parameters:

request (ModelsUsedWithAReaderRequest) – Reader for which the corresponding models will be returned.

Returns:

Models that were used with the specified reader and the hash identifying the job in which they were used together.

Return type:

ModelsUsedWithAReaderResponse

abstract get_image_models(request)[source]#

Retrieves image models that satisfy the specified criteria.

Parameters:

request (ImageModelInfoRequest) – Image model criteria.

Returns:

Image models satisfying criteria.

Return type:

MatchingImageModelsResponse

abstract get_text_models(request)[source]#

Retrieves text models that satisfy the specified criteria.

Parameters:

request (TextModelInfoRequest) – Text model criteria.

Returns:

Text models satisfying criteria.

Return type:

MatchingTextModelResponse

class JobsDB(dbname, user, password, host, port)[source]#
populate_model_databases()[source]#

Populates model databases with predefined models.

populate_tasks_database()[source]#

Populates tasks database with predefined tasks.

register_image_model(request)[source]#

Stores an image model together with information about it in the database.

Parameters:

request (ImageModelRegistrationRequest) – Model and information about it.

register_text_model(request)[source]#

Stores a text model together with information about it in the database.

Parameters:

request (TextModelRegistrationRequest) – Model and information about it.

register_reader(request)[source]#
get_batch_size(model)[source]#

Retrieves the batch size that was specified for a model.

Parameters:

model (Union[ImageFullModelConfig, TextFullModelConfig]) – Model for which the batch size will be returned.

Returns:

The batch size. The value None means that batch size for the model was not specified in the past.

Return type:

int, optional

get_model_info(model, info_name)[source]#
get_readers_used_with_a_model(request)[source]#

Retrieves readers that were used in the past together with the specified model.

Parameters:

request (ReadersUsedWithAModelRequest) – Model for which the corresponding readers will be returned.

Returns:

Readers that were used with the specified model and the hash identifying the job in which they were used together.

Return type:

ReadersUsedWithAModelResponse

get_reader_size(json_reader)[source]#
get_reader_by_json(json_reader)[source]#
get_all_registered_readers()[source]#
get_registered_readers_with_name(name)[source]#
get_models_used_with_a_reader(request)[source]#

Retrieves models that were used in the past together with the specified reader.

Parameters:

request (ModelsUsedWithAReaderRequest) – Reader for which the corresponding models will be returned.

Returns:

Models that were used with the specified reader and the hash identifying the job in which they were used together.

Return type:

ModelsUsedWithAReaderResponse

get_image_models(request)[source]#

Retrieves image models that satisfy the specified criteria.

Parameters:

request (ImageModelInfoRequest) – Image model criteria.

Returns:

Image models satisfying criteria.

Return type:

MatchingImageModelsResponse

get_text_models(request)[source]#

Retrieves text models that satisfy the specified criteria.

Parameters:

request (TextModelInfoRequest) – Text model criteria.

Returns:

Text models satisfying criteria.

Return type:

MatchingTextModelResponse

store_inference_job(req)[source]#

Stores inference job/request to the database.

Parameters:

req (InferenceRequest) – Inference request to store.

store_task2vec_job(req)[source]#
store_nearest_neighbor_job(classifier_job_hash, nn_result)[source]#

Stores the nearest neighbor job result.

Parameters:
  • classifier_job_hash (Hash) – Identifier of a specific nearest neighbor job.

  • nn_result (NearestNeighborResult) – Result from which a new result can be computed if only labels get changed.

store_linear_job(classifier_job_hash, linear_result)[source]#

Store linear classification job

get_info_with_inference_hash(inference_job_hash)[source]#
get_known_result(classifier_hash)[source]#
get_known_result_by_params(classifier_type, model_json, train_reader_json, test_reader_json)[source]#
store_known_result(job_hash, nn_result, classifier_request)[source]#

Store Known Results

get_nn_result(nn_job_hash)[source]#

Retrieves the nearest neighbor result from the database.

Parameters:

nn_job_hash (Hash) – Identifier of a specific nearest neighbor job.

Returns:

Result from which a new result can be computed if only labels get changed. The value None means that the result for the specific job (identified by its hash) has not been stored yet.

Return type:

NearestNeighborResult, optional

get_linear_result(lc_job_hash)[source]#
get_dataset(dataset_name)[source]#
get_all_readers()[source]#
get_successful_inference_request_hashes()[source]#

Retrieves hashes of inference jobs that successfully completed in the past.

Returns:

Hashes of successful inference jobs.

Return type:

Set[Hash]

get_successful_classifier_request_hashes_and_errors()[source]#

Retrieves hashes of classifier jobs that successfully completed in the past and the error they achieved.

Returns:

Mapping from successful classifier job hashes to their errors.

Return type:

Dict[Hash, float]

get_successful_task2vec_request_hashes()[source]#

Retrieves hashes of task2vec jobs that successfully completed in the past

Returns:

Hashes of successful task2vec jobs.

Return type:

Set[Hash]

store_hyperband_job(h)[source]#

Store linear classification job

store_finetune_job(req)[source]#

Store finetune job

get_successful_finetune_request_hashes()[source]#

Retrieves hashes of finetune jobs that successfully completed in the past

Returns:

Hashes of successful finetune jobs

Return type:

Set[Hash]

check_hyperband_job(h)[source]#

Store linear classification job

purge()[source]#

Purge saved results in database

query_model_by_tags(request)[source]#

Query model by tags

Extensions#

class Settings(_env_file='<object object>', _env_file_encoding=None, _env_nested_delimiter=None, _secrets_dir=None, *, redis_host, redis_port, postgres_host, postgres_port, postgres_user, postgres_database)[source]#
redis_host#
redis_port#
postgres_host#
postgres_port#
postgres_user#
postgres_database#
class Secrets(_env_file='<object object>', _env_file_encoding=None, _env_nested_delimiter=None, _secrets_dir=None, *, postgres_password)[source]#
postgres_password#
class ShiftExtension(name)[source]#