Import Text Regions to Text Files

A single text region object label can be added at a character range between start and end using range.


# Import dependencies
from typing import List
from encord import EncordUserClient
from encord.objects import LabelRowV2, Object, ObjectInstance, OntologyStructure
from encord.objects.frames import Range
from encord.objects.coordinates import TextCoordinates


SSH_PATH = "<file-path-to-your-shh-key>"
PROJECT_ID = "<project-unique-id>"

user_client = EncordUserClient.create_with_ssh_private_key(
    ssh_private_key_path=SSH_PATH,
)

# Specify the project
project = user_client.get_project(PROJECT_ID)

# Example mapping of data unit titles to their respective text ranges
data_units_to_update = {
    "<my-text-file-01>": [Range(start=5000, end=5050)],
    "<your-text-file-01>": [Range(start=3000, end=3050), Range(start=14000, end=14050)],
    "<their-text-file-01>": [Range(start=13100, end=13250), Range(start=24100, end=24550)],
}

# Find the ontology object for your Text Region
text_object: Object = project.ontology_structure.get_child_by_title(
    title="<name-of-your-text-region>", type_=Object
)

if text_object is None:
    raise ValueError("Ontology object for 'Text Region' not found.")

# Iterate over each data unit and apply updates
for data_title, text_ranges in data_units_to_update.items():
    # Get the corresponding label row
    label_rows = project.list_label_rows_v2(data_title_eq=data_title)

    if not label_rows:
        print(f"Skipping: No label row found for {data_title}")
        continue

    label_row = label_rows[0]
    label_row.initialise_labels()

    # Add object instances for each text range
    for text_range in text_ranges:
        text_object_instance: ObjectInstance = text_object.create_instance()
        text_object_instance.set_for_frames(
            frames=0,  # Indicates a single data unit (text file)
            coordinates=TextCoordinates(range=[text_range]),
        )
        label_row.add_object_instance(text_object_instance)
        print(f"Added text range {text_range} to {data_title}")

    # Save the label row after adding all instances
    label_row.save()
    print(f"Saved label row for {data_title}")

print("Updates applied to all specified data units!")


Import Classifications to Text Files

The example for the Classification uses nested attributes with the Ontology structure as follows:

  • Accurate?
    • Yes
    • No
      • Correction (text field to provide edits for the correction)
create_instance must use range_only=True for text documents. This includes HTML documents.

# Import dependencies
from typing import List
from pathlib import Path
from encord import EncordUserClient, Project
from encord.objects.frames import Range
from encord.objects import LabelRowV2, Classification, Option, OntologyStructure

SSH_PATH = "<file-path-to-ssh-private-key>"
PROJECT_ID = "<project-unique-id>"

# Create user client using ssh key
user_client: EncordUserClient = EncordUserClient.create_with_ssh_private_key(
    Path(SSH_PATH).read_text()
)

# Get project for which predictions are to be added
project: Project = user_client.get_project(PROJECT_ID)

# Specify the data unit to apply classification
label_row = project.list_label_rows_v2(
    data_title_eq="<file-name-for-text-file>.html"
)[0]


# Download the existing labels 
label_row.initialise_labels()

# Get the Ontology structure
ontology_structure: OntologyStructure = label_row.ontology_structure

# Assume that the following radio button classification exists in the Ontology.
radio_ontology_classification: Classification = (
    ontology_structure.get_child_by_title(
        title="<classification-name>", type_=Classification
    )
)

radio_classification_option = radio_ontology_classification.get_child_by_title(
title="<option-name>",
type_=Option
)

# Create classification instance. `range_only=True` is required for HTML documents
radio_classification_instance = radio_ontology_classification.create_instance(range_only=True)

# Set the answer of the classification instance
radio_classification_instance.set_answer(radio_classification_option)

# Select the frames where the classification instance is present
radio_classification_instance.set_for_frames(frames=0)

# Add it to the label row
label_row.add_classification_instance(radio_classification_instance)

# Save labels
label_row.save()

Export Labels for Text Files


# Import dependencies
from encord import EncordUserClient
import json

SSH_PATH= "<file-path-to-ssh-private-key"
PROJECT_ID= "<project-unique-id>"
DATA_UNIT_NAME= "<file-name-of-html-file>"

# Instantiate client. Replace <private_key_path> with the path to the file containing your private key.
user_client = EncordUserClient.create_with_ssh_private_key(
    ssh_private_key_path=SSH_PATH
)

# Specify Project. Replace <project_hash> with the hash of the Project you want to export labels for.
project = user_client.get_project(PROJECT_ID)

# Specify the data unit you want to export labels for. Replace <file_name> with the name of your specific data unit.
specific_label_row = project.list_label_rows_v2(
    data_title_eq=DATA_UNIT_NAME
)[0]

# Download label information for the specific data unit
specific_label_row.initialise_labels()

# Print the labels as JSON
print(json.dumps(specific_label_row.to_encord_dict()))

Remove Labels from Text Files


from encord import EncordUserClient
import json

SSH_PATH= "<file-path-to-ssh-private-key>"
PROJECT_ID= "<project-unique-id>"
DATA_UNIT_NAME= "<file-name-of-html-file>"

# Instantiate client. Replace <private_key_path> with the path to the file containing your private key.
user_client = EncordUserClient.create_with_ssh_private_key(
    ssh_private_key_path=SSH_PATH
)

# Specify Project. Replace <project_hash> with the hash of the Project you want to export labels for.
project = user_client.get_project(PROJECT_ID)

# Specify the data unit you want to export labels for. Replace <file_name> with the name of your specific data unit.
specific_label_row = project.list_label_rows_v2(
    data_title_eq=DATA_UNIT_NAME
)[0]


object_to_remove = None
specific_label_row.initialise_labels()
for object_instance in specific_label_row.get_object_instances():
    if object_instance.object_hash == '<label-unique-id>':
        object_to_remove = object_instance

specific_label_row.remove_object(object_to_remove)

specific_label_row.save()