Workflow: Upload, Classify and Standardize

Upload and classify a document, and then standardize for certain classes, all in a single POST request using workflows.

This guide demonstrates how to use DocuPanda's workflow feature to classify and standardize documents. We'll walk through the process of defining a workflow, uploading a document, and retrieving the classification and standardization results. This example is in Python, but the same concept applies to other programming languages.

Prerequisites

Before you begin, make sure you have:

  1. A DocuPanda API key
  2. Python 3 installed

Authentication

Every request to DocuPanda needs to include an API key. You can obtain your API key by signing up and going to your account settings page.

Step 1: Define a Workflow

First, we'll define a workflow that includes a classification and standardization step. This workflow will be applied to the documents we upload.

import requests

API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
CLASS_ID = "YOUR_CLASS_ID"
SCHEMA_ID = "YOUR_SCHEMA_ID"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}

def post_workflow():
    url = f"{APP_URL}/workflow/on-submit-document"
    payload = {
        "classifyStandardizeStep": {
            "classToSchema": {CLASS_ID: SCHEMA_ID},
          	"stdReleaseVersion": 2  # optional
        }
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["workflowId"]

workflow_id = post_workflow()
print(f"Workflow ID: {workflow_id}")

Replace "YOUR_API_KEY" with your actual API key, "YOUR_CLASS_ID" with the ID of the class you want to classify documents into, and "YOUR_SCHEMA_ID" with the ID of the schema you want to use for standardization. By default, standardization is done with the latest version, but you can optionally set the version.

Step 2: Upload a Document

Next, we'll upload a document and apply the workflow we just created.

import base64

DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"

def post_doc(file_dict, workflow_id):
    url = f"{APP_URL}/document"
    payload = {
        "document": {
            "file": file_dict,
        },
        "dataset": DATASET_NAME,
        "workflowId": workflow_id
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    res_json = response.json()
    return {
        "upload_job_id": res_json["jobId"],
        "cls_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classificationJobId"],
        "std_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationIds"][CLASS_ID],
        "std_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationJobIds"][CLASS_ID]
    }

file_dict = {
    "contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
    "filename": DOC_PATH.split("/")[-1]
}
response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
print(f"Upload Job ID: {response['upload_job_id']}")
print(f"Classification Job ID: {response['cls_job_id']}")
print(f"Standardization ID: {response['std_id']}")
print(f"Standardization Job ID: {response['std_job_id']}")

Replace "/path/to/your/doc.pdf" with the actual path to your document and "YOUR_DATASET_NAME" with the name of the dataset you want to assign to your document.

Step 3: Check Job Status

DocuPanda processes documents asynchronously. We can check the status of jobs using their IDs.

import time

def is_job_done(job_id):
    url = f"{APP_URL}/job/{job_id}"
    for _ in range(20):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        status = response.json()["status"]
        if status == "completed":
            return True
        elif status == "error":
            return False
        time.sleep(2)
    return False

def cls_job_outcome(cls_job_id):
    url = f"{APP_URL}/classify/{cls_job_id}"
    for _ in range(20):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        res_json = response.json()
        status = res_json["status"]
        if status == "completed":
            return {
                "done": True,
                "assigned_desired_class": CLASS_ID in res_json["assignedClassIds"]
            }
        time.sleep(2)
    return {"done": False, "assigned_desired_class": False}

upload_done = is_job_done(response["upload_job_id"])
cls_result = cls_job_outcome(response["cls_job_id"])
std_done = is_job_done(response["std_job_id"])

print(f"Upload completed: {upload_done}")
print(f"Classification completed: {cls_result['done']}")
print(f"Assigned desired class: {cls_result['assigned_desired_class']}")
print(f"Standardization completed: {std_done}")

Step 4: Retrieve Standardization Results

Once the jobs are complete and the document is classified into the desired class, we can retrieve the standardization results.

def get_std(std_id):
    url = f"{APP_URL}/standardization/{std_id}"
    response = requests.get(url, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["data"]

if upload_done and cls_result["done"] and cls_result["assigned_desired_class"] and std_done:
    std_result = get_std(response["std_id"])
    print("Standardization Result:")
    print(std_result)
else:
    print("Document was not classified as the desired class or jobs did not complete successfully")

Complete Example

Here's a complete example that puts all these steps together:

import time
import base64
import requests

API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"
CLASS_ID = "YOUR_CLASS_ID"
SCHEMA_ID = "YOUR_SCHEMA_ID"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}

def post_workflow():
    url = f"{APP_URL}/workflow/on-submit-document"
    payload = {
        "classifyStandardizeStep": {
            "classToSchema": {CLASS_ID: SCHEMA_ID},
        }
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["workflowId"]

def post_doc(file_dict, workflow_id):
    url = f"{APP_URL}/document"
    payload = {
        "document": {"file": file_dict},
        "dataset": DATASET_NAME,
        "workflowId": workflow_id
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    res_json = response.json()
    return {
        "upload_job_id": res_json["jobId"],
        "cls_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classificationJobId"],
        "std_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationIds"][CLASS_ID],
        "std_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationJobIds"][CLASS_ID]
    }

def is_job_done(job_id):
    url = f"{APP_URL}/job/{job_id}"
    for _ in range(20):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        status = response.json()["status"]
        if status == "completed":
            return True
        elif status == "error":
            return False
        time.sleep(2)
    return False

def cls_job_outcome(cls_job_id):
    url = f"{APP_URL}/classify/{cls_job_id}"
    for _ in range(20):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        res_json = response.json()
        status = res_json["status"]
        if status == "completed":
            return {
                "done": True,
                "assigned_desired_class": CLASS_ID in res_json["assignedClassIds"]
            }
        time.sleep(2)
    return {"done": False, "assigned_desired_class": False}

def get_std(std_id):
    url = f"{APP_URL}/standardization/{std_id}"
    response = requests.get(url, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["data"]

def main():
    workflow_id = post_workflow()
    print(f"Workflow ID: {workflow_id}")

    file_dict = {
        "contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
        "filename": DOC_PATH.split("/")[-1]
    }
    response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
    print(f"Upload Job ID: {response['upload_job_id']}")
    print(f"Classification Job ID: {response['cls_job_id']}")
    print(f"Standardization ID: {response['std_id']}")
    print(f"Standardization Job ID: {response['std_job_id']}")

    upload_done = is_job_done(response["upload_job_id"])
    cls_result = cls_job_outcome(response["cls_job_id"])
    std_done = is_job_done(response["std_job_id"])

    print(f"Upload completed: {upload_done}")
    print(f"Classification completed: {cls_result['done']}")
    print(f"Assigned desired class: {cls_result['assigned_desired_class']}")
    print(f"Standardization completed: {std_done}")

    if upload_done and cls_result["done"] and cls_result["assigned_desired_class"] and std_done:
        std_result = get_std(response["std_id"])
        print("Standardization Result:")
        print(std_result)
    else:
        print("Document was not classified as the desired class or jobs did not complete successfully")

if __name__ == '__main__':
    main()

Remember to replace "YOUR_API_KEY", "/path/to/your/doc.pdf", "YOUR_DATASET_NAME", "YOUR_CLASS_ID", and "YOUR_SCHEMA_ID" with your actual values.

This example demonstrates how to use DocuPanda's workflow feature to classify and standardize documents. It covers creating a workflow, uploading a document, checking job status, and retrieving classification and standardization results. You can customize this process further by modifying the workflow definition or adding error handling as needed for your specific use case.