Workflow: Upload and Standardize

This guide demonstrates how to use DocuPanda's workflow feature to standardize documents. We'll walk through the process of defining a workflow, uploading a document, and retrieving the standardization results. This example is in Python, but the same concept applies for other programming languages.

Prerequisites

Before you begin, make sure you have:

A DocuPanda API key
Python 3 installed

Authentication

Every request to DocuPanda needs to include an API key. You can obtain your API key by signing up and going to your account settings page.

Step 1: Define a Workflow

First, we'll define a workflow that includes a standardization step. This workflow will be applied to the documents we upload.

import requests

API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}

def post_workflow():
    url = f"{APP_URL}/workflow/on-submit-document"
    payload = {
        "standardizeStep": {
            "schemaIds": ["YOUR_SCHEMA_ID"],
          	"stdReleaseVersion": 2  # optional
        }
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["workflowId"]

workflow_id = post_workflow()
print(f"Workflow ID: {workflow_id}")

Replace "YOUR_API_KEY" with your actual API key and "YOUR_SCHEMA_ID" with the ID of the schema you want to use for standardization. By default, standardization is done with the latest version, but you can optionally set the version.

Step 2: Upload a Document

Next, we'll upload a document and apply the workflow we just created.

import base64

DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"

def post_doc(file_dict, workflow_id):
    url = f"{APP_URL}/document"
    payload = {
        "document": {
            "file": file_dict,
        },
        "dataset": DATASET_NAME,
        "workflowId": workflow_id
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    res_json = response.json()
    return {
        "upload_job_id": res_json["jobId"],
        "std_id": res_json["workflowResponse"]["standardizeStep"]["standardizationIds"][0],
        "std_job_id": res_json["workflowResponse"]["standardizeStep"]["standardizationJobIds"][0]
    }

file_dict = {
    "contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
    "filename": DOC_PATH.split("/")[-1]
}
response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
print(f"Upload Job ID: {response['upload_job_id']}")
print(f"Standardization ID: {response['std_id']}")
print(f"Standardization Job ID: {response['std_job_id']}")

Replace "/path/to/your/doc.pdf" with the actual path to your document and "YOUR_DATASET_NAME" with the name of the dataset you want to give to your document.

Step 3: Check Job Status

DocuPanda processes documents asynchronously. We can check the status of a job using its ID. If we want to be thorough, we can check the status of all the interim steps, or we can skip that and just poll directly for the complete standardization. Note that instead of polling you could use Webhooks to react immediately as results become available, and avoid polling for job status.

import time
import requests


def is_job_done(job_id):
    url = f"{APP_URL}/job/{job_id}"
    max_cumulative_delay = 500  # maximum cumulative wait time in seconds
    delay = 2  # initial delay time in seconds
    total_wait_time = 0  # total time waited

    for _ in range(100):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        status = response.json()["status"]

        if status == "completed":
            return True
        elif status == "error":
            return False

        # Check if the next delay would exceed the cumulative limit
        if total_wait_time + delay > max_cumulative_delay:
            return False  # job didn't complete in allowed cumulative time

        time.sleep(delay)
        total_wait_time += delay

        # Exponentially increase the delay, but avoid excessive waiting
        delay = min(delay * 2, max_cumulative_delay - total_wait_time)

    return False

upload_done = is_job_done(response["upload_job_id"])
std_done = is_job_done(response["std_job_id"])

print(f"Upload completed: {upload_done}")
print(f"Standardization completed: {std_done}")

Step 4: Retrieve Standardization Results

Once the jobs are complete, we can retrieve the standardization results. If we don't want to write the code to make sure the standardization exists, it is possible to poll directly for the standardization, but if it does not yet exist, you will get a 404 status code.

def get_std(std_id):
    url = f"{APP_URL}/standardization/{std_id}"
    response = requests.get(url, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["data"]

std_result = get_std(response["std_id"])
print("Standardization Result:")
print(std_result)

Complete Example

Here's a complete example that puts all these steps together:

import time
import base64
import requests

API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"
SCHEMA_ID = "YOUR_SCHEMA_ID"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}

def post_workflow():
    url = f"{APP_URL}/workflow/on-submit-document"
    payload = {"standardizeStep": {"schemaIds": [SCHEMA_ID]}}
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["workflowId"]

def post_doc(file_dict, workflow_id):
    url = f"{APP_URL}/document"
    payload = {
        "document": {"file": file_dict},
        "dataset": DATASET_NAME,
        "workflowId": workflow_id
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    res_json = response.json()
    return {
        "upload_job_id": res_json["jobId"],
        "std_id": res_json["workflowResponse"]["standardizeStep"]["standardizationIds"][0],
        "std_job_id": res_json["workflowResponse"]["standardizeStep"]["standardizationJobIds"][0]
    }


def is_job_done(job_id):
    url = f"{APP_URL}/job/{job_id}"
    max_cumulative_delay = 500  # maximum cumulative wait time in seconds
    delay = 2  # initial delay time in seconds
    total_wait_time = 0  # total time waited

    for _ in range(100):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        status = response.json()["status"]

        if status == "completed":
            return True
        elif status == "error":
            return False

        # Check if the next delay would exceed the cumulative limit
        if total_wait_time + delay > max_cumulative_delay:
            return False  # job didn't complete in allowed cumulative time

        time.sleep(delay)
        total_wait_time += delay

        # Exponentially increase the delay, but avoid excessive waiting
        delay = min(delay * 2, max_cumulative_delay - total_wait_time)

    return False


def get_std(std_id):
    url = f"{APP_URL}/standardization/{std_id}"
    response = requests.get(url, headers=HEADERS)
    assert response.status_code == 200
    return response.json()["data"]

def main():
    workflow_id = post_workflow() # only run this once, a workflow can be reused in subsequent runs
    print(f"Workflow ID: {workflow_id}")

    file_dict = {
        "contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
        "filename": DOC_PATH.split("/")[-1]
    }
    response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
    print(f"Upload Job ID: {response['upload_job_id']}")
    print(f"Standardization ID: {response['std_id']}")
    print(f"Standardization Job ID: {response['std_job_id']}")

    upload_done = is_job_done(response["upload_job_id"])
    std_done = is_job_done(response["std_job_id"])
    print(f"Upload completed: {upload_done}")
    print(f"Standardization completed: {std_done}")

    if upload_done and std_done:
        std_result = get_std(response["std_id"])
        print("Standardization Result:")
        print(std_result)
    else:
        print("Jobs did not complete successfully")

if __name__ == '__main__':
    main()

Remember to replace "YOUR_API_KEY", "/path/to/your/doc.pdf", "YOUR_DATASET_NAME", and "YOUR_SCHEMA_ID" with your actual values.

The workflow creation step only needs to run once, and you may post many documents to run using the same workflow.

This example demonstrates how to use DocuPanda's workflow feature to standardize documents. It covers creating a workflow, uploading a document, checking job status, and retrieving standardization results. You can customize this process further by modifying the workflow definition or adding error handling as needed for your specific use case.