Upload and classify a document, and then standardize for certain classes, all in a single POST request using workflows.
This guide demonstrates how to use DocuPanda's workflow feature to classify and standardize documents. We'll walk through the process of defining a workflow, uploading a document, and retrieving the classification and standardization results. This example is in Python, but the same concept applies to other programming languages.
Prerequisites
Before you begin, make sure you have:
- A DocuPanda API key
- Python 3 installed
Authentication
Every request to DocuPanda needs to include an API key. You can obtain your API key by signing up and going to your account settings page.
Step 1: Define a Workflow
First, we'll define a workflow that includes a classification and standardization step. This workflow will be applied to the documents we upload.
import requests
API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
CLASS_ID = "YOUR_CLASS_ID"
SCHEMA_ID = "YOUR_SCHEMA_ID"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}
def post_workflow():
url = f"{APP_URL}/workflow/on-submit-document"
payload = {
"classifyStandardizeStep": {
"classToSchema": {CLASS_ID: SCHEMA_ID},
"stdReleaseVersion": 2 # optional
}
}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
return response.json()["workflowId"]
workflow_id = post_workflow()
print(f"Workflow ID: {workflow_id}")
Replace "YOUR_API_KEY"
with your actual API key, "YOUR_CLASS_ID"
with the ID of the class you want to classify documents into, and "YOUR_SCHEMA_ID"
with the ID of the schema you want to use for standardization. By default, standardization is done with the latest version, but you can optionally set the version.
Step 2: Upload a Document
Next, we'll upload a document and apply the workflow we just created.
import base64
DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"
def post_doc(file_dict, workflow_id):
url = f"{APP_URL}/document"
payload = {
"document": {
"file": file_dict,
},
"dataset": DATASET_NAME,
"workflowId": workflow_id
}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
res_json = response.json()
return {
"upload_job_id": res_json["jobId"],
"cls_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classificationJobId"],
"std_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationIds"][CLASS_ID],
"std_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationJobIds"][CLASS_ID]
}
file_dict = {
"contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
"filename": DOC_PATH.split("/")[-1]
}
response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
print(f"Upload Job ID: {response['upload_job_id']}")
print(f"Classification Job ID: {response['cls_job_id']}")
print(f"Standardization ID: {response['std_id']}")
print(f"Standardization Job ID: {response['std_job_id']}")
Replace "/path/to/your/doc.pdf"
with the actual path to your document and "YOUR_DATASET_NAME"
with the name of the dataset you want to assign to your document.
Step 3: Check Job Status
DocuPanda processes documents asynchronously. We can check the status of jobs using their IDs.
import time
def is_job_done(job_id):
url = f"{APP_URL}/job/{job_id}"
for _ in range(20):
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
status = response.json()["status"]
if status == "completed":
return True
elif status == "error":
return False
time.sleep(2)
return False
def cls_job_outcome(cls_job_id):
url = f"{APP_URL}/classify/{cls_job_id}"
for _ in range(20):
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
res_json = response.json()
status = res_json["status"]
if status == "completed":
return {
"done": True,
"assigned_desired_class": CLASS_ID in res_json["assignedClassIds"]
}
time.sleep(2)
return {"done": False, "assigned_desired_class": False}
upload_done = is_job_done(response["upload_job_id"])
cls_result = cls_job_outcome(response["cls_job_id"])
std_done = is_job_done(response["std_job_id"])
print(f"Upload completed: {upload_done}")
print(f"Classification completed: {cls_result['done']}")
print(f"Assigned desired class: {cls_result['assigned_desired_class']}")
print(f"Standardization completed: {std_done}")
Step 4: Retrieve Standardization Results
Once the jobs are complete and the document is classified into the desired class, we can retrieve the standardization results.
def get_std(std_id):
url = f"{APP_URL}/standardization/{std_id}"
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
return response.json()["data"]
if upload_done and cls_result["done"] and cls_result["assigned_desired_class"] and std_done:
std_result = get_std(response["std_id"])
print("Standardization Result:")
print(std_result)
else:
print("Document was not classified as the desired class or jobs did not complete successfully")
Complete Example
Here's a complete example that puts all these steps together:
import time
import base64
import requests
API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"
CLASS_ID = "YOUR_CLASS_ID"
SCHEMA_ID = "YOUR_SCHEMA_ID"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}
def post_workflow():
url = f"{APP_URL}/workflow/on-submit-document"
payload = {
"classifyStandardizeStep": {
"classToSchema": {CLASS_ID: SCHEMA_ID},
}
}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
return response.json()["workflowId"]
def post_doc(file_dict, workflow_id):
url = f"{APP_URL}/document"
payload = {
"document": {"file": file_dict},
"dataset": DATASET_NAME,
"workflowId": workflow_id
}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
res_json = response.json()
return {
"upload_job_id": res_json["jobId"],
"cls_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classificationJobId"],
"std_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationIds"][CLASS_ID],
"std_job_id": res_json["workflowResponse"]["classifyStandardizeStep"]["classToStandardizationJobIds"][CLASS_ID]
}
def is_job_done(job_id):
url = f"{APP_URL}/job/{job_id}"
for _ in range(20):
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
status = response.json()["status"]
if status == "completed":
return True
elif status == "error":
return False
time.sleep(2)
return False
def cls_job_outcome(cls_job_id):
url = f"{APP_URL}/classify/{cls_job_id}"
for _ in range(20):
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
res_json = response.json()
status = res_json["status"]
if status == "completed":
return {
"done": True,
"assigned_desired_class": CLASS_ID in res_json["assignedClassIds"]
}
time.sleep(2)
return {"done": False, "assigned_desired_class": False}
def get_std(std_id):
url = f"{APP_URL}/standardization/{std_id}"
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
return response.json()["data"]
def main():
workflow_id = post_workflow()
print(f"Workflow ID: {workflow_id}")
file_dict = {
"contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
"filename": DOC_PATH.split("/")[-1]
}
response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
print(f"Upload Job ID: {response['upload_job_id']}")
print(f"Classification Job ID: {response['cls_job_id']}")
print(f"Standardization ID: {response['std_id']}")
print(f"Standardization Job ID: {response['std_job_id']}")
upload_done = is_job_done(response["upload_job_id"])
cls_result = cls_job_outcome(response["cls_job_id"])
std_done = is_job_done(response["std_job_id"])
print(f"Upload completed: {upload_done}")
print(f"Classification completed: {cls_result['done']}")
print(f"Assigned desired class: {cls_result['assigned_desired_class']}")
print(f"Standardization completed: {std_done}")
if upload_done and cls_result["done"] and cls_result["assigned_desired_class"] and std_done:
std_result = get_std(response["std_id"])
print("Standardization Result:")
print(std_result)
else:
print("Document was not classified as the desired class or jobs did not complete successfully")
if __name__ == '__main__':
main()
Remember to replace "YOUR_API_KEY"
, "/path/to/your/doc.pdf"
, "YOUR_DATASET_NAME"
, "YOUR_CLASS_ID"
, and "YOUR_SCHEMA_ID"
with your actual values.
This example demonstrates how to use DocuPanda's workflow feature to classify and standardize documents. It covers creating a workflow, uploading a document, checking job status, and retrieving classification and standardization results. You can customize this process further by modifying the workflow definition or adding error handling as needed for your specific use case.