Upload and standardize a document in a single POST request using workflows.
This guide demonstrates how to use DocuPanda's workflow feature to standardize documents. We'll walk through the process of defining a workflow, uploading a document, and retrieving the standardization results. This example is in Python, but the same concept applies for other programming languages.
Prerequisites
Before you begin, make sure you have:
- A DocuPanda API key
- Python 3 installed
Authentication
Every request to DocuPanda needs to include an API key. You can obtain your API key by signing up and going to your account settings page.
Step 1: Define a Workflow
First, we'll define a workflow that includes a standardization step. This workflow will be applied to the documents we upload.
import requests
API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}
def post_workflow():
url = f"{APP_URL}/workflow/on-submit-document"
payload = {
"standardizeStep": {
"schemaIds": ["YOUR_SCHEMA_ID"],
"stdReleaseVersion": 2 # optional
}
}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
return response.json()["workflowId"]
workflow_id = post_workflow()
print(f"Workflow ID: {workflow_id}")
Replace "YOUR_API_KEY"
with your actual API key and "YOUR_SCHEMA_ID"
with the ID of the schema you want to use for standardization. By default, standardization is done with the latest version, but you can optionally set the version.
Step 2: Upload a Document
Next, we'll upload a document and apply the workflow we just created.
import base64
DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"
def post_doc(file_dict, workflow_id):
url = f"{APP_URL}/document"
payload = {
"document": {
"file": file_dict,
},
"dataset": DATASET_NAME,
"workflowId": workflow_id
}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
res_json = response.json()
return {
"upload_job_id": res_json["jobId"],
"std_id": res_json["workflowResponse"]["standardizeStep"]["standardizationIds"][0],
"std_job_id": res_json["workflowResponse"]["standardizeStep"]["standardizationJobIds"][0]
}
file_dict = {
"contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
"filename": DOC_PATH.split("/")[-1]
}
response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
print(f"Upload Job ID: {response['upload_job_id']}")
print(f"Standardization ID: {response['std_id']}")
print(f"Standardization Job ID: {response['std_job_id']}")
Replace "/path/to/your/doc.pdf"
with the actual path to your document and "YOUR_DATASET_NAME"
with the name of the dataset you want to give to your document.
Step 3: Check Job Status
DocuPanda processes documents asynchronously. We can check the status of a job using its ID. If we want to be thorough, we can check the status of all the interim steps, or we can skip that and just poll directly for the complete standardization. Note that instead of polling you could use Webhooks to react immediately as results become available, and avoid polling for job status.
import time
import requests
def is_job_done(job_id):
url = f"{APP_URL}/job/{job_id}"
max_cumulative_delay = 500 # maximum cumulative wait time in seconds
delay = 2 # initial delay time in seconds
total_wait_time = 0 # total time waited
for _ in range(100):
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
status = response.json()["status"]
if status == "completed":
return True
elif status == "error":
return False
# Check if the next delay would exceed the cumulative limit
if total_wait_time + delay > max_cumulative_delay:
return False # job didn't complete in allowed cumulative time
time.sleep(delay)
total_wait_time += delay
# Exponentially increase the delay, but avoid excessive waiting
delay = min(delay * 2, max_cumulative_delay - total_wait_time)
return False
upload_done = is_job_done(response["upload_job_id"])
std_done = is_job_done(response["std_job_id"])
print(f"Upload completed: {upload_done}")
print(f"Standardization completed: {std_done}")
Step 4: Retrieve Standardization Results
Once the jobs are complete, we can retrieve the standardization results. If we don't want to write the code to make sure the standardization exists, it is possible to poll directly for the standardization, but if it does not yet exist, you will get a 404 status code.
def get_std(std_id):
url = f"{APP_URL}/standardization/{std_id}"
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
return response.json()["data"]
std_result = get_std(response["std_id"])
print("Standardization Result:")
print(std_result)
Complete Example
Here's a complete example that puts all these steps together:
import time
import base64
import requests
API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
DOC_PATH = "/path/to/your/doc.pdf"
DATASET_NAME = "YOUR_DATASET_NAME"
SCHEMA_ID = "YOUR_SCHEMA_ID"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}
def post_workflow():
url = f"{APP_URL}/workflow/on-submit-document"
payload = {"standardizeStep": {"schemaIds": [SCHEMA_ID]}}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
return response.json()["workflowId"]
def post_doc(file_dict, workflow_id):
url = f"{APP_URL}/document"
payload = {
"document": {"file": file_dict},
"dataset": DATASET_NAME,
"workflowId": workflow_id
}
response = requests.post(url, json=payload, headers=HEADERS)
assert response.status_code == 200
res_json = response.json()
return {
"upload_job_id": res_json["jobId"],
"std_id": res_json["workflowResponse"]["standardizeStep"]["standardizationIds"][0],
"std_job_id": res_json["workflowResponse"]["standardizeStep"]["standardizationJobIds"][0]
}
def is_job_done(job_id):
url = f"{APP_URL}/job/{job_id}"
max_cumulative_delay = 500 # maximum cumulative wait time in seconds
delay = 2 # initial delay time in seconds
total_wait_time = 0 # total time waited
for _ in range(100):
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
status = response.json()["status"]
if status == "completed":
return True
elif status == "error":
return False
# Check if the next delay would exceed the cumulative limit
if total_wait_time + delay > max_cumulative_delay:
return False # job didn't complete in allowed cumulative time
time.sleep(delay)
total_wait_time += delay
# Exponentially increase the delay, but avoid excessive waiting
delay = min(delay * 2, max_cumulative_delay - total_wait_time)
return False
def get_std(std_id):
url = f"{APP_URL}/standardization/{std_id}"
response = requests.get(url, headers=HEADERS)
assert response.status_code == 200
return response.json()["data"]
def main():
workflow_id = post_workflow() # only run this once, a workflow can be reused in subsequent runs
print(f"Workflow ID: {workflow_id}")
file_dict = {
"contents": base64.b64encode(open(DOC_PATH, 'rb').read()).decode(),
"filename": DOC_PATH.split("/")[-1]
}
response = post_doc(file_dict=file_dict, workflow_id=workflow_id)
print(f"Upload Job ID: {response['upload_job_id']}")
print(f"Standardization ID: {response['std_id']}")
print(f"Standardization Job ID: {response['std_job_id']}")
upload_done = is_job_done(response["upload_job_id"])
std_done = is_job_done(response["std_job_id"])
print(f"Upload completed: {upload_done}")
print(f"Standardization completed: {std_done}")
if upload_done and std_done:
std_result = get_std(response["std_id"])
print("Standardization Result:")
print(std_result)
else:
print("Jobs did not complete successfully")
if __name__ == '__main__':
main()
Remember to replace "YOUR_API_KEY"
, "/path/to/your/doc.pdf"
, "YOUR_DATASET_NAME"
, and "YOUR_SCHEMA_ID"
with your actual values.
The workflow creation step only needs to run once, and you may post many documents to run using the same workflow.
This example demonstrates how to use DocuPanda's workflow feature to standardize documents. It covers creating a workflow, uploading a document, checking job status, and retrieving standardization results. You can customize this process further by modifying the workflow definition or adding error handling as needed for your specific use case.