Upload

This guide demonstrates how to use DocuPanda's document parsing to extract text and structural information from documents. We'll walk through the process of uploading a document, monitoring the job, and retrieving the parsed results. This example is in Python, but the same concept applies for other programming languages.

Prerequisites

Before you begin, make sure you have:

A DocuPanda API key
Python 3 installed
The requests library (pip install requests)

Authentication

Every request to DocuPanda needs to include an API key. You can obtain your API key by signing up and going to your account settings page.

Step 1: Upload a Document

First, we'll upload a document to DocuPanda for parsing. You can upload a document either by providing a file or a URL.

import base64
import requests

API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
DOC_PATH = "/path/to/your/doc"
DATASET_NAME = "YOUR_DATASET_NAME"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}

def post_doc():
    url = f"{APP_URL}/document"
    payload = {
        "document": {
            "file": {
                "contents": base64.b64encode(open(str(DOC_PATH), 'rb').read()).decode(),
                "filename": "my_filename"  # optional
            },
            # Alternatively, you can use a URL:
            # "url": "INSERT_URL_HERE",
        },
        "dataset": DATASET_NAME
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    res_json = response.json()
    return {"job_id": res_json["jobId"], "doc_id": res_json["documentId"]}

response = post_doc()
print(f"Job ID: {response['job_id']}")
print(f"Document ID: {response['doc_id']}")

Replace "YOUR_API_KEY" with your actual API key, "/path/to/your/doc.pdf" with the path to your document, and "YOUR_DATASET_NAME" with your desired dataset name (dataset names are optional, and only for organizational purposes to group documents together).

Step 2: Check Job Status

DocuPanda processes documents asynchronously. We can check the status of a job using its ID. The job will be marked as "completed" when the parsing is finished.

import time

def is_job_done(job_id):
    url = f"{APP_URL}/job/{job_id}"
    for num_attempts in range(60):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        status = response.json()["status"]
        
        if status == "completed":
            return True
        elif status == "error":
            return False
            
        time.sleep(3)
    return False

success = is_job_done(response["job_id"])
print(f"Parsing completed: {success}")

Step 3: Retrieve Parsing Results

Once the job is complete, we can retrieve the parsed document results. The results include the full text of the document as well as more granular information, including text, bounding box, and type broken down by pages and sections.

def get_doc(doc_id):
    url = f"{APP_URL}/document/{doc_id}"
    response = requests.get(url, headers=HEADERS)
    assert response.status_code == 200
    return response.json()

if success:
    doc = get_doc(response["doc_id"])
    print(f"Full text:\n{doc['result']['text']}")
    
    # Access individual pages and sections
    for page in doc["result"]["pages"]:
        print(f"Page {page['pageNum']}")
        for section in page["sections"]:
            print(f"Section at bounding box {section['bbox']}:\n{section['text']}")

Complete Example

Here's a complete example that puts all these steps together:

import time
import base64
import requests

API_KEY = "YOUR_API_KEY"
APP_URL = "https://app.docupanda.io"
DOC_PATH = "/path/to/your/doc"
DATASET_NAME = "YOUR_DATASET_NAME"
HEADERS = {"accept": "application/json", "content-type": "application/json", "X-API-Key": API_KEY}

def post_doc():
    url = f"{APP_URL}/document"
    payload = {
        "document": {
            "file": {
                "contents": base64.b64encode(open(str(DOC_PATH), 'rb').read()).decode(),
                "filename": DOC_PATH.split("/")[-1]
            },
        },
        "dataset": DATASET_NAME
    }
    response = requests.post(url, json=payload, headers=HEADERS)
    assert response.status_code == 200
    res_json = response.json()
    return {"job_id": res_json["jobId"], "doc_id": res_json["documentId"]}

def is_job_done(job_id):
    url = f"{APP_URL}/job/{job_id}"
    for num_attempts in range(60):
        response = requests.get(url, headers=HEADERS)
        assert response.status_code == 200
        status = response.json()["status"]
        if status == "completed":
            return True
        elif status == "error":
            return False
        time.sleep(3)
    return False

def get_doc(doc_id):
    url = f"{APP_URL}/document/{doc_id}"
    response = requests.get(url, headers=HEADERS)
    assert response.status_code == 200
    return response.json()

def main():
    response = post_doc()
    print(f"Job ID: {response['job_id']}")
    print(f"Document ID: {response['doc_id']}")
    
    success = is_job_done(job_id=response["job_id"])
    print(f"Parsing completed: {success}")
    
    if success:
        doc = get_doc(doc_id=response["doc_id"])
        print("Document parsing completed successfully")
        print(f"Full text:\n{doc['result']['text']}")
        for page in doc["result"]["pages"]:
            print(f"Page {page['pageNum']}")
            for section in page["sections"]:
                print(f"Section at bounding box {section['bbox']}:\n{section['text']}")
    else:
        print(f"Upload failed for {DOC_PATH}")

if __name__ == '__main__':
    main()

Remember to replace "YOUR_API_KEY", "/path/to/your/doc.pdf", and "YOUR_DATASET_NAME" with your actual values.

This example demonstrates how to use DocuPanda's document parsing feature to extract text and structural information from documents. The parsed results include both the full text of the document and detailed information about the location of text within pages and sections. You can use this structural information for more advanced document analysis or to maintain the original document layout in your applications.