Interface UnstructuredLoaderOptions

interface UnstructuredLoaderOptions {
    apiKey?: string;
    chunkingStrategy?: null | ChunkingStrategyOpen;
    client?: UnstructuredClient;
    combineUnderNChars?: null | number;
    coordinates?: boolean;
    enableLogs?: boolean;
    encoding?: null | string;
    extractImageBlockTypes?: string[];
    gzUncompressedContentType?: null | string;
    hiResModelName?: null | string;
    httpClient?: HTTPClient;
    includeOrigElements?: null | boolean;
    includePageBreaks?: boolean;
    languages?: string[];
    maxCharacters?: null | number;
    multipageSections?: boolean;
    newAfterNChars?: null | number;
    ocrLanguages?: string[];
    outputFormat?: OutputFormatOpen;
    overlap?: number;
    overlapAll?: boolean;
    partitionViaApi?: boolean;
    pdfInferTableStructure?: boolean;
    postProcessors?: ((str: string) => string)[];
    retryConfig?: RetryConfig;
    security?: Security | (() => Promise<Security>);
    server?: "free-api" | "development";
    serverURL?: string;
    similarityThreshold?: null | number;
    skipInferTableTypes?: string[];
    splitPdfConcurrencyLevel?: number;
    splitPdfPage?: boolean;
    startingPageNumber?: null | number;
    strategy?: UnstructuredLoaderStrategy;
    timeoutMs?: number;
    uniqueElementIds?: boolean;
    xmlKeepTags?: boolean;
}

Hierarchy

SDKOptions
Omit<PartitionParameters, "files" | "strategy">
- UnstructuredLoaderOptions

Properties

`Optional`apiKey

apiKey?: string

`Optional`chunkingStrategy

chunkingStrategy?: null | ChunkingStrategyOpen

Use one of the supported strategies to chunk the returned elements after partitioning. When 'chunking_strategy' is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: 'basic', 'by_page', 'by_similarity', or 'by_title'

`Optional`client

client?: UnstructuredClient

`Optional`combineUnderNChars

combineUnderNChars?: null | number

If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500

`Optional`coordinates

coordinates?: boolean

If True, return coordinates for each element extracted via OCR. Default: False

`Optional`enableLogs

enableLogs?: boolean

The Unstructured SDK has logs they call console.info to log at request time. Passing true will log these messages. The default of false will overwrite the console.info function so that it does not log.

Default

false

`Optional`encoding

encoding?: null | string

The encoding method used to decode the text input. Default: utf-8

`Optional`extractImageBlockTypes

extractImageBlockTypes?: string[]

The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields.

`Optional`gzUncompressedContentType

gzUncompressedContentType?: null | string

If file is gzipped, use this content type after unzipping.

`Optional`hiResModelName

hiResModelName?: null | string

The name of the inference model used when strategy is hi_res

`Optional`httpClient

httpClient?: HTTPClient

`Optional`includeOrigElements

includeOrigElements?: null | boolean

When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as .metadata.orig_elements. Default: true.

`Optional`includePageBreaks

includePageBreaks?: boolean

If true, the output will include page breaks if the filetype supports it. Default: false

`Optional`languages

languages?: string[]

The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages.

`Optional`maxCharacters

maxCharacters?: null | number

If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500

`Optional`multipageSections

multipageSections?: boolean

If chunking strategy is set, determines if sections can span multiple sections. Default: true

`Optional`newAfterNChars

newAfterNChars?: null | number

If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500

`Optional`ocrLanguages

ocrLanguages?: string[]

Deprecated! The languages present in the document, for use in partitioning and/or OCR

`Optional`outputFormat

outputFormat?: OutputFormatOpen

The format of the response. Supported formats are application/json and text/csv. Default: application/json.

`Optional`overlap

overlap?: number

Specifies the length of a string ('tail') to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default: 0

`Optional`overlapAll

overlapAll?: boolean

When True, apply overlap between 'normal' chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of 'pollution' of otherwise clean semantic chunk boundaries. Default: False

`Optional`partitionViaApi

partitionViaApi?: boolean

`Optional`pdfInferTableStructure

pdfInferTableStructure?: boolean

Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents.

`Optional`postProcessors

postProcessors?: ((str: string) => string)[]

`Optional`retryConfig

retryConfig?: RetryConfig

Allows overriding the default retry config used by the SDK

`Optional`security

security?: Security | (() => Promise<Security>)

The security details required to authenticate the SDK

`Optional`server

server?: "free-api" | "development"

Allows overriding the default server used by the SDK

`Optional`serverURL

serverURL?: string

Allows overriding the default server URL used by the SDK

`Optional`similarityThreshold

similarityThreshold?: null | number

A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks.

`Optional`skipInferTableTypes

skipInferTableTypes?: string[]

The document types that you want to skip table extraction with. Default: []

`Optional`splitPdfConcurrencyLevel

splitPdfConcurrencyLevel?: number

Number of maximum concurrent requests made when splitting PDF. Ignored on backend.

`Optional`splitPdfPage

splitPdfPage?: boolean

Should the pdf file be split at client. Ignored on backend.

`Optional`startingPageNumber

startingPageNumber?: null | number

When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.

`Optional`strategy

strategy?: UnstructuredLoaderStrategy

`Optional`timeoutMs

timeoutMs?: number

`Optional`uniqueElementIds

uniqueElementIds?: boolean

When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False

`Optional`xmlKeepTags

xmlKeepTags?: boolean

If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents.

Interface UnstructuredLoaderOptions

Hierarchy

Index

Properties

Properties

OptionalapiKey

OptionalchunkingStrategy

Optionalclient

OptionalcombineUnderNChars

Optionalcoordinates

OptionalenableLogs

Default

Optionalencoding

OptionalextractImageBlockTypes

OptionalgzUncompressedContentType

OptionalhiResModelName

OptionalhttpClient

OptionalincludeOrigElements

OptionalincludePageBreaks

Optionallanguages

OptionalmaxCharacters

OptionalmultipageSections

OptionalnewAfterNChars

OptionalocrLanguages

OptionaloutputFormat

Optionaloverlap

OptionaloverlapAll

OptionalpartitionViaApi

OptionalpdfInferTableStructure

OptionalpostProcessors

OptionalretryConfig

Optionalsecurity

Optionalserver

OptionalserverURL

OptionalsimilarityThreshold

OptionalskipInferTableTypes

OptionalsplitPdfConcurrencyLevel

OptionalsplitPdfPage

OptionalstartingPageNumber

Optionalstrategy

OptionaltimeoutMs

OptionaluniqueElementIds

OptionalxmlKeepTags

Settings

On This Page

`Optional`apiKey

`Optional`chunkingStrategy

`Optional`client

`Optional`combineUnderNChars

`Optional`coordinates

`Optional`enableLogs

`Optional`encoding

`Optional`extractImageBlockTypes

`Optional`gzUncompressedContentType

`Optional`hiResModelName

`Optional`httpClient

`Optional`includeOrigElements

`Optional`includePageBreaks

`Optional`languages

`Optional`maxCharacters

`Optional`multipageSections

`Optional`newAfterNChars

`Optional`ocrLanguages

`Optional`outputFormat

`Optional`overlap

`Optional`overlapAll

`Optional`partitionViaApi

`Optional`pdfInferTableStructure

`Optional`postProcessors

`Optional`retryConfig

`Optional`security

`Optional`server

`Optional`serverURL

`Optional`similarityThreshold

`Optional`skipInferTableTypes

`Optional`splitPdfConcurrencyLevel

`Optional`splitPdfPage

`Optional`startingPageNumber

`Optional`strategy

`Optional`timeoutMs

`Optional`uniqueElementIds

`Optional`xmlKeepTags