Skip to content

Core API

The core API provides the main functions for processing PMC articles.

Primary Processing Function

process_single_pmc

pmcgrab.application.processing.process_single_pmc

process_single_pmc(
    pmc_id: str | int,
    *,
    download: bool = False,
    timeout: int = NCBI_TIMEOUT,
    metadata_only: bool = False,
    schema_version: int | None = None,
    output_style: str | None = None,
) -> ArticleOutput | None

Download and parse a single PMC article into normalized dictionary format.

Application-layer function that handles the complete processing pipeline for a single PMC article: fetching XML, parsing content, extracting structured data, and normalizing for JSON serialization. Includes thread-safe timeout protection and robust error handling.

Parameters:

Name Type Description Default

pmc_id

str | int

PMC ID as int or string, with or without the PMC prefix.

required

download

bool

If True, cache raw XML locally in data/ directory for reuse.

False

timeout

int

Maximum seconds to wait for network/parsing (default: 60).

NCBI_TIMEOUT

metadata_only

bool

If True, allow metadata-only output without body sections.

False

schema_version

int | None

Full-output schema version. Passing a schema version without output_style selects the full output for compatibility.

None

output_style

str | None

"paper" for clean paper JSON (default), or "full" for V2/V3/V4 metadata-rich output.

None

Returns:

Type Description
ArticleOutput | None

Normalized article dictionary. The default clean paper output includes

ArticleOutput | None

identifiers, paper (title, abstract, body), and assets

ArticleOutput | None

(images and tables). Pass output_style="full" for the metadata-rich

ArticleOutput | None

V4/V3/V2 contracts.

ArticleOutput | None

Returns None if processing fails or article has no usable content.

Examples:

>>> article_data = process_single_pmc("7181753")
>>> if article_data:
...     print(f"Title: {article_data['paper']['title']}")
...     sections = article_data["paper"]["body"]
...     print(f"Sections: {[section['title'] for section in sections]}")
Source code in src/pmcgrab/application/processing.py
def process_single_pmc(
    pmc_id: str | int,
    *,
    download: bool = False,
    timeout: int = _TIMEOUT_SECONDS,
    metadata_only: bool = False,
    schema_version: int | None = None,
    output_style: str | None = None,
) -> ArticleOutput | None:
    """Download and parse a single PMC article into normalized dictionary format.

    Application-layer function that handles the complete processing pipeline
    for a single PMC article: fetching XML, parsing content, extracting
    structured data, and normalizing for JSON serialization. Includes
    thread-safe timeout protection and robust error handling.

    Args:
        pmc_id: PMC ID as int or string, with or without the ``PMC`` prefix.
        download: If True, cache raw XML locally in data/ directory for reuse.
        timeout: Maximum seconds to wait for network/parsing (default: 60).
        metadata_only: If True, allow metadata-only output without body sections.
        schema_version: Full-output schema version. Passing a schema version
            without ``output_style`` selects the full output for compatibility.
        output_style: ``"paper"`` for clean paper JSON (default), or
            ``"full"`` for V2/V3/V4 metadata-rich output.

    Returns:
        Normalized article dictionary. The default clean paper output includes
        ``identifiers``, ``paper`` (title, abstract, body), and ``assets``
        (images and tables). Pass ``output_style="full"`` for the metadata-rich
        V4/V3/V2 contracts.
        Returns None if processing fails or article has no usable content.

    Examples:
        >>> article_data = process_single_pmc("7181753")
        >>> if article_data:
        ...     print(f"Title: {article_data['paper']['title']}")
        ...     sections = article_data["paper"]["body"]
        ...     print(f"Sections: {[section['title'] for section in sections]}")
    """
    _validate_output_options(output_style, schema_version)
    try:
        normalized_pmc_id = (
            str(pmc_id) if isinstance(pmc_id, int) else normalize_id(str(pmc_id))
        )
        pmc_id_num = int(normalized_pmc_id)
        if pmc_id_num <= 0:
            _logger.warning("Invalid PMC ID (must be positive): %s", pmc_id)
            return None
        current_email = next_email()

        try:
            paper = _run_with_timeout(
                build_paper_from_pmc,
                pmc_id_num,
                email=current_email,
                download=download,
                validate=False,
                suppress_warnings=True,
                timeout=timeout,
            )
        except TimeoutException:
            _logger.warning("Timeout processing PMCID %s after %ds", pmc_id, timeout)
            return None

        if paper is None:
            _logger.info("No data returned for PMCID %s", pmc_id)
            return None

        return _extract_paper_dict(
            paper,
            pmc_id_num,
            metadata_only=metadata_only,
            _source="ncbi_entrez",
            schema_version=schema_version,
            output_style=output_style,
        )

    except Exception:
        _logger.exception("Error processing PMCID %s", pmc_id)
        return None

options: show_source: true show_root_heading: true show_root_toc_entry: false show_object_full_path: false show_category_heading: false show_signature_annotations: true heading_level: 3

Email Management

next_email

pmcgrab.infrastructure.settings.next_email

next_email() -> str

Return the next email address in round-robin rotation.

Thread-safe via a lock-protected index counter.

Returns:

Name Type Description
str str

Next email address from the configured pool

Source code in src/pmcgrab/infrastructure/settings.py
def next_email() -> str:
    """Return the next email address in round-robin rotation.

    Thread-safe via a lock-protected index counter.

    Returns:
        str: Next email address from the configured pool
    """
    global _email_index
    with _email_lock:
        email = EMAIL_POOL[_email_index % len(EMAIL_POOL)]
        _email_index += 1
    return email

options: show_source: true show_root_heading: true show_root_toc_entry: false show_object_full_path: false show_category_heading: false show_signature_annotations: true heading_level: 3

Example Usage

from pmcgrab.application.processing import process_single_pmc
from pmcgrab.infrastructure.settings import next_email

# Process a single PMC article
email = next_email()
data = process_single_pmc("7114487")

if data:
    print(f"Title: {data['paper']['title']}")
    print(f"PMCID: {data['identifiers']['pmcid']}")
    print(f"Sections: {[section['title'] for section in data['paper']['body']]}")