Configuration¶

PMCGrab provides simple configuration options optimized for the process_single_pmc workflow.

Email Management¶

PMCGrab automatically manages email rotation for NCBI API compliance:

from pmcgrab.infrastructure.settings import next_email

# Get the next email in rotation
email = next_email()
print(f"Using email: {email}")

The system automatically rotates through available email addresses to ensure proper rate limiting and compliance with NCBI guidelines.

Basic Usage Pattern¶

The recommended configuration-free approach:

from pmcgrab.application.processing import process_single_pmc
from pmcgrab.infrastructure.settings import next_email

# Process a single article
email = next_email()  # Automatic email rotation
data = process_single_pmc("7114487")

if data:
    print(f"Successfully processed: {data['title']}")
else:
    print("Processing failed")

Batch Processing Configuration¶

For processing multiple articles, use the standard pattern:

# ─── Recommended Batch Processing Pattern ────────────────────────────────────
import json
from pathlib import Path

from pmcgrab.application.processing import process_single_pmc
from pmcgrab.infrastructure.settings import next_email

# The PMC IDs we want to process
PMC_IDS = ["7114487", "3084273", "7690653", "5707528", "7979870"]

OUT_DIR = Path("pmc_output")
OUT_DIR.mkdir(exist_ok=True)

for pmcid in PMC_IDS:
    email = next_email()
    print(f"• Fetching PMC{pmcid} using email {email} …")
    data = process_single_pmc(pmcid)
    if data is None:
        print(f"  ↳ FAILED to parse PMC{pmcid}")
        continue

    # Pretty-print a few key fields
    print(
        f"  Title   : {data['title'][:80]}{'…' if len(data['title']) > 80 else ''}\n"
        f"  Abstract: {data['abstract'][:120]}{'…' if len(data['abstract']) > 120 else ''}\n"
        f"  Authors : {len(data['authors']) if data['authors'] else 0}"
    )

    # Persist full JSON
    dest = OUT_DIR / f"PMC{pmcid}.json"
    with dest.open("w", encoding="utf-8") as fh:
        json.dump(data, fh, indent=2, ensure_ascii=False)
    print(f"  ↳ JSON saved to {dest}\n")

Error Handling Configuration¶

Built-in error handling with graceful degradation:

from pmcgrab.application.processing import process_single_pmc
from pmcgrab.infrastructure.settings import next_email

def robust_processing(pmcids):
    """Process PMC IDs with robust error handling."""
    successful = []
    failed = []

    for pmcid in pmcids:
        email = next_email()

        try:
            data = process_single_pmc(pmcid)
            if data is not None:
                successful.append((pmcid, data))
                print(f"Success PMC{pmcid}: {data['title'][:50]}...")
            else:
                failed.append(pmcid)
                print(f"Error PMC{pmcid}: No data returned")
        except Exception as e:
            failed.append(pmcid)
            print(f"Error PMC{pmcid}: {str(e)}")

    return successful, failed

# Usage
pmcids = ["7114487", "3084273", "invalid_id", "7690653"]
successful, failed = robust_processing(pmcids)
print(f"Processed: {len(successful)}, Failed: {len(failed)}")

Performance Configuration¶

Memory-Efficient Processing¶

For large datasets, process in chunks to manage memory:

import json
import gc
from pathlib import Path
from pmcgrab.application.processing import process_single_pmc
from pmcgrab.infrastructure.settings import next_email

def memory_efficient_processing(pmcids, output_dir="results", batch_size=10):
    """Process large datasets with memory management."""
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    for i in range(0, len(pmcids), batch_size):
        batch = pmcids[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}: {len(batch)} articles")

        for pmcid in batch:
            email = next_email()
            data = process_single_pmc(pmcid)

            if data is not None:
                output_file = output_path / f"PMC{pmcid}.json"
                with output_file.open('w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
                print(f"  Saved PMC{pmcid}")
                del data  # Clear from memory
            else:
                print(f"  Failed PMC{pmcid}")

        # Force garbage collection after each batch
        gc.collect()

# Usage for large datasets
large_pmcid_list = [str(i) for i in range(7000000, 7000100)]
memory_efficient_processing(large_pmcid_list, batch_size=20)

Progress Tracking Configuration¶

Add progress tracking for long-running processes:

from tqdm import tqdm
from pmcgrab.application.processing import process_single_pmc
from pmcgrab.infrastructure.settings import next_email

def process_with_progress(pmcids, output_dir="results"):
    """Process with progress tracking."""
    successful = 0

    for pmcid in tqdm(pmcids, desc="Processing papers"):
        email = next_email()
        data = process_single_pmc(pmcid)

        if data is not None:
            # Save and count success
            output_file = Path(output_dir) / f"PMC{pmcid}.json"
            output_file.parent.mkdir(exist_ok=True)

            with output_file.open('w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            successful += 1
            tqdm.write(f"Success {data['title'][:40]}...")
        else:
            tqdm.write(f"Error PMC{pmcid}: Failed")

    print(f"Completed: {successful}/{len(pmcids)} papers")

# Usage
pmcids = ["7114487", "3084273", "7690653", "5707528"]
process_with_progress(pmcids)

Output Configuration¶

Custom Output Directories¶

Organize output with custom directory structures:

from datetime import datetime
from pathlib import Path

# Create timestamped directories
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path(f"pmc_batch_{timestamp}")

# Or organize by topic
topic_dir = Path("cancer_research_papers")
topic_dir.mkdir(exist_ok=True)

JSON Formatting Options¶

Control JSON output formatting:

import json
from pmcgrab.application.processing import process_single_pmc

data = process_single_pmc("7114487")

if data:
    # Compact JSON (smaller files)
    with open("compact.json", "w") as f:
        json.dump(data, f, separators=(',', ':'), ensure_ascii=False)

    # Pretty JSON (human readable)
    with open("pretty.json", "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    # With Unicode preservation
    with open("unicode.json", "w", encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

Command Line Configuration¶

For command-line usage, PMCGrab provides several configuration options:

# Basic usage
uv run python -m pmcgrab PMC7114487

# With custom settings
uv run python -m pmcgrab \
    --output-dir ./results \
    --workers 4 \
    --batch-size 10 \
    --email researcher@university.edu \
    PMC7114487 PMC3084273

# From file
uv run python -m pmcgrab --input-file pmcids.txt --output-dir results/

Best Practices¶

Production Configuration¶

import logging
from datetime import datetime
from pathlib import Path
from pmcgrab.application.processing import process_single_pmc
from pmcgrab.infrastructure.settings import next_email

def production_processing(pmcids, base_output_dir="production"):
    """Production-ready processing with logging and organization."""

    # Set up logging
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f"pmcgrab_{timestamp}.log"

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

    logger = logging.getLogger(__name__)
    logger.info(f"Starting processing of {len(pmcids)} articles")

    # Create organized output structure
    output_dir = Path(base_output_dir) / f"batch_{timestamp}"
    output_dir.mkdir(parents=True, exist_ok=True)

    stats = {'successful': 0, 'failed': 0, 'failed_ids': []}

    for pmcid in pmcids:
        email = next_email()
        logger.info(f"Processing PMC{pmcid}")

        data = process_single_pmc(pmcid)
        if data is not None:
            output_file = output_dir / f"PMC{pmcid}.json"
            with output_file.open('w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            stats['successful'] += 1
            logger.info(f"Success: {data['title'][:50]}...")
        else:
            stats['failed'] += 1
            stats['failed_ids'].append(pmcid)
            logger.warning(f"Failed: PMC{pmcid}")

    # Save summary
    summary_file = output_dir / "summary.json"
    with summary_file.open('w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2)

    logger.info(f"Processing complete: {stats['successful']}/{len(pmcids)} successful")
    return stats

# Usage
pmcids = ["7114487", "3084273", "7690653"]
results = production_processing(pmcids)

This configuration approach provides robust, scalable processing while maintaining simplicity and reliability.