API Reference

`cholla_chem`

cholla_chem initialization.

`CIRpyNameResolver`

Bases: ChemicalNameResolver

Resolver using Chemical Identity Resolver via CIRPy.

Source code in cholla_chem/main.py

class CIRpyNameResolver(ChemicalNameResolver):
    """
    Resolver using Chemical Identity Resolver via CIRPy.
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 1,
        rate_limit_time: float = 10,
    ):
        super().__init__(
            "cirpy",
            resolver_name,
            resolver_weight,
            requires_internet=True,
            rate_limit_time=rate_limit_time,
        )

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using cirpy.
        """
        resolved_names = name_to_smiles_cirpy(compound_name_list)
        return resolved_names, {}

`name_to_smiles(compound_name_list)`

Convert chemical names to SMILES using cirpy.

Source code in cholla_chem/main.py

def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using cirpy.
    """
    resolved_names = name_to_smiles_cirpy(compound_name_list)
    return resolved_names, {}

`ChemNameCorrector`

Main class for correcting OCR errors in chemical names.

This class orchestrates the correction process by: 1. Applying configured correction strategies 2. Generating candidate corrections 3. Scoring candidates 4. Optionally validating with external tools 5. Returning ranked results

Example

corrector = ChemNameCorrector() results = corrector.correct("2-ch1oropropanoic acid") print(results[0].name) 2-chloropropanoic acid

With custom configuration

config = CorrectorConfig(max_candidates=50) corrector = ChemNameCorrector(config)

With external validation

validator = PubChemValidator() results = corrector.correct("asprin", validator=validator)

Attributes:

Name	Type	Description
`config`		Configuration for the corrector
`strategies`		List of active correction strategies
`scorer`		Scoring instance for ranking candidates

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

class ChemNameCorrector:
    """
    Main class for correcting OCR errors in chemical names.

    This class orchestrates the correction process by:
    1. Applying configured correction strategies
    2. Generating candidate corrections
    3. Scoring candidates
    4. Optionally validating with external tools
    5. Returning ranked results

    Example:
        >>> corrector = ChemNameCorrector()
        >>> results = corrector.correct("2-ch1oropropanoic acid")
        >>> print(results[0].name)
        2-chloropropanoic acid

        >>> # With custom configuration
        >>> config = CorrectorConfig(max_candidates=50)
        >>> corrector = ChemNameCorrector(config)

        >>> # With external validation
        >>> validator = PubChemValidator()
        >>> results = corrector.correct("asprin", validator=validator)

    Attributes:
        config: Configuration for the corrector
        strategies: List of active correction strategies
        scorer: Scoring instance for ranking candidates
    """

    def __init__(
        self,
        config: Optional[CorrectorConfig] = None,
        strategies: Optional[List[CorrectionStrategy]] = None,
    ):
        """
        Initialize the chemical name corrector.

        Args:
            config: Configuration object (uses defaults if None)
            strategies: Custom list of strategies (uses defaults if None)
        """
        self.config = config or CorrectorConfig()
        self.scorer = ChemicalNameScorer(self.config)

        if strategies is not None:
            self.strategies = strategies
        else:
            self.strategies = self._create_default_strategies()

        self.validator = None
        if self.config.enable_external_validation:
            self.validator = OPSINValidator()

    def _create_default_strategies(self) -> List[CorrectionStrategy]:
        """Create the default set of correction strategies based on config."""
        strategies: List[CorrectionStrategy] = []

        if self.config.enable_locant_correction:
            strategies.append(LocantCorrectionStrategy())

        if self.config.enable_character_substitution:
            char_strategy = CharacterSubstitutionStrategy(
                max_edits=self.config.max_character_substitution_edits_per_morpheme
            )
            strategies.append(char_strategy)

        if self.config.enable_character_insertion:
            char_insertion_strategy = CharacterInsertionStrategy(
                max_edits=self.config.max_character_insertion_edits_per_morpheme
            )
            strategies.append(char_insertion_strategy)

        if self.config.enable_character_deletion:
            char_deletion_strategy = CharacterDeletionStrategy(
                max_edits=self.config.max_character_deletion_edits_per_morpheme
            )
            strategies.append(char_deletion_strategy)

        if self.config.enable_transposition:
            char_transposition_strategy = CharacterTranspositionStrategy(
                max_edits=self.config.max_transposition_edits_per_morpheme
            )
            strategies.append(char_transposition_strategy)

        if self.config.enable_punctuation_restoration:
            strategies.append(PunctuationRestorationStrategy())

        if self.config.enable_bracket_balancing:
            strategies.append(BracketBalancingStrategy())

        return strategies

    def add_strategy(self, strategy: CorrectionStrategy) -> None:
        """
        Add a custom correction strategy.

        Args:
            strategy: The strategy to add
        """
        self.strategies.append(strategy)

    def remove_strategy(self, strategy_name: str) -> bool:
        """
        Remove a strategy by name.

        Args:
            strategy_name: Name of the strategy to remove

        Returns:
            True if strategy was found and removed, False otherwise
        """
        for i, strategy in enumerate(self.strategies):
            if strategy.name == strategy_name:
                self.strategies.pop(i)
                return True
        return False

    def correct(
        self, name: str, use_validator: bool = True, validate_all: bool = False
    ) -> List[CorrectionCandidate]:
        """
        Correct a chemical name and return ranked candidates.

        Args:
            name: The chemical name to correct
            use_validator: Whether to use external validator
            validate_all: Whether to validate all candidates or just the top ones

        Returns:
            List of CorrectionCandidate objects, sorted by score (descending)
        """
        # Generate all candidates
        candidates = self._generate_all_candidates(name)

        # Remove duplicates while preserving best corrections
        unique_candidates = self._deduplicate_candidates(candidates)

        # Score all candidates
        scored_candidates = [
            self.scorer.score(candidate) for candidate in unique_candidates
        ]

        # Filter by minimum score threshold
        filtered_candidates = [
            c for c in scored_candidates if c.score >= self.config.min_score_threshold
        ]

        # Sort by score (descending)
        sorted_candidates = sorted(
            filtered_candidates, key=lambda c: c.score, reverse=True
        )

        # Limit to max candidates
        limited_candidates = sorted_candidates[: self.config.max_candidates]

        if use_validator:
            self._validate_candidates_batch(
                {name: limited_candidates}, self.validator, validate_all
            )

            limited_candidates = sorted(
                limited_candidates, key=lambda c: c.score, reverse=True
            )

        return limited_candidates

    def correct_batch(
        self, names: List[str], use_validator: bool = True, validate_all: bool = False
    ) -> Dict[str, List[CorrectionCandidate]]:
        """
        Correct multiple chemical names.

        Args:
            names: List of chemical names to correct
            use_validator: Whether to use external validator
            validate_all: Whether to validate all candidates or just the top ones

        Returns:
            Dictionary mapping original names to their candidates
        """
        results = {}
        for name in names:
            results[name] = self.correct(name, use_validator=False)

        if use_validator:
            self._validate_candidates_batch(results, self.validator, validate_all)

        for name in names:
            results[name] = sorted(results[name], key=lambda c: c.score, reverse=True)

        return results

    def _generate_all_candidates(self, name: str) -> List[CorrectionCandidate]:
        """Generate candidates from all strategies."""
        candidates: List[CorrectionCandidate] = []

        names_to_process = [(name, 0)]
        for strategy in self.strategies:
            for name_to_process, num_corrections in names_to_process:
                for new_text, new_corrections in strategy.generate_candidates(
                    name_to_process, num_corrections, self.config
                ):
                    if (
                        len(new_corrections)
                        <= self.config.max_corrections_per_candidate
                    ):
                        candidate = CorrectionCandidate(
                            name=new_text,
                            original_name=name,
                            corrections=new_corrections,
                        )
                        candidates.append(candidate)

                        if new_text in names_to_process:
                            continue
                        if len(names_to_process) >= self.config.max_candidates:
                            continue
                        names_to_process.append((new_text, len(new_corrections)))

        return candidates

    def _deduplicate_candidates(
        self, candidates: List[CorrectionCandidate]
    ) -> List[CorrectionCandidate]:
        """Remove duplicate candidates, keeping the one with fewer corrections."""
        seen: Dict[str, CorrectionCandidate] = {}

        for candidate in candidates:
            if candidate.name not in seen:
                seen[candidate.name] = candidate
            else:
                # Keep the one with fewer corrections
                if candidate.num_corrections < seen[candidate.name].num_corrections:
                    seen[candidate.name] = candidate

        return list(seen.values())

    def _validate_candidates_batch(
        self,
        candidates: Dict[str, List[CorrectionCandidate]],
        validator: Optional[Validator],
        validate_all: bool,
    ) -> None:
        """Validate candidates using external validator."""
        if not validator:
            return

        all_candidate_names = []
        original_name_candidate_name_map = {}
        candidate_name_candidate_object_map = {}
        for original_name, candidates_list in candidates.items():
            original_name_candidate_name_map[original_name] = [
                candidate.name for candidate in candidates_list
            ]
            candidate_name_candidate_object_map.update(
                {candidate.name: candidate for candidate in candidates_list}
            )
            all_candidate_names.extend(
                [candidate.name for candidate in candidates_list]
            )

        validator_outputs = validator.batch_validate(all_candidate_names)

        for candidate_name, (is_valid, result) in validator_outputs.items():
            candidate = candidate_name_candidate_object_map[candidate_name]
            candidate.validated = True
            candidate.validation_result = result

            if is_valid:
                # Boost score for valid candidates
                candidate.score = min(1.0, candidate.score + 0.3)

            else:
                # Lower score for invalid candidates
                candidate.score = max(0.0, candidate.score - 0.2)

        return

    def get_best_candidate(
        self, name: str, use_validator: bool = True
    ) -> Optional[CorrectionCandidate]:
        """
        Get the single best correction candidate.

        Args:
            name: Chemical name to correct
            validator: Optional external validator

        Returns:
            Best candidate, or None if no candidates found
        """
        candidates = self.correct(name, use_validator)
        return candidates[0] if candidates else None

    def explain_corrections(self, candidate: CorrectionCandidate) -> str:
        """
        Generate a human-readable explanation of corrections.

        Args:
            candidate: The candidate to explain

        Returns:
            Multi-line string explaining all corrections
        """
        lines = [
            f"Original: {candidate.original_name}",
            f"Corrected: {candidate.name}",
            f"Score: {candidate.score:.3f}",
            f"Number of corrections: {candidate.num_corrections}",
            "",
            "Score components:",
        ]

        for component, value in candidate.score_components.items():
            lines.append(f"  - {component}: {value:.3f}")

        if candidate.corrections:
            lines.append("")
            lines.append("Corrections applied:")
            for i, correction in enumerate(candidate.corrections, 1):
                lines.append(
                    f"  {i}. [{correction.correction_type.name}] "
                    f"'{correction.original}' → '{correction.replacement}'"
                )
                if correction.description:
                    lines.append(f"     {correction.description}")

        if candidate.validated:
            lines.append("")
            lines.append(f"Validated: {candidate.validation_result or 'No result'}")

        return "\n".join(lines)

`init(config=None, strategies=None)`

Initialize the chemical name corrector.

Parameters:

Name	Type	Description	Default
`config`	`Optional[CorrectorConfig]`	Configuration object (uses defaults if None)	`None`
`strategies`	`Optional[List[CorrectionStrategy]]`	Custom list of strategies (uses defaults if None)	`None`

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

def __init__(
    self,
    config: Optional[CorrectorConfig] = None,
    strategies: Optional[List[CorrectionStrategy]] = None,
):
    """
    Initialize the chemical name corrector.

    Args:
        config: Configuration object (uses defaults if None)
        strategies: Custom list of strategies (uses defaults if None)
    """
    self.config = config or CorrectorConfig()
    self.scorer = ChemicalNameScorer(self.config)

    if strategies is not None:
        self.strategies = strategies
    else:
        self.strategies = self._create_default_strategies()

    self.validator = None
    if self.config.enable_external_validation:
        self.validator = OPSINValidator()

`add_strategy(strategy)`

Add a custom correction strategy.

Parameters:

Name	Type	Description	Default
`strategy`	`CorrectionStrategy`	The strategy to add	required

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

def add_strategy(self, strategy: CorrectionStrategy) -> None:
    """
    Add a custom correction strategy.

    Args:
        strategy: The strategy to add
    """
    self.strategies.append(strategy)

`correct(name, use_validator=True, validate_all=False)`

Correct a chemical name and return ranked candidates.

Parameters:

Name	Type	Description	Default
`name`	`str`	The chemical name to correct	required
`use_validator`	`bool`	Whether to use external validator	`True`
`validate_all`	`bool`	Whether to validate all candidates or just the top ones	`False`

Returns:

Type	Description
`List[CorrectionCandidate]`	List of CorrectionCandidate objects, sorted by score (descending)

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

def correct(
    self, name: str, use_validator: bool = True, validate_all: bool = False
) -> List[CorrectionCandidate]:
    """
    Correct a chemical name and return ranked candidates.

    Args:
        name: The chemical name to correct
        use_validator: Whether to use external validator
        validate_all: Whether to validate all candidates or just the top ones

    Returns:
        List of CorrectionCandidate objects, sorted by score (descending)
    """
    # Generate all candidates
    candidates = self._generate_all_candidates(name)

    # Remove duplicates while preserving best corrections
    unique_candidates = self._deduplicate_candidates(candidates)

    # Score all candidates
    scored_candidates = [
        self.scorer.score(candidate) for candidate in unique_candidates
    ]

    # Filter by minimum score threshold
    filtered_candidates = [
        c for c in scored_candidates if c.score >= self.config.min_score_threshold
    ]

    # Sort by score (descending)
    sorted_candidates = sorted(
        filtered_candidates, key=lambda c: c.score, reverse=True
    )

    # Limit to max candidates
    limited_candidates = sorted_candidates[: self.config.max_candidates]

    if use_validator:
        self._validate_candidates_batch(
            {name: limited_candidates}, self.validator, validate_all
        )

        limited_candidates = sorted(
            limited_candidates, key=lambda c: c.score, reverse=True
        )

    return limited_candidates

`correct_batch(names, use_validator=True, validate_all=False)`

Correct multiple chemical names.

Parameters:

Name	Type	Description	Default
`names`	`List[str]`	List of chemical names to correct	required
`use_validator`	`bool`	Whether to use external validator	`True`
`validate_all`	`bool`	Whether to validate all candidates or just the top ones	`False`

Returns:

Type	Description
`Dict[str, List[CorrectionCandidate]]`	Dictionary mapping original names to their candidates

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

def correct_batch(
    self, names: List[str], use_validator: bool = True, validate_all: bool = False
) -> Dict[str, List[CorrectionCandidate]]:
    """
    Correct multiple chemical names.

    Args:
        names: List of chemical names to correct
        use_validator: Whether to use external validator
        validate_all: Whether to validate all candidates or just the top ones

    Returns:
        Dictionary mapping original names to their candidates
    """
    results = {}
    for name in names:
        results[name] = self.correct(name, use_validator=False)

    if use_validator:
        self._validate_candidates_batch(results, self.validator, validate_all)

    for name in names:
        results[name] = sorted(results[name], key=lambda c: c.score, reverse=True)

    return results

`explain_corrections(candidate)`

Generate a human-readable explanation of corrections.

Parameters:

Name	Type	Description	Default
`candidate`	`CorrectionCandidate`	The candidate to explain	required

Returns:

Type	Description
`str`	Multi-line string explaining all corrections

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

def explain_corrections(self, candidate: CorrectionCandidate) -> str:
    """
    Generate a human-readable explanation of corrections.

    Args:
        candidate: The candidate to explain

    Returns:
        Multi-line string explaining all corrections
    """
    lines = [
        f"Original: {candidate.original_name}",
        f"Corrected: {candidate.name}",
        f"Score: {candidate.score:.3f}",
        f"Number of corrections: {candidate.num_corrections}",
        "",
        "Score components:",
    ]

    for component, value in candidate.score_components.items():
        lines.append(f"  - {component}: {value:.3f}")

    if candidate.corrections:
        lines.append("")
        lines.append("Corrections applied:")
        for i, correction in enumerate(candidate.corrections, 1):
            lines.append(
                f"  {i}. [{correction.correction_type.name}] "
                f"'{correction.original}' → '{correction.replacement}'"
            )
            if correction.description:
                lines.append(f"     {correction.description}")

    if candidate.validated:
        lines.append("")
        lines.append(f"Validated: {candidate.validation_result or 'No result'}")

    return "\n".join(lines)

`get_best_candidate(name, use_validator=True)`

Get the single best correction candidate.

Parameters:

Name	Type	Description	Default
`name`	`str`	Chemical name to correct	required
`validator`		Optional external validator	required

Returns:

Type	Description
`Optional[CorrectionCandidate]`	Best candidate, or None if no candidates found

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

def get_best_candidate(
    self, name: str, use_validator: bool = True
) -> Optional[CorrectionCandidate]:
    """
    Get the single best correction candidate.

    Args:
        name: Chemical name to correct
        validator: Optional external validator

    Returns:
        Best candidate, or None if no candidates found
    """
    candidates = self.correct(name, use_validator)
    return candidates[0] if candidates else None

`remove_strategy(strategy_name)`

Remove a strategy by name.

Parameters:

Name	Type	Description	Default
`strategy_name`	`str`	Name of the strategy to remove	required

Returns:

Type	Description
`bool`	True if strategy was found and removed, False otherwise

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py

def remove_strategy(self, strategy_name: str) -> bool:
    """
    Remove a strategy by name.

    Args:
        strategy_name: Name of the strategy to remove

    Returns:
        True if strategy was found and removed, False otherwise
    """
    for i, strategy in enumerate(self.strategies):
        if strategy.name == strategy_name:
            self.strategies.pop(i)
            return True
    return False

`ChemSpiPyResolver`

Bases: ChemicalNameResolver

Resolver using chemspipy.

Source code in cholla_chem/main.py

class ChemSpiPyResolver(ChemicalNameResolver):
    """
    Resolver using chemspipy.
    """

    def __init__(
        self,
        resolver_name: str,
        chemspider_api_key: str,
        resolver_weight: float = 3,
        rate_limit_time: float = 10,
    ):
        super().__init__(
            "chemspipy",
            resolver_name,
            resolver_weight,
            requires_internet=True,
            rate_limit_time=rate_limit_time,
        )
        if chemspider_api_key:
            if not isinstance(chemspider_api_key, str):
                raise TypeError("Invalid input: chemspider_api_key must be a string.")
        self._chemspider_api_key = chemspider_api_key
        self._requires_internet = True

    def name_to_smiles(
        self,
        compound_name_list: List[str],
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using ChemSpiPy.
        """
        resolved_names = name_to_smiles_chemspipy(
            compound_name_list, self._chemspider_api_key
        )
        return resolved_names, {}

`name_to_smiles(compound_name_list)`

Convert chemical names to SMILES using ChemSpiPy.

Source code in cholla_chem/main.py

def name_to_smiles(
    self,
    compound_name_list: List[str],
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using ChemSpiPy.
    """
    resolved_names = name_to_smiles_chemspipy(
        compound_name_list, self._chemspider_api_key
    )
    return resolved_names, {}

`ChemicalNameResolver`

Bases: ABC

Abstract base class for chemical name-to-SMILES resolvers.

Subclasses must implement the name_to_smiles method.

Source code in cholla_chem/main.py

class ChemicalNameResolver(ABC):
    """
    Abstract base class for chemical name-to-SMILES resolvers.

    Subclasses must implement the `name_to_smiles` method.
    """

    def __init__(
        self,
        resolver_type: str,
        resolver_name: str,
        resolver_weight: float,
        requires_internet: bool = False,
        rate_limit_time: Optional[float] = None,
    ):
        if not isinstance(resolver_type, str):
            raise TypeError("Invalid input: resolver_type must be a string.")
        self._resolver_type: str = resolver_type
        if not isinstance(resolver_name, str):
            raise TypeError("Invalid input: resolver_name must be a string.")
        self._resolver_name: str = resolver_name
        if not isinstance(resolver_weight, (int, float)):
            raise TypeError(
                "Invalid input: resolver_weight must be a number between 0-1000."
            )
        if resolver_weight < 0 or resolver_weight > 1000:
            raise ValueError(
                "Invalid input: resolver_weight must be a number between 0-1000."
            )
        self._resolver_weight: float = float(resolver_weight)
        self._requires_internet: bool = requires_internet
        self._rate_limit_time: Optional[float] = rate_limit_time

    @property
    def resolver_name(self) -> str:
        """Return resolver_name."""
        return self._resolver_name

    @property
    def resolver_weight(self) -> float:
        """Return resolver_weight."""
        return self._resolver_weight

    @property
    def requires_internet(self) -> bool:
        """Return requires_internet."""
        return self._requires_internet

    @property
    def rate_limit_time(self) -> Optional[float]:
        """Return rate_limit_time."""
        return self._rate_limit_time

    @abstractmethod
    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES strings.

        Args:
            compound_name_list: List of chemical names.

        Returns:
            Tuple of:
                - Dict mapping successful names to SMILES.
                - Dict mapping failed names to error messages.
        """
        pass

`rate_limit_time` `property`

Return rate_limit_time.

`requires_internet` `property`

Return requires_internet.

`resolver_name` `property`

Return resolver_name.

`resolver_weight` `property`

Return resolver_weight.

`name_to_smiles(compound_name_list)` `abstractmethod`

Convert chemical names to SMILES strings.

Parameters:

Name	Type	Description	Default
`compound_name_list`	`List[str]`	List of chemical names.	required

Returns:

Type	Description
`Tuple[Dict[str, str], Dict[str, str]]`	Tuple of: - Dict mapping successful names to SMILES. - Dict mapping failed names to error messages.

Source code in cholla_chem/main.py

@abstractmethod
def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES strings.

    Args:
        compound_name_list: List of chemical names.

    Returns:
        Tuple of:
            - Dict mapping successful names to SMILES.
            - Dict mapping failed names to error messages.
    """
    pass

`CorrectorConfig` `dataclass`

Configuration for the ChemNameCorrector.

Attributes:

Name	Type	Description
`max_candidates`	`int`	Maximum number of candidates to generate
`max_corrections_per_candidate`	`int`	Maximum corrections per candidate
`min_score_threshold`	`float`	Minimum score to include candidate in results
`enable_character_substitution`	`bool`	Enable OCR character correction
`max_character_substitution_edits`	`bool`	Max number of substitution edits
`enable_punctuation_restoration`	`bool`	Enable missing punctuation detection
`enable_bracket_balancing`	`bool`	Enable bracket matching correction
`custom_substitutions`	`Dict[str, List[str]]`	Additional user-defined substitution rules
`custom_rules`	`List[CorrectionRule]`	Additional user-defined correction rules
`enable_external_validation`	`bool`	Enable external validation of candidates

Source code in cholla_chem/name_manipulation/name_correction/dataclasses.py

@dataclass
class CorrectorConfig:
    """
    Configuration for the ChemNameCorrector.

    Attributes:
        max_candidates: Maximum number of candidates to generate
        max_corrections_per_candidate: Maximum corrections per candidate
        min_score_threshold: Minimum score to include candidate in results
        enable_character_substitution: Enable OCR character correction
        max_character_substitution_edits: Max number of substitution edits
        enable_punctuation_restoration: Enable missing punctuation detection
        enable_bracket_balancing: Enable bracket matching correction
        custom_substitutions: Additional user-defined substitution rules
        custom_rules: Additional user-defined correction rules
        enable_external_validation: Enable external validation of candidates
    """

    max_candidates: int = 100
    max_corrections_per_candidate: int = 3
    min_score_threshold: float = 0.1
    enable_locant_correction: bool = True

    enable_character_substitution: bool = True
    max_character_substitution_edits_per_morpheme: int = 1

    enable_character_insertion: bool = True
    max_character_insertion_edits_per_morpheme: int = 1

    enable_character_deletion: bool = True
    max_character_deletion_edits_per_morpheme: int = 1

    enable_transposition: bool = True
    max_transposition_edits_per_morpheme: int = 1

    enable_punctuation_restoration: bool = False
    enable_bracket_balancing: bool = False
    custom_substitutions: Dict[str, List[str]] = field(default_factory=dict)
    custom_rules: List[CorrectionRule] = field(default_factory=list)
    enable_external_validation: bool = True

`InorganicShorthandNameResolver`

Bases: ChemicalNameResolver

Resolver using inorganic shorthand (e.g. [Cp*RhCl2]2).

Source code in cholla_chem/main.py

class InorganicShorthandNameResolver(ChemicalNameResolver):
    """
    Resolver using inorganic shorthand (e.g. [Cp*RhCl2]2).
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 2,
    ):
        super().__init__(
            "inorganic_shorthand",
            resolver_name,
            resolver_weight,
            requires_internet=False,
            rate_limit_time=None,
        )
        self._requires_internet = False

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using inorganic shorthand converter.
        """
        resolved_names = name_to_smiles_inorganic_shorthand(compound_name_list)
        return resolved_names, {}

`name_to_smiles(compound_name_list)`

Convert chemical names to SMILES using inorganic shorthand converter.

Source code in cholla_chem/main.py

def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using inorganic shorthand converter.
    """
    resolved_names = name_to_smiles_inorganic_shorthand(compound_name_list)
    return resolved_names, {}

`ManualNameResolver`

Bases: ChemicalNameResolver

Resolver using manually curated names and corresponding SMILES.

Source code in cholla_chem/main.py

class ManualNameResolver(ChemicalNameResolver):
    """
    Resolver using manually curated names and corresponding SMILES.
    """

    def __init__(
        self,
        resolver_name: str,
        provided_name_dict: dict | None = None,
        resolver_weight: float = 10,
    ):
        super().__init__(
            "manual",
            resolver_name,
            resolver_weight,
            requires_internet=False,
            rate_limit_time=None,
        )
        if provided_name_dict:
            if not isinstance(provided_name_dict, dict):
                raise TypeError(
                    "Invalid input: provided_name_dict must be a dictionary."
                )
            for k, v in provided_name_dict.items():
                if not isinstance(k, str) or not isinstance(v, str):
                    raise ValueError(
                        "Invalid input: keys and values in provided_name_dict must be strings."
                    )

        self._provided_name_dict = provided_name_dict
        self._requires_internet = False

    def name_to_smiles(
        self,
        compound_name_list: List[str],
        provided_name_dict: Dict[str, str] | None = None,
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using manual name database.
        """
        if provided_name_dict is None:
            provided_name_dict = self._provided_name_dict
        resolved_names = name_to_smiles_manual(compound_name_list, provided_name_dict)
        return resolved_names, {}

`name_to_smiles(compound_name_list, provided_name_dict=None)`

Convert chemical names to SMILES using manual name database.

Source code in cholla_chem/main.py

def name_to_smiles(
    self,
    compound_name_list: List[str],
    provided_name_dict: Dict[str, str] | None = None,
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using manual name database.
    """
    if provided_name_dict is None:
        provided_name_dict = self._provided_name_dict
    resolved_names = name_to_smiles_manual(compound_name_list, provided_name_dict)
    return resolved_names, {}

`OpsinNameResolver`

Bases: ChemicalNameResolver

Resolver using OPSIN via py2opsin.

Source code in cholla_chem/main.py

class OpsinNameResolver(ChemicalNameResolver):
    """
    Resolver using OPSIN via py2opsin.
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 3,
        allow_acid: bool = False,
        allow_radicals: bool = True,
        allow_bad_stereo: bool = False,
        wildcard_radicals: bool = False,
        jar_fpath: str = "opsin-cli.jar",
    ):
        super().__init__(
            "opsin",
            resolver_name,
            resolver_weight,
            rate_limit_time=None,
        )
        self._allow_acid = allow_acid
        self._allow_radicals = allow_radicals
        self._allow_bad_stereo = allow_bad_stereo
        self._wildcard_radicals = wildcard_radicals
        self._jar_fpath = jar_fpath

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using OPSIN.
        """
        resolved_names, failure_message_dict = name_to_smiles_opsin(
            compound_name_list,
            allow_acid=self._allow_acid,
            allow_radicals=self._allow_radicals,
            allow_bad_stereo=self._allow_bad_stereo,
            wildcard_radicals=self._wildcard_radicals,
        )
        return resolved_names, failure_message_dict

`name_to_smiles(compound_name_list)`

Convert chemical names to SMILES using OPSIN.

Source code in cholla_chem/main.py

def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using OPSIN.
    """
    resolved_names, failure_message_dict = name_to_smiles_opsin(
        compound_name_list,
        allow_acid=self._allow_acid,
        allow_radicals=self._allow_radicals,
        allow_bad_stereo=self._allow_bad_stereo,
        wildcard_radicals=self._wildcard_radicals,
    )
    return resolved_names, failure_message_dict

`PubChemNameResolver`

Bases: ChemicalNameResolver

Resolver using PubChem via PubChemPy.

Source code in cholla_chem/main.py

class PubChemNameResolver(ChemicalNameResolver):
    """
    Resolver using PubChem via PubChemPy.
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 2,
        rate_limit_time: float = 10,
    ):
        super().__init__(
            "pubchem",
            resolver_name,
            resolver_weight,
            requires_internet=True,
            rate_limit_time=rate_limit_time,
        )

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using pubchem.
        """
        resolved_names = name_to_smiles_pubchem(compound_name_list)
        return resolved_names, {}

`name_to_smiles(compound_name_list)`

Convert chemical names to SMILES using pubchem.

Source code in cholla_chem/main.py

def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using pubchem.
    """
    resolved_names = name_to_smiles_pubchem(compound_name_list)
    return resolved_names, {}

`StructuralFormulaNameResolver`

Bases: ChemicalNameResolver

Resolver using structural chemical formula (e.g. CH3CH2CH2COOH).

Source code in cholla_chem/main.py

class StructuralFormulaNameResolver(ChemicalNameResolver):
    """
    Resolver using structural chemical formula (e.g. CH3CH2CH2COOH).
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 2,
    ):
        super().__init__(
            "structural_formula",
            resolver_name,
            resolver_weight,
            requires_internet=False,
            rate_limit_time=None,
        )
        self._requires_internet = False

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using structural formula converter.
        """
        resolved_names = name_to_smiles_structural_formula(compound_name_list)
        return resolved_names, {}

`name_to_smiles(compound_name_list)`

Convert chemical names to SMILES using structural formula converter.

Source code in cholla_chem/main.py

def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using structural formula converter.
    """
    resolved_names = name_to_smiles_structural_formula(compound_name_list)
    return resolved_names, {}

`resolve_compounds_to_smiles(compounds_list, resolvers_list=[], smiles_selection_mode='weighted', detailed_name_dict=False, batch_size=500, normalize_unicode=True, split_names_to_solve=True, resolve_peptide_shorthand=True, attempt_name_correction=True, internet_connection_available=True, name_correction_config=None)`

Resolve a list of compound names to their SMILES representations.

Parameters:

Name	Type	Description	Default
`compounds_list`	`List[str]`	A list of compound names.	required
`resolvers_list`	`List[ChemicalNameResolver]`	A list of ChemicalNameResolver instances. Defaults to [].	`[]`
`smiles_selection_mode`	`str`	The method to select the SMILES representation from multiple resolvers. Defaults to 'weighted'.	`'weighted'`
`detailed_name_dict`	`bool`	If True, returns a dictionary with detailed information about each compound. Defaults to False.	`False`
`batch_size`	`int`	The number of compounds to process in each batch. Defaults to 500.	`500`
`normalize_unicode`	`bool`	Whether to normalize Unicode characters in compound names. Defaults to True.	`True`
`split_names_to_solve`	`bool`	Whether to split compound names on common delimiters to solve them as separate compounds. Can be used to solve otherwise unresolvable compound names such as BH3•THF. Defaults to True.	`True`
`resolve_peptide_shorthand`	`bool`	Whether to resolve peptide shorthand notation. Defaults to True.	`True`
`attempt_name_correction`	`bool`	Whether to attempt to correct compound names that are misspelled or contain typos. Defaults to True.	`True`
`internet_connection_available`	`bool`	Whether an internet connection is available to resolve compound names. Defaults to True.	`True`
`name_correction_config`	`CorrectorConfig`	Configuration for name correction. Defaults to None.	`None`

Returns:

Type	Description
`Dict[str, CompoundResolutionEntry] \| Dict[str, CompoundResolutionEntryWithNameCorrection] \| Dict[str, str]`	Dict[str, Dict[str, Dict[str, List[str]]]] \| Dict[str, str]: A dictionary mapping each compound to its SMILES representation and resolvers, or a simple dictionary mapping each compound to it's selected SMILES representation.

Source code in cholla_chem/main.py

def resolve_compounds_to_smiles(
    compounds_list: List[str],
    resolvers_list: List[ChemicalNameResolver] = [],
    smiles_selection_mode: str = "weighted",
    detailed_name_dict: bool = False,
    batch_size: int = 500,
    normalize_unicode: bool = True,
    split_names_to_solve: bool = True,
    resolve_peptide_shorthand: bool = True,
    attempt_name_correction: bool = True,
    internet_connection_available: bool = True,
    name_correction_config: Optional[CorrectorConfig] = None,
) -> (
    Dict[str, CompoundResolutionEntry]
    | Dict[str, CompoundResolutionEntryWithNameCorrection]
    | Dict[str, str]
):
    """
    Resolve a list of compound names to their SMILES representations.

    Args:
        compounds_list (List[str]): A list of compound names.
        resolvers_list (List[ChemicalNameResolver], optional): A list of ChemicalNameResolver instances.
            Defaults to [].
        smiles_selection_mode (str, optional): The method to select the SMILES representation from multiple resolvers.
            Defaults to 'weighted'.
        detailed_name_dict (bool, optional): If True, returns a dictionary with detailed information about each compound.
            Defaults to False.
        batch_size (int, optional): The number of compounds to process in each batch. Defaults to 500.
        normalize_unicode (bool, optional): Whether to normalize Unicode characters in compound names. Defaults to True.
        split_names_to_solve (bool, optional): Whether to split compound names on common delimiters to solve them as separate compounds.
            Can be used to solve otherwise unresolvable compound names such as BH3•THF. Defaults to True.
        resolve_peptide_shorthand (bool, optional): Whether to resolve peptide shorthand notation. Defaults to True.
        attempt_name_correction (bool, optional): Whether to attempt to correct compound names that are misspelled or contain typos.
            Defaults to True.
        internet_connection_available (bool, optional): Whether an internet connection is available to resolve compound names. Defaults to True.
        name_correction_config (CorrectorConfig, optional): Configuration for name correction. Defaults to None.

    Returns:
        Dict[str, Dict[str, Dict[str, List[str]]]] | Dict[str, str]: A dictionary mapping each compound to its SMILES representation and resolvers, or a simple dictionary mapping each compound to it's selected SMILES representation.
    """
    if not resolvers_list:
        resolvers_list = [
            PubChemNameResolver("pubchem_default"),
            OpsinNameResolver("opsin_default"),
            ManualNameResolver("manual_default"),
            StructuralFormulaNameResolver("structural_formula_default"),
            InorganicShorthandNameResolver("inorganic_shorthand_default"),
        ]

    if isinstance(compounds_list, str):
        compounds_list = [compounds_list]
    if not isinstance(compounds_list, list):
        raise ValueError(
            "Invalid input: compounds_list must be a string or a non-empty list of strings."
        )
    if isinstance(compounds_list, list):
        if len(compounds_list) == 0:
            raise ValueError(
                "Invalid input: compounds_list must be a string or a non-empty list of strings."
            )
        for compound in compounds_list:
            if not isinstance(compound, str):
                raise ValueError(
                    "Invalid input: compounds_list must be a string or a non-empty list of strings."
                )
    if len(compounds_list) != len(set(compounds_list)):
        logger.info("Removing duplicate compound names from compounds_list.")
        compounds_list = list(set(compounds_list))

    non_empty_compounds_list = [string for string in compounds_list if string]
    if len(non_empty_compounds_list) != len(compounds_list):
        logger.info("Removing empty compound names from compounds_list.")
        compounds_list = non_empty_compounds_list

    if not isinstance(resolvers_list, list) or len(resolvers_list) == 0:
        raise ValueError(
            "Invalid input: resolvers_list must be a non-empty list of ChemicalNameResolver instances."
        )

    seen_resolvers = []
    for resolver in resolvers_list:
        if not isinstance(resolver, ChemicalNameResolver):
            raise ValueError(
                f"Invalid resolver: {resolver} is not an instance of ChemicalNameResolver."
            )
        if resolver.resolver_name in seen_resolvers:
            raise ValueError(f"Duplicate resolver name: {resolver.resolver_name}.")
        seen_resolvers.append(resolver.resolver_name)

    if not (isinstance(smiles_selection_mode, str) or callable(smiles_selection_mode)):
        raise ValueError(
            "Invalid input: smiles_selection_mode must be a string or function."
        )

    if not isinstance(detailed_name_dict, bool):
        raise ValueError("Invalid input: detailed_name_dict must be a bool.")

    if not isinstance(batch_size, int):
        raise TypeError("Invalid input: batch_size must be an integer.")
    if batch_size <= 0 or batch_size > 1000:
        raise ValueError("Invalid input: batch_size must be an integer between 1-1000.")

    if not isinstance(split_names_to_solve, bool):
        raise ValueError("Invalid input: split_names_to_solve must be a bool.")

    if not isinstance(normalize_unicode, bool):
        raise ValueError("Invalid input: normalize_unicode must be a bool.")

    if not isinstance(internet_connection_available, bool):
        raise ValueError("Invalid input: internet_connection_available must be a bool.")

    if not internet_connection_available:
        logger.info(
            "Internet connection not available, filtering out internet-dependent resolvers."
        )
        resolvers_list = [
            resolver for resolver in resolvers_list if not resolver.requires_internet
        ]

    if normalize_unicode:
        logger.info("Normalizing unicode in compound names.")
        # Clean compound names (strip, remove/replace forbidden characters, etc.) and return a mapping dict
        cleaned_compounds_list, cleaned_compounds_dict = (
            normalize_unicode_and_return_mapping(compounds_list)
        )
    else:
        cleaned_compounds_list = compounds_list
        cleaned_compounds_dict = {compound: compound for compound in compounds_list}

    if split_names_to_solve:
        # Split compound names on delimiters, add split parts to compounds list
        # Return mapping between original compound names and split parts
        # Necessary to resolve names like H₂O•THF
        cleaned_compounds_list, delimiter_split_dict = (
            split_compounds_on_delimiters_and_return_mapping(cleaned_compounds_list)
        )

    # Resolve compounds and split compound names with resolvers
    resolvers_out_dict = resolve_compounds_using_resolvers(
        cleaned_compounds_list, resolvers_list, batch_size
    )

    # Assemble the resolution dictionary
    compounds_out_dict = assemble_compounds_resolution_dict(
        compounds_list, resolvers_out_dict, cleaned_compounds_dict
    )

    if split_names_to_solve:
        # Resolve compounds that were split with split_compounds_on_delimiters_and_return_mapping
        compounds_out_dict = assemble_split_compounds_resolution_dict(
            compounds_out_dict,
            compounds_list,
            resolvers_out_dict,
            cleaned_compounds_dict,
            delimiter_split_dict,
        )

    # Get the resolvers weight dict - needed for SMILESSelector
    resolvers_weight_dict = get_resolvers_weight_dict(resolvers_list)
    resolvers_priority_order = [resolver.resolver_name for resolver in resolvers_list]

    # Select "best" SMILES according to some criteria, add to resolution dict
    compounds_out_dict = select_smiles_with_criteria(
        compounds_out_dict,
        resolvers_weight_dict,
        resolvers_priority_order,
        smiles_selection_mode,
    )

    if attempt_name_correction:
        # Attempt to correct compound names and then attempt to resolve using the corrected names.
        corrected_names_dict = correct_names(
            compounds_out_dict, name_correction_config, resolve_peptide_shorthand
        )
        if corrected_names_dict:
            corrected_pairs: list[tuple[str, str]] = []
            for original_name, info in corrected_names_dict.items():
                selected = info.get("selected_name")
                if isinstance(selected, str) and selected:
                    corrected_pairs.append((original_name, selected))

            if corrected_pairs:
                selected_names = [selected for _, selected in corrected_pairs]

                corrected_compounds_out_dict = resolve_compounds_to_smiles(
                    compounds_list=selected_names,
                    resolvers_list=resolvers_list,
                    smiles_selection_mode=smiles_selection_mode,
                    detailed_name_dict=True,
                    batch_size=batch_size,
                    normalize_unicode=normalize_unicode,
                    split_names_to_solve=split_names_to_solve,
                    resolve_peptide_shorthand=False,
                    attempt_name_correction=False,
                )

                # ugliness to get rid of mypy error.
                copy_compounds_out_dict: Dict[str, Any] = compounds_out_dict.copy()

                for original_name, selected_name in corrected_pairs:
                    resolved = corrected_compounds_out_dict.get(selected_name)
                    if resolved:
                        copy_compounds_out_dict[original_name] = resolved
                        copy_compounds_out_dict[original_name][
                            "name_correction_info"
                        ] = corrected_names_dict[original_name]

                compounds_out_dict = copy_compounds_out_dict

    if not detailed_name_dict:
        logger.info("Returning simplified SMILES dictionary.")
        return {k: v.get("SMILES", "") for k, v in compounds_out_dict.items()}

    return compounds_out_dict

API Reference

cholla_chem

CIRpyNameResolver

name_to_smiles(compound_name_list)

ChemNameCorrector

With custom configuration

With external validation

__init__(config=None, strategies=None)

add_strategy(strategy)

correct(name, use_validator=True, validate_all=False)

correct_batch(names, use_validator=True, validate_all=False)

explain_corrections(candidate)

get_best_candidate(name, use_validator=True)

remove_strategy(strategy_name)

ChemSpiPyResolver

name_to_smiles(compound_name_list)

ChemicalNameResolver

rate_limit_time property

requires_internet property

resolver_name property

resolver_weight property

name_to_smiles(compound_name_list) abstractmethod

CorrectorConfig dataclass

InorganicShorthandNameResolver

name_to_smiles(compound_name_list)

ManualNameResolver

name_to_smiles(compound_name_list, provided_name_dict=None)

OpsinNameResolver

name_to_smiles(compound_name_list)

PubChemNameResolver

name_to_smiles(compound_name_list)

StructuralFormulaNameResolver

name_to_smiles(compound_name_list)

`cholla_chem`

`CIRpyNameResolver`

`name_to_smiles(compound_name_list)`

`ChemNameCorrector`

`init(config=None, strategies=None)`

`add_strategy(strategy)`

`correct(name, use_validator=True, validate_all=False)`

`correct_batch(names, use_validator=True, validate_all=False)`

`explain_corrections(candidate)`

`get_best_candidate(name, use_validator=True)`

`remove_strategy(strategy_name)`

`ChemSpiPyResolver`

`name_to_smiles(compound_name_list)`

`ChemicalNameResolver`

`rate_limit_time` `property`

`requires_internet` `property`

`resolver_name` `property`

`resolver_weight` `property`

`name_to_smiles(compound_name_list)` `abstractmethod`

`CorrectorConfig` `dataclass`

`InorganicShorthandNameResolver`

`name_to_smiles(compound_name_list)`

`ManualNameResolver`

`name_to_smiles(compound_name_list, provided_name_dict=None)`

`OpsinNameResolver`

`name_to_smiles(compound_name_list)`

`PubChemNameResolver`

`name_to_smiles(compound_name_list)`

`StructuralFormulaNameResolver`

`name_to_smiles(compound_name_list)`