Skip to content

API Reference

cholla_chem

cholla_chem initialization.

CIRpyNameResolver

Bases: ChemicalNameResolver

Resolver using Chemical Identity Resolver via CIRPy.

Source code in cholla_chem/main.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
class CIRpyNameResolver(ChemicalNameResolver):
    """
    Resolver using Chemical Identity Resolver via CIRPy.
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 1,
        rate_limit_time: float = 10,
    ):
        super().__init__(
            "cirpy",
            resolver_name,
            resolver_weight,
            requires_internet=True,
            rate_limit_time=rate_limit_time,
        )

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using cirpy.
        """
        resolved_names = name_to_smiles_cirpy(compound_name_list)
        return resolved_names, {}

name_to_smiles(compound_name_list)

Convert chemical names to SMILES using cirpy.

Source code in cholla_chem/main.py
210
211
212
213
214
215
216
217
def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using cirpy.
    """
    resolved_names = name_to_smiles_cirpy(compound_name_list)
    return resolved_names, {}

ChemNameCorrector

Main class for correcting OCR errors in chemical names.

This class orchestrates the correction process by: 1. Applying configured correction strategies 2. Generating candidate corrections 3. Scoring candidates 4. Optionally validating with external tools 5. Returning ranked results

Example

corrector = ChemNameCorrector() results = corrector.correct("2-ch1oropropanoic acid") print(results[0].name) 2-chloropropanoic acid

With custom configuration

config = CorrectorConfig(max_candidates=50) corrector = ChemNameCorrector(config)

With external validation

validator = PubChemValidator() results = corrector.correct("asprin", validator=validator)

Attributes:

Name Type Description
config

Configuration for the corrector

strategies

List of active correction strategies

scorer

Scoring instance for ranking candidates

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
class ChemNameCorrector:
    """
    Main class for correcting OCR errors in chemical names.

    This class orchestrates the correction process by:
    1. Applying configured correction strategies
    2. Generating candidate corrections
    3. Scoring candidates
    4. Optionally validating with external tools
    5. Returning ranked results

    Example:
        >>> corrector = ChemNameCorrector()
        >>> results = corrector.correct("2-ch1oropropanoic acid")
        >>> print(results[0].name)
        2-chloropropanoic acid

        >>> # With custom configuration
        >>> config = CorrectorConfig(max_candidates=50)
        >>> corrector = ChemNameCorrector(config)

        >>> # With external validation
        >>> validator = PubChemValidator()
        >>> results = corrector.correct("asprin", validator=validator)

    Attributes:
        config: Configuration for the corrector
        strategies: List of active correction strategies
        scorer: Scoring instance for ranking candidates
    """

    def __init__(
        self,
        config: Optional[CorrectorConfig] = None,
        strategies: Optional[List[CorrectionStrategy]] = None,
    ):
        """
        Initialize the chemical name corrector.

        Args:
            config: Configuration object (uses defaults if None)
            strategies: Custom list of strategies (uses defaults if None)
        """
        self.config = config or CorrectorConfig()
        self.scorer = ChemicalNameScorer(self.config)

        if strategies is not None:
            self.strategies = strategies
        else:
            self.strategies = self._create_default_strategies()

        self.validator = None
        if self.config.enable_external_validation:
            self.validator = OPSINValidator()

    def _create_default_strategies(self) -> List[CorrectionStrategy]:
        """Create the default set of correction strategies based on config."""
        strategies: List[CorrectionStrategy] = []

        if self.config.enable_locant_correction:
            strategies.append(LocantCorrectionStrategy())

        if self.config.enable_character_substitution:
            char_strategy = CharacterSubstitutionStrategy(
                max_edits=self.config.max_character_substitution_edits_per_morpheme
            )
            strategies.append(char_strategy)

        if self.config.enable_character_insertion:
            char_insertion_strategy = CharacterInsertionStrategy(
                max_edits=self.config.max_character_insertion_edits_per_morpheme
            )
            strategies.append(char_insertion_strategy)

        if self.config.enable_character_deletion:
            char_deletion_strategy = CharacterDeletionStrategy(
                max_edits=self.config.max_character_deletion_edits_per_morpheme
            )
            strategies.append(char_deletion_strategy)

        if self.config.enable_transposition:
            char_transposition_strategy = CharacterTranspositionStrategy(
                max_edits=self.config.max_transposition_edits_per_morpheme
            )
            strategies.append(char_transposition_strategy)

        if self.config.enable_punctuation_restoration:
            strategies.append(PunctuationRestorationStrategy())

        if self.config.enable_bracket_balancing:
            strategies.append(BracketBalancingStrategy())

        return strategies

    def add_strategy(self, strategy: CorrectionStrategy) -> None:
        """
        Add a custom correction strategy.

        Args:
            strategy: The strategy to add
        """
        self.strategies.append(strategy)

    def remove_strategy(self, strategy_name: str) -> bool:
        """
        Remove a strategy by name.

        Args:
            strategy_name: Name of the strategy to remove

        Returns:
            True if strategy was found and removed, False otherwise
        """
        for i, strategy in enumerate(self.strategies):
            if strategy.name == strategy_name:
                self.strategies.pop(i)
                return True
        return False

    def correct(
        self, name: str, use_validator: bool = True, validate_all: bool = False
    ) -> List[CorrectionCandidate]:
        """
        Correct a chemical name and return ranked candidates.

        Args:
            name: The chemical name to correct
            use_validator: Whether to use external validator
            validate_all: Whether to validate all candidates or just the top ones

        Returns:
            List of CorrectionCandidate objects, sorted by score (descending)
        """
        # Generate all candidates
        candidates = self._generate_all_candidates(name)

        # Remove duplicates while preserving best corrections
        unique_candidates = self._deduplicate_candidates(candidates)

        # Score all candidates
        scored_candidates = [
            self.scorer.score(candidate) for candidate in unique_candidates
        ]

        # Filter by minimum score threshold
        filtered_candidates = [
            c for c in scored_candidates if c.score >= self.config.min_score_threshold
        ]

        # Sort by score (descending)
        sorted_candidates = sorted(
            filtered_candidates, key=lambda c: c.score, reverse=True
        )

        # Limit to max candidates
        limited_candidates = sorted_candidates[: self.config.max_candidates]

        if use_validator:
            self._validate_candidates_batch(
                {name: limited_candidates}, self.validator, validate_all
            )

            limited_candidates = sorted(
                limited_candidates, key=lambda c: c.score, reverse=True
            )

        return limited_candidates

    def correct_batch(
        self, names: List[str], use_validator: bool = True, validate_all: bool = False
    ) -> Dict[str, List[CorrectionCandidate]]:
        """
        Correct multiple chemical names.

        Args:
            names: List of chemical names to correct
            use_validator: Whether to use external validator
            validate_all: Whether to validate all candidates or just the top ones

        Returns:
            Dictionary mapping original names to their candidates
        """
        results = {}
        for name in names:
            results[name] = self.correct(name, use_validator=False)

        if use_validator:
            self._validate_candidates_batch(results, self.validator, validate_all)

        for name in names:
            results[name] = sorted(results[name], key=lambda c: c.score, reverse=True)

        return results

    def _generate_all_candidates(self, name: str) -> List[CorrectionCandidate]:
        """Generate candidates from all strategies."""
        candidates: List[CorrectionCandidate] = []

        names_to_process = [(name, 0)]
        for strategy in self.strategies:
            for name_to_process, num_corrections in names_to_process:
                for new_text, new_corrections in strategy.generate_candidates(
                    name_to_process, num_corrections, self.config
                ):
                    if (
                        len(new_corrections)
                        <= self.config.max_corrections_per_candidate
                    ):
                        candidate = CorrectionCandidate(
                            name=new_text,
                            original_name=name,
                            corrections=new_corrections,
                        )
                        candidates.append(candidate)

                        if new_text in names_to_process:
                            continue
                        if len(names_to_process) >= self.config.max_candidates:
                            continue
                        names_to_process.append((new_text, len(new_corrections)))

        return candidates

    def _deduplicate_candidates(
        self, candidates: List[CorrectionCandidate]
    ) -> List[CorrectionCandidate]:
        """Remove duplicate candidates, keeping the one with fewer corrections."""
        seen: Dict[str, CorrectionCandidate] = {}

        for candidate in candidates:
            if candidate.name not in seen:
                seen[candidate.name] = candidate
            else:
                # Keep the one with fewer corrections
                if candidate.num_corrections < seen[candidate.name].num_corrections:
                    seen[candidate.name] = candidate

        return list(seen.values())

    def _validate_candidates_batch(
        self,
        candidates: Dict[str, List[CorrectionCandidate]],
        validator: Optional[Validator],
        validate_all: bool,
    ) -> None:
        """Validate candidates using external validator."""
        if not validator:
            return

        all_candidate_names = []
        original_name_candidate_name_map = {}
        candidate_name_candidate_object_map = {}
        for original_name, candidates_list in candidates.items():
            original_name_candidate_name_map[original_name] = [
                candidate.name for candidate in candidates_list
            ]
            candidate_name_candidate_object_map.update(
                {candidate.name: candidate for candidate in candidates_list}
            )
            all_candidate_names.extend(
                [candidate.name for candidate in candidates_list]
            )

        validator_outputs = validator.batch_validate(all_candidate_names)

        for candidate_name, (is_valid, result) in validator_outputs.items():
            candidate = candidate_name_candidate_object_map[candidate_name]
            candidate.validated = True
            candidate.validation_result = result

            if is_valid:
                # Boost score for valid candidates
                candidate.score = min(1.0, candidate.score + 0.3)

            else:
                # Lower score for invalid candidates
                candidate.score = max(0.0, candidate.score - 0.2)

        return

    def get_best_candidate(
        self, name: str, use_validator: bool = True
    ) -> Optional[CorrectionCandidate]:
        """
        Get the single best correction candidate.

        Args:
            name: Chemical name to correct
            validator: Optional external validator

        Returns:
            Best candidate, or None if no candidates found
        """
        candidates = self.correct(name, use_validator)
        return candidates[0] if candidates else None

    def explain_corrections(self, candidate: CorrectionCandidate) -> str:
        """
        Generate a human-readable explanation of corrections.

        Args:
            candidate: The candidate to explain

        Returns:
            Multi-line string explaining all corrections
        """
        lines = [
            f"Original: {candidate.original_name}",
            f"Corrected: {candidate.name}",
            f"Score: {candidate.score:.3f}",
            f"Number of corrections: {candidate.num_corrections}",
            "",
            "Score components:",
        ]

        for component, value in candidate.score_components.items():
            lines.append(f"  - {component}: {value:.3f}")

        if candidate.corrections:
            lines.append("")
            lines.append("Corrections applied:")
            for i, correction in enumerate(candidate.corrections, 1):
                lines.append(
                    f"  {i}. [{correction.correction_type.name}] "
                    f"'{correction.original}' → '{correction.replacement}'"
                )
                if correction.description:
                    lines.append(f"     {correction.description}")

        if candidate.validated:
            lines.append("")
            lines.append(f"Validated: {candidate.validation_result or 'No result'}")

        return "\n".join(lines)

__init__(config=None, strategies=None)

Initialize the chemical name corrector.

Parameters:

Name Type Description Default
config Optional[CorrectorConfig]

Configuration object (uses defaults if None)

None
strategies Optional[List[CorrectionStrategy]]

Custom list of strategies (uses defaults if None)

None
Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def __init__(
    self,
    config: Optional[CorrectorConfig] = None,
    strategies: Optional[List[CorrectionStrategy]] = None,
):
    """
    Initialize the chemical name corrector.

    Args:
        config: Configuration object (uses defaults if None)
        strategies: Custom list of strategies (uses defaults if None)
    """
    self.config = config or CorrectorConfig()
    self.scorer = ChemicalNameScorer(self.config)

    if strategies is not None:
        self.strategies = strategies
    else:
        self.strategies = self._create_default_strategies()

    self.validator = None
    if self.config.enable_external_validation:
        self.validator = OPSINValidator()

add_strategy(strategy)

Add a custom correction strategy.

Parameters:

Name Type Description Default
strategy CorrectionStrategy

The strategy to add

required
Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
122
123
124
125
126
127
128
129
def add_strategy(self, strategy: CorrectionStrategy) -> None:
    """
    Add a custom correction strategy.

    Args:
        strategy: The strategy to add
    """
    self.strategies.append(strategy)

correct(name, use_validator=True, validate_all=False)

Correct a chemical name and return ranked candidates.

Parameters:

Name Type Description Default
name str

The chemical name to correct

required
use_validator bool

Whether to use external validator

True
validate_all bool

Whether to validate all candidates or just the top ones

False

Returns:

Type Description
List[CorrectionCandidate]

List of CorrectionCandidate objects, sorted by score (descending)

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def correct(
    self, name: str, use_validator: bool = True, validate_all: bool = False
) -> List[CorrectionCandidate]:
    """
    Correct a chemical name and return ranked candidates.

    Args:
        name: The chemical name to correct
        use_validator: Whether to use external validator
        validate_all: Whether to validate all candidates or just the top ones

    Returns:
        List of CorrectionCandidate objects, sorted by score (descending)
    """
    # Generate all candidates
    candidates = self._generate_all_candidates(name)

    # Remove duplicates while preserving best corrections
    unique_candidates = self._deduplicate_candidates(candidates)

    # Score all candidates
    scored_candidates = [
        self.scorer.score(candidate) for candidate in unique_candidates
    ]

    # Filter by minimum score threshold
    filtered_candidates = [
        c for c in scored_candidates if c.score >= self.config.min_score_threshold
    ]

    # Sort by score (descending)
    sorted_candidates = sorted(
        filtered_candidates, key=lambda c: c.score, reverse=True
    )

    # Limit to max candidates
    limited_candidates = sorted_candidates[: self.config.max_candidates]

    if use_validator:
        self._validate_candidates_batch(
            {name: limited_candidates}, self.validator, validate_all
        )

        limited_candidates = sorted(
            limited_candidates, key=lambda c: c.score, reverse=True
        )

    return limited_candidates

correct_batch(names, use_validator=True, validate_all=False)

Correct multiple chemical names.

Parameters:

Name Type Description Default
names List[str]

List of chemical names to correct

required
use_validator bool

Whether to use external validator

True
validate_all bool

Whether to validate all candidates or just the top ones

False

Returns:

Type Description
Dict[str, List[CorrectionCandidate]]

Dictionary mapping original names to their candidates

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def correct_batch(
    self, names: List[str], use_validator: bool = True, validate_all: bool = False
) -> Dict[str, List[CorrectionCandidate]]:
    """
    Correct multiple chemical names.

    Args:
        names: List of chemical names to correct
        use_validator: Whether to use external validator
        validate_all: Whether to validate all candidates or just the top ones

    Returns:
        Dictionary mapping original names to their candidates
    """
    results = {}
    for name in names:
        results[name] = self.correct(name, use_validator=False)

    if use_validator:
        self._validate_candidates_batch(results, self.validator, validate_all)

    for name in names:
        results[name] = sorted(results[name], key=lambda c: c.score, reverse=True)

    return results

explain_corrections(candidate)

Generate a human-readable explanation of corrections.

Parameters:

Name Type Description Default
candidate CorrectionCandidate

The candidate to explain

required

Returns:

Type Description
str

Multi-line string explaining all corrections

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def explain_corrections(self, candidate: CorrectionCandidate) -> str:
    """
    Generate a human-readable explanation of corrections.

    Args:
        candidate: The candidate to explain

    Returns:
        Multi-line string explaining all corrections
    """
    lines = [
        f"Original: {candidate.original_name}",
        f"Corrected: {candidate.name}",
        f"Score: {candidate.score:.3f}",
        f"Number of corrections: {candidate.num_corrections}",
        "",
        "Score components:",
    ]

    for component, value in candidate.score_components.items():
        lines.append(f"  - {component}: {value:.3f}")

    if candidate.corrections:
        lines.append("")
        lines.append("Corrections applied:")
        for i, correction in enumerate(candidate.corrections, 1):
            lines.append(
                f"  {i}. [{correction.correction_type.name}] "
                f"'{correction.original}' → '{correction.replacement}'"
            )
            if correction.description:
                lines.append(f"     {correction.description}")

    if candidate.validated:
        lines.append("")
        lines.append(f"Validated: {candidate.validation_result or 'No result'}")

    return "\n".join(lines)

get_best_candidate(name, use_validator=True)

Get the single best correction candidate.

Parameters:

Name Type Description Default
name str

Chemical name to correct

required
validator

Optional external validator

required

Returns:

Type Description
Optional[CorrectionCandidate]

Best candidate, or None if no candidates found

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def get_best_candidate(
    self, name: str, use_validator: bool = True
) -> Optional[CorrectionCandidate]:
    """
    Get the single best correction candidate.

    Args:
        name: Chemical name to correct
        validator: Optional external validator

    Returns:
        Best candidate, or None if no candidates found
    """
    candidates = self.correct(name, use_validator)
    return candidates[0] if candidates else None

remove_strategy(strategy_name)

Remove a strategy by name.

Parameters:

Name Type Description Default
strategy_name str

Name of the strategy to remove

required

Returns:

Type Description
bool

True if strategy was found and removed, False otherwise

Source code in cholla_chem/name_manipulation/name_correction/name_corrector.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def remove_strategy(self, strategy_name: str) -> bool:
    """
    Remove a strategy by name.

    Args:
        strategy_name: Name of the strategy to remove

    Returns:
        True if strategy was found and removed, False otherwise
    """
    for i, strategy in enumerate(self.strategies):
        if strategy.name == strategy_name:
            self.strategies.pop(i)
            return True
    return False

ChemSpiPyResolver

Bases: ChemicalNameResolver

Resolver using chemspipy.

Source code in cholla_chem/main.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
class ChemSpiPyResolver(ChemicalNameResolver):
    """
    Resolver using chemspipy.
    """

    def __init__(
        self,
        resolver_name: str,
        chemspider_api_key: str,
        resolver_weight: float = 3,
        rate_limit_time: float = 10,
    ):
        super().__init__(
            "chemspipy",
            resolver_name,
            resolver_weight,
            requires_internet=True,
            rate_limit_time=rate_limit_time,
        )
        if chemspider_api_key:
            if not isinstance(chemspider_api_key, str):
                raise TypeError("Invalid input: chemspider_api_key must be a string.")
        self._chemspider_api_key = chemspider_api_key
        self._requires_internet = True

    def name_to_smiles(
        self,
        compound_name_list: List[str],
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using ChemSpiPy.
        """
        resolved_names = name_to_smiles_chemspipy(
            compound_name_list, self._chemspider_api_key
        )
        return resolved_names, {}

name_to_smiles(compound_name_list)

Convert chemical names to SMILES using ChemSpiPy.

Source code in cholla_chem/main.py
245
246
247
248
249
250
251
252
253
254
255
def name_to_smiles(
    self,
    compound_name_list: List[str],
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using ChemSpiPy.
    """
    resolved_names = name_to_smiles_chemspipy(
        compound_name_list, self._chemspider_api_key
    )
    return resolved_names, {}

ChemicalNameResolver

Bases: ABC

Abstract base class for chemical name-to-SMILES resolvers.

Subclasses must implement the name_to_smiles method.

Source code in cholla_chem/main.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class ChemicalNameResolver(ABC):
    """
    Abstract base class for chemical name-to-SMILES resolvers.

    Subclasses must implement the `name_to_smiles` method.
    """

    def __init__(
        self,
        resolver_type: str,
        resolver_name: str,
        resolver_weight: float,
        requires_internet: bool = False,
        rate_limit_time: Optional[float] = None,
    ):
        if not isinstance(resolver_type, str):
            raise TypeError("Invalid input: resolver_type must be a string.")
        self._resolver_type: str = resolver_type
        if not isinstance(resolver_name, str):
            raise TypeError("Invalid input: resolver_name must be a string.")
        self._resolver_name: str = resolver_name
        if not isinstance(resolver_weight, (int, float)):
            raise TypeError(
                "Invalid input: resolver_weight must be a number between 0-1000."
            )
        if resolver_weight < 0 or resolver_weight > 1000:
            raise ValueError(
                "Invalid input: resolver_weight must be a number between 0-1000."
            )
        self._resolver_weight: float = float(resolver_weight)
        self._requires_internet: bool = requires_internet
        self._rate_limit_time: Optional[float] = rate_limit_time

    @property
    def resolver_name(self) -> str:
        """Return resolver_name."""
        return self._resolver_name

    @property
    def resolver_weight(self) -> float:
        """Return resolver_weight."""
        return self._resolver_weight

    @property
    def requires_internet(self) -> bool:
        """Return requires_internet."""
        return self._requires_internet

    @property
    def rate_limit_time(self) -> Optional[float]:
        """Return rate_limit_time."""
        return self._rate_limit_time

    @abstractmethod
    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES strings.

        Args:
            compound_name_list: List of chemical names.

        Returns:
            Tuple of:
                - Dict mapping successful names to SMILES.
                - Dict mapping failed names to error messages.
        """
        pass

rate_limit_time property

Return rate_limit_time.

requires_internet property

Return requires_internet.

resolver_name property

Return resolver_name.

resolver_weight property

Return resolver_weight.

name_to_smiles(compound_name_list) abstractmethod

Convert chemical names to SMILES strings.

Parameters:

Name Type Description Default
compound_name_list List[str]

List of chemical names.

required

Returns:

Type Description
Tuple[Dict[str, str], Dict[str, str]]

Tuple of: - Dict mapping successful names to SMILES. - Dict mapping failed names to error messages.

Source code in cholla_chem/main.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
@abstractmethod
def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES strings.

    Args:
        compound_name_list: List of chemical names.

    Returns:
        Tuple of:
            - Dict mapping successful names to SMILES.
            - Dict mapping failed names to error messages.
    """
    pass

CorrectorConfig dataclass

Configuration for the ChemNameCorrector.

Attributes:

Name Type Description
max_candidates int

Maximum number of candidates to generate

max_corrections_per_candidate int

Maximum corrections per candidate

min_score_threshold float

Minimum score to include candidate in results

enable_character_substitution bool

Enable OCR character correction

max_character_substitution_edits bool

Max number of substitution edits

enable_punctuation_restoration bool

Enable missing punctuation detection

enable_bracket_balancing bool

Enable bracket matching correction

custom_substitutions Dict[str, List[str]]

Additional user-defined substitution rules

custom_rules List[CorrectionRule]

Additional user-defined correction rules

enable_external_validation bool

Enable external validation of candidates

Source code in cholla_chem/name_manipulation/name_correction/dataclasses.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
@dataclass
class CorrectorConfig:
    """
    Configuration for the ChemNameCorrector.

    Attributes:
        max_candidates: Maximum number of candidates to generate
        max_corrections_per_candidate: Maximum corrections per candidate
        min_score_threshold: Minimum score to include candidate in results
        enable_character_substitution: Enable OCR character correction
        max_character_substitution_edits: Max number of substitution edits
        enable_punctuation_restoration: Enable missing punctuation detection
        enable_bracket_balancing: Enable bracket matching correction
        custom_substitutions: Additional user-defined substitution rules
        custom_rules: Additional user-defined correction rules
        enable_external_validation: Enable external validation of candidates
    """

    max_candidates: int = 100
    max_corrections_per_candidate: int = 3
    min_score_threshold: float = 0.1
    enable_locant_correction: bool = True

    enable_character_substitution: bool = True
    max_character_substitution_edits_per_morpheme: int = 1

    enable_character_insertion: bool = True
    max_character_insertion_edits_per_morpheme: int = 1

    enable_character_deletion: bool = True
    max_character_deletion_edits_per_morpheme: int = 1

    enable_transposition: bool = True
    max_transposition_edits_per_morpheme: int = 1

    enable_punctuation_restoration: bool = False
    enable_bracket_balancing: bool = False
    custom_substitutions: Dict[str, List[str]] = field(default_factory=dict)
    custom_rules: List[CorrectionRule] = field(default_factory=list)
    enable_external_validation: bool = True

InorganicShorthandNameResolver

Bases: ChemicalNameResolver

Resolver using inorganic shorthand (e.g. [Cp*RhCl2]2).

Source code in cholla_chem/main.py
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
class InorganicShorthandNameResolver(ChemicalNameResolver):
    """
    Resolver using inorganic shorthand (e.g. [Cp*RhCl2]2).
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 2,
    ):
        super().__init__(
            "inorganic_shorthand",
            resolver_name,
            resolver_weight,
            requires_internet=False,
            rate_limit_time=None,
        )
        self._requires_internet = False

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using inorganic shorthand converter.
        """
        resolved_names = name_to_smiles_inorganic_shorthand(compound_name_list)
        return resolved_names, {}

name_to_smiles(compound_name_list)

Convert chemical names to SMILES using inorganic shorthand converter.

Source code in cholla_chem/main.py
352
353
354
355
356
357
358
359
def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using inorganic shorthand converter.
    """
    resolved_names = name_to_smiles_inorganic_shorthand(compound_name_list)
    return resolved_names, {}

ManualNameResolver

Bases: ChemicalNameResolver

Resolver using manually curated names and corresponding SMILES.

Source code in cholla_chem/main.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
class ManualNameResolver(ChemicalNameResolver):
    """
    Resolver using manually curated names and corresponding SMILES.
    """

    def __init__(
        self,
        resolver_name: str,
        provided_name_dict: dict | None = None,
        resolver_weight: float = 10,
    ):
        super().__init__(
            "manual",
            resolver_name,
            resolver_weight,
            requires_internet=False,
            rate_limit_time=None,
        )
        if provided_name_dict:
            if not isinstance(provided_name_dict, dict):
                raise TypeError(
                    "Invalid input: provided_name_dict must be a dictionary."
                )
            for k, v in provided_name_dict.items():
                if not isinstance(k, str) or not isinstance(v, str):
                    raise ValueError(
                        "Invalid input: keys and values in provided_name_dict must be strings."
                    )

        self._provided_name_dict = provided_name_dict
        self._requires_internet = False

    def name_to_smiles(
        self,
        compound_name_list: List[str],
        provided_name_dict: Dict[str, str] | None = None,
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using manual name database.
        """
        if provided_name_dict is None:
            provided_name_dict = self._provided_name_dict
        resolved_names = name_to_smiles_manual(compound_name_list, provided_name_dict)
        return resolved_names, {}

name_to_smiles(compound_name_list, provided_name_dict=None)

Convert chemical names to SMILES using manual name database.

Source code in cholla_chem/main.py
290
291
292
293
294
295
296
297
298
299
300
301
def name_to_smiles(
    self,
    compound_name_list: List[str],
    provided_name_dict: Dict[str, str] | None = None,
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using manual name database.
    """
    if provided_name_dict is None:
        provided_name_dict = self._provided_name_dict
    resolved_names = name_to_smiles_manual(compound_name_list, provided_name_dict)
    return resolved_names, {}

OpsinNameResolver

Bases: ChemicalNameResolver

Resolver using OPSIN via py2opsin.

Source code in cholla_chem/main.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
class OpsinNameResolver(ChemicalNameResolver):
    """
    Resolver using OPSIN via py2opsin.
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 3,
        allow_acid: bool = False,
        allow_radicals: bool = True,
        allow_bad_stereo: bool = False,
        wildcard_radicals: bool = False,
        jar_fpath: str = "opsin-cli.jar",
    ):
        super().__init__(
            "opsin",
            resolver_name,
            resolver_weight,
            rate_limit_time=None,
        )
        self._allow_acid = allow_acid
        self._allow_radicals = allow_radicals
        self._allow_bad_stereo = allow_bad_stereo
        self._wildcard_radicals = wildcard_radicals
        self._jar_fpath = jar_fpath

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using OPSIN.
        """
        resolved_names, failure_message_dict = name_to_smiles_opsin(
            compound_name_list,
            allow_acid=self._allow_acid,
            allow_radicals=self._allow_radicals,
            allow_bad_stereo=self._allow_bad_stereo,
            wildcard_radicals=self._wildcard_radicals,
        )
        return resolved_names, failure_message_dict

name_to_smiles(compound_name_list)

Convert chemical names to SMILES using OPSIN.

Source code in cholla_chem/main.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using OPSIN.
    """
    resolved_names, failure_message_dict = name_to_smiles_opsin(
        compound_name_list,
        allow_acid=self._allow_acid,
        allow_radicals=self._allow_radicals,
        allow_bad_stereo=self._allow_bad_stereo,
        wildcard_radicals=self._wildcard_radicals,
    )
    return resolved_names, failure_message_dict

PubChemNameResolver

Bases: ChemicalNameResolver

Resolver using PubChem via PubChemPy.

Source code in cholla_chem/main.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class PubChemNameResolver(ChemicalNameResolver):
    """
    Resolver using PubChem via PubChemPy.
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 2,
        rate_limit_time: float = 10,
    ):
        super().__init__(
            "pubchem",
            resolver_name,
            resolver_weight,
            requires_internet=True,
            rate_limit_time=rate_limit_time,
        )

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using pubchem.
        """
        resolved_names = name_to_smiles_pubchem(compound_name_list)
        return resolved_names, {}

name_to_smiles(compound_name_list)

Convert chemical names to SMILES using pubchem.

Source code in cholla_chem/main.py
181
182
183
184
185
186
187
188
def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using pubchem.
    """
    resolved_names = name_to_smiles_pubchem(compound_name_list)
    return resolved_names, {}

StructuralFormulaNameResolver

Bases: ChemicalNameResolver

Resolver using structural chemical formula (e.g. CH3CH2CH2COOH).

Source code in cholla_chem/main.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
class StructuralFormulaNameResolver(ChemicalNameResolver):
    """
    Resolver using structural chemical formula (e.g. CH3CH2CH2COOH).
    """

    def __init__(
        self,
        resolver_name: str,
        resolver_weight: float = 2,
    ):
        super().__init__(
            "structural_formula",
            resolver_name,
            resolver_weight,
            requires_internet=False,
            rate_limit_time=None,
        )
        self._requires_internet = False

    def name_to_smiles(
        self, compound_name_list: List[str]
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Convert chemical names to SMILES using structural formula converter.
        """
        resolved_names = name_to_smiles_structural_formula(compound_name_list)
        return resolved_names, {}

name_to_smiles(compound_name_list)

Convert chemical names to SMILES using structural formula converter.

Source code in cholla_chem/main.py
323
324
325
326
327
328
329
330
def name_to_smiles(
    self, compound_name_list: List[str]
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Convert chemical names to SMILES using structural formula converter.
    """
    resolved_names = name_to_smiles_structural_formula(compound_name_list)
    return resolved_names, {}

resolve_compounds_to_smiles(compounds_list, resolvers_list=[], smiles_selection_mode='weighted', detailed_name_dict=False, batch_size=500, normalize_unicode=True, split_names_to_solve=True, resolve_peptide_shorthand=True, attempt_name_correction=True, internet_connection_available=True, name_correction_config=None)

Resolve a list of compound names to their SMILES representations.

Parameters:

Name Type Description Default
compounds_list List[str]

A list of compound names.

required
resolvers_list List[ChemicalNameResolver]

A list of ChemicalNameResolver instances. Defaults to [].

[]
smiles_selection_mode str

The method to select the SMILES representation from multiple resolvers. Defaults to 'weighted'.

'weighted'
detailed_name_dict bool

If True, returns a dictionary with detailed information about each compound. Defaults to False.

False
batch_size int

The number of compounds to process in each batch. Defaults to 500.

500
normalize_unicode bool

Whether to normalize Unicode characters in compound names. Defaults to True.

True
split_names_to_solve bool

Whether to split compound names on common delimiters to solve them as separate compounds. Can be used to solve otherwise unresolvable compound names such as BH3•THF. Defaults to True.

True
resolve_peptide_shorthand bool

Whether to resolve peptide shorthand notation. Defaults to True.

True
attempt_name_correction bool

Whether to attempt to correct compound names that are misspelled or contain typos. Defaults to True.

True
internet_connection_available bool

Whether an internet connection is available to resolve compound names. Defaults to True.

True
name_correction_config CorrectorConfig

Configuration for name correction. Defaults to None.

None

Returns:

Type Description
Dict[str, CompoundResolutionEntry] | Dict[str, CompoundResolutionEntryWithNameCorrection] | Dict[str, str]

Dict[str, Dict[str, Dict[str, List[str]]]] | Dict[str, str]: A dictionary mapping each compound to its SMILES representation and resolvers, or a simple dictionary mapping each compound to it's selected SMILES representation.

Source code in cholla_chem/main.py
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
def resolve_compounds_to_smiles(
    compounds_list: List[str],
    resolvers_list: List[ChemicalNameResolver] = [],
    smiles_selection_mode: str = "weighted",
    detailed_name_dict: bool = False,
    batch_size: int = 500,
    normalize_unicode: bool = True,
    split_names_to_solve: bool = True,
    resolve_peptide_shorthand: bool = True,
    attempt_name_correction: bool = True,
    internet_connection_available: bool = True,
    name_correction_config: Optional[CorrectorConfig] = None,
) -> (
    Dict[str, CompoundResolutionEntry]
    | Dict[str, CompoundResolutionEntryWithNameCorrection]
    | Dict[str, str]
):
    """
    Resolve a list of compound names to their SMILES representations.

    Args:
        compounds_list (List[str]): A list of compound names.
        resolvers_list (List[ChemicalNameResolver], optional): A list of ChemicalNameResolver instances.
            Defaults to [].
        smiles_selection_mode (str, optional): The method to select the SMILES representation from multiple resolvers.
            Defaults to 'weighted'.
        detailed_name_dict (bool, optional): If True, returns a dictionary with detailed information about each compound.
            Defaults to False.
        batch_size (int, optional): The number of compounds to process in each batch. Defaults to 500.
        normalize_unicode (bool, optional): Whether to normalize Unicode characters in compound names. Defaults to True.
        split_names_to_solve (bool, optional): Whether to split compound names on common delimiters to solve them as separate compounds.
            Can be used to solve otherwise unresolvable compound names such as BH3•THF. Defaults to True.
        resolve_peptide_shorthand (bool, optional): Whether to resolve peptide shorthand notation. Defaults to True.
        attempt_name_correction (bool, optional): Whether to attempt to correct compound names that are misspelled or contain typos.
            Defaults to True.
        internet_connection_available (bool, optional): Whether an internet connection is available to resolve compound names. Defaults to True.
        name_correction_config (CorrectorConfig, optional): Configuration for name correction. Defaults to None.

    Returns:
        Dict[str, Dict[str, Dict[str, List[str]]]] | Dict[str, str]: A dictionary mapping each compound to its SMILES representation and resolvers, or a simple dictionary mapping each compound to it's selected SMILES representation.
    """
    if not resolvers_list:
        resolvers_list = [
            PubChemNameResolver("pubchem_default"),
            OpsinNameResolver("opsin_default"),
            ManualNameResolver("manual_default"),
            StructuralFormulaNameResolver("structural_formula_default"),
            InorganicShorthandNameResolver("inorganic_shorthand_default"),
        ]

    if isinstance(compounds_list, str):
        compounds_list = [compounds_list]
    if not isinstance(compounds_list, list):
        raise ValueError(
            "Invalid input: compounds_list must be a string or a non-empty list of strings."
        )
    if isinstance(compounds_list, list):
        if len(compounds_list) == 0:
            raise ValueError(
                "Invalid input: compounds_list must be a string or a non-empty list of strings."
            )
        for compound in compounds_list:
            if not isinstance(compound, str):
                raise ValueError(
                    "Invalid input: compounds_list must be a string or a non-empty list of strings."
                )
    if len(compounds_list) != len(set(compounds_list)):
        logger.info("Removing duplicate compound names from compounds_list.")
        compounds_list = list(set(compounds_list))

    non_empty_compounds_list = [string for string in compounds_list if string]
    if len(non_empty_compounds_list) != len(compounds_list):
        logger.info("Removing empty compound names from compounds_list.")
        compounds_list = non_empty_compounds_list

    if not isinstance(resolvers_list, list) or len(resolvers_list) == 0:
        raise ValueError(
            "Invalid input: resolvers_list must be a non-empty list of ChemicalNameResolver instances."
        )

    seen_resolvers = []
    for resolver in resolvers_list:
        if not isinstance(resolver, ChemicalNameResolver):
            raise ValueError(
                f"Invalid resolver: {resolver} is not an instance of ChemicalNameResolver."
            )
        if resolver.resolver_name in seen_resolvers:
            raise ValueError(f"Duplicate resolver name: {resolver.resolver_name}.")
        seen_resolvers.append(resolver.resolver_name)

    if not (isinstance(smiles_selection_mode, str) or callable(smiles_selection_mode)):
        raise ValueError(
            "Invalid input: smiles_selection_mode must be a string or function."
        )

    if not isinstance(detailed_name_dict, bool):
        raise ValueError("Invalid input: detailed_name_dict must be a bool.")

    if not isinstance(batch_size, int):
        raise TypeError("Invalid input: batch_size must be an integer.")
    if batch_size <= 0 or batch_size > 1000:
        raise ValueError("Invalid input: batch_size must be an integer between 1-1000.")

    if not isinstance(split_names_to_solve, bool):
        raise ValueError("Invalid input: split_names_to_solve must be a bool.")

    if not isinstance(normalize_unicode, bool):
        raise ValueError("Invalid input: normalize_unicode must be a bool.")

    if not isinstance(internet_connection_available, bool):
        raise ValueError("Invalid input: internet_connection_available must be a bool.")

    if not internet_connection_available:
        logger.info(
            "Internet connection not available, filtering out internet-dependent resolvers."
        )
        resolvers_list = [
            resolver for resolver in resolvers_list if not resolver.requires_internet
        ]

    if normalize_unicode:
        logger.info("Normalizing unicode in compound names.")
        # Clean compound names (strip, remove/replace forbidden characters, etc.) and return a mapping dict
        cleaned_compounds_list, cleaned_compounds_dict = (
            normalize_unicode_and_return_mapping(compounds_list)
        )
    else:
        cleaned_compounds_list = compounds_list
        cleaned_compounds_dict = {compound: compound for compound in compounds_list}

    if split_names_to_solve:
        # Split compound names on delimiters, add split parts to compounds list
        # Return mapping between original compound names and split parts
        # Necessary to resolve names like H₂O•THF
        cleaned_compounds_list, delimiter_split_dict = (
            split_compounds_on_delimiters_and_return_mapping(cleaned_compounds_list)
        )

    # Resolve compounds and split compound names with resolvers
    resolvers_out_dict = resolve_compounds_using_resolvers(
        cleaned_compounds_list, resolvers_list, batch_size
    )

    # Assemble the resolution dictionary
    compounds_out_dict = assemble_compounds_resolution_dict(
        compounds_list, resolvers_out_dict, cleaned_compounds_dict
    )

    if split_names_to_solve:
        # Resolve compounds that were split with split_compounds_on_delimiters_and_return_mapping
        compounds_out_dict = assemble_split_compounds_resolution_dict(
            compounds_out_dict,
            compounds_list,
            resolvers_out_dict,
            cleaned_compounds_dict,
            delimiter_split_dict,
        )

    # Get the resolvers weight dict - needed for SMILESSelector
    resolvers_weight_dict = get_resolvers_weight_dict(resolvers_list)
    resolvers_priority_order = [resolver.resolver_name for resolver in resolvers_list]

    # Select "best" SMILES according to some criteria, add to resolution dict
    compounds_out_dict = select_smiles_with_criteria(
        compounds_out_dict,
        resolvers_weight_dict,
        resolvers_priority_order,
        smiles_selection_mode,
    )

    if attempt_name_correction:
        # Attempt to correct compound names and then attempt to resolve using the corrected names.
        corrected_names_dict = correct_names(
            compounds_out_dict, name_correction_config, resolve_peptide_shorthand
        )
        if corrected_names_dict:
            corrected_pairs: list[tuple[str, str]] = []
            for original_name, info in corrected_names_dict.items():
                selected = info.get("selected_name")
                if isinstance(selected, str) and selected:
                    corrected_pairs.append((original_name, selected))

            if corrected_pairs:
                selected_names = [selected for _, selected in corrected_pairs]

                corrected_compounds_out_dict = resolve_compounds_to_smiles(
                    compounds_list=selected_names,
                    resolvers_list=resolvers_list,
                    smiles_selection_mode=smiles_selection_mode,
                    detailed_name_dict=True,
                    batch_size=batch_size,
                    normalize_unicode=normalize_unicode,
                    split_names_to_solve=split_names_to_solve,
                    resolve_peptide_shorthand=False,
                    attempt_name_correction=False,
                )

                # ugliness to get rid of mypy error.
                copy_compounds_out_dict: Dict[str, Any] = compounds_out_dict.copy()

                for original_name, selected_name in corrected_pairs:
                    resolved = corrected_compounds_out_dict.get(selected_name)
                    if resolved:
                        copy_compounds_out_dict[original_name] = resolved
                        copy_compounds_out_dict[original_name][
                            "name_correction_info"
                        ] = corrected_names_dict[original_name]

                compounds_out_dict = copy_compounds_out_dict

    if not detailed_name_dict:
        logger.info("Returning simplified SMILES dictionary.")
        return {k: v.get("SMILES", "") for k, v in compounds_out_dict.items()}

    return compounds_out_dict