Source code for moldenViz.parser

"""Read and parse a molden file."""

import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import numpy as np
from numpy.typing import NDArray
from scipy.special import gamma

logger = logging.getLogger(__name__)


@dataclass
class _Atom:
    """Represents an atom with its properties and associated shells.

    Parameters
    ----------
    label : str
        The atomic label (e.g., 'C', 'O', 'H').
    atomic_number : int
        The atomic number of the element.
    position : NDArray[np.floating]
        The 3D position coordinates of the atom.
    shells : list[_Shell]
        List of electron shells associated with this atom.
    """

    label: str
    atomic_number: int
    position: NDArray[np.floating]
    shells: list['_Shell']


@dataclass
class _MolecularOrbital:
    """Represents a molecular orbital with its properties.

    Parameters
    ----------
    sym : str
        The symmetry label of the molecular orbital.
    energy : float
        The energy of the molecular orbital.
    spin : str
        The spin state of the molecular orbital ('Alpha' or 'Beta').
    occ : int
        The occupation number of the molecular orbital.
    """

    sym: str
    energy: float
    spin: str
    occ: int


class _GTO:
    """Represents a Gaussian-type orbital (GTO) primitive.

    Parameters
    ----------
    exp : float
        The exponent of the Gaussian function.
    coeff : float
        The coefficient of the Gaussian function.
    """

    def __init__(self, exp: float, coeff: float) -> None:
        """Initialize a GTO primitive with exponent and coefficient."""
        self.exp = exp
        self.coeff = coeff

        self.norm = 0.0

    def normalize(self, l: int) -> None:
        """Normalize the GTO primitive.

        Parameters
        ----------
        l : int
            The angular momentum quantum number.

        Notes
        -----
        Uses the normalization factor from Jiyun Kuang and C D Lin 1997
        J. Phys. B: At. Mol. Opt. Phys. 30 2529, page 2532.
        """
        # See (Jiyun Kuang and C D Lin 1997 J. Phys. B: At. Mol. Opt. Phys. 30 2529)
        # page 2532 for the normalization factor
        self.norm = np.sqrt(2 * (2 * self.exp) ** (l + 1.5) / gamma(l + 1.5))


class _Shell:
    """Represents an electron shell containing multiple GTO primitives.

    Parameters
    ----------
    l : int
        The angular momentum quantum number of the shell.
    gtos : list[_GTO]
        List of GTO primitives that compose this shell.
    """

    def __init__(self, l: int, gtos: list[_GTO]) -> None:
        """Initialize a shell with angular momentum and GTO primitives."""
        self.l = l
        self.gtos = gtos

        self.norm = 0.0
        self.gto_norms = np.empty(len(gtos), dtype=float)
        self.gto_exps = np.array([gto.exp for gto in gtos], dtype=float)
        self.gto_coeffs = np.array([gto.coeff for gto in gtos], dtype=float)
        self.prefactor = np.empty(len(gtos), dtype=float)

    def normalize(self) -> None:
        """Normalize the shell.

        Notes
        -----
        Uses the normalization factor from Jiyun Kuang and C D Lin 1997
        J. Phys. B: At. Mol. Opt. Phys. 30 2529, equations 18 and 20.
        """
        for idx, gto in enumerate(self.gtos):
            gto.normalize(self.l)
            self.gto_norms[idx] = gto.norm

        overlap = 0.0
        for i_gto in self.gtos:
            for j_gto in self.gtos:
                overlap += (
                    i_gto.coeff
                    * j_gto.coeff
                    * (2 * np.sqrt(i_gto.exp * j_gto.exp) / (i_gto.exp + j_gto.exp)) ** (self.l + 1.5)
                )

        self.norm = 1 / np.sqrt(overlap)
        self.prefactor = self.norm * self.gto_norms * self.gto_coeffs


[docs] class Parser: """Parser for molden files. Parameters ---------- source : str | list[str] The path to the molden file, or the lines from the file. only_molecule : bool, optional Only parse the atoms and skip molecular orbitals. Default is `False`. Attributes ---------- atoms : list[_Atom] A list of Atom objects containing the label, atomic number, and position for each atom. shells : list[_Shell] A list of `_Shell` objects containing the atom, angular momentum quantum number (l), and GTOs for each shell. mos : list[_MolecularOrbital] A list of MolecularOrbital objects containing the symmetry, energy, and coefficients for each MO. mo_coeffs : NDArray[np.floating] A 2D array containing all molecular orbital coefficients, where each row represents the coefficients for one molecular orbital. Raises ------ TypeError If the source is not a valid molden file path, or molden file lines. """ ANGSTROM_TO_BOHR = 1.8897259886 def __init__( self, source: str | list[str] | Any, only_molecule: bool = False, ) -> None: """Initialize the Parser with either a filename or molden lines.""" if isinstance(source, str): with Path(source).open('r') as file: self.molden_lines = file.readlines() elif isinstance(source, list): self.molden_lines = source else: raise TypeError('Source must be a filename (str) or list of lines (list[str]).') # Remove leading/trailing whitespace and newline characters self.molden_lines = [line.strip() for line in self.molden_lines] self.check_molden_format() self._atom_ind, self._gto_ind, self._mo_ind = self.divide_molden_lines() self.atoms = self.get_atoms() if only_molecule: return self.shells = self.get_shells() self.mos, self.mo_coeffs = self.get_mos()
[docs] def check_molden_format(self) -> None: """Check if the provided molden lines conform to the expected format. Raises ------ ValueError If the molden lines do not contain the required sections or if they are in an unsupported format. """ logger.info('Checking molden format...') if not self.molden_lines: raise ValueError('The provided molden lines are empty.') if not any('[Atoms]' in line for line in self.molden_lines): raise ValueError("No '[Atoms]' section found in the molden file.") if not any('[GTO]' in line for line in self.molden_lines): raise ValueError("No '[GTO]' section found in the molden file.") if not any('[MO]' in line for line in self.molden_lines): raise ValueError("No '[MO]' section found in the molden file.") if not any(orbs in line for orbs in ['5D', '9G'] for line in self.molden_lines): raise ValueError('Cartesian orbitals functions are not currently supported.') logger.info('Molden format check passed.')
[docs] def divide_molden_lines(self) -> tuple[int, int, int]: """Divide the molden lines into sections for atoms, GTOs, and MOs. Returns ------- tuple[int, int, int] Indices of the '[Atoms]', '[GTO]', and '[MO]' lines. Raises ------ ValueError If the molden lines do not contain the required sections. """ logger.info('Dividing molden lines into sections...') if '[Atoms] AU' in self.molden_lines: atom_ind = self.molden_lines.index('[Atoms] AU') elif '[Atoms] Angs' in self.molden_lines: atom_ind = self.molden_lines.index('[Atoms] Angs') else: raise ValueError('No (AU/Angs) in [Atoms] section found in the molden file.') gto_ind = self.molden_lines.index('[GTO]') mo_ind = self.molden_lines.index('[MO]') logger.info('Finished dividing molden lines.') return atom_ind, gto_ind, mo_ind
[docs] def get_atoms(self) -> list[_Atom]: """Parse the atoms from the molden file. Returns ------- list[_Atom] A list of Atom objects containing the label, atomic number, and position for each atom. """ logger.info('Parsing atoms...') angs = 'Angs' in self.molden_lines[self._atom_ind] atoms = [] for line in self.molden_lines[self._atom_ind + 1 : self._gto_ind]: label, _, atomic_number, *coords = line.split() position = np.array([float(coord) for coord in coords], dtype=float) if angs: position *= self.ANGSTROM_TO_BOHR atoms.append(_Atom(label, int(atomic_number), position, [])) logger.info('Parsed %s atoms.', len(atoms)) return atoms
[docs] def get_shells(self) -> list[_Shell]: """Parse the Gaussian-type orbitals (GTOs) from the molden file. Returns ------- list[_Shell] A list of `_Shell` objects containing the atom, angular momentum quantum number (l), and GTOs for each shell. Raises ------ ValueError If the shell label is not supported or if the GTOs are not formatted correctly in the molden file. """ logger.info('Parsing GTO lines...') shell_labels = ['s', 'p', 'd', 'f', 'g'] lines = iter(self.molden_lines[self._gto_ind + 1 : self._mo_ind]) shells = [] for atom in self.atoms: logger.debug('Parsing GTOs for atom: %s', atom.label) _ = next(lines) # Skip atom index # Read shells until a blank line while True: line = next(lines) if not line: break shell_label, num_gtos, _ = line.split() if shell_label not in shell_labels: raise ValueError(f"Shell label '{shell_label}' is currently not supported.") gtos = [] for _ in range(int(num_gtos)): exp, coeff = next(lines).split() gtos.append(_GTO(float(exp), float(coeff))) shell = _Shell(shell_labels.index(shell_label), gtos) shell.normalize() atom.shells.append(shell) shells.append(shell) logger.info('Parsed %s GTOs.', len(shells)) return shells
[docs] def get_mos(self, sort: bool = True) -> tuple[list[_MolecularOrbital], NDArray[np.floating]]: """Parse the molecular orbitals (MOs) from the molden file. Parameters ---------- sort : bool, optional If true (default), returns the MOs sorted by energy. If false, returns the MOs in the order given in the molden file. Returns ------- tuple[list[_MolecularOrbital], NDArray[np.floating]] Two-item tuple: the first element contains the parsed molecular orbitals (symmetry, energy, spin, occupation), and the second is a 2D NumPy array of orbital coefficients shaped ``(num_mos, num_basis_functions)``. """ logger.info('Parsing MO coefficients...') num_total_gtos = sum(2 * gto.l + 1 for gto in self.shells) order = self._gto_order() lines = self.molden_lines[self._mo_ind + 1 :] total_num_mos = sum('Sym=' in line for line in lines) lines = iter(lines) mos = [] mo_coeffs = np.empty((total_num_mos, num_total_gtos), dtype=float) for mo_ind in range(total_num_mos): _, sym = next(lines).split() energy_line = next(lines) energy = float(energy_line.split()[1]) _, spin = next(lines).split() occ_line = next(lines) occ = int(float(occ_line.split()[1])) coeffs = [] for _ in range(num_total_gtos): _, coeff = next(lines).split() coeffs.append(coeff) # Store coefficients in shared array mo_coeffs[mo_ind] = np.array(coeffs, dtype=float)[order] mo = _MolecularOrbital( sym=sym, energy=energy, spin=spin, occ=occ, ) mos.append(mo) logger.info('Parsed MO coefficients.') if sort: # Sort MOs and reorder mo_coeffs to match sorted_indices = sorted(range(len(mos)), key=lambda i: mos[i].energy) mos = [mos[i] for i in sorted_indices] mo_coeffs = mo_coeffs[sorted_indices] return mos, mo_coeffs
def _gto_order(self) -> list[int]: """Return the order of the GTOs in the molden file. Molden defines the order of the orbitals as m = 0, 1, -1, 2, -2, ... We want it to be m = -l, -l + 1, ..., l - 1, l. Note: For l = 1, the order is 1, -1, 0, which is different from the general pattern. This is handled separately. Returns ------- list[int] The order of the atomic orbitals. """ order = [] ind = 0 for shell in self.shells: l = shell.l if l == 1: order.extend([ind + 1, ind + 2, ind]) else: order.extend([ind + i for i in range(2 * l, -1, -2)]) order.extend([ind + i for i in range(1, 2 * l, 2)]) ind += 2 * l + 1 return order