Walkthrough 1 — Sequence operations¶

molforge.sequence covers the everyday sequence-level operations: alignment, mutations, composition statistics. Pure NumPy — no Biopython dependency.

Topics:

Composition and properties — molecular weight, GRAVY, aromaticity.
Pairwise alignment — Needleman-Wunsch (global) and Smith-Waterman (local).
Substitution matrices — BLOSUM62 and PAM250.
Mutations — A123V, H:K42N, slash-delimited multi-mutants.
Mutating a Protein — sequence-level changes on a 3-D structure.

In [1]:

Copied!

import molforge as mf
print(f"molforge {mf.__version__}")
import molforge as mf
print(f"molforge {mf.__version__}")

molforge 0.0.1

1. Composition and properties¶

In [1]:

Copied!





from molforge.sequence import composition, length, molecular_weight, gravy, aromaticity

# Human ubiquitin (PDB 1UBQ)
seq = "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
print(f"Length:               {length(seq)} residues")
print(f"Molecular weight:     {molecular_weight(seq):.1f} Da")
print(f"GRAVY:                {gravy(seq):+.3f}")
print(f"Aromatic fraction:    {aromaticity(seq):.1%}")
from molforge.sequence import composition, length, molecular_weight, gravy, aromaticity

# Human ubiquitin (PDB 1UBQ)
seq = "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
print(f"Length:               {length(seq)} residues")
print(f"Molecular weight:     {molecular_weight(seq):.1f} Da")
print(f"GRAVY:                {gravy(seq):+.3f}")
print(f"Aromatic fraction:    {aromaticity(seq):.1%}")

Length:               76 residues
Molecular weight:     8564.8 Da
GRAVY:                -0.489
Aromatic fraction:    3.9%

In [1]:

Copied!





# Per-residue composition counts or fractions
comp = composition(seq, as_fraction=True)
top5 = sorted(comp.items(), key=lambda kv: -kv[1])[:5]
print("Top 5 amino acids by frequency:")
for aa, frac in top5:
    print(f"  {aa}: {frac:.1%}")
# Per-residue composition counts or fractions
comp = composition(seq, as_fraction=True)
top5 = sorted(comp.items(), key=lambda kv: -kv[1])[:5]
print("Top 5 amino acids by frequency:")
for aa, frac in top5:
    print(f"  {aa}: {frac:.1%}")

Top 5 amino acids by frequency:
  L: 11.8%
  I: 9.2%
  K: 9.2%
  T: 9.2%
  E: 7.9%

2. Pairwise alignment¶

Two sequences from the same family share local stretches but differ in length and content. The right algorithm picks itself: Needleman-Wunsch for global (full-length) alignment, Smith-Waterman for local (best subsequence) alignment.

In [1]:

Copied!





from molforge.sequence import align, needleman_wunsch, smith_waterman

# Two short fragments of ubiquitin with one substitution and one indel
a = "MQIFVKTLTGKTITLEV"
b = "MQILVKTLTGKTITLEV"  # F4L

result = needleman_wunsch(a, b)
print("=== Global alignment (Needleman-Wunsch) ===")
print(f"Identity: {result.identity:.1%}")
print(f"Score:    {result.score:.0f}")
print()
print(result.format(width=80))
from molforge.sequence import align, needleman_wunsch, smith_waterman

# Two short fragments of ubiquitin with one substitution and one indel
a = "MQIFVKTLTGKTITLEV"
b = "MQILVKTLTGKTITLEV"  # F4L

result = needleman_wunsch(a, b)
print("=== Global alignment (Needleman-Wunsch) ===")
print(f"Identity: {result.identity:.1%}")
print(f"Score:    {result.score:.0f}")
print()
print(result.format(width=80))

=== Global alignment (Needleman-Wunsch) ===
Identity: 94.1%
Score:    75

MQIFVKTLTGKTITLEV
||| |||||||||||||
MQILVKTLTGKTITLEV

In [1]:

Copied!





# Find a shared region between two divergent sequences
a = "ZZZZZMQIFVKTLTGKTZZZZZ"
b = "QQQQMQIFVKTLTGKTQQQ"
result = smith_waterman(a, b)
print("=== Local alignment (Smith-Waterman) ===")
print(f"Identity:    {result.identity:.1%}")
print(f"Score:       {result.score:.0f}")
print(f"a coverage:  {result.coverage_a:.1%}")
print(f"b coverage:  {result.coverage_b:.1%}")
print(f"a region:    a[{result.start_a}:{result.end_a}]")
print(f"b region:    b[{result.start_b}:{result.end_b}]")
print()
print(result.aligned_a)
print(result.aligned_b)
# Find a shared region between two divergent sequences
a = "ZZZZZMQIFVKTLTGKTZZZZZ"
b = "QQQQMQIFVKTLTGKTQQQ"
result = smith_waterman(a, b)
print("=== Local alignment (Smith-Waterman) ===")
print(f"Identity:    {result.identity:.1%}")
print(f"Score:       {result.score:.0f}")
print(f"a coverage:  {result.coverage_a:.1%}")
print(f"b coverage:  {result.coverage_b:.1%}")
print(f"a region:    a[{result.start_a}:{result.end_a}]")
print(f"b region:    b[{result.start_b}:{result.end_b}]")
print()
print(result.aligned_a)
print(result.aligned_b)

=== Local alignment (Smith-Waterman) ===
Identity:    63.2%
Score:       80
a coverage:  86.4%
b coverage:  100.0%
a region:    a[1:20]
b region:    b[0:19]

ZZZZMQIFVKTLTGKTZZZ
QQQQMQIFVKTLTGKTQQQ

3. Substitution matrices¶

Both NW and SW default to BLOSUM62 — the standard. PAM250 and match/mismatch are also available.

In [1]:

Copied!





from molforge.sequence import BLOSUM62, PAM250, get_matrix, available_matrices

print(f"Available matrices: {available_matrices()}")
mat, idx = get_matrix("BLOSUM62")
print(f"BLOSUM62 shape: {mat.shape}")  # 24x24 (20 AAs + B/Z/X/*)
print(f"index A -> {idx['A']}, R -> {idx['R']}, X -> {idx['X']}")
print()
# A swap between two similar residues (L vs I) scores high in BLOSUM62
print(f"BLOSUM62[L, I] = {mat[idx['L'], idx['I']]}  (similar)")
print(f"BLOSUM62[L, D] = {mat[idx['L'], idx['D']]}  (dissimilar)")
from molforge.sequence import BLOSUM62, PAM250, get_matrix, available_matrices

print(f"Available matrices: {available_matrices()}")
mat, idx = get_matrix("BLOSUM62")
print(f"BLOSUM62 shape: {mat.shape}")  # 24x24 (20 AAs + B/Z/X/*)
print(f"index A -> {idx['A']}, R -> {idx['R']}, X -> {idx['X']}")
print()
# A swap between two similar residues (L vs I) scores high in BLOSUM62
print(f"BLOSUM62[L, I] = {mat[idx['L'], idx['I']]}  (similar)")
print(f"BLOSUM62[L, D] = {mat[idx['L'], idx['D']]}  (dissimilar)")

Available matrices: ['BLOSUM62', 'PAM250']
BLOSUM62 shape: (24, 24)
index A -> 0, R -> 1, X -> 22

BLOSUM62[L, I] = 2  (similar)
BLOSUM62[L, D] = -4  (dissimilar)

In [1]:

Copied!





# Compare matrices on the same alignment
result_blosum = align("MKTV", "MRTV", matrix="BLOSUM62")
result_pam250 = align("MKTV", "MRTV", matrix="PAM250")
result_match = align("MKTV", "MRTV", matrix=None, match=2, mismatch=-1)
print(f"BLOSUM62 score: {result_blosum.score}")
print(f"PAM250   score: {result_pam250.score}")
print(f"Simple   score: {result_match.score}")
# Compare matrices on the same alignment
result_blosum = align("MKTV", "MRTV", matrix="BLOSUM62")
result_pam250 = align("MKTV", "MRTV", matrix="PAM250")
result_match = align("MKTV", "MRTV", matrix=None, match=2, mismatch=-1)
print(f"BLOSUM62 score: {result_blosum.score}")
print(f"PAM250   score: {result_pam250.score}")
print(f"Simple   score: {result_match.score}")

BLOSUM62 score: 16
PAM250   score: 16
Simple   score: 5

4. Mutations¶

Point mutations follow the standard protein-engineering notation:

A123V — wild-type A at position 123 → V
H:K42N — chain-prefixed
A1V/T56K/L99M — slash- (or comma-) delimited multi-mutants

In [1]:

Copied!





from molforge.sequence import Mutation, parse_mutations, apply_mutation, apply_mutations

# Parse a single mutation
m = Mutation.parse("A123V")
print(f"Parsed: {m}")
print(f"  wild_type={m.wild_type}, position={m.position}, mutant={m.mutant}")
print()

# Parse a multi-mutant string
muts = parse_mutations("A1V/T56K/L99M")
print(f"Multi-mutant: {[str(m) for m in muts]}")
from molforge.sequence import Mutation, parse_mutations, apply_mutation, apply_mutations

# Parse a single mutation
m = Mutation.parse("A123V")
print(f"Parsed: {m}")
print(f"  wild_type={m.wild_type}, position={m.position}, mutant={m.mutant}")
print()

# Parse a multi-mutant string
muts = parse_mutations("A1V/T56K/L99M")
print(f"Multi-mutant: {[str(m) for m in muts]}")

Parsed: A123V
  wild_type=A, position=123, mutant=V

Multi-mutant: ['A1V', 'T56K', 'L99M']

In [1]:

Copied!





# Apply a single mutation to a sequence string
ub = "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
print(f"Wild-type position 48:  {ub[47]}")
mutant = apply_mutation(ub, "K48R")
print(f"Mutant position 48:     {mutant[47]}")
diff = [i for i, (a, b) in enumerate(zip(ub, mutant)) if a != b]
print(f"Differs at positions:   {diff}")
print()

# Apply multiple mutations at once
double = apply_mutations(ub, "K48R/L67P")
diffs = [(i+1, ub[i], double[i]) for i in range(len(ub)) if ub[i] != double[i]]
print(f"Double mutant changes:  {diffs}")
# Apply a single mutation to a sequence string
ub = "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
print(f"Wild-type position 48:  {ub[47]}")
mutant = apply_mutation(ub, "K48R")
print(f"Mutant position 48:     {mutant[47]}")
diff = [i for i, (a, b) in enumerate(zip(ub, mutant)) if a != b]
print(f"Differs at positions:   {diff}")
print()

# Apply multiple mutations at once
double = apply_mutations(ub, "K48R/L67P")
diffs = [(i+1, ub[i], double[i]) for i in range(len(ub)) if ub[i] != double[i]]
print(f"Double mutant changes:  {diffs}")

Wild-type position 48:  K
Mutant position 48:     R
Differs at positions:   [47]

Double mutant changes:  [(48, 'K', 'R'), (67, 'L', 'P')]

Validation built in¶

apply_mutation checks that the declared wild-type matches what's actually at that position. This catches a class of bugs where you think you're mutating one residue but are actually pointing at another.

In [1]:

Copied!





# This will fail — there's no L at position 48 in ubiquitin
try:
    apply_mutation(ub, "L48R")
except ValueError as e:
    print(f"Caught error: {e}")
# This will fail — there's no L at position 48 in ubiquitin
try:
    apply_mutation(ub, "L48R")
except ValueError as e:
    print(f"Caught error: {e}")

Caught error: wild-type mismatch: mutation L48R expects L at position 48, but found K

5. Mutating a Protein¶

mutate_protein applies a mutation at the structural level — it updates the residue label on the canonical AtomArray. Atom coordinates stay put: it's a sequence-only mutation. For side-chain rebuilding or energy minimization after substitution, plug into Rosetta or OpenMM downstream.

In [1]:

Copied!





from pathlib import Path
from molforge.io import read_pdb
from molforge.sequence import mutate_protein

protein = read_pdb(Path("../../tests/fixtures/pdb/tripeptide.pdb"))
print(f"Original sequence: {protein.sequence}")
print(f"Residue 1 name:    {protein['A'][1].name}")
print()

# Apply mutation — get back a new Protein, original is unchanged
mutant = mutate_protein(protein, "A1C", chain_id="A")
print(f"Mutant sequence:   {mutant.sequence}")
print(f"Residue 1 name:    {mutant['A'][1].name}")
print(f"Original still:    {protein['A'][1].name}")
print()

# Coordinates are unchanged (it's a sequence-only operation)
import numpy as np
print(f"Coords identical:  {np.allclose(mutant.atom_array.coords, protein.atom_array.coords)}")
from pathlib import Path
from molforge.io import read_pdb
from molforge.sequence import mutate_protein

protein = read_pdb(Path("../../tests/fixtures/pdb/tripeptide.pdb"))
print(f"Original sequence: {protein.sequence}")
print(f"Residue 1 name:    {protein['A'][1].name}")
print()

# Apply mutation — get back a new Protein, original is unchanged
mutant = mutate_protein(protein, "A1C", chain_id="A")
print(f"Mutant sequence:   {mutant.sequence}")
print(f"Residue 1 name:    {mutant['A'][1].name}")
print(f"Original still:    {protein['A'][1].name}")
print()

# Coordinates are unchanged (it's a sequence-only operation)
import numpy as np
print(f"Coords identical:  {np.allclose(mutant.atom_array.coords, protein.atom_array.coords)}")

Original sequence: AGV
Residue 1 name:    ALA

Mutant sequence:   CGV
Residue 1 name:    CYS
Original still:    ALA

Coords identical:  True

What's next¶

Walkthrough 2 — structural analysis on the resulting structure (RMSD against wild-type, DSSP, SASA, dihedrals).
Walkthrough 5 — turn sequences and structures into ML-ready tensors (one-hot, BLOSUM embeddings, ESM-2 language model embeddings).
End-to-end example (notebooks/examples/end_to_end_design.ipynb): full design loop that combines sequence operations, folding, and structural comparison.