# morphkit/parse_word_block
# SPDX-License-Identifier: CC-BY-4.0
# Copyright (c) 2025 Tony Jurg
__version__ = "0.0.1"
# import required packages
from typing import Callable, Dict, Any, List, Tuple
import beta_code
import re
import unicodedata
# bring in other sibling modules
from .analyse_pos import analyse_pos
from .analyse_morph_tag import analyse_morph_tag
# Note: The first part of this file deals with the conversion from betacode to unicode.
# Function parse_word_block() is in the second part.
# NORMALIZING UNICODE FOR USE WITH N1904-TF
# See also https://centerblc.github.io/N1904/characterencoding.html.
# Check unicode at https://unicode-explorer.com/search/
# The nine monotonic tonos chars (acute accent) to polytonic oxia chars
# See https://github.com/EzerIT/BibleOL/blob/master/techdoc/techdoc.pdf page 34
OXIA_MAP = {
"\u03AC": "\u1F71", # ά (alpha with oxia)
"\u03AD": "\u1F73", # έ (epsilon with oxia)
"\u03AE": "\u1F75", # ή (eta with oxia)
"\u03AF": "\u1F77", # ί (iota with oxia)
"\u03CC": "\u1F79", # ό (omicron with oxia)
"\u03CD": "\u1F7B", # ύ (upsilon with oxia)
"\u03CE": "\u1F7D", # ώ (omega with oxia)
"\u0390": "\u1FD3", # ΐ (iota dial. with oxia)
"\u03B0": "\u1FE3", # ΰ (UPSILON with oxia)
}
def _apply_oxia_map(text: str) -> str:
"""Swap out any of the nine tonos chars for their oxia counterparts."""
return "".join(OXIA_MAP.get(ch, ch) for ch in text)
def betacode_to_polytonic(betacode_str: str, debug: bool = False) -> str:
"""
Convert a Betacode string into Greek Unicode using precomposed polytonic characters.
The function will:
1. turn Betacode into monotonic Greek
2. NFC-normalize
3. remap the nine tonos chars into oxia chars from the predefined table.
"""
# perform the betacode to unicode conversion and deal with the caret (^), see comment below.
greek = beta_code.beta_code_to_greek(betacode_str).replace('^', chr(0x302))
# normalize so that any decomposed sequences get composed
norm = unicodedata.normalize("NFC", greek)
# apply the custom table to map tonos into oxia.
oxia = _apply_oxia_map(norm)
if debug:
print (f"[parse_word_block:betacode_to_polytonic] {betacode_str=} {greek=} {oxia=}")
return oxia
'''
Note: some issue still to be fully sorted out:
When converting certain betacode words containing a caret (^), it is not converted into unicode Greek.
Betacode manual: https://stephanus.tlg.uci.edu/encoding/BCM.pdf (pag 31) suggests that ^ should be
tranlated into 'Circumflex Accent Caret'. This can be done after the betacode conversions by adding
method .replace('^', chr(0x302)) which places the caret above the previous letter.
If precomposed unicode is required, we also need to convert the resulting Unicode to precomposed.
The caret originates from the stem files used by Morpheus, for instance:
https://github.com/perseids-tools/morpheus-perseids/blob/01db3bb72ff405e94917884f5cc89f1052c49ff4/stemlib/Greek/stemsrc/nom.irreg#L1307
Another option might be to remove the caret before turning it into Unicode.
'''
[docs]
def parse_word_block(block: List[str], language: str = 'greek', debug: bool = False) -> Tuple[str, List[Dict[str, Any]]]:
"""
Parse a single Morpheus output block of Beta-code lines into structured morphological data.
Each block corresponds to all analyses for one Greek form. Lines are labeled with prefixes
like `:raw`, `:lem`, `:stem`, `:end`, etc. This function walks through those labels,
extracts fields, and assembles a dictionary with morphological features.
Args:
-----
:block (List[str]): A list of lines (strings) from Morpheus output, each starting with a label
like `:raw`, `:lem`, `:stem`, or `:end`, followed by tab-separated fields.
Usualy this is the output generated by function :py:func:`~morphkit.get_word_blocks()`.
:language (str): Optional argument. Defaults to `greek`. The other option is `latin`.
:debug (bool): Optional argument. Defaults to `False`. If set to `True` the function print some debug information.
Returns:
--------
:Tuple[str, List[Dict[str, Any]]]: A pair (raw_beta, parses)
Each pair consist of:
:raw_beta (str): raw Beta-code form as returned by Morpheus (from the last `:raw` line).
:parses (List[Dict]): a list of parse dictionaries, one per analysis block. Each parse dictionary may contain keys such as:
- `"raw_bc"`: the original betacode word.
- `"workw_bc"`: the segment Morpheus analysed in betacode.
- `"lem_full_bc"`: the full lemma form (incl homonym or pl suffix) in betacode.
- grammatical features: `"case"`, `"number"`, `"gender"`, `"tense"`, `"mood"`, `"voice"`, `"person"`, `"degree"`.
- lists: `"morph_codes"`, `"morph_flags"`, `"dialects"`.
- computed fields added to it such as `"pos"` (Part of Speech) and `"morph"` (SP tag).
Raises:
-------
:ValueError: If the language parameter is invalid (only 'greek' and 'latin' are allowed).
General notes:
--------------
.. _bol:https://learner.bible/.
- The entries with unicode representation of the betacode are using precomposed character (with oxia) in order to match its presentation in the N1904-TF dataset which is using the same scheme as the `Bible Online Learner`_bol.
- Details on decoding the Morpheus output can be found at https://github.com/tonyjurg/Create_morpheus_TF_dataset/blob/main/Decoding_Morpheus_output/decode_output.md
"""
raw_beta = "unknown" # default raw beta code if not set
parses = [] # list to collect parse dictionaries
current = None # placeholder for the current parse entry
# a single map from normalized token → (key, value)
# This mapping ensures conformance to the Morpheus values
TOK_MAP = {
# tenses
"pres": ("tense","pres"),
"present": ("tense","pres"),
"imperf": ("tense","imperf"),
"imperfect": ("tense","imperf"),
"fut": ("tense","fut"),
"future": ("tense","fut"),
"aor": ("tense","aor"),
"aorist": ("tense","aor"),
"perf": ("tense","perf"),
"perfect": ("tense","perf"),
"plup": ("tense","plup"),
"pluperfect": ("tense","plup"),
# moods
"ind": ("mood","ind"),
"indicative": ("mood","ind"),
"subj": ("mood","subj"),
"subjunctive": ("mood","subj"),
"opt": ("mood","opt"),
"optative": ("mood","opt"),
"imperat": ("mood","imperat"),
"imperative": ("mood","imperat"),
"inf": ("mood","inf"),
"infinitive": ("mood","inf"),
"part": ("mood","part"),
"participle": ("mood","part"),
# voices
"act": ("voice","act"),
"active": ("voice","act"),
"mid": ("voice","mid"),
"middle": ("voice","mid"),
"pass": ("voice","pass"),
"passive": ("voice","pass"),
"mp": ("voice","mp"), # midlePassive
# numbers
"sg": ("number","sg"),
"pl": ("number","pl"),
"dual": ("number","dual"),
# degrees
"comp": ("degree","comp"),
"sup": ("degree","sup"),
# morphological particularities
"contr": ("morphpart","contr"),
"nu_movable": ("morphpart","nu_movable"),
"short_subj": ("morphpart","short_subj"),
"indeclform": ("morphpart","indeclform"),
"impersonal": ("morphpart","impersonal"),
# genders
"masc": ("gender", "masc"),
"fem": ("gender", "fem"),
"neut": ("gender", "neut"),
# cases
"nom": ("case", "nom"),
"acc": ("case", "acc"),
"gen": ("case", "gen"),
"dat": ("case", "dat"),
"abl": ("case", "abl"),
"voc": ("case", "voc"),
# person
"1st": ("person", "1st"),
"2nd": ("person", "2nd"),
"3rd": ("person", "3rd"),
}
PRON_MAP = { # pronoun type
"art_adj" : ("pron_type", "personal"),
"pron_adj1" : ("pron_type", "demonstrative"),
"demonstr" : ("pron_type", "demonstrative"),
"relative" : ("pron_type", "relative"),
"pron1" : ("pron_type", "personal"),
"pron2" : ("pron_type", "personal"),
"pron3" : ("pron_type", "personal"),
"indef" : ("pron_type", "indefinite"),
"interrog" : ("pron_type", "interrogative"),
}
SECOND_TENSE_MAP = {
"aor2": "A",
"aor2_pass": "AP",
# so far no other 2nd-s seen, if seen, they can be added here
# I put in an A/AP, so that if other 2nds are found, it is easier to distinguise
}
if debug:
import inspect
frame = inspect.currentframe().f_back
#print(f"[parse_word_block] called with: {frame.f_locals}") #provides huge (full) amount of details
# Tailor the output to language
if language not in ('greek','latin'):
raise ValueError(
f"[parse_word_block] Unknown language format {language!r}. "
"Choose from {'greek', 'latin'}."
)
if language == 'greek':
bc_ind='_bc'
uc_itm=True
else: # can only be latin
bc_ind=''
uc_itm=False
for line in block:
# Only process lines that begin with a colon (labels)
if not line.startswith(':'):
continue
# Extract label (e.g., 'raw', 'lem', etc.) and fields after the label
label = line.split()[0][1:]
fields = line[len(label)+2:].split('\t')
"""
================================ Label :raw ================================
This item always contains data which reproduces typicaly the token as suplied in betacode.
The raw form of the word, as it was inputted. This may include ellipsis (indicated with ').
Crane provides as example ἐπέμπετ᾽, which could stand for ἐπέμπετε ("you [pl] were sending")
or ἐπέμπετο ("s/he was being sent")
"""
if label == 'raw':
# Start a new parse: 'raw' gives the base word in beta code
raw_beta = fields[0].strip()
if uc_itm:
current = {'raw_bc': raw_beta}
current["raw_uc"] = betacode_to_polytonic(raw_beta)
else:
current = {'raw': raw_beta}
parses.append(current)
elif current is None:
# Skip analysis lines until a 'raw' label initializes a parse
continue
"""
================================ Label :workw ==============================
This item always contains data which represents the working token after basic normalisation.
In many instances, the raw and work word are identical
"""
elif label == 'workw':
# Record the word form in context and its Unicode conversion
workw_token = fields[0].strip()
current[f"workw{bc_ind}"] = workw_token
if uc_itm: current["workw_uc"] = betacode_to_polytonic(workw_token)
"""
================================ Label :lem ================================
This item always contains data which represents the lemma associated to the rest of the grammatical
data in the analytic block.
"""
elif label == "lem":
# Full lemma entry which may include items like homonym number and '-pl' suffix
lemma_field = fields[0].strip()
current[f"lem_full{bc_ind}"] = lemma_field
# if debug is set also print the lemma in betacode, unicode and the TF addapations
if uc_itm: current["lem_full_uc"] = betacode_to_polytonic(lemma_field,debug)
# Check for and remove a '-pl' suffix
if lemma_field.endswith('-pl'):
current["lem_pl_suff"] = '1'
lemma_field = lemma_field[:-3] # remove the '-pl'
# Try to split lemma into base and homonym number (e.g. "logos2" → base: "logos", homonym: 2)
m = re.match(r"^(.*?)(\d+)$", lemma_field)
if m:
current[f"lem_base{bc_ind}"] = m.group(1) # Everything before the digits
current["lem_homonym"] = int(m.group(2)) # The digits (homonym number)
else:
current[f"lem_base{bc_ind}"] = lemma_field
# Convert the stripped lemma-field as base to Betacode and Unicode Greek
if uc_itm: current["lem_base_uc"] = betacode_to_polytonic(current["lem_base_bc"])
"""
================================ Label :prvb ================================
This item is filled with details about attached preposition (e.g., ἐν- or meta-) when applicable.
Furthermore it may contain details about dialect and morphological flags (related to the prepositon???)
It can contain zero, one or two prepositions. Eg. διακατηλέγχετο ('diakathle/gxeto'; Acts 18:28) has 2: dia/,kata/.
"""
elif label == "prvb":
# beta code and Unicode conversion
prvb_segment = fields[0].strip()
if prvb_segment:
# Split on commas, strip whitespace, drop any empties
preverbs = [p.strip() for p in re.split(r'\s*,\s*', prvb_segment) if p.strip()]
# Store the betacode **list**
current[f"prvb{bc_ind}"] = preverbs
# Convert each to Unicode (and normalize the circumflex)
if uc_itm: current["prvb_uc"] = [
betacode_to_polytonic(p)
for p in preverbs
]
# This part may be required to be expanded!
# There is info available like ":prvb a)na/ apocope poetic "
# or ":prvb a)na/ root_preverb " or
# :prvb a)po/,a)na/ apocope poetic unasp_preverb
"""
================================ Label :aug1 ================================
This item is filled with details about augment / reduplication, dialect, and morph-flags
when applicable
"""
elif label == "aug1":
# beta code and Unicode conversion
aug1_segment = fields[0].strip()
if aug1_segment:
current[f"aug1{bc_ind}"] = aug1_segment
if uc_itm: current["aug1_uc"] = betacode_to_polytonic(aug1_segment)
# Maybe we need to expand this!
"""
================================ Label :stem ================================
This section provides the determined stem,
"""
elif label == "stem":
# Stem analysis: beta code and Unicode conversion
stem_segment = fields[0].strip()
if stem_segment:
current[f"stem{bc_ind}"] = stem_segment
if uc_itm: current["stem_uc"] = betacode_to_polytonic(stem_segment)
# Grammatical tokens in second field: gender, number, case, or extra morph flags
if len(fields) > 1 and fields[1]:
for tok in fields[1].split():
if tok in {"masc", "fem", "neut"}:
current["stem_gender"] = TOK_MAP[tok][1] # TOK_MAP = key, val
elif tok in {"sg", "pl", "dual"}:
current["stem_number"] = TOK_MAP[tok][1]
elif tok in {"nom", "acc", "gen", "dat", "abl", "voc"}:
current["stem_case"] = TOK_MAP[tok][1]
else:
current.setdefault("stem_tok", []).append(tok)
# Additional dialect markers (3rd field) and flags (4th field)
if len(fields) > 2 and fields[2].strip():
current.setdefault("dialects", []).extend(fields[2].strip().split())
if len(fields) > 3 and fields[3].strip():
current.setdefault("stem_flags", []).extend(fields[3].strip().split())
# Stem codes list (5th field, comma-separated)
if len(fields) > 4 and fields[4].strip():
codes = [c.strip() for c in fields[4].split(",") if c.strip()]
current.setdefault("stem_codes", []).extend(codes)
"""
================================ Label :suff ================================
Note: for all words in the Greek New Testament, this line contained no data.
Hence the code is not looking for any other information on this line.
"""
elif label == "suff":
# suff analysis: beta code and Unicode conversion
suff_segment = fields[0].strip()
if suff_segment:
current[f"suff{bc_ind}"] = suff_segment
if uc_itm: current["suff_uc"] = betacode_to_polytonic(suff_segment)
"""
================================ Label :end ================================
This line is of particular interest as it contains most of the morphological data.
"""
elif label == "end":
# Ending segment and its Unicode conversion
if fields[0]:
end_seg = fields[0].strip()
current[f"end{bc_ind}"] = end_seg
if uc_itm: current["end_uc"] = betacode_to_polytonic(end_seg)
# Tense, mood, voice, person, number, gender, case, degree, etc., from second field
if len(fields) > 1:
# Iterate over morphological tokens in the end field
if fields[1]:
for tok in fields[1].split():
tl = tok.lower() # Normalize token for consistent matching
# 1) Direct match in TOK_MAP: assign feature value
if tl in TOK_MAP:
key, val = TOK_MAP[tl]
# Special handling for keys that may have multiple values, like case/gender
if key in {"case", "gender"}:
if key in current:
# If already a list, append new value
if isinstance(current[key], list):
current[key].append(val)
else:
# Convert existing scalar to list with new value
current[key] = [current[key], val]
else:
# First occurrence: assign as scalar
current[key] = val
else:
# Regular case: assign directly
current[key] = val
# 2) Handle compound gender/case values (e.g., "masc/fem", "nom/voc")
elif "/" in tl:
parts = tl.split("/")
try:
# Try to map all parts using TOK_MAP
key_vals = [TOK_MAP[p] for p in parts]
# Ensure all mapped keys are the same (either all gender or all case)
keys = set(k for k, _ in key_vals)
if len(keys) == 1:
key = keys.pop()
values = [v for _, v in key_vals]
current.setdefault(key, []).extend(values)
else:
# Mixed keys are not valid; keep as unmatched
current.setdefault("other_end_tokens", []).append(tok)
except KeyError:
# One or more parts were unrecognized
current.setdefault("other_end_tokens", []).append(tok)
# 3) If the token is a person indicator (1st, 2nd, or 3rd)
elif re.fullmatch(r"[123](?:st|nd|rd)", tl):
current["person"] = tl
# 4) Anything else goes into a generic catch-all bucket
else:
current.setdefault("other_end_tokens", []).append(tok)
# Collect any trailing dialects and morphological flags/codes
if len(fields) > 2 and fields[2].strip():
current.setdefault("dialects", []).extend(fields[2].strip().split())
if len(fields) > 3 and fields[3].strip():
current.setdefault("end_flags", []).extend(fields[3].strip().split())
if len(fields) > 4 and fields[4].strip():
codes = [c.strip() for c in fields[4].split(",") if c.strip()]
current.setdefault("end_codes", []).extend(codes)
# populate pronoun type if applicable
for code in codes:
if code in PRON_MAP:
key, val = PRON_MAP[code]
current[key] = val
# lookup the first matching sec_tense value (if any)
sec_val = next(
(SECOND_TENSE_MAP[code]
for code in current.get("end_codes", [])
if code in SECOND_TENSE_MAP),
None
)
# only set the key if we found something
if sec_val is not None:
current["sec_tense"] = sec_val
"""
============================== post processing ====================================
A first level op post processing and interpretation of the data is done in this section
"""
# final processing
for p in parses:
# Remove duplicate entries for each parse, preserving insertion order
# Deduplicate dialect entries
if 'dialects' in p:
p['dialects'] = list(dict.fromkeys(p['dialects']))
# Handle ordinal markers like '1st','2nd','3rd' by setting person and removing them from other_end_tokens
other_tokens = p.get('other_end_tokens', [])
new_tokens = []
for tok in other_tokens:
m_ord = re.match(r'^([123])(?:st|nd|rd|th)$', tok.lower())
if m_ord:
# Assign person if not already set
if 'person' not in p:
p['person'] = TOK_MAP[m_ord.group(1)][1] # TOK_MAP = key, val
else:
new_tokens.append(tok)
# Update other_end_tokens with the filtered list
if new_tokens:
p['other_end_tokens'] = new_tokens
else:
p.pop('other_end_tokens', None)
return raw_beta, parses
# End of function parse_word_block().