Source code for morphkit.parse_word_block

# morphkit/parse_word_block
# SPDX-License-Identifier: CC-BY-4.0
# Copyright (c) 2025 Tony Jurg
__version__ = "0.0.1"

# import required packages
from typing import Callable, Dict, Any, List, Tuple
import beta_code
import re
import unicodedata

# bring in other sibling modules
from .analyse_pos        import analyse_pos
from .analyse_morph_tag  import analyse_morph_tag

# Note: The first part of this file deals with the conversion from betacode to unicode.
# Function parse_word_block() is in the second part.

# NORMALIZING UNICODE FOR USE WITH N1904-TF
# See also https://centerblc.github.io/N1904/characterencoding.html.
# Check unicode at https://unicode-explorer.com/search/

# The nine monotonic tonos chars (acute accent) to polytonic oxia chars
# See https://github.com/EzerIT/BibleOL/blob/master/techdoc/techdoc.pdf page 34
OXIA_MAP = {
    "\u03AC": "\u1F71",  # ά (alpha with oxia) 
    "\u03AD": "\u1F73",  # έ (epsilon with oxia)
    "\u03AE": "\u1F75",  # ή (eta with oxia)
    "\u03AF": "\u1F77",  # ί (iota with oxia)
    "\u03CC": "\u1F79",  # ό (omicron with oxia)
    "\u03CD": "\u1F7B",  # ύ (upsilon with oxia)
    "\u03CE": "\u1F7D",  # ώ (omega with oxia)
    "\u0390": "\u1FD3",  # ΐ (iota dial. with oxia)
    "\u03B0": "\u1FE3",  # ΰ (UPSILON with oxia)
}

def _apply_oxia_map(text: str) -> str:
    """Swap out any of the nine tonos chars for their oxia counterparts."""
    return "".join(OXIA_MAP.get(ch, ch) for ch in text)

def betacode_to_polytonic(betacode_str: str, debug: bool = False) -> str:
    """
    Convert a Betacode string into Greek Unicode using precomposed polytonic characters.

    The function will:
      1. turn Betacode into monotonic Greek
      2. NFC-normalize
      3. remap the nine tonos chars into oxia chars from the predefined table.
    """
    # perform the betacode to unicode conversion and deal with the caret (^), see comment below.
    greek = beta_code.beta_code_to_greek(betacode_str).replace('^', chr(0x302))
    # normalize so that any decomposed sequences get composed
    norm = unicodedata.normalize("NFC", greek)
    # apply the custom table to map tonos into oxia.
    oxia = _apply_oxia_map(norm)
    if debug:
        print (f"[parse_word_block:betacode_to_polytonic] {betacode_str=} {greek=} {oxia=}")
    return oxia



'''
Note: some issue still to be fully sorted out: 

When converting certain betacode words containing a caret (^), it is not converted into unicode Greek.
Betacode manual: https://stephanus.tlg.uci.edu/encoding/BCM.pdf  (pag 31) suggests that ^ should be
tranlated into 'Circumflex Accent Caret'. This can be done after the betacode conversions by adding
method .replace('^', chr(0x302)) which places the caret above the previous letter. 
If precomposed unicode is required, we also need to convert the resulting Unicode to precomposed.
The caret originates from the stem files used by Morpheus, for instance:
https://github.com/perseids-tools/morpheus-perseids/blob/01db3bb72ff405e94917884f5cc89f1052c49ff4/stemlib/Greek/stemsrc/nom.irreg#L1307

Another option might be to remove the caret before turning it into Unicode.

'''





[docs]
def parse_word_block(block: List[str], language: str = 'greek', debug: bool = False) -> Tuple[str, List[Dict[str, Any]]]:
    """
    Parse a single Morpheus output block of Beta-code lines into structured morphological data.

    Each block corresponds to all analyses for one Greek form. Lines are labeled with prefixes
    like `:raw`, `:lem`, `:stem`, `:end`, etc. This function walks through those labels,
    extracts fields, and assembles a dictionary with morphological features.

    Args:
    -----
   
        :block (List[str]):  A list of lines (strings) from Morpheus output, each starting with a label
            like `:raw`, `:lem`, `:stem`, or `:end`, followed by tab-separated fields.
            Usualy this is the output generated by function :py:func:`~morphkit.get_word_blocks()`.

        :language (str): Optional argument. Defaults to `greek`. The other option is `latin`.

        :debug (bool):   Optional argument. Defaults to `False`. If set to `True` the function print some debug information. 

    Returns:
    --------
  
        :Tuple[str, List[Dict[str, Any]]]: A pair (raw_beta, parses)

        Each pair consist of:
            
            :raw_beta (str):  raw Beta-code form as returned by Morpheus (from the last `:raw` line).
            :parses (List[Dict]): a list of parse dictionaries, one per analysis block. Each parse dictionary may contain keys such as:

                - `"raw_bc"`: the original betacode word.
                - `"workw_bc"`: the segment Morpheus analysed in betacode.
                - `"lem_full_bc"`: the full lemma form (incl homonym or pl suffix) in betacode.
                - grammatical features: `"case"`, `"number"`, `"gender"`, `"tense"`, `"mood"`, `"voice"`, `"person"`, `"degree"`.
                - lists: `"morph_codes"`, `"morph_flags"`, `"dialects"`.
                - computed fields added to it such as `"pos"` (Part of Speech) and `"morph"` (SP tag).

    Raises:
    -------

        :ValueError: If the language parameter is invalid (only 'greek' and 'latin' are allowed).

    General notes:
    --------------

    .. _bol:https://learner.bible/.

        - The entries with unicode representation of the betacode are using precomposed character (with oxia) in order to match its presentation in the N1904-TF dataset which is using the same scheme as the `Bible Online Learner`_bol. 

        - Details on decoding the Morpheus output can be found at           https://github.com/tonyjurg/Create_morpheus_TF_dataset/blob/main/Decoding_Morpheus_output/decode_output.md
        
    """
      
    raw_beta = "unknown"    # default raw beta code if not set
    parses = []             # list to collect parse dictionaries
    current = None          # placeholder for the current parse entry

    # a single map from normalized token → (key, value)    
    # This mapping ensures conformance to the Morpheus values
    TOK_MAP = {
        # tenses
        "pres":        ("tense","pres"), 
        "present":     ("tense","pres"),
        "imperf":      ("tense","imperf"),
        "imperfect":   ("tense","imperf"),
        "fut":         ("tense","fut"),
        "future":      ("tense","fut"),
        "aor":         ("tense","aor"),
        "aorist":      ("tense","aor"),
        "perf":        ("tense","perf"),
        "perfect":     ("tense","perf"),
        "plup":        ("tense","plup"), 
        "pluperfect":  ("tense","plup"),
       
        # moods
        "ind":         ("mood","ind"),
        "indicative":  ("mood","ind"),
        "subj":        ("mood","subj"),
        "subjunctive": ("mood","subj"),
        "opt":         ("mood","opt"),   
        "optative":    ("mood","opt"),
        "imperat":     ("mood","imperat"), 
        "imperative":  ("mood","imperat"),
        "inf":         ("mood","inf"), 
        "infinitive":  ("mood","inf"),
        "part":        ("mood","part"), 
        "participle":  ("mood","part"),
        
        # voices
        "act":         ("voice","act"),
        "active":      ("voice","act"),
        "mid":         ("voice","mid"),
        "middle":      ("voice","mid"),
        "pass":        ("voice","pass"),
        "passive":     ("voice","pass"),
        "mp":          ("voice","mp"), # midlePassive
        
        # numbers
        "sg":          ("number","sg"),
        "pl":          ("number","pl"),
        "dual":        ("number","dual"),
        # degrees
        "comp":        ("degree","comp"), 
        "sup":         ("degree","sup"),  
        
        # morphological particularities
        "contr":       ("morphpart","contr"),
        "nu_movable":  ("morphpart","nu_movable"),
        "short_subj":  ("morphpart","short_subj"),
        "indeclform":  ("morphpart","indeclform"),
        "impersonal":  ("morphpart","impersonal"),
        
        # genders
        "masc": ("gender", "masc"),
        "fem":  ("gender", "fem"),
        "neut": ("gender", "neut"),
        
        # cases
        "nom":  ("case", "nom"),
        "acc":  ("case", "acc"),
        "gen":  ("case", "gen"),
        "dat":  ("case", "dat"),
        "abl":  ("case", "abl"),
        "voc":  ("case", "voc"),

        # person
        "1st":  ("person", "1st"),
        "2nd":  ("person", "2nd"),
        "3rd":  ("person", "3rd"),
    }

    PRON_MAP = {    # pronoun type
        "art_adj"     : ("pron_type", "personal"),
        "pron_adj1"   : ("pron_type", "demonstrative"),
        "demonstr"    : ("pron_type", "demonstrative"),
        "relative"    : ("pron_type", "relative"),
        "pron1"       : ("pron_type", "personal"),
        "pron2"       : ("pron_type", "personal"),
        "pron3"       : ("pron_type", "personal"),
        "indef"       : ("pron_type", "indefinite"),
        "interrog"    : ("pron_type", "interrogative"),
    }


    SECOND_TENSE_MAP = {
        "aor2":       "A",
        "aor2_pass":  "AP",
        # so far no other 2nd-s seen, if seen, they can be added here
        # I put in an A/AP, so that if other 2nds are found, it is easier to distinguise
    }

    if debug:
        import inspect
        frame = inspect.currentframe().f_back
        #print(f"[parse_word_block] called with: {frame.f_locals}") #provides huge (full) amount of details

    # Tailor the output to language
    if language not in ('greek','latin'):
        raise ValueError(
        f"[parse_word_block] Unknown language format {language!r}. "
        "Choose from {'greek', 'latin'}."
        )

    if language == 'greek':
        bc_ind='_bc'
        uc_itm=True
    else:   # can only be latin
        bc_ind=''
        uc_itm=False

    for line in block:
        # Only process lines that begin with a colon (labels)
        if not line.startswith(':'):
            continue
            
        # Extract label (e.g., 'raw', 'lem', etc.) and fields after the label
        label = line.split()[0][1:]
        fields = line[len(label)+2:].split('\t')

        """
        ================================ Label :raw ================================
        This item always contains data which reproduces typicaly the token as suplied in betacode.

        The raw form of the word, as it was inputted. This may include ellipsis (indicated with ').
        Crane provides as example ἐπέμπετ᾽, which could stand for ἐπέμπετε ("you [pl] were sending")
        or ἐπέμπετο ("s/he was being sent")
        """

        if label == 'raw':
            # Start a new parse: 'raw' gives the base word in beta code
            raw_beta = fields[0].strip()
            if uc_itm:
                 current = {'raw_bc': raw_beta}
                 current["raw_uc"] = betacode_to_polytonic(raw_beta)
            else:
                 current = {'raw': raw_beta}
            parses.append(current)

        elif current is None:
            # Skip analysis lines until a 'raw' label initializes a parse
            continue

            """
            ================================ Label :workw ==============================
            This item always contains data which represents the working token after basic normalisation.
            In many instances, the raw and work word are identical
            """
        
        elif label == 'workw':
            # Record the word form in context and its Unicode conversion
            workw_token = fields[0].strip()
            current[f"workw{bc_ind}"] = workw_token
            if uc_itm: current["workw_uc"] = betacode_to_polytonic(workw_token)
        
            """
            ================================ Label :lem ================================
            This item always contains data which represents the lemma associated to the rest of the grammatical  
            data in the analytic block.
            """

        elif label == "lem":
            # Full lemma entry which may include items like homonym number and '-pl' suffix
            lemma_field = fields[0].strip()
            current[f"lem_full{bc_ind}"] = lemma_field
            # if debug is set also print the lemma in betacode, unicode and the TF addapations
            if uc_itm: current["lem_full_uc"] = betacode_to_polytonic(lemma_field,debug)

            # Check for and remove a '-pl' suffix
            if lemma_field.endswith('-pl'):
                current["lem_pl_suff"] = '1'
                lemma_field = lemma_field[:-3]  # remove the '-pl'
            
            # Try to split lemma into base and homonym number (e.g. "logos2" → base: "logos", homonym: 2)
            m = re.match(r"^(.*?)(\d+)$", lemma_field)
            if m:
                current[f"lem_base{bc_ind}"] = m.group(1)         # Everything before the digits
                current["lem_homonym"] = int(m.group(2))    # The digits (homonym number)
            else:
                current[f"lem_base{bc_ind}"] = lemma_field
                
            # Convert the stripped lemma-field as base to Betacode and Unicode Greek
            if uc_itm: current["lem_base_uc"] = betacode_to_polytonic(current["lem_base_bc"])
            
        
            """
            ================================ Label :prvb ================================
            This item is filled with details about attached preposition (e.g., ἐν- or meta-) when applicable.
            Furthermore it may contain details about dialect and morphological flags (related to the prepositon???)
            It can contain zero, one or two prepositions. Eg. διακατηλέγχετο ('diakathle/gxeto'; Acts 18:28) has 2: dia/,kata/.
            """ 

        elif label == "prvb":
            # beta code and Unicode conversion
            prvb_segment = fields[0].strip()
            if prvb_segment:
                # Split on commas, strip whitespace, drop any empties
                preverbs = [p.strip() for p in re.split(r'\s*,\s*', prvb_segment) if p.strip()]
                # Store the betacode **list**
                current[f"prvb{bc_ind}"] = preverbs
                # Convert each to Unicode (and normalize the circumflex)
                if uc_itm: current["prvb_uc"] = [
                    betacode_to_polytonic(p)
                    for p in preverbs
                ]

            # This part may be required to be expanded!
            # There is info available like ":prvb a)na/			apocope poetic	"
            # or ":prvb a)na/			root_preverb	" or
            # :prvb a)po/,a)na/			apocope poetic unasp_preverb	

            """
            ================================ Label :aug1 ================================
            This item is filled with details about augment / reduplication, dialect, and morph-flags
            when applicable
            """

        elif label == "aug1":

            # beta code and Unicode conversion
            aug1_segment = fields[0].strip()

            if aug1_segment:
                current[f"aug1{bc_ind}"] = aug1_segment
                if uc_itm: current["aug1_uc"] = betacode_to_polytonic(aug1_segment)
        
            # Maybe we need to expand this!

            """
            ================================ Label :stem ================================
            This section provides the determined stem,
            """

        elif label == "stem":
            # Stem analysis: beta code and Unicode conversion
            stem_segment = fields[0].strip()
            if stem_segment:
                current[f"stem{bc_ind}"] = stem_segment
                if uc_itm: current["stem_uc"] = betacode_to_polytonic(stem_segment)

            # Grammatical tokens in second field: gender, number, case, or extra morph flags
            if len(fields) > 1 and fields[1]:
                for tok in fields[1].split():
                    if tok in {"masc", "fem", "neut"}:
                        current["stem_gender"] = TOK_MAP[tok][1]   # TOK_MAP = key, val
                    elif tok in {"sg", "pl", "dual"}:
                        current["stem_number"] = TOK_MAP[tok][1]
                    elif tok in {"nom", "acc", "gen", "dat", "abl", "voc"}:
                        current["stem_case"] = TOK_MAP[tok][1]
                    else:
                        current.setdefault("stem_tok", []).append(tok)
            # Additional dialect markers (3rd field) and flags (4th field)
            if len(fields) > 2 and fields[2].strip():
                current.setdefault("dialects", []).extend(fields[2].strip().split())
            if len(fields) > 3 and fields[3].strip():
                current.setdefault("stem_flags", []).extend(fields[3].strip().split())

            # Stem codes list (5th field, comma-separated)
            if len(fields) > 4 and fields[4].strip():
                codes = [c.strip() for c in fields[4].split(",") if c.strip()]
                current.setdefault("stem_codes", []).extend(codes)
        

            """
            ================================ Label :suff ================================
            Note: for all words in the Greek New Testament, this line contained no data.
            Hence the code is not looking for any other information on this line.
            """

        elif label == "suff":

            # suff analysis: beta code and Unicode conversion
            suff_segment = fields[0].strip()
            if suff_segment:
                current[f"suff{bc_ind}"] = suff_segment
                if uc_itm: current["suff_uc"] = betacode_to_polytonic(suff_segment)
                

   
            """
            ================================ Label :end ================================
            This line is of particular interest as it contains most of the morphological data.
            """
                
        elif label == "end":
            # Ending segment and its Unicode conversion
            if fields[0]:
                end_seg = fields[0].strip()
                current[f"end{bc_ind}"] = end_seg
                if uc_itm: current["end_uc"] = betacode_to_polytonic(end_seg)
                
            # Tense, mood, voice, person, number, gender, case, degree, etc., from second field
            if len(fields) > 1:
            
                 # Iterate over morphological tokens in the end field
                if fields[1]:
                    for tok in fields[1].split():
                        tl = tok.lower()  # Normalize token for consistent matching
                        
                        # 1) Direct match in TOK_MAP: assign feature value
                        if tl in TOK_MAP:
                            key, val = TOK_MAP[tl]
                            
                            # Special handling for keys that may have multiple values, like case/gender
                            if key in {"case", "gender"}:
                                if key in current:
                                    # If already a list, append new value
                                    if isinstance(current[key], list):
                                        current[key].append(val)
                                    else:
                                        # Convert existing scalar to list with new value
                                        current[key] = [current[key], val]
                                else:
                                    # First occurrence: assign as scalar
                                    current[key] = val
                            else:
                                # Regular case: assign directly
                                current[key] = val
                    
                        # 2) Handle compound gender/case values (e.g., "masc/fem", "nom/voc")
                        elif "/" in tl:
                            parts = tl.split("/")
                            try:
                                # Try to map all parts using TOK_MAP
                                key_vals = [TOK_MAP[p] for p in parts]
                                # Ensure all mapped keys are the same (either all gender or all case)
                                keys = set(k for k, _ in key_vals)
                                if len(keys) == 1:
                                    key = keys.pop()
                                    values = [v for _, v in key_vals]
                                    current.setdefault(key, []).extend(values)
                                else:
                                   # Mixed keys are not valid; keep as unmatched
                                    current.setdefault("other_end_tokens", []).append(tok)
                            except KeyError:
                                # One or more parts were unrecognized
                                current.setdefault("other_end_tokens", []).append(tok)
                    
                        # 3) If the token is a person indicator (1st, 2nd, or 3rd)
                        elif re.fullmatch(r"[123](?:st|nd|rd)", tl):
                            current["person"] = tl

                        # 4) Anything else goes into a generic catch-all bucket
                        else:
                            current.setdefault("other_end_tokens", []).append(tok)

            # Collect any trailing dialects and morphological flags/codes
            if len(fields) > 2 and fields[2].strip():    
                current.setdefault("dialects", []).extend(fields[2].strip().split())
            if len(fields) > 3 and fields[3].strip():
                current.setdefault("end_flags", []).extend(fields[3].strip().split())
            if len(fields) > 4 and fields[4].strip():
                codes = [c.strip() for c in fields[4].split(",") if c.strip()]
                current.setdefault("end_codes", []).extend(codes)
                # populate pronoun type if applicable
                for code in codes:
                    if code in PRON_MAP: 
                       key, val = PRON_MAP[code]
                       current[key] = val

            # lookup the first matching sec_tense value (if any)
            sec_val = next(
                (SECOND_TENSE_MAP[code]
                for code in current.get("end_codes", [])
                if code in SECOND_TENSE_MAP),
                None
            )
            # only set the key if we found something
            if sec_val is not None:
                current["sec_tense"] = sec_val
    
    """ 
    ============================== post processing ====================================
    A first level op post processing and interpretation of the data is done in this section
    """



    # final processing
    for p in parses:
        # Remove duplicate entries for each parse, preserving insertion order
        
       
        # Deduplicate dialect entries
        if 'dialects' in p:
            p['dialects'] = list(dict.fromkeys(p['dialects']))

        # Handle ordinal markers like '1st','2nd','3rd' by setting person and removing them from other_end_tokens
        other_tokens = p.get('other_end_tokens', [])
        new_tokens = []
        for tok in other_tokens:
            m_ord = re.match(r'^([123])(?:st|nd|rd|th)$', tok.lower())
            if m_ord:
                # Assign person if not already set
                if 'person' not in p:
                    p['person'] = TOK_MAP[m_ord.group(1)][1]  # TOK_MAP = key, val
            else:
                new_tokens.append(tok)         
        # Update other_end_tokens with the filtered list
        if new_tokens:
            p['other_end_tokens'] = new_tokens
        else:
            p.pop('other_end_tokens', None)

    return raw_beta, parses


    # End of function parse_word_block().