# Prepare Text-Critical Greek New Testament data

https://ebible.org/details.php?id=grctcgnt

The Text-Critical Greek New Testament is based upon The New Testament in the Original Greek: Byzantine Textform 2018, compiled and arranged by Maurice A. Robinson and William G. Pierpont.

## Data preparation

This script preprocesses the data to generate two output files:
- A text file containing the complete text as one continuous string without line breaks.
- A text file where each line represents a single verse, including its reference and content.

In [1]:
import os
import unicodedata
import json
from itertools import islice  # Import islice to skip lines efficiently

def normalize(string, chars_to_remove=None):
    """
    Normalize the input string by converting it to lowercase, removing diacritical marks,
    and optionally removing specified characters from a list.
    
    Args:
        string (str): The input string to normalize
        chars_to_remove (list, optional): List of characters to remove from the string
    
    Returns:
        str: The normalized string
    """
    # Convert to lowercase and normalize apostrophe (to 8125 GREEK KORONIS)
    string = string.lower().replace("’", "᾽").replace("ʼ","᾽")
    # Apply Unicode normalization (NFD) to decompose characters
    string = unicodedata.normalize('NFD', string)
    # Remove non-spacing marks (diacritics)
    string = ''.join(ch for ch in string if unicodedata.category(ch) != 'Mn')
    # Remove specified characters if provided
    if chars_to_remove is not None:
        string = ''.join(ch for ch in string if ch not in chars_to_remove)
    return string
    
# Define the directory where source files are located and the filename prefix to filter by.
directory = r'source'
prefix = 'grctcgnt_073_JHN'

# Get all filenames in the directory that start with the prefix and sort them alphabetically
file_list = sorted(
    f for f in os.listdir(directory)
    if f.startswith(prefix) and os.path.isfile(os.path.join(directory, f))
)

# Initialize lists to collect text fragments and tagged lines, and JSON items.
all_line_parts = []
tagged_line_parts = []
json_items = []

# Process each file while automatically tracking chapter numbers (starting at 1)
for chapter, filename in enumerate(file_list, start=1):
    file_path = os.path.join(directory, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        # Skip the first two lines of the file using islice.
        # Process the remaining lines, with verse numbering starting at 1.
        for verse, line in enumerate(islice(file, 2, None), start=1):
            # Remove the newline character and any unwanted special characters and normalize
            line_content = normalize(line,'﻿').rstrip('\n')
            
            # Collect the cleaned line.
            all_line_parts.append(line_content)
            
            # Generate the tagged line with formatted chapter and verse numbers.
            tag = f"43{chapter:03}{verse:03}"
            tagged_line = f"{tag}\t{line_content}\n"
            tagged_line_parts.append(tagged_line)

            # Append a JSON item for this line.
            json_items.append({
                "tag": tag,
                "text": line_content
            })

# Combine all text fragments into a single continuous string.
all_lines = ''.join(all_line_parts)

# Join all tagged lines, ensuring each is on a new line, with a final newline at the end.
tagged_lines = ''.join(tagged_line_parts)

# Write the output to files
with open('TCGNT-John.txt', 'w', encoding='utf-8') as f:
    f.write("".join(all_lines))

with open('TCGNT-John-tagged.txt', 'w', encoding='utf-8') as f:
    f.write("".join(tagged_lines))

# Write the JSON data to an output file with indentation for readability.
with open('TCGNT-John.json', 'w', encoding='utf-8') as f:
    json.dump(json_items, f, ensure_ascii=False, indent=4)

# checking the results

In [2]:
# Dump the first 300 characters of the continuous text
all_lines[:300]

'εν αρχη ην ο λογος, και ο λογος ην προς τον θεον, και θεος ην ο λογος. ουτος ην εν αρχη προς τον θεον. παντα δι᾽ αυτου εγενετο, και χωρις αυτου εγενετο ουδε εν ο γεγονεν. εν αυτω ζωη ην, και η ζωη ην το φως των ανθρωπων, και το φως εν τη σκοτια φαινει, και η σκοτια αυτο ου κατελαβεν. εγενετο ανθρωπο'

In [3]:
# Check unicode of some of the words
import unicodedata
chars="εν αρχη δι᾽"
for char in chars:
    print(ord(char),unicodedata.name(char)) 

949 GREEK SMALL LETTER EPSILON
957 GREEK SMALL LETTER NU
32 SPACE
945 GREEK SMALL LETTER ALPHA
961 GREEK SMALL LETTER RHO
967 GREEK SMALL LETTER CHI
951 GREEK SMALL LETTER ETA
32 SPACE
948 GREEK SMALL LETTER DELTA
953 GREEK SMALL LETTER IOTA
8125 GREEK KORONIS


In [4]:
# Print the first 300 characters of the tagged text
print (tagged_lines[:300])

43001001	εν αρχη ην ο λογος, και ο λογος ην προς τον θεον, και θεος ην ο λογος. 
43001002	ουτος ην εν αρχη προς τον θεον. 
43001003	παντα δι᾽ αυτου εγενετο, και χωρις αυτου εγενετο ουδε εν ο γεγονεν. 
43001004	εν αυτω ζωη ην, και η ζωη ην το φως των ανθρωπων, 
43001005	και το φως εν τη σκοτια φαινει


In [5]:
# Dump the first two JSON items
print(json.dumps(json_items[:2], ensure_ascii=False, indent=4))

[
    {
        "tag": "43001001",
        "text": "εν αρχη ην ο λογος, και ο λογος ην προς τον θεον, και θεος ην ο λογος. "
    },
    {
        "tag": "43001002",
        "text": "ουτος ην εν αρχη προς τον θεον. "
    }
]


# 5 - Notebook version details<a class="anchor" id="bullet5"></a>
##### [Back to ToC](#TOC)

<div style="float: left;">
  <table>
    <tr>
      <td><strong>Author</strong></td>
      <td>Tony Jurg</td>
    </tr>
    <tr>
      <td><strong>Version</strong></td>
      <td>1.0</td>
    </tr>
    <tr>
      <td><strong>Date</strong></td>
      <td>25 February 2025</td>
    </tr>
  </table>
</div>