elemes/load-test/content_parser.py

#!/usr/bin/env python3
"""
Content Parser for Locust E2E Test Generation.

Scans the content/ directory, parses lesson markdown files,
extracts test-relevant data, and writes test_data.json.

Also injects a LOCUST_TEST token into tokens_siswa.csv if not present.

Usage (from elemes/load-test/):
    python content_parser.py
    python content_parser.py --content-dir ../../content --tokens-file ../../tokens_siswa.csv --num-tokens 50
"""

import argparse
import csv
import json
import os
import re
import sys
import uuid


# ---------------------------------------------------------------------------
# Marker extraction (mirrors lesson_service.py logic)
# ---------------------------------------------------------------------------

def extract_section(content: str, start_marker: str, end_marker: str) -> tuple[str, str]:
    """Extract text between markers. Returns (extracted, remaining)."""
    if start_marker not in content or end_marker not in content:
        return "", content

    start_idx = content.find(start_marker)
    end_idx = content.find(end_marker)
    if start_idx == -1 or end_idx == -1 or end_idx <= start_idx:
        return "", content

    extracted = content[start_idx + len(start_marker):end_idx].strip()
    remaining = content[:start_idx] + content[end_idx + len(end_marker):]
    return extracted, remaining


def detect_lesson_type(content: str) -> str:
    """Detect lesson type from markers present in content."""
    has_arduino = '---INITIAL_CODE_ARDUINO---' in content
    has_c = '---INITIAL_CODE---' in content
    has_python = '---INITIAL_PYTHON---' in content
    has_circuit = '---INITIAL_CIRCUIT---' in content
    has_quiz = '---INITIAL_QUIZ---' in content
    has_velxio = '---VELXIO_CIRCUIT---' in content

    if has_arduino or (has_velxio and not has_c and not has_python):
        return 'arduino'
    if has_quiz:
        return 'quiz'
    if has_c and has_circuit:
        return 'hybrid'
    if has_circuit and not has_c and not has_python:
        return 'circuit'
    if has_python and not has_c:
        return 'python'
    return 'c'


def parse_lesson(filepath: str) -> dict:
    """Parse a single lesson markdown file and extract test data."""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    slug = os.path.basename(filepath).replace('.md', '')
    lesson_type = detect_lesson_type(content)

    # Extract all relevant sections
    initial_code_c, _ = extract_section(content, '---INITIAL_CODE---', '---END_INITIAL_CODE---')
    initial_python, _ = extract_section(content, '---INITIAL_PYTHON---', '---END_INITIAL_PYTHON---')
    initial_code_arduino, _ = extract_section(content, '---INITIAL_CODE_ARDUINO---', '---END_INITIAL_CODE_ARDUINO---')
    velxio_circuit, _ = extract_section(content, '---VELXIO_CIRCUIT---', '---END_VELXIO_CIRCUIT---')
    expected_output, _ = extract_section(content, '---EXPECTED_OUTPUT---', '---END_EXPECTED_OUTPUT---')
    expected_output_python, _ = extract_section(content, '---EXPECTED_OUTPUT_PYTHON---', '---END_EXPECTED_OUTPUT_PYTHON---')
    expected_serial, _ = extract_section(content, '---EXPECTED_SERIAL_OUTPUT---', '---END_EXPECTED_SERIAL_OUTPUT---')
    expected_wiring, _ = extract_section(content, '---EXPECTED_WIRING---', '---END_EXPECTED_WIRING---')
    key_text, _ = extract_section(content, '---KEY_TEXT---', '---END_KEY_TEXT---')
    solution_code, _ = extract_section(content, '---SOLUTION_CODE---', '---END_SOLUTION_CODE---')
    solution_python, _ = extract_section(content, '---SOLUTION_PYTHON---', '---END_SOLUTION_PYTHON---')

    # A lesson is compilable if it has solution code or Arduino initial code
    is_compilable = bool(solution_code or solution_python or initial_code_arduino)

    data = {
        'slug': slug,
        'type': lesson_type,
        'has_c': bool(initial_code_c),
        'has_python': bool(initial_python),
        'has_circuit': '---INITIAL_CIRCUIT---' in content,
        'has_arduino': bool(initial_code_arduino),
        'has_velxio': bool(velxio_circuit),
        'compilable': is_compilable,
        'key_text': key_text,
    }

    # Add type-specific fields
    if initial_code_c:
        data['initial_code_c'] = initial_code_c
    if solution_code:
        data['solution_code'] = solution_code
    if initial_python:
        data['initial_python'] = initial_python
    if solution_python:
        data['solution_python'] = solution_python
    if expected_output:
        data['expected_output'] = expected_output
    if expected_output_python:
        data['expected_output_python'] = expected_output_python
    if initial_code_arduino:
        data['initial_code_arduino'] = initial_code_arduino
    if velxio_circuit:
        data['velxio_circuit'] = velxio_circuit
    if expected_serial:
        data['expected_serial'] = expected_serial
    if expected_wiring:
        data['expected_wiring'] = expected_wiring

    return data


def get_ordered_slugs(content_dir: str) -> list[str]:
    """Get lesson slugs in order from home.md's Available_Lessons section."""
    home_path = os.path.join(content_dir, 'home.md')
    if not os.path.exists(home_path):
        return []

    with open(home_path, 'r', encoding='utf-8') as f:
        home_content = f.read()

    parts = home_content.split('----Available_Lessons----')
    if len(parts) <= 1:
        # Try alternate separator
        parts = home_content.split('---Available_Lessons---')
    if len(parts) <= 1:
        return []

    links = re.findall(r'\[([^\]]+)\]\((?:lesson/)?([^\)]+)\)', parts[1])
    return [fn.replace('.md', '') for _, fn in links]


# ---------------------------------------------------------------------------
# Token injection
# ---------------------------------------------------------------------------

def ensure_test_tokens(tokens_file: str, lesson_slugs: list[str], num_tokens: int) -> list[str]:
    """
    Ensure a specified number of test tokens exist in tokens_siswa.csv.
    Returns a list of token strings.
    """
    existing_tokens = []
    rows = []
    fieldnames = []

    if os.path.exists(tokens_file):
        with open(tokens_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=';')
            fieldnames = reader.fieldnames or []
            rows = list(reader)

        for row in rows:
            if row.get('token', '').startswith('LOCUST_TEST_'):
                existing_tokens.append(row['token'])

    if not fieldnames:
        fieldnames = ['token', 'nama_siswa'] + lesson_slugs

    tokens_to_return = existing_tokens[:num_tokens]

    if len(tokens_to_return) < num_tokens:
        needed = num_tokens - len(tokens_to_return)
        new_tokens = []
        for i in range(needed):
            token = f"LOCUST_TEST_{uuid.uuid4().hex[:8]}"
            new_tokens.append(token)

            new_row = {'token': token, 'nama_siswa': f"Locust Bot {len(tokens_to_return) + i + 1}"}
            for slug in lesson_slugs:
                if slug not in fieldnames:
                     fieldnames.append(slug)
                new_row[slug] = 'not_started'
            rows.append(new_row)

        with open(tokens_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=';')
            writer.writeheader()
            writer.writerows(rows)

        print(f"  ✚  Injected {len(new_tokens)} new test tokens.")
        tokens_to_return.extend(new_tokens)
    else:
        print(f"  ♻  Reusing {num_tokens} existing test tokens.")

    return tokens_to_return


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def _get_teacher_token(tokens_file: str) -> str:
    """Get the teacher token (first data row)."""
    if not os.path.exists(tokens_file):
        return ""
    with open(tokens_file, 'r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter=';')
        for row in reader:
            return row.get('token', '')
    return ""


def main():
    parser = argparse.ArgumentParser(description='Parse content/ for Locust test generation')
    parser.add_argument('--content-dir', default='../../content',
                        help='Path to content directory (default: ../../content)')
    parser.add_argument('--tokens-file', default='../../tokens_siswa.csv',
                        help='Path to tokens CSV (default: ../../tokens_siswa.csv)')
    parser.add_argument('--num-tokens', type=int, default=50,
                        help='Number of test tokens to generate (default: 50)')
    parser.add_argument('--output', default='test_data.json',
                        help='Output JSON file (default: test_data.json)')
    args = parser.parse_args()

    content_dir = os.path.abspath(args.content_dir)
    tokens_file = os.path.abspath(args.tokens_file)

    print(f"\n{'='*60}")
    print(f"  Elemes Content Parser for Locust E2E Testing")
    print(f"{'='*60}")
    print(f"  Content dir : {content_dir}")
    print(f"  Tokens file : {tokens_file}")
    print(f"  Num tokens  : {args.num_tokens}")
    print(f"  Output      : {args.output}")
    print()

    # 1. Get ordered lesson slugs
    ordered_slugs = get_ordered_slugs(content_dir)
    if not ordered_slugs:
        # Fallback: scan directory
        ordered_slugs = [
            f.replace('.md', '')
            for f in sorted(os.listdir(content_dir))
            if f.endswith('.md') and f != 'home.md'
        ]

    print(f"  📚 Found {len(ordered_slugs)} lessons:")

    # 2. Parse each lesson
    lessons = []
    for slug in ordered_slugs:
        filepath = os.path.join(content_dir, f'{slug}.md')
        if not os.path.exists(filepath):
            print(f"     ⚠  {slug}.md not found, skipping")
            continue

        lesson = parse_lesson(filepath)
        lessons.append(lesson)

        # Summary icon per type
        icons = {
            'c': '🔧', 'python': '🐍', 'hybrid': '🔀',
            'circuit': '⚡', 'arduino': '🤖', 'quiz': '❓'
        }
        icon = icons.get(lesson['type'], '📄')
        compilable = '✓ compile' if lesson.get('compilable') else '✗ compile'
        print(f"     {icon} {slug} [{lesson['type']}] {compilable}")

    # 3. Inject test tokens
    print()
    tokens = ensure_test_tokens(tokens_file, ordered_slugs, args.num_tokens)

    # 4. Build output
    test_data = {
        'generated_by': 'content_parser.py',
        'tokens': tokens,
        'teacher_token': _get_teacher_token(tokens_file),
        'lessons': lessons,
        'stats': {
            'total': len(lessons),
            'c': sum(1 for l in lessons if l['type'] == 'c'),
            'python': sum(1 for l in lessons if l['type'] == 'python'),
            'hybrid': sum(1 for l in lessons if l['type'] == 'hybrid'),
            'circuit': sum(1 for l in lessons if l['type'] == 'circuit'),
            'arduino': sum(1 for l in lessons if l['type'] == 'arduino'),
            'quiz': sum(1 for l in lessons if l['type'] == 'quiz'),
            'compilable': sum(1 for l in lessons if l.get('compilable')),
        }
    }

    # 5. Write output
    with open(args.output, 'w', encoding='utf-8') as f:
        json.dump(test_data, f, indent=2, ensure_ascii=False)

    print(f"\n  ✅ Wrote {args.output}")
    print(f"     {test_data['stats']['total']} lessons "
          f"({test_data['stats']['compilable']} compilable)")
    print(f"\n  Next: locust -f locustfile.py")
    print(f"{'='*60}\n")


if __name__ == '__main__':
    main()