Skill

Arabic NLP


name: arabic-nlp-skill description: Egyptian Arabic dialect AI patterns for speech and language processing. Use when implementing (1) Arabic text preprocessing and normalization, (2) Egyptian dialect handling vs MSA, (3) Arabic tokenization strategies, (4) RTL text processing, (5) Arabic-optimized embeddings, (6) diacritics handling, or (7) code-switching detection. Triggers on Arabic NLP, Egyptian dialect, Arabic AI, Whisper Arabic, Arabic TTS, Arabic LLM.

Arabic NLP Patterns for Egyptian Dialect

Arabic Text Challenges

ChallengeExampleSolution
Diacritics (تشكيل)كَتَبَ vs كتبNormalize or preserve based on task
Letter variantsى vs ي, ة vs هStandardize to canonical form
Numbers٠١٢ vs 012Support both Arabic-Indic and Western
Hamza formsأ إ آ ءNormalize based on context
ElongationمرحباااااRemove repeated characters
Dialectal spellingازيك vs كيف حالكMap to canonical form or keep

Text Normalization

import re
import unicodedata

def normalize_arabic(text: str, remove_diacritics: bool = True) -> str:
    """Normalize Arabic text for NLP processing."""
    
    # Remove diacritics (tashkeel) if requested
    if remove_diacritics:
        # Arabic diacritics Unicode range
        diacritics = re.compile(r'[\u064B-\u065F\u0670]')
        text = diacritics.sub('', text)
    
    # Normalize alef variants → ا
    text = re.sub(r'[إأآا]', 'ا', text)
    
    # Normalize teh marbuta → ه
    text = re.sub(r'ة', 'ه', text)
    
    # Normalize yeh variants → ي
    text = re.sub(r'ى', 'ي', text)
    
    # Remove tatweel (kashida) ـ
    text = re.sub(r'ـ', '', text)
    
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def normalize_numbers(text: str, to_western: bool = True) -> str:
    """Convert between Arabic-Indic and Western numerals."""
    arabic_indic = '٠١٢٣٤٥٦٧٨٩'
    western = '0123456789'
    
    if to_western:
        trans = str.maketrans(arabic_indic, western)
    else:
        trans = str.maketrans(western, arabic_indic)
    
    return text.translate(trans)

def remove_elongation(text: str, max_repeat: int = 2) -> str:
    """Remove repeated characters (مرحبااااا → مرحبا)."""
    return re.sub(r'(.)\1{' + str(max_repeat) + r',}', r'\1' * max_repeat, text)

Egyptian Dialect Mapping

# Common Egyptian → MSA mappings
EGYPTIAN_TO_MSA = {
    # Pronouns
    'انت': 'أنت',
    'انتي': 'أنتِ',
    'احنا': 'نحن',
    'هم': 'هم',
    'ده': 'هذا',
    'دي': 'هذه',
    'دول': 'هؤلاء',
    
    # Greetings
    'ازيك': 'كيف حالك',
    'ازيكو': 'كيف حالكم',
    'اهلا': 'أهلاً',
    'مساء الخير': 'مساء الخير',
    
    # Common verbs
    'عايز': 'أريد',
    'عايزه': 'أريد',
    'مش': 'ليس',
    'كده': 'هكذا',
    'خلاص': 'انتهى',
    'يعني': 'أي',
    
    # Question words
    'ايه': 'ما',
    'فين': 'أين',
    'ازاي': 'كيف',
    'ليه': 'لماذا',
    'امتى': 'متى',
    'مين': 'من',
}

def egyptian_to_msa(text: str) -> str:
    """Convert Egyptian dialect words to MSA."""
    words = text.split()
    result = []
    for word in words:
        normalized = normalize_arabic(word)
        result.append(EGYPTIAN_TO_MSA.get(normalized, word))
    return ' '.join(result)

Arabic Tokenization

from transformers import AutoTokenizer

# Arabic-specific tokenizers
TOKENIZERS = {
    'arabert': 'aubmindlab/bert-base-arabertv02',
    'camelbert': 'CAMeL-Lab/bert-base-arabic-camelbert-mix',
    'jais': 'core42/jais-13b-chat',  # Arabic-optimized
}

def get_arabic_tokenizer(model_type: str = 'arabert'):
    """Load Arabic-optimized tokenizer."""
    return AutoTokenizer.from_pretrained(TOKENIZERS[model_type])

# For Egyptian dialect, CAMeLBERT trained on dialectal data works better
tokenizer = get_arabic_tokenizer('camelbert')

Whisper Arabic Configuration

import whisper

def transcribe_egyptian(audio_path: str, model_size: str = "large-v3") -> dict:
    """Transcribe Egyptian Arabic audio with Whisper."""
    model = whisper.load_model(model_size)
    
    result = model.transcribe(
        audio_path,
        language="ar",  # Arabic
        task="transcribe",
        
        # Egyptian-specific settings
        initial_prompt="مرحبا، انا بتكلم عربي مصري",  # Prime for Egyptian
        
        # Quality settings
        temperature=0.0,  # Deterministic
        compression_ratio_threshold=2.4,
        logprob_threshold=-1.0,
        no_speech_threshold=0.6,
        
        # Output settings
        word_timestamps=True,
        verbose=False,
    )
    
    return {
        "text": result["text"],
        "segments": result["segments"],
        "language": result["language"],
    }

Arabic TTS (XTTS2)

from TTS.api import TTS

def synthesize_egyptian(
    text: str,
    reference_audio: str = None,
    output_path: str = "output.wav"
) -> str:
    """Synthesize Egyptian Arabic speech with XTTS2."""
    
    # Preprocess text
    text = normalize_arabic(text, remove_diacritics=False)  # Keep diacritics for TTS
    text = remove_elongation(text)
    
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
    
    if reference_audio:
        # Voice cloning
        tts.tts_to_file(
            text=text,
            speaker_wav=reference_audio,
            language="ar",
            file_path=output_path,
        )
    else:
        # Default Arabic voice
        tts.tts_to_file(
            text=text,
            language="ar",
            file_path=output_path,
        )
    
    return output_path

Arabic Embeddings

from sentence_transformers import SentenceTransformer

# Arabic-optimized embedding models
EMBEDDING_MODELS = {
    'multilingual-e5': 'intfloat/multilingual-e5-large',  # Best for Arabic
    'labse': 'sentence-transformers/LaBSE',  # Good for cross-lingual
    'arabic-sbert': 'CAMeL-Lab/bert-base-arabic-camelbert-mix',
}

def get_arabic_embeddings(texts: list[str], model_name: str = 'multilingual-e5') -> list:
    """Generate embeddings for Arabic text."""
    
    # Normalize texts first
    normalized = [normalize_arabic(t) for t in texts]
    
    model = SentenceTransformer(EMBEDDING_MODELS[model_name])
    
    # For E5 models, add instruction prefix
    if 'e5' in model_name:
        normalized = [f"query: {t}" for t in normalized]
    
    embeddings = model.encode(normalized, normalize_embeddings=True)
    return embeddings.tolist()

Arabic LLM Prompting

# System prompts for Egyptian Arabic
EGYPTIAN_SYSTEM_PROMPTS = {
    'assistant': """أنت مساعد ذكي بتتكلم عربي مصري. 
رد على الأسئلة بطريقة ودية وبسيطة.
استخدم اللهجة المصرية في الردود.""",

    'formal': """أنت مساعد محترف يستخدم اللغة العربية الفصحى.
قدم إجابات دقيقة ومفصلة.""",

    'customer_service': """انت موظف خدمة عملاء بتتكلم مصري.
ساعد العميل بطريقة لطيفة ومحترمة.
لو مش فاهم السؤال، اطلب توضيح.""",
}

def format_arabic_prompt(
    user_message: str,
    system_prompt: str = None,
    style: str = 'assistant'
) -> list[dict]:
    """Format prompt for Arabic LLM."""
    
    system = system_prompt or EGYPTIAN_SYSTEM_PROMPTS.get(style, EGYPTIAN_SYSTEM_PROMPTS['assistant'])
    
    return [
        {"role": "system", "content": system},
        {"role": "user", "content": user_message},
    ]

Code-Switching Detection

import re

def detect_language_mix(text: str) -> dict:
    """Detect Arabic/English code-switching."""
    
    # Arabic Unicode range
    arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F]+')
    # Latin characters
    latin_pattern = re.compile(r'[a-zA-Z]+')
    
    arabic_words = arabic_pattern.findall(text)
    latin_words = latin_pattern.findall(text)
    
    total_words = len(arabic_words) + len(latin_words)
    
    if total_words == 0:
        return {"arabic_ratio": 0, "latin_ratio": 0, "mixed": False}
    
    arabic_ratio = len(arabic_words) / total_words
    latin_ratio = len(latin_words) / total_words
    
    # Mixed if both languages present significantly
    mixed = arabic_ratio > 0.1 and latin_ratio > 0.1
    
    return {
        "arabic_ratio": round(arabic_ratio, 2),
        "latin_ratio": round(latin_ratio, 2),
        "mixed": mixed,
        "primary_language": "ar" if arabic_ratio > latin_ratio else "en",
    }

RTL Text Handling

def prepare_rtl_display(text: str) -> str:
    """Prepare Arabic text for proper RTL display."""
    # Add RTL mark at start
    RTL_MARK = '\u200F'
    
    # Handle mixed content
    lines = text.split('\n')
    result = []
    
    for line in lines:
        if any('\u0600' <= c <= '\u06FF' for c in line):
            # Line contains Arabic - add RTL mark
            result.append(RTL_MARK + line)
        else:
            result.append(line)
    
    return '\n'.join(result)

Production Checklist

  • Text normalization pipeline configured
  • Egyptian ↔ MSA mapping for search/retrieval
  • Arabic-optimized tokenizer selected
  • Whisper primed with Egyptian prompt
  • TTS preserves diacritics when needed
  • Embedding model supports Arabic well (E5/LaBSE)
  • LLM system prompts in Egyptian dialect
  • Code-switching handled gracefully
  • RTL display working in frontend
  • Numbers support both formats

ProYaro AI Infrastructure Documentation • Version 1.2