Skill
Arabic NLP
name: arabic-nlp-skill description: Egyptian Arabic dialect AI patterns for speech and language processing. Use when implementing (1) Arabic text preprocessing and normalization, (2) Egyptian dialect handling vs MSA, (3) Arabic tokenization strategies, (4) RTL text processing, (5) Arabic-optimized embeddings, (6) diacritics handling, or (7) code-switching detection. Triggers on Arabic NLP, Egyptian dialect, Arabic AI, Whisper Arabic, Arabic TTS, Arabic LLM.
Arabic NLP Patterns for Egyptian Dialect
Arabic Text Challenges
| Challenge | Example | Solution |
|---|---|---|
| Diacritics (تشكيل) | كَتَبَ vs كتب | Normalize or preserve based on task |
| Letter variants | ى vs ي, ة vs ه | Standardize to canonical form |
| Numbers | ٠١٢ vs 012 | Support both Arabic-Indic and Western |
| Hamza forms | أ إ آ ء | Normalize based on context |
| Elongation | مرحبااااا | Remove repeated characters |
| Dialectal spelling | ازيك vs كيف حالك | Map to canonical form or keep |
Text Normalization
import re
import unicodedata
def normalize_arabic(text: str, remove_diacritics: bool = True) -> str:
"""Normalize Arabic text for NLP processing."""
# Remove diacritics (tashkeel) if requested
if remove_diacritics:
# Arabic diacritics Unicode range
diacritics = re.compile(r'[\u064B-\u065F\u0670]')
text = diacritics.sub('', text)
# Normalize alef variants → ا
text = re.sub(r'[إأآا]', 'ا', text)
# Normalize teh marbuta → ه
text = re.sub(r'ة', 'ه', text)
# Normalize yeh variants → ي
text = re.sub(r'ى', 'ي', text)
# Remove tatweel (kashida) ـ
text = re.sub(r'ـ', '', text)
# Normalize spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def normalize_numbers(text: str, to_western: bool = True) -> str:
"""Convert between Arabic-Indic and Western numerals."""
arabic_indic = '٠١٢٣٤٥٦٧٨٩'
western = '0123456789'
if to_western:
trans = str.maketrans(arabic_indic, western)
else:
trans = str.maketrans(western, arabic_indic)
return text.translate(trans)
def remove_elongation(text: str, max_repeat: int = 2) -> str:
"""Remove repeated characters (مرحبااااا → مرحبا)."""
return re.sub(r'(.)\1{' + str(max_repeat) + r',}', r'\1' * max_repeat, text)
Egyptian Dialect Mapping
# Common Egyptian → MSA mappings
EGYPTIAN_TO_MSA = {
# Pronouns
'انت': 'أنت',
'انتي': 'أنتِ',
'احنا': 'نحن',
'هم': 'هم',
'ده': 'هذا',
'دي': 'هذه',
'دول': 'هؤلاء',
# Greetings
'ازيك': 'كيف حالك',
'ازيكو': 'كيف حالكم',
'اهلا': 'أهلاً',
'مساء الخير': 'مساء الخير',
# Common verbs
'عايز': 'أريد',
'عايزه': 'أريد',
'مش': 'ليس',
'كده': 'هكذا',
'خلاص': 'انتهى',
'يعني': 'أي',
# Question words
'ايه': 'ما',
'فين': 'أين',
'ازاي': 'كيف',
'ليه': 'لماذا',
'امتى': 'متى',
'مين': 'من',
}
def egyptian_to_msa(text: str) -> str:
"""Convert Egyptian dialect words to MSA."""
words = text.split()
result = []
for word in words:
normalized = normalize_arabic(word)
result.append(EGYPTIAN_TO_MSA.get(normalized, word))
return ' '.join(result)
Arabic Tokenization
from transformers import AutoTokenizer
# Arabic-specific tokenizers
TOKENIZERS = {
'arabert': 'aubmindlab/bert-base-arabertv02',
'camelbert': 'CAMeL-Lab/bert-base-arabic-camelbert-mix',
'jais': 'core42/jais-13b-chat', # Arabic-optimized
}
def get_arabic_tokenizer(model_type: str = 'arabert'):
"""Load Arabic-optimized tokenizer."""
return AutoTokenizer.from_pretrained(TOKENIZERS[model_type])
# For Egyptian dialect, CAMeLBERT trained on dialectal data works better
tokenizer = get_arabic_tokenizer('camelbert')
Whisper Arabic Configuration
import whisper
def transcribe_egyptian(audio_path: str, model_size: str = "large-v3") -> dict:
"""Transcribe Egyptian Arabic audio with Whisper."""
model = whisper.load_model(model_size)
result = model.transcribe(
audio_path,
language="ar", # Arabic
task="transcribe",
# Egyptian-specific settings
initial_prompt="مرحبا، انا بتكلم عربي مصري", # Prime for Egyptian
# Quality settings
temperature=0.0, # Deterministic
compression_ratio_threshold=2.4,
logprob_threshold=-1.0,
no_speech_threshold=0.6,
# Output settings
word_timestamps=True,
verbose=False,
)
return {
"text": result["text"],
"segments": result["segments"],
"language": result["language"],
}
Arabic TTS (XTTS2)
from TTS.api import TTS
def synthesize_egyptian(
text: str,
reference_audio: str = None,
output_path: str = "output.wav"
) -> str:
"""Synthesize Egyptian Arabic speech with XTTS2."""
# Preprocess text
text = normalize_arabic(text, remove_diacritics=False) # Keep diacritics for TTS
text = remove_elongation(text)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
if reference_audio:
# Voice cloning
tts.tts_to_file(
text=text,
speaker_wav=reference_audio,
language="ar",
file_path=output_path,
)
else:
# Default Arabic voice
tts.tts_to_file(
text=text,
language="ar",
file_path=output_path,
)
return output_path
Arabic Embeddings
from sentence_transformers import SentenceTransformer
# Arabic-optimized embedding models
EMBEDDING_MODELS = {
'multilingual-e5': 'intfloat/multilingual-e5-large', # Best for Arabic
'labse': 'sentence-transformers/LaBSE', # Good for cross-lingual
'arabic-sbert': 'CAMeL-Lab/bert-base-arabic-camelbert-mix',
}
def get_arabic_embeddings(texts: list[str], model_name: str = 'multilingual-e5') -> list:
"""Generate embeddings for Arabic text."""
# Normalize texts first
normalized = [normalize_arabic(t) for t in texts]
model = SentenceTransformer(EMBEDDING_MODELS[model_name])
# For E5 models, add instruction prefix
if 'e5' in model_name:
normalized = [f"query: {t}" for t in normalized]
embeddings = model.encode(normalized, normalize_embeddings=True)
return embeddings.tolist()
Arabic LLM Prompting
# System prompts for Egyptian Arabic
EGYPTIAN_SYSTEM_PROMPTS = {
'assistant': """أنت مساعد ذكي بتتكلم عربي مصري.
رد على الأسئلة بطريقة ودية وبسيطة.
استخدم اللهجة المصرية في الردود.""",
'formal': """أنت مساعد محترف يستخدم اللغة العربية الفصحى.
قدم إجابات دقيقة ومفصلة.""",
'customer_service': """انت موظف خدمة عملاء بتتكلم مصري.
ساعد العميل بطريقة لطيفة ومحترمة.
لو مش فاهم السؤال، اطلب توضيح.""",
}
def format_arabic_prompt(
user_message: str,
system_prompt: str = None,
style: str = 'assistant'
) -> list[dict]:
"""Format prompt for Arabic LLM."""
system = system_prompt or EGYPTIAN_SYSTEM_PROMPTS.get(style, EGYPTIAN_SYSTEM_PROMPTS['assistant'])
return [
{"role": "system", "content": system},
{"role": "user", "content": user_message},
]
Code-Switching Detection
import re
def detect_language_mix(text: str) -> dict:
"""Detect Arabic/English code-switching."""
# Arabic Unicode range
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F]+')
# Latin characters
latin_pattern = re.compile(r'[a-zA-Z]+')
arabic_words = arabic_pattern.findall(text)
latin_words = latin_pattern.findall(text)
total_words = len(arabic_words) + len(latin_words)
if total_words == 0:
return {"arabic_ratio": 0, "latin_ratio": 0, "mixed": False}
arabic_ratio = len(arabic_words) / total_words
latin_ratio = len(latin_words) / total_words
# Mixed if both languages present significantly
mixed = arabic_ratio > 0.1 and latin_ratio > 0.1
return {
"arabic_ratio": round(arabic_ratio, 2),
"latin_ratio": round(latin_ratio, 2),
"mixed": mixed,
"primary_language": "ar" if arabic_ratio > latin_ratio else "en",
}
RTL Text Handling
def prepare_rtl_display(text: str) -> str:
"""Prepare Arabic text for proper RTL display."""
# Add RTL mark at start
RTL_MARK = '\u200F'
# Handle mixed content
lines = text.split('\n')
result = []
for line in lines:
if any('\u0600' <= c <= '\u06FF' for c in line):
# Line contains Arabic - add RTL mark
result.append(RTL_MARK + line)
else:
result.append(line)
return '\n'.join(result)
Production Checklist
- Text normalization pipeline configured
- Egyptian ↔ MSA mapping for search/retrieval
- Arabic-optimized tokenizer selected
- Whisper primed with Egyptian prompt
- TTS preserves diacritics when needed
- Embedding model supports Arabic well (E5/LaBSE)
- LLM system prompts in Egyptian dialect
- Code-switching handled gracefully
- RTL display working in frontend
- Numbers support both formats
ProYaro AI Infrastructure Documentation • Version 1.2