Skill

MLX Chat (Text Generation)

MLX Chat Integration Skill

Overview

This skill provides ready-to-use code for integrating MLX text generation into your applications.

Service: MLX FastAPI (Mac Mini) Endpoint: http://localhost:8004/v1/chat/completions (internal) or http://10.0.0.188:8004/v1/chat/completions (network) Protocol: HTTP POST Response Time: 5-20 seconds (depending on max_tokens)


TypeScript/JavaScript Implementation

Basic Chat Client

interface MLXChatRequest {
  prompt: string;
  max_tokens?: number;
  temp?: number;
  top_p?: number;
  repetition_penalty?: number;
  system_prompt?: string;
  conversation_id?: string;
}

interface MLXChatResponse {
  text: string;
  conversation_id: string;
}

class MLXClient {
  private baseURL: string;
  private conversationId: string | null = null;

  constructor(baseURL: string = 'http://localhost:8004') {
    this.baseURL = baseURL;
  }

  async chat(options: MLXChatRequest): Promise<MLXChatResponse> {
    const payload: MLXChatRequest = {
      prompt: options.prompt,
      max_tokens: options.max_tokens || 256,
      temp: options.temp || 0.7,
      top_p: options.top_p || 0.9,
      repetition_penalty: options.repetition_penalty || 1.1,
      system_prompt: options.system_prompt || 'You are a helpful assistant.',
    };

    // Use existing conversation if available
    if (this.conversationId) {
      payload.conversation_id = this.conversationId;
    }

    const response = await fetch(`${this.baseURL}/v1/chat/completions`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify(payload),
    });

    if (!response.ok) {
      throw new Error(`MLX API error: ${response.statusText}`);
    }

    const data: MLXChatResponse = await response.json();

    // Store conversation ID for continuity
    this.conversationId = data.conversation_id;

    return data;
  }

  newConversation(): void {
    this.conversationId = null;
  }

  getCurrentConversationId(): string | null {
    return this.conversationId;
  }
}

// Usage Example
const mlx = new MLXClient();

// Simple chat
const response1 = await mlx.chat({
  prompt: 'ما هي عاصمة مصر؟',
  max_tokens: 100,
});
console.log(response1.text); // "عاصمة مصر هي القاهرة."

// Continue conversation
const response2 = await mlx.chat({
  prompt: 'وما هو عدد سكانها؟',
  max_tokens: 150,
});
console.log(response2.text); // "يبلغ عدد سكان القاهرة..."

// Start new conversation
mlx.newConversation();
const response3 = await mlx.chat({
  prompt: 'Hello, how are you?',
  system_prompt: 'You are a friendly English assistant.',
});

React Hook

import { useState, useCallback } from 'react';

interface UseMLXChatOptions {
  baseURL?: string;
  systemPrompt?: string;
  maxTokens?: number;
  temperature?: number;
}

interface Message {
  role: 'user' | 'assistant';
  content: string;
}

export function useMLXChat(options: UseMLXChatOptions = {}) {
  const [messages, setMessages] = useState<Message[]>([]);
  const [loading, setLoading] = useState(false);
  const [conversationId, setConversationId] = useState<string | null>(null);

  const sendMessage = useCallback(async (userMessage: string) => {
    // Add user message
    setMessages(prev => [...prev, { role: 'user', content: userMessage }]);
    setLoading(true);

    try {
      const response = await fetch(`${options.baseURL || 'http://localhost:8004'}/v1/chat/completions`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
          prompt: userMessage,
          max_tokens: options.maxTokens || 500,
          temp: options.temperature || 0.7,
          system_prompt: options.systemPrompt || 'You are a helpful assistant.',
          conversation_id: conversationId,
        }),
      });

      if (!response.ok) {
        throw new Error('MLX API request failed');
      }

      const data = await response.json();

      // Update conversation ID
      setConversationId(data.conversation_id);

      // Add assistant message
      setMessages(prev => [...prev, { role: 'assistant', content: data.text }]);

      return data.text;
    } catch (error) {
      console.error('MLX chat error:', error);
      throw error;
    } finally {
      setLoading(false);
    }
  }, [conversationId, options]);

  const clearHistory = useCallback(() => {
    setMessages([]);
    setConversationId(null);
  }, []);

  return {
    messages,
    loading,
    sendMessage,
    clearHistory,
  };
}

// Usage in Component
function ChatComponent() {
  const { messages, loading, sendMessage, clearHistory } = useMLXChat({
    systemPrompt: 'You are an expert Egyptian marketing assistant.',
    maxTokens: 500,
  });

  const handleSubmit = async (e: React.FormEvent) => {
    e.preventDefault();
    const input = e.currentTarget.querySelector('input');
    if (!input.value.trim()) return;

    await sendMessage(input.value);
    input.value = '';
  };

  return (
    <div>
      <div className="messages">
        {messages.map((msg, i) => (
          <div key={i} className={msg.role}>
            {msg.content}
          </div>
        ))}
      </div>

      <form onSubmit={handleSubmit}>
        <input type="text" disabled={loading} />
        <button type="submit" disabled={loading}>
          {loading ? 'Sending...' : 'Send'}
        </button>
      </form>

      <button onClick={clearHistory}>Clear Chat</button>
    </div>
  );
}

Python Implementation

import requests
from typing import Optional, Dict, Any

class MLXClient:
    """Client for MLX FastAPI text generation service"""

    def __init__(self, base_url: str = "http://localhost:8004"):
        self.base_url = base_url
        self.conversation_id: Optional[str] = None

    def chat(
        self,
        prompt: str,
        max_tokens: int = 256,
        temperature: float = 0.7,
        top_p: float = 0.9,
        repetition_penalty: float = 1.1,
        system_prompt: str = "You are a helpful assistant.",
    ) -> Dict[str, Any]:
        """
        Send a chat message to MLX service

        Args:
            prompt: User's message
            max_tokens: Maximum tokens to generate
            temperature: Sampling temperature (0.0-2.0)
            top_p: Nucleus sampling parameter
            repetition_penalty: Penalty for repetition
            system_prompt: System instruction for the model

        Returns:
            dict with 'text' and 'conversation_id'
        """
        payload = {
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temp": temperature,
            "top_p": top_p,
            "repetition_penalty": repetition_penalty,
            "system_prompt": system_prompt,
        }

        # Continue conversation if available
        if self.conversation_id:
            payload["conversation_id"] = self.conversation_id

        response = requests.post(
            f"{self.base_url}/v1/chat/completions",
            json=payload,
            timeout=120  # 2 minutes timeout
        )
        response.raise_for_status()

        data = response.json()
        self.conversation_id = data["conversation_id"]

        return data

    def new_conversation(self):
        """Start a new conversation"""
        self.conversation_id = None

    def get_conversation_id(self) -> Optional[str]:
        """Get current conversation ID"""
        return self.conversation_id


# Usage Example
if __name__ == "__main__":
    mlx = MLXClient()

    # First message
    response = mlx.chat(
        prompt="ما هي عاصمة مصر؟",
        max_tokens=100,
    )
    print(f"Assistant: {response['text']}")

    # Continue conversation
    response = mlx.chat(
        prompt="وما هو عدد سكانها تقريباً؟",
        max_tokens=150,
    )
    print(f"Assistant: {response['text']}")

    # Start new conversation
    mlx.new_conversation()
    response = mlx.chat(
        prompt="Write a product description for a moisturizer",
        system_prompt="You are an expert marketing copywriter.",
        temperature=0.8,  # More creative
    )
    print(f"Assistant: {response['text']}")

Advanced Features

Model Switching

async function switchMLXModel(modelName: string): Promise<void> {
  const response = await fetch(`http://localhost:8004/v1/models/switch/${modelName}`, {
    method: 'POST',
  });

  if (!response.ok) {
    throw new Error(`Failed to switch model: ${response.statusText}`);
  }

  const data = await response.json();
  console.log(data.message); // "Successfully switched to model: ..."

  // Note: Model switching takes 10-30 seconds
  // Consider showing loading state
}

// List available models
async function listMLXModels(): Promise<any> {
  const response = await fetch('http://localhost:8004/v1/models');
  return response.json();
}

Conversation Management

// Get all conversations
async function listConversations(): Promise<any[]> {
  const response = await fetch('http://localhost:8004/v1/conversations');
  return response.json();
}

// Get specific conversation history
async function getConversation(conversationId: string): Promise<any> {
  const response = await fetch(`http://localhost:8004/v1/conversations/${conversationId}`);
  return response.json();
}

// Delete conversation
async function deleteConversation(conversationId: string): Promise<void> {
  await fetch(`http://localhost:8004/v1/conversations/${conversationId}`, {
    method: 'DELETE',
  });
}

// Clear conversation messages (keep conversation)
async function clearConversation(conversationId: string): Promise<void> {
  await fetch(`http://localhost:8004/v1/conversations/${conversationId}/clear`, {
    method: 'POST',
  });
}

Error Handling

class MLXError extends Error {
  constructor(
    message: string,
    public statusCode?: number,
    public details?: any
  ) {
    super(message);
    this.name = 'MLXError';
  }
}

async function safeMLXChat(
  client: MLXClient,
  prompt: string,
  options: Partial<MLXChatRequest> = {}
): Promise<string> {
  try {
    const response = await client.chat({ prompt, ...options });
    return response.text;
  } catch (error) {
    if (error instanceof TypeError && error.message.includes('fetch')) {
      throw new MLXError(
        'MLX service is not accessible. Is it running?',
        503
      );
    }

    if (error.response?.status === 503) {
      throw new MLXError(
        'MLX model is not loaded. Check service health.',
        503
      );
    }

    throw new MLXError(
      'MLX chat request failed',
      error.response?.status,
      error.response?.data
    );
  }
}

Best Practices

1. Parameter Tuning

// For factual content (documentation, answers)
const factualSettings = {
  temp: 0.2,
  top_p: 0.9,
  repetition_penalty: 1.2,
};

// For creative content (marketing, stories)
const creativeSettings = {
  temp: 0.9,
  top_p: 0.95,
  repetition_penalty: 1.1,
};

// For balanced use
const balancedSettings = {
  temp: 0.7,
  top_p: 0.9,
  repetition_penalty: 1.15,
};

2. Token Management

// Estimate tokens (rough approximation)
function estimateTokens(text: string): number {
  // English: ~4 chars per token
  // Arabic: ~2-3 chars per token
  return Math.ceil(text.length / 3.5);
}

// Adjust max_tokens based on use case
const settings = {
  shortAnswer: 100,      // Quick responses
  paragraph: 300,        // Detailed explanation
  article: 1000,         // Long-form content
  conversation: 500,     // Chat responses
};

3. System Prompts

const systemPrompts = {
  egyptianMarketing: `You are an expert Egyptian marketing copywriter.
    Write in Modern Standard Arabic or Egyptian dialect as appropriate.
    Use persuasive language and emotional appeals.`,

  technicalWriter: `You are a technical documentation writer.
    Be precise, clear, and concise.
    Use proper terminology and examples.`,

  customerSupport: `You are a helpful customer support agent.
    Be polite, empathetic, and solution-focused.
    Ask clarifying questions when needed.`,
};

Performance Tips

  1. Reuse conversation IDs for multi-turn chats (more efficient)
  2. Set appropriate max_tokens - don't over-generate
  3. Use lower temperature for faster, more deterministic responses
  4. Monitor response times - adjust timeout based on max_tokens
  5. Cache frequently used responses when possible
  6. Avoid frequent model switching - it's slow (10-30s)

Testing

// Health check before using
async function isMLXHealthy(): Promise<boolean> {
  try {
    const response = await fetch('http://localhost:8004/health', {
      timeout: 5000,
    });
    return response.ok;
  } catch {
    return false;
  }
}

// Test implementation
async function testMLXIntegration() {
  const mlx = new MLXClient();

  // Test health
  console.log('Testing MLX health...');
  const healthy = await isMLXHealthy();
  console.assert(healthy, 'MLX service should be healthy');

  // Test chat
  console.log('Testing chat...');
  const response = await mlx.chat({
    prompt: 'Say "Hello"',
    max_tokens: 10,
  });
  console.assert(response.text.length > 0, 'Should return text');
  console.assert(response.conversation_id, 'Should return conversation ID');

  // Test conversation continuity
  console.log('Testing conversation...');
  const response2 = await mlx.chat({
    prompt: 'What did I just say?',
    max_tokens: 20,
  });
  console.assert(
    response2.conversation_id === response.conversation_id,
    'Should maintain same conversation ID'
  );

  console.log('All tests passed!');
}

Skill Version: 1.0 Last Updated: 2025-01-01

ProYaro AI Infrastructure Documentation • Version 1.2