Skill
MLX Chat (Text Generation)
MLX Chat Integration Skill
Overview
This skill provides ready-to-use code for integrating MLX text generation into your applications.
Service: MLX FastAPI (Mac Mini)
Endpoint: http://localhost:8004/v1/chat/completions (internal) or http://10.0.0.188:8004/v1/chat/completions (network)
Protocol: HTTP POST
Response Time: 5-20 seconds (depending on max_tokens)
TypeScript/JavaScript Implementation
Basic Chat Client
interface MLXChatRequest {
prompt: string;
max_tokens?: number;
temp?: number;
top_p?: number;
repetition_penalty?: number;
system_prompt?: string;
conversation_id?: string;
}
interface MLXChatResponse {
text: string;
conversation_id: string;
}
class MLXClient {
private baseURL: string;
private conversationId: string | null = null;
constructor(baseURL: string = 'http://localhost:8004') {
this.baseURL = baseURL;
}
async chat(options: MLXChatRequest): Promise<MLXChatResponse> {
const payload: MLXChatRequest = {
prompt: options.prompt,
max_tokens: options.max_tokens || 256,
temp: options.temp || 0.7,
top_p: options.top_p || 0.9,
repetition_penalty: options.repetition_penalty || 1.1,
system_prompt: options.system_prompt || 'You are a helpful assistant.',
};
// Use existing conversation if available
if (this.conversationId) {
payload.conversation_id = this.conversationId;
}
const response = await fetch(`${this.baseURL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
throw new Error(`MLX API error: ${response.statusText}`);
}
const data: MLXChatResponse = await response.json();
// Store conversation ID for continuity
this.conversationId = data.conversation_id;
return data;
}
newConversation(): void {
this.conversationId = null;
}
getCurrentConversationId(): string | null {
return this.conversationId;
}
}
// Usage Example
const mlx = new MLXClient();
// Simple chat
const response1 = await mlx.chat({
prompt: 'ما هي عاصمة مصر؟',
max_tokens: 100,
});
console.log(response1.text); // "عاصمة مصر هي القاهرة."
// Continue conversation
const response2 = await mlx.chat({
prompt: 'وما هو عدد سكانها؟',
max_tokens: 150,
});
console.log(response2.text); // "يبلغ عدد سكان القاهرة..."
// Start new conversation
mlx.newConversation();
const response3 = await mlx.chat({
prompt: 'Hello, how are you?',
system_prompt: 'You are a friendly English assistant.',
});
React Hook
import { useState, useCallback } from 'react';
interface UseMLXChatOptions {
baseURL?: string;
systemPrompt?: string;
maxTokens?: number;
temperature?: number;
}
interface Message {
role: 'user' | 'assistant';
content: string;
}
export function useMLXChat(options: UseMLXChatOptions = {}) {
const [messages, setMessages] = useState<Message[]>([]);
const [loading, setLoading] = useState(false);
const [conversationId, setConversationId] = useState<string | null>(null);
const sendMessage = useCallback(async (userMessage: string) => {
// Add user message
setMessages(prev => [...prev, { role: 'user', content: userMessage }]);
setLoading(true);
try {
const response = await fetch(`${options.baseURL || 'http://localhost:8004'}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
prompt: userMessage,
max_tokens: options.maxTokens || 500,
temp: options.temperature || 0.7,
system_prompt: options.systemPrompt || 'You are a helpful assistant.',
conversation_id: conversationId,
}),
});
if (!response.ok) {
throw new Error('MLX API request failed');
}
const data = await response.json();
// Update conversation ID
setConversationId(data.conversation_id);
// Add assistant message
setMessages(prev => [...prev, { role: 'assistant', content: data.text }]);
return data.text;
} catch (error) {
console.error('MLX chat error:', error);
throw error;
} finally {
setLoading(false);
}
}, [conversationId, options]);
const clearHistory = useCallback(() => {
setMessages([]);
setConversationId(null);
}, []);
return {
messages,
loading,
sendMessage,
clearHistory,
};
}
// Usage in Component
function ChatComponent() {
const { messages, loading, sendMessage, clearHistory } = useMLXChat({
systemPrompt: 'You are an expert Egyptian marketing assistant.',
maxTokens: 500,
});
const handleSubmit = async (e: React.FormEvent) => {
e.preventDefault();
const input = e.currentTarget.querySelector('input');
if (!input.value.trim()) return;
await sendMessage(input.value);
input.value = '';
};
return (
<div>
<div className="messages">
{messages.map((msg, i) => (
<div key={i} className={msg.role}>
{msg.content}
</div>
))}
</div>
<form onSubmit={handleSubmit}>
<input type="text" disabled={loading} />
<button type="submit" disabled={loading}>
{loading ? 'Sending...' : 'Send'}
</button>
</form>
<button onClick={clearHistory}>Clear Chat</button>
</div>
);
}
Python Implementation
import requests
from typing import Optional, Dict, Any
class MLXClient:
"""Client for MLX FastAPI text generation service"""
def __init__(self, base_url: str = "http://localhost:8004"):
self.base_url = base_url
self.conversation_id: Optional[str] = None
def chat(
self,
prompt: str,
max_tokens: int = 256,
temperature: float = 0.7,
top_p: float = 0.9,
repetition_penalty: float = 1.1,
system_prompt: str = "You are a helpful assistant.",
) -> Dict[str, Any]:
"""
Send a chat message to MLX service
Args:
prompt: User's message
max_tokens: Maximum tokens to generate
temperature: Sampling temperature (0.0-2.0)
top_p: Nucleus sampling parameter
repetition_penalty: Penalty for repetition
system_prompt: System instruction for the model
Returns:
dict with 'text' and 'conversation_id'
"""
payload = {
"prompt": prompt,
"max_tokens": max_tokens,
"temp": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"system_prompt": system_prompt,
}
# Continue conversation if available
if self.conversation_id:
payload["conversation_id"] = self.conversation_id
response = requests.post(
f"{self.base_url}/v1/chat/completions",
json=payload,
timeout=120 # 2 minutes timeout
)
response.raise_for_status()
data = response.json()
self.conversation_id = data["conversation_id"]
return data
def new_conversation(self):
"""Start a new conversation"""
self.conversation_id = None
def get_conversation_id(self) -> Optional[str]:
"""Get current conversation ID"""
return self.conversation_id
# Usage Example
if __name__ == "__main__":
mlx = MLXClient()
# First message
response = mlx.chat(
prompt="ما هي عاصمة مصر؟",
max_tokens=100,
)
print(f"Assistant: {response['text']}")
# Continue conversation
response = mlx.chat(
prompt="وما هو عدد سكانها تقريباً؟",
max_tokens=150,
)
print(f"Assistant: {response['text']}")
# Start new conversation
mlx.new_conversation()
response = mlx.chat(
prompt="Write a product description for a moisturizer",
system_prompt="You are an expert marketing copywriter.",
temperature=0.8, # More creative
)
print(f"Assistant: {response['text']}")
Advanced Features
Model Switching
async function switchMLXModel(modelName: string): Promise<void> {
const response = await fetch(`http://localhost:8004/v1/models/switch/${modelName}`, {
method: 'POST',
});
if (!response.ok) {
throw new Error(`Failed to switch model: ${response.statusText}`);
}
const data = await response.json();
console.log(data.message); // "Successfully switched to model: ..."
// Note: Model switching takes 10-30 seconds
// Consider showing loading state
}
// List available models
async function listMLXModels(): Promise<any> {
const response = await fetch('http://localhost:8004/v1/models');
return response.json();
}
Conversation Management
// Get all conversations
async function listConversations(): Promise<any[]> {
const response = await fetch('http://localhost:8004/v1/conversations');
return response.json();
}
// Get specific conversation history
async function getConversation(conversationId: string): Promise<any> {
const response = await fetch(`http://localhost:8004/v1/conversations/${conversationId}`);
return response.json();
}
// Delete conversation
async function deleteConversation(conversationId: string): Promise<void> {
await fetch(`http://localhost:8004/v1/conversations/${conversationId}`, {
method: 'DELETE',
});
}
// Clear conversation messages (keep conversation)
async function clearConversation(conversationId: string): Promise<void> {
await fetch(`http://localhost:8004/v1/conversations/${conversationId}/clear`, {
method: 'POST',
});
}
Error Handling
class MLXError extends Error {
constructor(
message: string,
public statusCode?: number,
public details?: any
) {
super(message);
this.name = 'MLXError';
}
}
async function safeMLXChat(
client: MLXClient,
prompt: string,
options: Partial<MLXChatRequest> = {}
): Promise<string> {
try {
const response = await client.chat({ prompt, ...options });
return response.text;
} catch (error) {
if (error instanceof TypeError && error.message.includes('fetch')) {
throw new MLXError(
'MLX service is not accessible. Is it running?',
503
);
}
if (error.response?.status === 503) {
throw new MLXError(
'MLX model is not loaded. Check service health.',
503
);
}
throw new MLXError(
'MLX chat request failed',
error.response?.status,
error.response?.data
);
}
}
Best Practices
1. Parameter Tuning
// For factual content (documentation, answers)
const factualSettings = {
temp: 0.2,
top_p: 0.9,
repetition_penalty: 1.2,
};
// For creative content (marketing, stories)
const creativeSettings = {
temp: 0.9,
top_p: 0.95,
repetition_penalty: 1.1,
};
// For balanced use
const balancedSettings = {
temp: 0.7,
top_p: 0.9,
repetition_penalty: 1.15,
};
2. Token Management
// Estimate tokens (rough approximation)
function estimateTokens(text: string): number {
// English: ~4 chars per token
// Arabic: ~2-3 chars per token
return Math.ceil(text.length / 3.5);
}
// Adjust max_tokens based on use case
const settings = {
shortAnswer: 100, // Quick responses
paragraph: 300, // Detailed explanation
article: 1000, // Long-form content
conversation: 500, // Chat responses
};
3. System Prompts
const systemPrompts = {
egyptianMarketing: `You are an expert Egyptian marketing copywriter.
Write in Modern Standard Arabic or Egyptian dialect as appropriate.
Use persuasive language and emotional appeals.`,
technicalWriter: `You are a technical documentation writer.
Be precise, clear, and concise.
Use proper terminology and examples.`,
customerSupport: `You are a helpful customer support agent.
Be polite, empathetic, and solution-focused.
Ask clarifying questions when needed.`,
};
Performance Tips
- Reuse conversation IDs for multi-turn chats (more efficient)
- Set appropriate max_tokens - don't over-generate
- Use lower temperature for faster, more deterministic responses
- Monitor response times - adjust timeout based on max_tokens
- Cache frequently used responses when possible
- Avoid frequent model switching - it's slow (10-30s)
Testing
// Health check before using
async function isMLXHealthy(): Promise<boolean> {
try {
const response = await fetch('http://localhost:8004/health', {
timeout: 5000,
});
return response.ok;
} catch {
return false;
}
}
// Test implementation
async function testMLXIntegration() {
const mlx = new MLXClient();
// Test health
console.log('Testing MLX health...');
const healthy = await isMLXHealthy();
console.assert(healthy, 'MLX service should be healthy');
// Test chat
console.log('Testing chat...');
const response = await mlx.chat({
prompt: 'Say "Hello"',
max_tokens: 10,
});
console.assert(response.text.length > 0, 'Should return text');
console.assert(response.conversation_id, 'Should return conversation ID');
// Test conversation continuity
console.log('Testing conversation...');
const response2 = await mlx.chat({
prompt: 'What did I just say?',
max_tokens: 20,
});
console.assert(
response2.conversation_id === response.conversation_id,
'Should maintain same conversation ID'
);
console.log('All tests passed!');
}
Skill Version: 1.0 Last Updated: 2025-01-01
ProYaro AI Infrastructure Documentation • Version 1.2