Performance Tips

This guide provides strategies to optimize EasilyAI performance, reduce latency, manage costs, and improve overall efficiency.

Model Selection

Choose the Right Model for Your Task

Different models have different performance characteristics:

python

from easilyai import create_app

# Fast models for simple tasks
fast_models = {
    "openai": "gpt-3.5-turbo",
    "anthropic": "claude-3-haiku-20240307",
    "gemini": "gemini-1.5-flash"
}

# Powerful models for complex tasks
powerful_models = {
    "openai": "gpt-4",
    "anthropic": "claude-3-opus-20240229",
    "gemini": "gemini-1.5-pro"
}

# Example: Use fast model for simple Q&A
simple_app = create_app("FastApp", "openai", "your-key", "gpt-3.5-turbo")
response = simple_app.request("What is 2+2?")

# Use powerful model for complex analysis
complex_app = create_app("PowerfulApp", "openai", "your-key", "gpt-4")
analysis = complex_app.request("Analyze the economic implications of AI automation")

Model Performance Comparison

python

import time
from easilyai import create_app

def benchmark_models():
    models = [
        ("openai", "gpt-3.5-turbo"),
        ("openai", "gpt-4"),
        ("anthropic", "claude-3-haiku-20240307"),
        ("anthropic", "claude-3-sonnet-20240229")
    ]
    
    prompt = "Explain quantum computing in simple terms"
    results = []
    
    for service, model in models:
        try:
            app = create_app("BenchmarkApp", service, "your-key", model)
            
            start_time = time.time()
            response = app.request(prompt)
            end_time = time.time()
            
            results.append({
                "service": service,
                "model": model,
                "duration": end_time - start_time,
                "response_length": len(response)
            })
        
        except Exception as e:
            print(f"Error with {service} {model}: {e}")
    
    # Sort by duration
    results.sort(key=lambda x: x["duration"])
    
    print("Performance Benchmark Results:")
    print("-" * 50)
    for result in results:
        print(f"{result['service']} {result['model']}:")
        print(f"  Duration: {result['duration']:.2f}s")
        print(f"  Response Length: {result['response_length']} chars")
        print()

# Run benchmark
benchmark_models()

Caching Strategies

Response Caching

Implement caching to avoid repeated API calls:

python

import hashlib
import json
import time
from pathlib import Path

class ResponseCache:
    def __init__(self, cache_dir: str = "cache", ttl: int = 3600):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.ttl = ttl  # Time to live in seconds
    
    def _cache_key(self, prompt: str, service: str, model: str, **kwargs) -> str:
        """Generate a unique cache key"""
        data = {
            "prompt": prompt,
            "service": service,
            "model": model,
            **kwargs
        }
        
        # Create hash of parameters
        data_str = json.dumps(data, sort_keys=True)
        return hashlib.md5(data_str.encode()).hexdigest()
    
    def get(self, prompt: str, service: str, model: str, **kwargs):
        """Get cached response if available and not expired"""
        cache_key = self._cache_key(prompt, service, model, **kwargs)
        cache_file = self.cache_dir / f"{cache_key}.json"
        
        if cache_file.exists():
            with open(cache_file, 'r') as f:
                cached_data = json.load(f)
            
            # Check if cache is still valid
            if time.time() - cached_data["timestamp"] < self.ttl:
                return cached_data["response"]
        
        return None
    
    def set(self, prompt: str, service: str, model: str, response: str, **kwargs):
        """Cache a response"""
        cache_key = self._cache_key(prompt, service, model, **kwargs)
        cache_file = self.cache_dir / f"{cache_key}.json"
        
        cache_data = {
            "response": response,
            "timestamp": time.time(),
            "prompt": prompt,
            "service": service,
            "model": model,
            "kwargs": kwargs
        }
        
        with open(cache_file, 'w') as f:
            json.dump(cache_data, f)

# Usage
from easilyai import create_app

class CachedAI:
    def __init__(self, cache_ttl: int = 3600):
        self.cache = ResponseCache(ttl=cache_ttl)
    
    def request(self, prompt: str, service: str, model: str, api_key: str, **kwargs):
        # Check cache first
        cached_response = self.cache.get(prompt, service, model, **kwargs)
        if cached_response:
            print("Cache hit!")
            return cached_response
        
        # Make API request
        print("Cache miss - making API request")
        app = create_app("CachedApp", service, api_key, model)
        response = app.request(prompt, **kwargs)
        
        # Cache the response
        self.cache.set(prompt, service, model, response, **kwargs)
        
        return response

# Usage
cached_ai = CachedAI(cache_ttl=1800)  # 30 minutes

# First request - will hit API
response1 = cached_ai.request("What is Python?", "openai", "gpt-3.5-turbo", "your-key")

# Second request - will use cache
response2 = cached_ai.request("What is Python?", "openai", "gpt-3.5-turbo", "your-key")

In-Memory Caching

For faster access, use in-memory caching:

python

from functools import lru_cache
import time

class InMemoryCachedAI:
    def __init__(self, cache_size: int = 1000):
        self.cache_size = cache_size
        self.cache_stats = {"hits": 0, "misses": 0}
    
    @lru_cache(maxsize=1000)
    def _cached_request(self, prompt: str, service: str, model: str, api_key: str, **kwargs):
        """Internal cached request method"""
        from easilyai import create_app
        
        app = create_app("InMemoryApp", service, api_key, model)
        return app.request(prompt, **kwargs)
    
    def request(self, prompt: str, service: str, model: str, api_key: str, **kwargs):
        # Convert kwargs to hashable format for caching
        kwargs_tuple = tuple(sorted(kwargs.items()))
        
        try:
            response = self._cached_request(prompt, service, model, api_key, **kwargs_tuple)
            self.cache_stats["hits"] += 1
            return response
        except TypeError:
            # If caching fails due to unhashable types, make direct request
            from easilyai import create_app
            app = create_app("DirectApp", service, api_key, model)
            response = app.request(prompt, **kwargs)
            self.cache_stats["misses"] += 1
            return response
    
    def get_cache_stats(self):
        total = self.cache_stats["hits"] + self.cache_stats["misses"]
        hit_rate = (self.cache_stats["hits"] / total * 100) if total > 0 else 0
        return {
            "hits": self.cache_stats["hits"],
            "misses": self.cache_stats["misses"],
            "hit_rate": hit_rate
        }

# Usage
cached_ai = InMemoryCachedAI()

# Make several requests
for i in range(5):
    response = cached_ai.request("What is AI?", "openai", "gpt-3.5-turbo", "your-key")
    print(f"Request {i+1}: {response[:50]}...")

# Check cache performance
stats = cached_ai.get_cache_stats()
print(f"Cache hit rate: {stats['hit_rate']:.1f}%")

Batch Processing

Parallel Processing

Process multiple requests concurrently:

python

import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
from easilyai import create_app

class BatchProcessor:
    def __init__(self, max_workers: int = 5):
        self.max_workers = max_workers
    
    def process_batch_parallel(self, requests: list, delay: float = 0.1):
        """Process requests in parallel with rate limiting"""
        results = []
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all requests
            future_to_request = {}
            for i, request in enumerate(requests):
                future = executor.submit(self._process_single_request, request)
                future_to_request[future] = i
                
                # Add delay between submissions for rate limiting
                if i > 0:
                    time.sleep(delay)
            
            # Collect results as they complete
            for future in as_completed(future_to_request):
                request_index = future_to_request[future]
                try:
                    result = future.result()
                    results.append((request_index, result))
                except Exception as e:
                    results.append((request_index, {"error": str(e)}))
        
        # Sort results by original order
        results.sort(key=lambda x: x[0])
        return [result for _, result in results]
    
    def _process_single_request(self, request: dict):
        """Process a single request"""
        try:
            app = create_app(
                "BatchApp",
                request["service"],
                request["api_key"],
                request["model"]
            )
            
            response = app.request(request["prompt"], **request.get("kwargs", {}))
            return {
                "success": True,
                "response": response,
                "prompt": request["prompt"]
            }
        except Exception as e:
            return {
                "success": False,
                "error": str(e),
                "prompt": request["prompt"]
            }

# Usage
processor = BatchProcessor(max_workers=3)

requests = [
    {"prompt": "What is AI?", "service": "openai", "api_key": "your-key", "model": "gpt-3.5-turbo"},
    {"prompt": "What is ML?", "service": "openai", "api_key": "your-key", "model": "gpt-3.5-turbo"},
    {"prompt": "What is DL?", "service": "anthropic", "api_key": "your-key", "model": "claude-3-haiku-20240307"},
]

import time
start_time = time.time()
results = processor.process_batch_parallel(requests)
end_time = time.time()

print(f"Processed {len(requests)} requests in {end_time - start_time:.2f} seconds")

for result in results:
    if result["success"]:
        print(f"✓ {result['prompt']}: {result['response'][:50]}...")
    else:
        print(f"✗ {result['prompt']}: {result['error']}")

Rate Limiting

Smart Rate Limiting

Implement intelligent rate limiting:

python

import time
from collections import defaultdict, deque

class RateLimiter:
    def __init__(self):
        self.limits = {
            "openai": {"requests_per_minute": 60, "tokens_per_minute": 90000},
            "anthropic": {"requests_per_minute": 50, "tokens_per_minute": 100000},
            "gemini": {"requests_per_minute": 60, "tokens_per_minute": 120000}
        }
        
        self.request_times = defaultdict(deque)
        self.token_counts = defaultdict(deque)
    
    def can_make_request(self, service: str, estimated_tokens: int = 1000) -> bool:
        """Check if we can make a request without hitting rate limits"""
        if service not in self.limits:
            return True
        
        now = time.time()
        limits = self.limits[service]
        
        # Clean old entries (older than 1 minute)
        minute_ago = now - 60
        
        # Clean request times
        while self.request_times[service] and self.request_times[service][0] < minute_ago:
            self.request_times[service].popleft()
        
        # Clean token counts
        while self.token_counts[service] and self.token_counts[service][0][0] < minute_ago:
            self.token_counts[service].popleft()
        
        # Check request rate limit
        if len(self.request_times[service]) >= limits["requests_per_minute"]:
            return False
        
        # Check token rate limit
        current_tokens = sum(count for _, count in self.token_counts[service])
        if current_tokens + estimated_tokens > limits["tokens_per_minute"]:
            return False
        
        return True
    
    def record_request(self, service: str, tokens_used: int = 1000):
        """Record a request and token usage"""
        now = time.time()
        self.request_times[service].append(now)
        self.token_counts[service].append((now, tokens_used))
    
    def wait_time(self, service: str) -> float:
        """Calculate how long to wait before making another request"""
        if service not in self.limits:
            return 0
        
        if not self.request_times[service]:
            return 0
        
        # Calculate time until oldest request expires
        oldest_request = self.request_times[service][0]
        wait_time = 60 - (time.time() - oldest_request)
        
        return max(0, wait_time)

class RateLimitedAI:
    def __init__(self):
        self.rate_limiter = RateLimiter()
    
    def request(self, prompt: str, service: str, model: str, api_key: str, **kwargs):
        # Estimate tokens (rough approximation)
        estimated_tokens = len(prompt.split()) * 1.3  # Rough token estimation
        
        # Wait if necessary
        if not self.rate_limiter.can_make_request(service, int(estimated_tokens)):
            wait_time = self.rate_limiter.wait_time(service)
            if wait_time > 0:
                print(f"Rate limit approached, waiting {wait_time:.1f} seconds...")
                time.sleep(wait_time)
        
        # Make request
        from easilyai import create_app
        app = create_app("RateLimitedApp", service, api_key, model)
        response = app.request(prompt, **kwargs)
        
        # Record the request
        response_tokens = len(response.split()) * 1.3
        total_tokens = estimated_tokens + response_tokens
        self.rate_limiter.record_request(service, int(total_tokens))
        
        return response

# Usage
rate_limited_ai = RateLimitedAI()

# Make many requests - will automatically rate limit
for i in range(10):
    response = rate_limited_ai.request(
        f"Tell me a fact about number {i}",
        "openai",
        "gpt-3.5-turbo",
        "your-key"
    )
    print(f"Response {i}: {response[:50]}...")

Prompt Optimization

Efficient Prompting

Optimize prompts for better performance:

python

class PromptOptimizer:
    def __init__(self):
        self.optimization_rules = [
            self._remove_redundancy,
            self._add_specificity,
            self._optimize_structure,
            self._add_output_format
        ]
    
    def optimize_prompt(self, prompt: str, task_type: str = "general") -> str:
        """Apply optimization rules to a prompt"""
        optimized = prompt
        
        for rule in self.optimization_rules:
            optimized = rule(optimized, task_type)
        
        return optimized
    
    def _remove_redundancy(self, prompt: str, task_type: str) -> str:
        """Remove redundant words and phrases"""
        # Remove common redundant phrases
        redundant_phrases = [
            "please", "can you", "I would like you to", "could you",
            "if possible", "thank you"
        ]
        
        optimized = prompt
        for phrase in redundant_phrases:
            optimized = optimized.replace(phrase, "")
        
        # Clean up extra spaces
        optimized = " ".join(optimized.split())
        
        return optimized
    
    def _add_specificity(self, prompt: str, task_type: str) -> str:
        """Add specific instructions based on task type"""
        if task_type == "code":
            return f"Generate code: {prompt}. Include comments and error handling."
        elif task_type == "creative":
            return f"Creative writing: {prompt}. Be vivid and engaging."
        elif task_type == "analysis":
            return f"Analyze and provide insights: {prompt}. Include specific examples."
        else:
            return prompt
    
    def _optimize_structure(self, prompt: str, task_type: str) -> str:
        """Optimize prompt structure"""
        # Add clear structure for complex prompts
        if len(prompt) > 100:
            return f"Task: {prompt}\n\nFormat your response clearly with numbered points."
        return prompt
    
    def _add_output_format(self, prompt: str, task_type: str) -> str:
        """Add output format specifications"""
        if task_type == "list":
            return f"{prompt}\n\nProvide the answer as a numbered list."
        elif task_type == "brief":
            return f"{prompt}\n\nKeep the response concise (under 100 words)."
        elif task_type == "detailed":
            return f"{prompt}\n\nProvide a comprehensive, detailed response."
        else:
            return prompt

# Usage
optimizer = PromptOptimizer()

# Optimize different types of prompts
original_prompt = "Please can you tell me about Python programming if possible, thank you"
optimized_prompt = optimizer.optimize_prompt(original_prompt, "code")

print(f"Original: {original_prompt}")
print(f"Optimized: {optimized_prompt}")

# Use optimized prompt
from easilyai import create_app
app = create_app("OptimizedApp", "openai", "your-key", "gpt-3.5-turbo")
response = app.request(optimized_prompt)

Memory Management

Efficient Memory Usage

Manage memory efficiently for large-scale applications:

python

import gc
import weakref
from typing import Dict, Any

class MemoryEfficientAI:
    def __init__(self):
        self._app_cache = weakref.WeakValueDictionary()
        self.max_cache_size = 100
        self.request_count = 0
    
    def get_app(self, service: str, api_key: str, model: str):
        """Get or create app with weak reference caching"""
        cache_key = f"{service}_{model}"
        
        if cache_key in self._app_cache:
            return self._app_cache[cache_key]
        
        from easilyai import create_app
        app = create_app(f"MemoryApp_{cache_key}", service, api_key, model)
        self._app_cache[cache_key] = app
        
        return app
    
    def request(self, prompt: str, service: str, model: str, api_key: str, **kwargs):
        """Make request with automatic memory management"""
        app = self.get_app(service, api_key, model)
        response = app.request(prompt, **kwargs)
        
        # Periodic garbage collection
        self.request_count += 1
        if self.request_count % 50 == 0:
            self._cleanup_memory()
        
        return response
    
    def _cleanup_memory(self):
        """Perform memory cleanup"""
        # Force garbage collection
        gc.collect()
        
        # Clear weak reference cache if it gets too large
        if len(self._app_cache) > self.max_cache_size:
            self._app_cache.clear()
        
        print(f"Memory cleanup performed. Cache size: {len(self._app_cache)}")

# Usage
memory_efficient_ai = MemoryEfficientAI()

# Make many requests
for i in range(100):
    response = memory_efficient_ai.request(
        f"Question {i}: What is {i} squared?",
        "openai",
        "gpt-3.5-turbo",
        "your-key"
    )
    print(f"Response {i}: {response}")

Performance Monitoring

Performance Metrics

Track performance metrics:

python

import time
import statistics
from collections import defaultdict

class PerformanceMonitor:
    def __init__(self):
        self.metrics = defaultdict(list)
        self.start_times = {}
    
    def start_request(self, request_id: str):
        """Start timing a request"""
        self.start_times[request_id] = time.time()
    
    def end_request(self, request_id: str, service: str, model: str, success: bool = True):
        """End timing a request and record metrics"""
        if request_id not in self.start_times:
            return
        
        duration = time.time() - self.start_times[request_id]
        del self.start_times[request_id]
        
        metric_key = f"{service}_{model}"
        self.metrics[metric_key].append({
            "duration": duration,
            "success": success,
            "timestamp": time.time()
        })
    
    def get_performance_stats(self, service: str, model: str) -> Dict[str, Any]:
        """Get performance statistics for a service/model combination"""
        metric_key = f"{service}_{model}"
        data = self.metrics[metric_key]
        
        if not data:
            return {}
        
        durations = [d["duration"] for d in data]
        successes = [d["success"] for d in data]
        
        return {
            "total_requests": len(data),
            "success_rate": sum(successes) / len(successes) * 100,
            "avg_duration": statistics.mean(durations),
            "median_duration": statistics.median(durations),
            "min_duration": min(durations),
            "max_duration": max(durations),
            "std_duration": statistics.stdev(durations) if len(durations) > 1 else 0
        }
    
    def print_performance_report(self):
        """Print a comprehensive performance report"""
        print("Performance Report")
        print("=" * 50)
        
        for metric_key, data in self.metrics.items():
            service, model = metric_key.split("_", 1)
            stats = self.get_performance_stats(service, model)
            
            if stats:
                print(f"\n{service.upper()} {model}:")
                print(f"  Total Requests: {stats['total_requests']}")
                print(f"  Success Rate: {stats['success_rate']:.1f}%")
                print(f"  Average Duration: {stats['avg_duration']:.2f}s")
                print(f"  Median Duration: {stats['median_duration']:.2f}s")
                print(f"  Duration Range: {stats['min_duration']:.2f}s - {stats['max_duration']:.2f}s")
                print(f"  Standard Deviation: {stats['std_duration']:.2f}s")

class MonitoredAI:
    def __init__(self):
        self.monitor = PerformanceMonitor()
        self.request_counter = 0
    
    def request(self, prompt: str, service: str, model: str, api_key: str, **kwargs):
        """Make a monitored request"""
        request_id = f"req_{self.request_counter}"
        self.request_counter += 1
        
        self.monitor.start_request(request_id)
        
        try:
            from easilyai import create_app
            app = create_app("MonitoredApp", service, api_key, model)
            response = app.request(prompt, **kwargs)
            
            self.monitor.end_request(request_id, service, model, success=True)
            return response
        
        except Exception as e:
            self.monitor.end_request(request_id, service, model, success=False)
            raise e
    
    def get_report(self):
        """Get performance report"""
        self.monitor.print_performance_report()

# Usage
monitored_ai = MonitoredAI()

# Make various requests
models = [
    ("openai", "gpt-3.5-turbo"),
    ("openai", "gpt-4"),
    ("anthropic", "claude-3-haiku-20240307")
]

for service, model in models:
    for i in range(5):
        try:
            response = monitored_ai.request(
                f"Tell me about topic {i}",
                service,
                model,
                "your-key"
            )
            print(f"✓ {service} {model}: {response[:50]}...")
        except Exception as e:
            print(f"✗ {service} {model}: {e}")

# Generate performance report
monitored_ai.get_report()

Cost Optimization

Cost Tracking

Track and optimize API costs:

python

class CostTracker:
    def __init__(self):
        # Approximate costs per 1K tokens (as of 2024)
        self.costs = {
            "openai": {
                "gpt-3.5-turbo": {"input": 0.001, "output": 0.002},
                "gpt-4": {"input": 0.03, "output": 0.06},
                "gpt-4-turbo": {"input": 0.01, "output": 0.03}
            },
            "anthropic": {
                "claude-3-haiku-20240307": {"input": 0.00025, "output": 0.00125},
                "claude-3-sonnet-20240229": {"input": 0.003, "output": 0.015},
                "claude-3-opus-20240229": {"input": 0.015, "output": 0.075}
            }
        }
        
        self.usage_log = []
    
    def estimate_tokens(self, text: str) -> int:
        """Rough token estimation"""
        return len(text.split()) * 1.3
    
    def calculate_cost(self, service: str, model: str, input_tokens: int, output_tokens: int) -> float:
        """Calculate cost for a request"""
        if service not in self.costs or model not in self.costs[service]:
            return 0.0
        
        model_costs = self.costs[service][model]
        input_cost = (input_tokens / 1000) * model_costs["input"]
        output_cost = (output_tokens / 1000) * model_costs["output"]
        
        return input_cost + output_cost
    
    def log_request(self, service: str, model: str, prompt: str, response: str):
        """Log a request for cost tracking"""
        input_tokens = self.estimate_tokens(prompt)
        output_tokens = self.estimate_tokens(response)
        cost = self.calculate_cost(service, model, input_tokens, output_tokens)
        
        self.usage_log.append({
            "service": service,
            "model": model,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost": cost,
            "timestamp": time.time()
        })
    
    def get_cost_summary(self) -> Dict[str, Any]:
        """Get cost summary"""
        if not self.usage_log:
            return {}
        
        total_cost = sum(entry["cost"] for entry in self.usage_log)
        total_tokens = sum(entry["input_tokens"] + entry["output_tokens"] for entry in self.usage_log)
        
        # Group by service/model
        by_model = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": 0})
        
        for entry in self.usage_log:
            key = f"{entry['service']}_{entry['model']}"
            by_model[key]["requests"] += 1
            by_model[key]["tokens"] += entry["input_tokens"] + entry["output_tokens"]
            by_model[key]["cost"] += entry["cost"]
        
        return {
            "total_cost": total_cost,
            "total_tokens": total_tokens,
            "total_requests": len(self.usage_log),
            "by_model": dict(by_model)
        }
    
    def print_cost_report(self):
        """Print cost report"""
        summary = self.get_cost_summary()
        
        if not summary:
            print("No usage data available")
            return
        
        print("Cost Report")
        print("=" * 40)
        print(f"Total Cost: ${summary['total_cost']:.4f}")
        print(f"Total Tokens: {summary['total_tokens']:,}")
        print(f"Total Requests: {summary['total_requests']}")
        print(f"Average Cost per Request: ${summary['total_cost']/summary['total_requests']:.4f}")
        print()
        
        print("By Model:")
        for model, data in summary["by_model"].items():
            print(f"  {model}:")
            print(f"    Requests: {data['requests']}")
            print(f"    Tokens: {data['tokens']:,}")
            print(f"    Cost: ${data['cost']:.4f}")
            print(f"    Avg Cost/Request: ${data['cost']/data['requests']:.4f}")
            print()

class CostOptimizedAI:
    def __init__(self, budget_limit: float = 10.0):
        self.cost_tracker = CostTracker()
        self.budget_limit = budget_limit
    
    def request(self, prompt: str, service: str, model: str, api_key: str, **kwargs):
        """Make a cost-tracked request"""
        # Check budget
        current_cost = self.cost_tracker.get_cost_summary().get("total_cost", 0)
        if current_cost >= self.budget_limit:
            raise Exception(f"Budget limit of ${self.budget_limit} exceeded!")
        
        # Make request
        from easilyai import create_app
        app = create_app("CostOptimizedApp", service, api_key, model)
        response = app.request(prompt, **kwargs)
        
        # Log for cost tracking
        self.cost_tracker.log_request(service, model, prompt, response)
        
        return response
    
    def get_remaining_budget(self) -> float:
        """Get remaining budget"""
        current_cost = self.cost_tracker.get_cost_summary().get("total_cost", 0)
        return max(0, self.budget_limit - current_cost)
    
    def suggest_cheaper_alternative(self, service: str, model: str):
        """Suggest cheaper alternatives"""
        alternatives = {
            ("openai", "gpt-4"): ("openai", "gpt-3.5-turbo"),
            ("anthropic", "claude-3-opus-20240229"): ("anthropic", "claude-3-haiku-20240307")
        }
        
        return alternatives.get((service, model), (service, model))

# Usage
cost_optimized_ai = CostOptimizedAI(budget_limit=5.0)

# Make requests with cost tracking
for i in range(10):
    try:
        response = cost_optimized_ai.request(
            f"Tell me about topic {i}",
            "openai",
            "gpt-3.5-turbo",
            "your-key"
        )
        print(f"Request {i}: {response[:50]}...")
        print(f"Remaining budget: ${cost_optimized_ai.get_remaining_budget():.4f}")
    except Exception as e:
        print(f"Error on request {i}: {e}")
        break

# Print cost report
cost_optimized_ai.cost_tracker.print_cost_report()

Best Practices Summary

Choose appropriate models: Use fast models for simple tasks, powerful models for complex ones
Implement caching: Avoid repeated API calls for the same requests
Use batch processing: Process multiple requests concurrently when possible
Apply rate limiting: Respect API limits to avoid throttling
Optimize prompts: Use clear, specific prompts to get better results faster
Monitor performance: Track metrics to identify bottlenecks
Manage costs: Track usage and optimize for cost efficiency
Handle errors gracefully: Implement retry logic and fallbacks
Use efficient data structures: Manage memory effectively for large-scale applications
Regular cleanup: Perform periodic memory cleanup and cache management

These performance optimization techniques will help you build efficient, scalable, and cost-effective AI applications with EasilyAI.

Performance Tips ​

Model Selection ​

Choose the Right Model for Your Task ​

Model Performance Comparison ​

Caching Strategies ​

Response Caching ​

In-Memory Caching ​

Batch Processing ​

Parallel Processing ​

Rate Limiting ​

Smart Rate Limiting ​

Prompt Optimization ​

Efficient Prompting ​

Memory Management ​

Efficient Memory Usage ​

Performance Monitoring ​

Performance Metrics ​

Cost Optimization ​

Cost Tracking ​

Best Practices Summary ​

Performance Tips

Model Selection

Choose the Right Model for Your Task

Model Performance Comparison

Caching Strategies

Response Caching

In-Memory Caching

Batch Processing

Parallel Processing

Rate Limiting

Smart Rate Limiting

Prompt Optimization

Efficient Prompting

Memory Management

Efficient Memory Usage

Performance Monitoring

Performance Metrics

Cost Optimization

Cost Tracking

Best Practices Summary