Rate limits help ensure fair usage and maintain service quality for all users. This guide explains HelpingAI's rate limiting system and how to work within these limits effectively.
Limits the number of API calls you can make per minute, regardless of size.
# Each of these counts as 1 request {#each-of-these-counts-as-1-request}
client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=[{"role": "user", "content": "Hi"}]
)
client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=[{"role": "user", "content": "This is a much longer message..."}]
)Limits the total number of tokens (input + output) processed per minute.
# This uses ~20 tokens (input + output) {#this-uses-20-tokens-input-output}
response = client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=[{"role": "user", "content": "Hello!"}] # ~2 tokens
)
# Response: "Hello! How can I help you today?" # ~8 tokens {#response-hello-how-can-i-help-you-today-8-tokens}
# Total: ~10 tokens {#total-10-tokens}Limits how many requests can be processed simultaneously.
Every API response includes rate limit information in the headers:
HTTP/1.1 200 OK
X-RateLimit-Limit-Requests: 60
X-RateLimit-Remaining-Requests: 59
X-RateLimit-Reset-Requests: 1640995200
X-RateLimit-Limit-Tokens: 10000
X-RateLimit-Remaining-Tokens: 9950
X-RateLimit-Reset-Tokens: 1640995200| Header | Description |
|---|---|
X-RateLimit-Limit-Requests | Maximum requests per minute |
X-RateLimit-Remaining-Requests | Remaining requests in current window |
X-RateLimit-Reset-Requests | Unix timestamp when request limit resets |
X-RateLimit-Limit-Tokens | Maximum tokens per minute |
X-RateLimit-Remaining-Tokens | Remaining tokens in current window |
X-RateLimit-Reset-Tokens | Unix timestamp when token limit resets |
When you exceed rate limits, you'll receive a 429 status code:
{
"error": {
"message": "Rate limit exceeded. Please try again in 30 seconds.",
"type": "rate_limit_error",
"code": "rate_limit_exceeded"
}
}import time
import requests
from helpingai import HelpingAI
from helpingai.exceptions import RateLimitError
client = HelpingAI(api_key="your-api-key")
def make_request_with_retry(messages, max_retries=3):
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=messages
)
return response
except RateLimitError as e:
if attempt < max_retries - 1:
# Extract wait time from error message or use exponential backoff
wait_time = 2 ** attempt
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
else:
raise e
return None
# Usage {#usage}
response = make_request_with_retry([
{"role": "user", "content": "Hello!"}
])import { HelpingAI } from 'helpingai';
const client = new HelpingAI({
apiKey: 'your-api-key'
});
async function makeRequestWithRetry(messages, maxRetries = 3) {
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
const response = await client.chat.completions.create({
model: 'Dhanishtha-2.0-preview',
messages: messages
});
return response;
} catch (error) {
if (error.status === 429 && attempt < maxRetries - 1) {
const waitTime = Math.pow(2, attempt) * 1000; // Exponential backoff
console.log(`Rate limited. Waiting ${waitTime}ms...`);
await new Promise(resolve => setTimeout(resolve, waitTime));
} else {
throw error;
}
}
}
}
// Usage
const response = await makeRequestWithRetry([
{role: 'user', content: 'Hello!'}
]);Implement a queue to manage requests within rate limits:
import asyncio
from collections import deque
import time
class RateLimitedClient:
def __init__(self, client, requests_per_minute=60):
self.client = client
self.requests_per_minute = requests_per_minute
self.request_times = deque()
async def make_request(self, **kwargs):
await self._wait_if_needed()
response = await self.client.chat.completions.create(**kwargs)
self.request_times.append(time.time())
return response
async def _wait_if_needed(self):
now = time.time()
# Remove requests older than 1 minute
while self.request_times and now - self.request_times[0] > 60:
self.request_times.popleft()
# If we're at the limit, wait
if len(self.request_times) >= self.requests_per_minute:
wait_time = 60 - (now - self.request_times[0])
if wait_time > 0:
await asyncio.sleep(wait_time)
# Usage {#usage}
rate_limited_client = RateLimitedClient(client, requests_per_minute=60)
response = await rate_limited_client.make_request(
model="Dhanishtha-2.0-preview",
messages=[{"role": "user", "content": "Hello!"}]
)Track token usage to stay within limits:
class TokenBudgetManager:
def __init__(self, tokens_per_minute=10000):
self.tokens_per_minute = tokens_per_minute
self.token_usage = deque()
def estimate_tokens(self, text):
# Rough estimation: ~4 characters per token
return len(text) // 4
def can_make_request(self, messages, max_tokens=150):
# Estimate input tokens
input_tokens = sum(self.estimate_tokens(msg['content']) for msg in messages)
estimated_total = input_tokens + max_tokens
# Check current usage
now = time.time()
current_usage = sum(
tokens for timestamp, tokens in self.token_usage
if now - timestamp < 60
)
return current_usage + estimated_total <= self.tokens_per_minute
def record_usage(self, usage):
self.token_usage.append((time.time(), usage.total_tokens))
# Clean old entries
now = time.time()
self.token_usage = deque([
(timestamp, tokens) for timestamp, tokens in self.token_usage
if now - timestamp < 60
])
# Usage {#usage}
budget_manager = TokenBudgetManager(tokens_per_minute=10000)
messages = [{"role": "user", "content": "Hello!"}]
if budget_manager.can_make_request(messages):
response = client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=messages
)
budget_manager.record_usage(response.usage)
else:
print("Would exceed token budget, waiting...")Process multiple requests efficiently:
async def batch_process(requests, batch_size=10, delay=1.0):
results = []
for i in range(0, len(requests), batch_size):
batch = requests[i:i + batch_size]
# Process batch concurrently
tasks = [
client.chat.completions.create(**request)
for request in batch
]
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
results.extend(batch_results)
# Delay between batches to respect rate limits
if i + batch_size < len(requests):
await asyncio.sleep(delay)
return results
# Usage {#usage}
requests = [
{
"model": "Dhanishtha-2.0-preview",
"messages": [{"role": "user", "content": f"Request {i}"}]
}
for i in range(100)
]
results = await batch_process(requests, batch_size=10, delay=6.0)def monitor_rate_limits(response):
"""Extract and display rate limit information"""
if hasattr(response, '_headers'):
headers = response._headers
print("Rate Limit Status:")
print(f"Requests: {headers.get('x-ratelimit-remaining-requests', 'N/A')}/{headers.get('x-ratelimit-limit-requests', 'N/A')}")
print(f"Tokens: {headers.get('x-ratelimit-remaining-tokens', 'N/A')}/{headers.get('x-ratelimit-limit-tokens', 'N/A')}")
reset_time = headers.get('x-ratelimit-reset-requests')
if reset_time:
reset_datetime = datetime.fromtimestamp(int(reset_time))
print(f"Resets at: {reset_datetime}")
# Usage {#usage}
response = client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=[{"role": "user", "content": "Hello!"}]
)
monitor_rate_limits(response)Monitor your usage through the HelpingAI dashboard:
import random
def exponential_backoff(attempt, base_delay=1, max_delay=60):
"""Calculate delay with jitter"""
delay = min(base_delay * (2 ** attempt), max_delay)
jitter = random.uniform(0, delay * 0.1) # Add 10% jitter
return delay + jitterStreaming doesn't reduce token usage but provides better user experience:
stream = client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=[{"role": "user", "content": "Write a long story"}],
stream=True,
max_tokens=1000
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")max_tokens limitstemperature for factual responsesCache responses for repeated queries:
import hashlib
import json
class ResponseCache:
def __init__(self):
self.cache = {}
def get_cache_key(self, messages, **kwargs):
# Create hash of request parameters
request_data = {
'messages': messages,
**kwargs
}
return hashlib.md5(json.dumps(request_data, sort_keys=True).encode()).hexdigest()
def get(self, messages, **kwargs):
key = self.get_cache_key(messages, **kwargs)
return self.cache.get(key)
def set(self, messages, response, **kwargs):
key = self.get_cache_key(messages, **kwargs)
self.cache[key] = response
# Usage {#usage}
cache = ResponseCache()
def cached_request(messages, **kwargs):
# Check cache first
cached_response = cache.get(messages, **kwargs)
if cached_response:
return cached_response
# Make request if not cached
response = client.chat.completions.create(
model="Dhanishtha-2.0-preview",
messages=messages,
**kwargs
)
# Cache the response
cache.set(messages, response, **kwargs)
return responseConsider upgrading when you:
Enterprise customers can request:
"Rate limit exceeded" errors:
Inconsistent rate limiting:
Unexpected token usage:
hideThink=true to reduce reasoning tokensNeed higher limits? Upgrade your plan or contact enterprise sales for custom solutions.