Python SDK
The official Ujeebu SDK for Python. Built with the popular requests library, it provides a Pythonic interface perfect for data science, web scraping, and automation.
Installation
pip install ujeebu-pythonpoetry add ujeebu-pythonpipenv install ujeebu-pythonRequirements:
- Python 3.7 or higher
-
requestslibrary (automatically installed)
Quick Start
from ujeebu_python import UjeebuClient
# Initialize with your API key
client = UjeebuClient('your-api-key')
# Scrape a website
response = client.scrape(
'https://example.com',
params={'js': True, 'response_type': 'html'}
)
print(response.text)Authentication
The SDK requires an API key for authentication. Get yours from the Ujeebu Dashboard.
Using Environment Variables (Recommended)
import os
from ujeebu_python import UjeebuClient
client = UjeebuClient(os.environ['UJEEBU_API_KEY'])from dotenv import load_dotenv
import os
from ujeebu_python import UjeebuClient
load_dotenv()
client = UjeebuClient(os.getenv('UJEEBU_API_KEY'))WARNING — Security
Never hardcode API keys in your source code or commit them to version control.
Core Methods
scrape()
Scrape web pages with various rendering and extraction options.
response = client.scrape('https://example.com')
print(response.text)response = client.scrape(
'https://example.com',
params={
'js': True,
'js_timeout': 5000,
'wait_for': '.dynamic-content'
}
)
print(response.text)response = client.scrape(
'https://example.com',
params={
'extract_rules': {
'title': 'h1',
'articles': {
'selector': '.article',
'type': 'list',
'data': {
'headline': 'h2',
'author': '.author'
}
}
}
}
)
data = response.json()
print(data['result'])extract()
Extract clean article content from web pages.
response = client.extract('https://example.com/article')
data = response.json()
print(data['article']['title'])
print(data['article']['author'])
print(data['article']['text'])
print(data['article']['pub_date'])response = client.extract(
'https://example.com/article',
params={
'strip_tags': 'script,style,nav',
'images': True
}
)serp()
Get structured search engine results.
response = client.serp(params={
'search': 'artificial intelligence',
'search_type': 'search',
'lang': 'en',
'results_count': 20
})
data = response.json()
print(data['organic_results'])
print(data['knowledge_graph'])response = client.serp(params={
'search': 'latest technology news',
'search_type': 'news',
'lang': 'en',
'results_count': 10
})
data = response.json()
for article in data['news']:
print(article['title'])response = client.serp(params={
'search': 'beautiful landscapes',
'search_type': 'images',
'results_count': 50
})
data = response.json()
for image in data['images']:
print(image['link'])preview()
Generate preview cards for URLs (similar to social media link previews).
response = client.preview('https://example.com/article')
data = response.json()
print(data['title'])
print(data['description'])
print(data['image'])
print(data['author'])
print(data['site_name'])ai_scrape()
Extract structured data using AI-powered natural language prompts.
response = client.ai_scrape(
'https://example.com/product',
'Extract the product name, price, and rating'
)
data = response.json()
print(data['data'])response = client.ai_scrape(
'https://example.com/product',
'Extract product details',
params={
'schema': {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'price': {'type': 'number'},
'rating': {'type': 'number'}
},
'required': ['name', 'price']
}
}
)
data = response.json()
print(data['data'])auto_extract()
Automatically extract structured data from any web page without writing prompts or selectors.
response = client.auto_extract('https://example.com/article')
data = response.json()
print(f"Page type: {data['page_type']}")
print(data['data'])response = client.auto_extract(
'https://example.com/article',
params={'html': existing_html} # Skip fetching
)
data = response.json()
print(data['data'])response = client.auto_extract(
'https://example.com/product',
params={
'proxy_type': 'premium',
'auto_proxy': False,
'js': True,
'scroll_down': True,
'wait_for': '.product-details',
'wait_for_timeout': 5000,
'timeout': 60000,
'force_refresh': True,
'auto_captcha_solve': True,
'provider': 'openai',
'model': 'gpt-4o'
}
)
data = response.json()
print(f"Page type: {data['page_type']}")
print(data['data'])
print(f"Credits used: {response.headers.get('ujb-credits')}")markdown()
Convert web pages to clean, LLM-optimized markdown.
response = client.markdown('https://example.com/article')
data = response.json()
print(data['markdown'])response = client.markdown(
'https://docs.example.com/guide',
params={
'filter': 'bm25',
'query': 'installation instructions'
}
)
data = response.json()
print(data['markdown'])
print(data['references'])response = client.markdown(
'https://example.com/spa-page',
params={
'filter': 'fit',
'citations': True,
'js': True,
'wait': 3000,
'wait_for_selector': '.content-loaded',
'timeout': 120,
'proxy_type': 'premium'
}
)
data = response.json()
print(data['markdown'])
print(data['fit_markdown'])
print(data['markdown_with_citations'])
print(data['references'])
print(f"Credits used: {response.headers.get('ujb-credits')}")Convenience Methods
get_pdf()
Generate a PDF of a web page.
response = client.get_pdf(
'https://example.com',
params={'js': True, 'wait_for': 2000}
)
data = response.json()
# data['pdf'] contains base64-encoded PDFget_screenshot()
Capture a screenshot of a web page.
response = client.get_screenshot(
'https://example.com',
params={'js': True, 'screenshot_fullpage': True}
)
data = response.json()
# data['screenshot'] contains base64-encoded imageresponse = client.get_screenshot(
'https://example.com',
params={'screenshot_partial': '.hero-section'}
)
data = response.json()
# data['screenshot'] contains base64-encoded imageget_html()
Get clean HTML content.
response = client.get_html(
'https://example.com',
params={'js': True, 'strip_tags': 'script,style'}
)
data = response.json()
print(data['html'])Scrape Parameters
INFO — Calling convention
The Python SDK uses positional
urland aparamsdict:client.scrape('https://example.com', params={'js': True}). All responses arerequests.Responseobjects — use.json()for JSON responses or.textfor HTML.
| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
url |
string |
Yes | The URL to scrape (positional argument). | |
js |
boolean |
No | False |
Enable JavaScript rendering. |
response_type |
string |
No | html |
Output format: 'html', 'screenshot', 'pdf', 'raw'. |
json |
boolean |
No | False |
When true, returns a JSON response instead of raw content. |
timeout |
int |
No | 60 |
Maximum number of seconds before request timeout. |
wait_for |
`str | int` | No | None |
wait_for_timeout |
int |
No | None |
Timeout in milliseconds for the wait_for parameter. |
js_timeout |
int |
No | 30000 |
Timeout for JavaScript execution in milliseconds. |
device |
string |
No | desktop |
Device to emulate: 'desktop', 'mobile', or specific device name. |
extract_rules |
dict |
No | None |
Rules for structured data extraction using CSS selectors. |
proxy_type |
string |
No | rotating |
Proxy type: 'rotating', 'advanced', 'premium', 'residential', 'mobile', 'custom'. |
proxy_country |
string |
No | US |
Country ISO code when using premium proxy. |
auto_proxy |
boolean |
No | False |
Automatically try different proxies until one succeeds. |
proxy_session |
string |
No | None |
Alphanumeric identifier to route requests through the same proxy instance. |
auto_captcha_solve |
boolean |
No | False |
Enable automatic CAPTCHA detection and solving. |
auto_captcha_solve_timeout |
int |
No | 120000 |
Timeout in milliseconds for CAPTCHA solving. |
AI Scraper Parameters
| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
url |
string |
Yes | The URL to scrape. | |
prompt |
string |
No | Natural language instruction describing what data to extract. Required unless schema is provided. | |
schema |
dict |
No | JSON schema defining the expected structure of extracted data. When provided, prompt becomes optional. | |
temperature |
float |
No | 0.0 |
LLM temperature (0.0-1.0). Lower = more deterministic. |
js |
bool |
No | True |
Enable JavaScript rendering. |
proxy_type |
string |
No | auto |
Proxy type: 'rotating', 'advanced', 'premium', 'residential'. Auto-selects if not specified. |
proxy_country |
string |
No | None |
ISO country code for proxy location. |
timeout |
int |
No | 120 |
Request timeout in seconds. |
wait_for |
`str | int` | No | None |
Markdown Parameters
| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
url |
string |
Yes | The URL to convert to markdown. | |
filter |
string |
No | fit |
Content filter: 'raw' (full page), 'fit' (main content), 'bm25' (relevance-ranked with query). |
query |
string |
No | None |
Search query for BM25 relevance filtering. Required when filter is 'bm25'. |
citations |
bool |
No | True (GET) / False (POST) |
Include citation references in the markdown output. Default is True for GET requests, False for POST requests. |
js |
bool |
No | True |
Enable JavaScript rendering for dynamic pages. |
wait |
int |
No | None |
Milliseconds to wait after page load before conversion. |
wait_for_selector |
string |
No | None |
CSS selector to wait for before conversion. |
timeout |
int |
No | 60 |
Request timeout in seconds. |
proxy |
str |
No | None |
Custom proxy URL to use for the request. |
proxy_type |
str |
No | "" (auto_proxy) |
Proxy type: 'rotating', 'advanced', 'premium', 'residential', 'residential_us', 'residential_geo'. If not set, auto_proxy selects the best proxy automatically. |
auto_captcha_solve |
bool |
No | True |
Enable automatic CAPTCHA detection and solving. |
auto_captcha_solve_timeout |
int |
No | 0 |
Timeout for CAPTCHA solving in milliseconds. |
AutoExtract Parameters
| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
url |
string |
Yes | The URL to extract data from. | |
html |
string |
No | None |
Pre-fetched HTML content. Skips fetching step when provided. |
proxy_type |
string |
No | "" (auto_proxy) |
Proxy type: 'rotating', 'advanced', 'premium', 'residential', 'residential_us', 'residential_geo'. |
auto_proxy |
bool |
No | True |
Automatic proxy selection for best results. |
js |
bool |
No | True |
Enable JavaScript rendering. |
scroll_down |
bool |
No | False |
Scroll down the page before extraction. |
wait_for |
string |
No | None |
CSS selector to wait for before extraction. |
wait_for_timeout |
int |
No | 30000 |
Timeout in milliseconds for the wait_for selector. |
timeout |
int |
No | 120000 |
Overall request timeout in milliseconds. |
force_refresh |
bool |
No | False |
Force regeneration of extraction rules even if cached. |
auto_captcha_solve |
bool |
No | True |
Enable automatic CAPTCHA detection and solving. |
auto_captcha_solve_timeout |
int |
No | 0 |
Timeout in milliseconds for CAPTCHA solving. |
provider |
string |
No | None |
LLM provider for rule generation (e.g., 'openai', 'anthropic', 'google'). |
model |
string |
No | None |
LLM model to use for rule generation. |
Error Handling
from requests.exceptions import HTTPError, ConnectionError, Timeout
try:
response = client.scrape('https://example.com')
response.raise_for_status()
print(response.text)
except HTTPError as e:
print(f'HTTP Error: {e.response.status_code}')
print(f'Message: {e.response.json().get("message")}')
except ConnectionError:
print('Network connection error')
except Timeout:
print('Request timed out')
except Exception as e:
print(f'Error: {e}')Integration Examples
With Pandas
import pandas as pd
from ujeebu_python import UjeebuClient
client = UjeebuClient(os.environ['UJEEBU_API_KEY'])
# Scrape product data
response = client.scrape(
'https://example.com/products',
params={
'extract_rules': {
'products': {
'selector': '.product',
'type': 'list',
'data': {
'name': '.title',
'price': '.price',
'rating': '.rating'
}
}
}
}
)
data = response.json()
df = pd.DataFrame(data['result']['products'])
print(df.head())Async with asyncio
import asyncio
from concurrent.futures import ThreadPoolExecutor
from ujeebu_python import UjeebuClient
client = UjeebuClient(os.environ['UJEEBU_API_KEY'])
urls = ['https://example1.com', 'https://example2.com', 'https://example3.com']
async def scrape_all(urls):
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as executor:
tasks = [
loop.run_in_executor(executor, client.scrape, url)
for url in urls
]
return await asyncio.gather(*tasks)
results = asyncio.run(scrape_all(urls))
for result in results:
print(len(result.text))Spin up an API key in 60 seconds
Free tier: 5,000 credits, no card, full access to every endpoint on this page.