Article Extractor API
export const extractFullPath =
Article Extractor API
Convert any news or blog article into clean, structured JSON data. Extract text, author, publish date, images, and embedded media with a single API call.
Authentication
All API requests require authentication using an API key. Include your API key in the request header.
INFO — Get your API key
Sign up for a free account to receive your API key instantly at ujeebu.com/signup.
Header Format
GET ApiKey: YOUR_API_KEY
Basic Request
Make an extraction request by sending a GET request to the endpoint with the target article URL.
Endpoint
GET https://api.ujeebu.com/extract
Code Examples
curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article' \
-H "ApiKey: YOUR_API_KEY"const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article', {
headers: {
'ApiKey': 'YOUR_API_KEY'
}
});
const data = await response.json();
console.log(data);import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const response = await client.extract('https://example.com/article');
console.log(response.data.article);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={'url': 'https://example.com/article'},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json())from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key='YOUR_API_KEY')
response = ujeebu.extract(
url='https://example.com/article'
)
data = response.json()
print(data['article'])OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("https://api.ujeebu.com/extract?url=https://example.com/article")
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'ApiKey: YOUR_API_KEY'
],
]);
$response = curl_exec($curl);
curl_close($curl);
echo $response;package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
res, err := client.Do(req)
if err != nil {
panic(err)
}
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, err := ujeebu.NewClient("YOUR_API_KEY")
if err != nil {
panic(err)
}
article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://example.com/article",
})
if err != nil {
panic(err)
}
fmt.Println(article)
}Request Parameters
| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
url |
string |
Yes | - |
URL of article to be extracted. |
raw_html |
string |
No | null |
HTML of article to be extracted. When this is passed, article extraction is carried out on the value of this parameter (i.e. without fetching article from url), however the extractor still relies on url to resolve relative links and relatively referenced assets in the provided html. |
js |
boolean |
No | false |
Indicates whether to execute JavaScript or not. Set to 'auto' to let the extractor decide. |
text |
boolean |
No | true |
Indicates whether API should return extracted text. |
html |
boolean |
No | true |
Indicates whether API should extract html. |
media |
boolean |
No | false |
Indicates whether API should extract media. |
feeds |
boolean |
No | false |
Indicates whether API should extract RSS feeds. |
images |
boolean |
No | true |
Indicates whether API should extract all images present in HTML. |
author |
boolean |
No | true |
Indicates whether API should extract article's author. |
pub_date |
boolean |
No | true |
Indicates whether API should extract article's publish date. |
partial |
number |
No | 0 |
Number of characters or percentage of text (if percent sign is present) of text/html to be returned. 0 means all. |
is_article |
boolean |
No | true |
When true returns the probability [0-1] of URL being an article. Anything scoring 0.5 and above should be an article, but this may slightly vary from one site to another. |
quick_mode |
boolean |
No | false |
When true, does a quick analysis of the content instead of the normal advanced parsing. Usually cuts down response time by about 30% to 60%. |
strip_tags |
csv-string |
No | form |
Indicates which tags to strip from the extracted article HTML. Expects a comma separated list of tag names/css selectors. |
timeout |
number |
No | 60 |
Maximum number of seconds before request timeout. |
js_timeout |
number |
No | timeout/2 |
When js is enabled, indicates how many seconds the API should wait for the JS engine to render the supplied URL. |
scroll_down |
boolean |
No | false |
Indicates whether to scroll down the page or not, this applies only when js is enabled. |
scroll_wait |
number |
No | 100 |
Wait time in milliseconds between scroll actions when scroll_down is enabled. |
scroll_percent |
number |
No | null |
Percentage of the page to scroll (0-100). Used when scroll_down is enabled. |
progressive_scroll |
boolean |
No | false |
Enable progressive scrolling behavior for better dynamic content loading. |
scroll_callback |
string |
No | null |
JavaScript callback function to execute during scroll events. |
scroll_to_selector |
string |
No | null |
CSS selector to scroll to a specific element on the page. |
wait_until |
string |
No | load |
Controls when page load is considered complete. Possible values: 'load' (wait for load event), 'domcontentloaded' (wait for DOMContentLoaded), 'networkidle' (wait for network to be idle), 'commit' (wait for initial HTML commit). |
image_analysis |
boolean |
No | true |
Indicates whether API should analyse images for minimum width and height (see parameters min_image_width and min_image_height for more details). |
min_image_width |
number |
No | 200 |
Minimum width of the images kept in the HTML (if image_analysis is false this parameter has no effect). |
min_image_height |
number |
No | 100 |
Minimum height of the images kept in the HTML (if image_analysis is false this parameter has no effect). |
image_timeout |
number |
No | 2 |
Image fetching timeout in seconds. |
return_only_enclosed_text_images |
boolean |
No | true |
Indicates whether to return only images that are enclosed within extracted article HTML. |
main_image_in_html |
boolean |
No | false |
Include the main image in the extracted HTML body. |
publisher_country |
boolean |
No | false |
Extract publisher country information from the article. |
publisher_tz |
boolean |
No | false |
Extract publisher timezone information from the article. |
heavy_mode |
boolean |
No | false |
Enable more thorough extraction mode with deeper content analysis. |
text_length |
string |
No | priority |
Text selection mode for extraction algorithm. Possible values: 'conservative', 'auto', 'optimistic', 'priority'. |
proxy_type |
string |
No | rotating |
Indicates type of proxy to use. Possible values: 'rotating', 'advanced', 'premium', 'residential', 'mobile', 'custom'. When using 'residential' with a non-US proxy_country, requests are automatically routed through residential_geo proxies. |
proxy_country |
string |
No | US |
Country ISO 3166-1 alpha-2 code to proxy from. Valid only when premium proxy type is chosen. |
custom_proxy |
string |
No | null |
URI for your custom proxy in the following format: scheme://user:pass@host:port. Applicable and required only if proxy_type=custom. |
auto_proxy |
boolean |
No | false |
Enable a more advanced proxy by default when rotating proxy is not working. It will move to the next proxy option until it gets the content and will only stop when content is available or none of the options worked. Please note that you are billed only on the top option attempted. |
auto_premium_proxy |
boolean |
No | false |
Automatically use premium proxy on failures with rotating proxy. |
custom_proxy_username |
string |
No | null |
Username for custom proxy authentication when using custom_proxy. |
custom_proxy_password |
string |
No | null |
Password for custom proxy authentication when using custom_proxy. |
session_id |
alphanumeric |
No | null |
Alphanumeric identifier with a length between 1 and 16 characters, used to route multiple requests from the same proxy instance. Sessions remain active for 30 minutes. |
pagination |
boolean |
No | true |
Extract and concatenate multiple-page articles. |
pagination_max_pages |
number |
No | 30 |
Indicates the number of pages to extract when pagination is enabled. |
cookies |
string |
No | null |
Cookie string to send with the request. Can be a JSON object mapping cookie names to values or a cookie string. |
js_use |
string |
No | null |
JavaScript engine to use for rendering. Possible values: 'browserless', 'extract_browserless'. |
html_timeout |
number |
No | timeout |
Timeout in seconds specifically for HTML fetching operations. |
block_ads |
boolean |
No | false |
Block advertisements and trackers during page scraping. |
no_html_cache |
boolean |
No | false |
Disable HTML caching for this request. |
UJB-headerName |
string |
No | null |
Indicates which headers to send to target URL. This can be useful when article is behind a paywall for example, and that you need to pass your authentication cookies. |
auto_captcha_solve |
boolean |
No | false |
Enable automatic CAPTCHA detection and solving using external services. Supports reCAPTCHA v2/v3, hCaptcha, Cloudflare Turnstile, FunCaptcha, GeeTest, and image CAPTCHAs. When enabled, requests automatically use super mode with JavaScript rendering. |
auto_captcha_solve_timeout |
number |
No | 120000 |
Timeout in milliseconds for CAPTCHA solving. CAPTCHAs typically take 20-60 seconds to solve. |
Response Format
| Status | Meaning | Description | Schema |
|---|---|---|---|
| 200 | OK | successful operation | SuccessResponse |
| 400 | Bad Request | Invalid parameter value | APIResponseError |
Article Schema
{
"url": "string",
"canonical_url": "string",
"title": "string",
"text": "string",
"html": "string",
"summary": "string",
"image": "string",
"images": ["string"],
"media": ["string"],
"feeds": ["string"],
"language": "string",
"author": "string",
"pub_date": "string",
"modified_date": "string",
"site_name": "string",
"favicon": "string",
"encoding": "string",
"is_article": 0.0,
"next_page": "string",
"publisher_country": [{"name": "string"}],
"publisher_tz": [{"name": "string"}]
}
Properties
| Name | Type | Description |
|---|---|---|
| url | string | the URL parameter. |
| canonical_url | string | the final (resolved) URL. |
| title | string | the title of the article. |
| text | string | the extracted text. |
| html | string | the extracted html. |
| summary | string | summary (if available) of the article text. |
| image | string | main image of the article. |
| images | [string] | all images present in article. |
| media | [string] | all media present in article. |
| feeds | [string] | RSS feeds found on the page. |
| language | string | language code of article text. |
| author | string | author of article. |
| pub_date | string | publication date of article. |
| modified_date | string | last modified date of article. |
| site_name | string | name of site hosting article. |
| favicon | string | favicon of site hosting article. |
| encoding | string | character encoding of article text. |
| is_article | number | probability [0-1] of URL being an article. |
| next_page | string | URL of the next page (when pagination is detected). |
| publisher_country | [object] | publisher country info (when publisher_country is true). |
| publisher_tz | [object] | publisher timezone info (when publisher_tz is true). |
Success Response example
{
"article": {
"text": "I began learning German at the age of 13, and I\u2019m still trying to explain to myself why it was love at first sound. The answer must surely be: the excellence of my teacher. At an English public school not famed for its cultural generosity, Mr King was that rare thing: a kindly and intelligent man who, in the thick of the second world war, determinedly loved the Germany that he knew was still there somewhere.\nRather than join the chorus of anti-German propaganda, he preferred, doggedly, to inspire his little class with the beauty of the language, and of its literature and culture. One day, he used to say, the real Germany will come back. And he was right. Because now it has.\nWhy was it love at first sound for me? Well...",
"html": "<p><span>I<\/span> began learning German at the age of 13, and I’m still trying to explain to myself why it was love at first sound. The answer must surely be: the excellence of my teacher. At an English public school not famed for its cultural generosity, Mr King was that rare thing: a kindly and intelligent man who, in the thick of the second world war, determinedly loved the Germany that he knew was still there somewhere.<\/p><p>Rather than join the chorus of anti-German propaganda, he preferred, doggedly, to inspire his little class with the beauty of the language, and of its literature and culture. One day, he used to say, the real Germany will come back. And he was right. Because now it has....",
"media": [],
"images": [],
"author": "John le Carr\u00e9",
"pub_date": "2017-07-01 23:05:12",
"is_article": 1,
"url": "https:\/\/www.theguardian.com\/education\/2017\/jul\/02\/why-we-should-learn-german-john-le-carre",
"canonical_url": "https:\/\/www.theguardian.com\/education\/2017\/jul\/02\/why-we-should-learn-german-john-le-carre",
"title": "Why we should learn German | John le Carr\u00e9",
"language": "en",
"image": "https:\/\/i.guim.co.uk\/img\/media\/f19eff6f7e1751d88b38e725cfbe6687084d5f64\/0_235_9010_5405\/master\/9010.jpg?width=1200&height=630&quality=85&auto=format&fit=crop&overlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdG8tb3BpbmlvbnMtYWdlLTIwMTcucG5n&enable=upscale&s=efeec857dffdb94cd84c4b652b4e287f",
"summary": "To help make the European debate decent and civilised, it is now more important than ever to value the skills of the linguist",
"modified_date": "2017-12-02 03:00:56",
"site_name": "the Guardian",
"favicon": "https:\/\/static.guim.co.uk\/images\/favicon-32x32.ico",
"encoding": "utf-8"
},
"time": 0.85
}
Error Response Schema
{
"url": "string",
"message": "string",
"error_code": 400,
"errors": ["string"]
}
Properties
| Name | Type | Description |
|---|---|---|
| url | string | Given URL |
| message | string | Error message |
| error_code | number | HTTP status code |
| errors | [string] | List of all errors |
Response Codes
| Code | Billed | Meaning | Suggestion |
|---|---|---|---|
| 200 | Yes | Successful request | - |
| 400 | NO | Some required parameter is missing (URL) | Set |
| 401 | NO | Missing API-KEY | Provide API-KEY |
| 404 | YES | Provided URL not found | Provide a valid URL |
| 408 | YES | Request timeout | Increase timeout parameter, use premium proxy or force JS |
| 429 | NO | Too many requests | upgrade your plan |
| 500 | NO | Internal error | Try request or contact us |
Examples
TIP — Quick Tip
Use the searchable parameter table above to quickly find the parameters you need for your use case.
Extract with JavaScript Rendering
Enable JavaScript execution to extract content from pages that rely on client-side rendering.
curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle' \
-H "ApiKey: YOUR_API_KEY"const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle', {
headers: {
'ApiKey': 'YOUR_API_KEY'
}
});
const data = await response.json();
console.log(data.article.title, data.article.text);const response = await client.extract('https://example.com/article', {
js: true,
js_timeout: 10000,
wait_until: 'networkidle'
});
console.log(response.data.article.title, response.data.article.text);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={
'url': 'https://example.com/article',
'js': 'true',
'js_timeout': 10000,
'wait_until': 'networkidle'
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
data = response.json()
print(data['article']['title'], data['article']['text'])response = ujeebu.extract(
'https://example.com/article',
params={
'js': True,
'js_timeout': 10000,
'wait_until': 'networkidle'
}
)
data = response.json()
print(data['article']['title'], data['article']['text'])OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle")
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'ApiKey: YOUR_API_KEY'
],
]);
$response = curl_exec($curl);
curl_close($curl);
echo $response;package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://example.com/article",
JS: true,
JSTimeout: 10000,
WaitUntil: "networkidle",
})
if err != nil {
panic(err)
}
fmt.Println(article.Title, article.Text)Extract with Custom Headers
Pass custom headers to access content behind authentication or paywalls.
curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article' \
-H "ApiKey: YOUR_API_KEY" \
-H "UJB-Authorization: Bearer your-token" \
-H "UJB-Cookie: session=abc123"const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article', {
headers: {
'ApiKey': 'YOUR_API_KEY',
'UJB-Authorization': 'Bearer your-token',
'UJB-Cookie': 'session=abc123'
}
});
const data = await response.json();
console.log(data.article);const response = await client.extract('https://example.com/article', {}, {
'Authorization': 'Bearer your-token',
'Cookie': 'session=abc123'
});
console.log(response.data.article);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={'url': 'https://example.com/article'},
headers={
'ApiKey': 'YOUR_API_KEY',
'UJB-Authorization': 'Bearer your-token',
'UJB-Cookie': 'session=abc123'
}
)
print(response.json()['article'])response = ujeebu.extract(
'https://example.com/article',
headers={
'Authorization': 'Bearer your-token',
'Cookie': 'session=abc123'
}
)
data = response.json()
print(data['article'])OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("https://api.ujeebu.com/extract?url=https://example.com/article")
.addHeader("ApiKey", "YOUR_API_KEY")
.addHeader("UJB-Authorization", "Bearer your-token")
.addHeader("UJB-Cookie", "session=abc123")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'ApiKey: YOUR_API_KEY',
'UJB-Authorization: Bearer your-token',
'UJB-Cookie: session=abc123'
],
]);
$response = curl_exec($curl);
curl_close($curl);
echo $response;package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
req.Header.Add("UJB-Authorization", "Bearer your-token")
req.Header.Add("UJB-Cookie", "session=abc123")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://example.com/article",
CustomHeaders: map[string]string{
"Authorization": "Bearer your-token",
"Cookie": "session=abc123",
},
})
if err != nil {
panic(err)
}
fmt.Println(article)Extract with Premium Proxy
Use premium proxies for better reliability and geographic targeting.
curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US' \
-H "ApiKey: YOUR_API_KEY"const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US', {
headers: {
'ApiKey': 'YOUR_API_KEY'
}
});
const data = await response.json();
console.log(data.article);const response = await client.extract('https://example.com/article', {
proxy_type: 'premium',
proxy_country: 'US'
});
console.log(response.data.article);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={
'url': 'https://example.com/article',
'proxy_type': 'premium',
'proxy_country': 'US'
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json()['article'])response = ujeebu.extract(
'https://example.com/article',
params={
'proxy_type': 'premium',
'proxy_country': 'US'
}
)
data = response.json()
print(data['article'])OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US")
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'ApiKey: YOUR_API_KEY'
],
]);
$response = curl_exec($curl);
curl_close($curl);
echo $response;package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://example.com/article",
ProxyType: "premium",
ProxyCountry: "US",
})
if err != nil {
panic(err)
}
fmt.Println(article)Extract with Pagination
Automatically extract multi-page articles by following pagination links.
curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5' \
-H "ApiKey: YOUR_API_KEY"const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5', {
headers: {
'ApiKey': 'YOUR_API_KEY'
}
});
const data = await response.json();
// Article text will include content from all pages
console.log(data.article.text);const response = await client.extract('https://example.com/article', {
pagination: true,
pagination_max_pages: 5
});
// Article text will include content from all pages
console.log(response.data.article.text);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={
'url': 'https://example.com/article',
'pagination': 'true',
'pagination_max_pages': 5
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
data = response.json()
# Article text will include content from all pages
print(data['article']['text'])response = ujeebu.extract(
'https://example.com/article',
params={
'pagination': True,
'pagination_max_pages': 5
}
)
data = response.json()
# Article text will include content from all pages
print(data['article']['text'])OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5")
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'ApiKey: YOUR_API_KEY'
],
]);
$response = curl_exec($curl);
curl_close($curl);
echo $response;package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://example.com/article",
Pagination: true,
PaginationMaxPages: 5,
})
if err != nil {
panic(err)
}
// Article text will include content from all pages
fmt.Println(article.Text)Quick Mode for Faster Extraction
Use quick mode to reduce extraction time by 30-60% with simplified processing.
curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true' \
-H "ApiKey: YOUR_API_KEY"const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true', {
headers: {
'ApiKey': 'YOUR_API_KEY'
}
});
const data = await response.json();
console.log(data.article);const response = await client.extract('https://example.com/article', {
quick_mode: true
});
console.log(response.data.article);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={
'url': 'https://example.com/article',
'quick_mode': 'true'
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json()['article'])response = ujeebu.extract(
'https://example.com/article',
params={'quick_mode': True}
)
data = response.json()
print(data['article'])OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true")
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'ApiKey: YOUR_API_KEY'
],
]);
$response = curl_exec($curl);
curl_close($curl);
echo $response;package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://example.com/article",
QuickMode: true,
})
if err != nil {
panic(err)
}
fmt.Println(article)Stripping tags
If you want to delete some html element(s) before the extraction is carried out, use parameter strip_tags to pass a comma-separated list of css selectors of elements to delete.
INFO
The example below will remove any meta, form and input tags as well as any element with class
hidden.
Passing custom headers
curl -i \
-H 'UJB-Username: username' \
-H 'UJB-Authorisation: Basic dXNlcm5hbWU6cGFzc3dvcmQ=' \
-H 'ApiKey: <API Key>' \
-X GET \
https://api.ujeebu.com/extract?url=https://ujeebu.com/blog/how-to-extract-clean-text-from-html
The code above will return the following response:
{
"article": {
"author": "Sam",
"pub_date": "2019-08-09 12:42:25",
"is_article": 1,
"url": "https://ujeebu.com/blog/how-to-extract-clean-text-from-html",
"canonical_url": "https://ujeebu.com/blog/how-to-extract-clean-text-from-html/",
"title": "Extracting clean data from blog and news articles",
"site_name": "Ujeebu blog",
"favicon": "https://ujeebu.com/blog/favicon.png",
"encoding": "utf-8",
"pages": ["https://ujeebu.com/blog/how-to-extract-clean-text-from-html/"]
},
"time": 6.366053104400635,
"js": false,
"pagination": false
}
Using Proxies
Using Your Own Proxy
Auto Proxy
When dealing with difficult websites that block requests or have aggressive anti-bot measures, use the auto_proxy parameter to automatically try different proxies until one succeeds.
TIP — How Auto Proxy Works
When
auto_proxy=true, Ujeebu automatically cycles through available proxy types in sequence. If one proxy fails (network error, timeout, or status ≥ 400), it immediately retries with the next proxy. This continues until a successful response is received or all proxies have been tried.
Retry Flow:
- Select proxy → Make request → Check result
- On failure (error, timeout, status ≥ 400 except 404) → Try next proxy
- On success (status 200-399) → Return response
- Final fallback → Direct connection (no proxy)
curl -X GET 'https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true' \
-H "ApiKey: YOUR_API_KEY"const response = await fetch('https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true', {
headers: {
'ApiKey': 'YOUR_API_KEY'
}
});
const data = await response.json();
console.log(data.article.title, data.article.text);const response = await client.extract('https://difficult-site.com/article', {
auto_proxy: true,
js: true // Combine with JS rendering if needed
});
console.log(response.data.article.title, response.data.article.text);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={
'url': 'https://difficult-site.com/article',
'auto_proxy': 'true',
'js': 'true'
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
data = response.json()
print(data['article']['title'], data['article']['text'])response = ujeebu.extract(
'https://difficult-site.com/article',
params={
'auto_proxy': True,
'js': True # Combine with JS rendering if needed
}
)
data = response.json()
print(data['article']['title'], data['article']['text'])OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true")
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$curl = curl_init();
curl_setopt_array($curl, [
CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'ApiKey: YOUR_API_KEY'
],
]);
$response = curl_exec($curl);
curl_close($curl);
echo $response;package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://difficult-site.com/article",
AutoProxy: true,
JS: true, // Combine with JS rendering if needed
})
if err != nil {
panic(err)
}
fmt.Println(article.Title, article.Text)When to use Auto Proxy:
- ✅ Extracting articles from sites known to block requests
- ✅ Handling geo-restricted content
- ✅ Improving success rates for critical extraction tasks
- ✅ Scraping news sites with aggressive anti-bot measures
Important notes:
- You are billed only for the proxy option that succeeds
- Each failed attempt still contributes to processing time
- A 404 response does NOT trigger a retry (it's a valid "not found")
For detailed documentation, see the Auto Proxy Guide.
CAPTCHA Solving
The Extract API can automatically detect and solve CAPTCHAs on web pages using external solving services. This is an opt-in feature that supports:
- reCAPTCHA v2/v3 (including invisible and enterprise versions)
- hCaptcha
- Cloudflare Turnstile
- FunCaptcha (Arkose Labs)
- GeeTest
- Image CAPTCHAs
INFO — How it works
When
auto_captcha_solveis enabled, the API automatically detects CAPTCHAs on the page, sends them to a solving service, and injects the solution token back into the page. This happens transparently before extracting the article content.
Code Example
curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000' \
-H "ApiKey: YOUR_API_KEY"const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000', {
headers: {
'ApiKey': 'YOUR_API_KEY'
}
});
const data = await response.json();
console.log(data.article);const response = await client.extract('https://example.com/protected-article', {
js: true,
auto_captcha_solve: true,
auto_captcha_solve_timeout: 120000 // 2 minutes
});
console.log(response.data.article);import requests
response = requests.get(
'https://api.ujeebu.com/extract',
params={
'url': 'https://example.com/protected-article',
'js': 'true',
'auto_captcha_solve': 'true',
'auto_captcha_solve_timeout': 120000
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json()['article'])response = ujeebu.extract(
'https://example.com/protected-article',
params={
'js': True,
'auto_captcha_solve': True,
'auto_captcha_solve_timeout': 120000 # 2 minutes
}
)
data = response.json()
print(data['article'])package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
client := &http.Client{}
req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000", nil)
req.Header.Add("ApiKey", "YOUR_API_KEY")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(string(body))
}article, _, err := client.Extract(ujeebu.ExtractParams{
URL: "https://example.com/protected-article",
JS: true,
AutoCaptchaSolve: true,
AutoCaptchaSolveTimeout: 120000, // 2 minutes
})
if err != nil {
panic(err)
}
fmt.Println(article)WARNING — Additional credits
CAPTCHA solving uses external services and incurs +5 credits on top of the base request cost when a CAPTCHA is detected and solved.
Rate Limits & Credits
Usage Tracking
To track credit usage programmatically, call the /account endpoint. See Account API for the full reference, response shape, and rate limit (10 calls/minute).
Spin up an API key in 60 seconds
Free tier: 5,000 credits, no card, full access to every endpoint on this page.