Article Extractor API

Convert any news or blog article into clean, structured JSON data. Extract text, author, publish date, images, and embedded media with a single API call.

Authentication

All API requests require authentication using an API key. Include your API key in the request header.

INFO - Get your API key

Sign up for a free account to receive your API key instantly at ujeebu.com/signup.

Header Format

GET ApiKey: YOUR_API_KEY

Basic Request

Make an extraction request by sending a GET request to the endpoint with the target article URL.

Endpoint

GET https://api.ujeebu.com/extract

Code Examples

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article' \
  -H "ApiKey: YOUR_API_KEY"

const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data);

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const response = await client.extract('https://example.com/article');

console.log(response.data.article);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={'url': 'https://example.com/article'},
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json())

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key='YOUR_API_KEY')

response = ujeebu.extract(
    url='https://example.com/article'
)

data = response.json()
print(data['article'])

OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, err := client.Do(req)
	if err != nil {
		panic(err)
	}
	defer res.Body.Close()
	
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, err := ujeebu.NewClient("YOUR_API_KEY")
	if err != nil {
		panic(err)
	}
	
	article, _, err := client.Extract(ujeebu.ExtractParams{
		URL: "https://example.com/article",
	})
	if err != nil {
		panic(err)
	}
	
	fmt.Println(article)
}

Request Parameters

Parameter	Type	Required	Default	Description
`url`	`string`	Yes	`-`	URL of article to be extracted.
`raw_html`	`string`	No	`null`	HTML of article to be extracted. When this is passed, article extraction is carried out on the value of this parameter (i.e. without fetching article from url), however the extractor still relies on url to resolve relative links and relatively referenced assets in the provided html.
`js`	`boolean`	No	`false`	Indicates whether to execute JavaScript or not. Set to 'auto' to let the extractor decide.
`text`	`boolean`	No	`true`	Indicates whether API should return extracted text.
`html`	`boolean`	No	`true`	Indicates whether API should extract html.
`media`	`boolean`	No	`false`	Indicates whether API should extract media.
`feeds`	`boolean`	No	`false`	Indicates whether API should extract RSS feeds.
`images`	`boolean`	No	`true`	Indicates whether API should extract all images present in HTML.
`author`	`boolean`	No	`true`	Indicates whether API should extract article's author.
`pub_date`	`boolean`	No	`true`	Indicates whether API should extract article's publish date.
`partial`	`number`	No	`0`	Number of characters or percentage of text (if percent sign is present) of text/html to be returned. 0 means all.
`is_article`	`boolean`	No	`true`	When true returns the probability [0-1] of URL being an article. Anything scoring 0.5 and above should be an article, but this may slightly vary from one site to another.
`quick_mode`	`boolean`	No	`false`	When true, does a quick analysis of the content instead of the normal advanced parsing. Usually cuts down response time by about 30% to 60%.
`strip_tags`	`csv-string`	No	`form`	Indicates which tags to strip from the extracted article HTML. Expects a comma separated list of tag names/css selectors.
`timeout`	`number`	No	`60`	Maximum number of seconds before request timeout.
`js_timeout`	`number`	No	`timeout/2`	When js is enabled, indicates how many seconds the API should wait for the JS engine to render the supplied URL.
`scroll_down`	`boolean`	No	`false`	Indicates whether to scroll down the page or not, this applies only when js is enabled.
`scroll_wait`	`number`	No	`100`	Wait time in milliseconds between scroll actions when scroll_down is enabled.
`scroll_percent`	`number`	No	`null`	Percentage of the page to scroll (0-100). Used when scroll_down is enabled.
`progressive_scroll`	`boolean`	No	`false`	Enable progressive scrolling behavior for better dynamic content loading.
`scroll_callback`	`string`	No	`null`	JavaScript callback function to execute during scroll events.
`scroll_to_selector`	`string`	No	`null`	CSS selector to scroll to a specific element on the page.
`wait_until`	`string`	No	`load`	Controls when page load is considered complete. Possible values: 'load' (wait for load event), 'domcontentloaded' (wait for DOMContentLoaded), 'networkidle' (wait for network to be idle), 'commit' (wait for initial HTML commit).
`image_analysis`	`boolean`	No	`true`	Indicates whether API should analyse images for minimum width and height (see parameters min_image_width and min_image_height for more details).
`min_image_width`	`number`	No	`200`	Minimum width of the images kept in the HTML (if image_analysis is false this parameter has no effect).
`min_image_height`	`number`	No	`100`	Minimum height of the images kept in the HTML (if image_analysis is false this parameter has no effect).
`image_timeout`	`number`	No	`2`	Image fetching timeout in seconds.
`return_only_enclosed_text_images`	`boolean`	No	`true`	Indicates whether to return only images that are enclosed within extracted article HTML.
`main_image_in_html`	`boolean`	No	`false`	Include the main image in the extracted HTML body.
`publisher_country`	`boolean`	No	`false`	Extract publisher country information from the article.
`publisher_tz`	`boolean`	No	`false`	Extract publisher timezone information from the article.
`heavy_mode`	`boolean`	No	`false`	Enable more thorough extraction mode with deeper content analysis.
`text_length`	`string`	No	`priority`	Text selection mode for extraction algorithm. Possible values: 'conservative', 'auto', 'optimistic', 'priority'.
`proxy_type`	`string`	No	`rotating`	Indicates type of proxy to use. Possible values: 'rotating', 'advanced', 'premium', 'residential', 'mobile', 'custom'. When using 'residential' with a non-US proxy_country, requests are automatically routed through residential_geo proxies.
`proxy_country`	`string`	No	`US`	Country ISO 3166-1 alpha-2 code to proxy from. Valid only when premium proxy type is chosen.
`custom_proxy`	`string`	No	`null`	URI for your custom proxy in the following format: scheme://user:pass@host:port. Applicable and required only if proxy_type=custom.
`auto_proxy`	`boolean`	No	`false`	Enable a more advanced proxy by default when rotating proxy is not working. It will move to the next proxy option until it gets the content and will only stop when content is available or none of the options worked. Please note that you are billed only on the top option attempted.
`auto_premium_proxy`	`boolean`	No	`false`	Automatically use premium proxy on failures with rotating proxy.
`custom_proxy_username`	`string`	No	`null`	Username for custom proxy authentication when using custom_proxy.
`custom_proxy_password`	`string`	No	`null`	Password for custom proxy authentication when using custom_proxy.
`session_id`	`alphanumeric`	No	`null`	Alphanumeric identifier with a length between 1 and 16 characters, used to route multiple requests from the same proxy instance. Sessions remain active for 30 minutes.
`pagination`	`boolean`	No	`true`	Extract and concatenate multiple-page articles.
`pagination_max_pages`	`number`	No	`30`	Indicates the number of pages to extract when pagination is enabled.
`cookies`	`string`	No	`null`	Cookie string to send with the request. Can be a JSON object mapping cookie names to values or a cookie string.
`js_use`	`string`	No	`null`	JavaScript engine to use for rendering. Possible values: 'browserless', 'extract_browserless'.
`html_timeout`	`number`	No	`timeout`	Timeout in seconds specifically for HTML fetching operations.
`block_ads`	`boolean`	No	`false`	Block advertisements and trackers during page scraping.
`no_html_cache`	`boolean`	No	`false`	Disable HTML caching for this request.
`UJB-headerName`	`string`	No	`null`	Indicates which headers to send to target URL. This can be useful when article is behind a paywall for example, and that you need to pass your authentication cookies.
`auto_captcha_solve`	`boolean`	No	`false`	Enable automatic CAPTCHA detection and solving using external services. Supports reCAPTCHA v2/v3, hCaptcha, Cloudflare Turnstile, FunCaptcha, GeeTest, and image CAPTCHAs. When enabled, requests automatically use super mode with JavaScript rendering.
`auto_captcha_solve_timeout`	`number`	No	`120000`	Timeout in milliseconds for CAPTCHA solving. CAPTCHAs typically take 20-60 seconds to solve.

Response Format

Status	Meaning	Description	Schema
200	OK	successful operation	SuccessResponse
400	Bad Request	Invalid parameter value	APIResponseError

Article Schema

{
  "url": "string",
  "canonical_url": "string",
  "title": "string",
  "text": "string",
  "html": "string",
  "summary": "string",
  "image": "string",
  "images": ["string"],
  "media": ["string"],
  "feeds": ["string"],
  "language": "string",
  "author": "string",
  "pub_date": "string",
  "modified_date": "string",
  "site_name": "string",
  "favicon": "string",
  "encoding": "string",
  "is_article": 0.0,
  "next_page": "string",
  "publisher_country": [{"name": "string"}],
  "publisher_tz": [{"name": "string"}]
}

Properties

Name	Type	Description
url	string	the URL parameter.
canonical_url	string	the final (resolved) URL.
title	string	the title of the article.
text	string	the extracted text.
html	string	the extracted html.
summary	string	summary (if available) of the article text.
image	string	main image of the article.
images	[string]	all images present in article.
media	[string]	all media present in article.
feeds	[string]	RSS feeds found on the page.
language	string	language code of article text.
author	string	author of article.
pub_date	string	publication date of article.
modified_date	string	last modified date of article.
site_name	string	name of site hosting article.
favicon	string	favicon of site hosting article.
encoding	string	character encoding of article text.
is_article	number	probability [0-1] of URL being an article.
next_page	string	URL of the next page (when pagination is detected).
publisher_country	[object]	publisher country info (when publisher_country is true).
publisher_tz	[object]	publisher timezone info (when publisher_tz is true).

Success Response example

{
    "article": {
        "text": "I began learning German at the age of 13, and I\u2019m still trying to explain to myself why it was love at first sound. The answer must surely be: the excellence of my teacher. At an English public school not famed for its cultural generosity, Mr King was that rare thing: a kindly and intelligent man who, in the thick of the second world war, determinedly loved the Germany that he knew was still there somewhere.\nRather than join the chorus of anti-German propaganda, he preferred, doggedly, to inspire his little class with the beauty of the language, and of its literature and culture. One day, he used to say, the real Germany will come back. And he was right. Because now it has.\nWhy was it love at first sound for me? Well...",
        "html": "<p><span>I<\/span> began learning German at the age of 13, and I’m still trying to explain to myself why it was love at first sound. The answer must surely be: the excellence of my teacher. At an English public school not famed for its cultural generosity, Mr King was that rare thing: a kindly and intelligent man who, in the thick of the second world war, determinedly loved the Germany that he knew was still there somewhere.<\/p><p>Rather than join the chorus of anti-German propaganda, he preferred, doggedly, to inspire his little class with the beauty of the language, and of its literature and culture. One day, he used to say, the real Germany will come back. And he was right. Because now it has....",
        "media": [],
        "images": [],
        "author": "John le Carr\u00e9",
        "pub_date": "2017-07-01 23:05:12",
        "is_article": 1,
        "url": "https:\/\/www.theguardian.com\/education\/2017\/jul\/02\/why-we-should-learn-german-john-le-carre",
        "canonical_url": "https:\/\/www.theguardian.com\/education\/2017\/jul\/02\/why-we-should-learn-german-john-le-carre",
        "title": "Why we should learn German | John le Carr\u00e9",
        "language": "en",
        "image": "https:\/\/i.guim.co.uk\/img\/media\/f19eff6f7e1751d88b38e725cfbe6687084d5f64\/0_235_9010_5405\/master\/9010.jpg?width=1200&height=630&quality=85&auto=format&fit=crop&overlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdG8tb3BpbmlvbnMtYWdlLTIwMTcucG5n&enable=upscale&s=efeec857dffdb94cd84c4b652b4e287f",
        "summary": "To help make the European debate decent and civilised, it is now more important than ever to value the skills of the linguist",
        "modified_date": "2017-12-02 03:00:56",
        "site_name": "the Guardian",
        "favicon": "https:\/\/static.guim.co.uk\/images\/favicon-32x32.ico",
        "encoding": "utf-8"
    },
    "time": 0.85
}

Error Response Schema

{
  "url": "string",
  "message": "string",
  "error_code": 400,
  "errors": ["string"]
}

Properties

Name	Type	Description
url	string	Given URL
message	string	Error message
error_code	number	HTTP status code
errors	[string]	List of all errors

Response Codes

Code	Billed	Meaning	Suggestion
200	Yes	Successful request	-
400	NO	Some required parameter is missing (URL)	Set
401	NO	Missing API-KEY	Provide API-KEY
404	YES	Provided URL not found	Provide a valid URL
408	YES	Request timeout	Increase timeout parameter, use premium proxy or force JS
429	NO	Too many requests	upgrade your plan
500	NO	Internal error	Try request or contact us

Examples

TIP - Quick Tip

Use the searchable parameter table above to quickly find the parameters you need for your use case.

Extract with JavaScript Rendering

Enable JavaScript execution to extract content from pages that rely on client-side rendering.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle' \
  -H "ApiKey: YOUR_API_KEY"

const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article.title, data.article.text);

const response = await client.extract('https://example.com/article', {
  js: true,
  js_timeout: 10000,
  wait_until: 'networkidle'
});

console.log(response.data.article.title, response.data.article.text);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'js': 'true',
        'js_timeout': 10000,
        'wait_until': 'networkidle'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

data = response.json()
print(data['article']['title'], data['article']['text'])

response = ujeebu.extract(
    'https://example.com/article',
    params={
        'js': True,
        'js_timeout': 10000,
        'wait_until': 'networkidle'
    }
)

data = response.json()
print(data['article']['title'], data['article']['text'])

OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:       "https://example.com/article",
	JS:        true,
	JSTimeout: 10000,
	WaitUntil: "networkidle",
})
if err != nil {
	panic(err)
}

fmt.Println(article.Title, article.Text)

Extract with Custom Headers

Pass custom headers to access content behind authentication or paywalls.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article' \
  -H "ApiKey: YOUR_API_KEY" \
  -H "UJB-Authorization: Bearer your-token" \
  -H "UJB-Cookie: session=abc123"

const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article', {
  headers: {
    'ApiKey': 'YOUR_API_KEY',
    'UJB-Authorization': 'Bearer your-token',
    'UJB-Cookie': 'session=abc123'
  }
});

const data = await response.json();
console.log(data.article);

const response = await client.extract('https://example.com/article', {}, {
  'Authorization': 'Bearer your-token',
  'Cookie': 'session=abc123'
});

console.log(response.data.article);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={'url': 'https://example.com/article'},
    headers={
        'ApiKey': 'YOUR_API_KEY',
        'UJB-Authorization': 'Bearer your-token',
        'UJB-Cookie': 'session=abc123'
    }
)

print(response.json()['article'])

response = ujeebu.extract(
    'https://example.com/article',
    headers={
        'Authorization': 'Bearer your-token',
        'Cookie': 'session=abc123'
    }
)

data = response.json()
print(data['article'])

OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .addHeader("UJB-Authorization", "Bearer your-token")
  .addHeader("UJB-Cookie", "session=abc123")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY',
    'UJB-Authorization: Bearer your-token',
    'UJB-Cookie: session=abc123'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	req.Header.Add("UJB-Authorization", "Bearer your-token")
	req.Header.Add("UJB-Cookie", "session=abc123")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

article, _, err := client.Extract(ujeebu.ExtractParams{
	URL: "https://example.com/article",
	CustomHeaders: map[string]string{
		"Authorization": "Bearer your-token",
		"Cookie":        "session=abc123",
	},
})
if err != nil {
	panic(err)
}

fmt.Println(article)

Extract with Premium Proxy

Use premium proxies for better reliability and geographic targeting.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US' \
  -H "ApiKey: YOUR_API_KEY"

const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article);

const response = await client.extract('https://example.com/article', {
  proxy_type: 'premium',
  proxy_country: 'US'
});

console.log(response.data.article);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'proxy_type': 'premium',
        'proxy_country': 'US'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['article'])

response = ujeebu.extract(
    'https://example.com/article',
    params={
        'proxy_type': 'premium',
        'proxy_country': 'US'
    }
)

data = response.json()
print(data['article'])

OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:          "https://example.com/article",
	ProxyType:    "premium",
	ProxyCountry: "US",
})
if err != nil {
	panic(err)
}

fmt.Println(article)

Extract with Pagination

Automatically extract multi-page articles by following pagination links.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5' \
  -H "ApiKey: YOUR_API_KEY"

const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
// Article text will include content from all pages
console.log(data.article.text);

const response = await client.extract('https://example.com/article', {
  pagination: true,
  pagination_max_pages: 5
});

// Article text will include content from all pages
console.log(response.data.article.text);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'pagination': 'true',
        'pagination_max_pages': 5
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

data = response.json()
# Article text will include content from all pages
print(data['article']['text'])

response = ujeebu.extract(
    'https://example.com/article',
    params={
        'pagination': True,
        'pagination_max_pages': 5
    }
)

data = response.json()
# Article text will include content from all pages
print(data['article']['text'])

OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:                "https://example.com/article",
	Pagination:         true,
	PaginationMaxPages: 5,
})
if err != nil {
	panic(err)
}

// Article text will include content from all pages
fmt.Println(article.Text)

Quick Mode for Faster Extraction

Use quick mode to reduce extraction time by 30-60% with simplified processing.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true' \
  -H "ApiKey: YOUR_API_KEY"

const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article);

const response = await client.extract('https://example.com/article', {
  quick_mode: true
});

console.log(response.data.article);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'quick_mode': 'true'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['article'])

response = ujeebu.extract(
    'https://example.com/article',
    params={'quick_mode': True}
)

data = response.json()
print(data['article'])

OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:       "https://example.com/article",
	QuickMode: true,
})
if err != nil {
	panic(err)
}

fmt.Println(article)

Stripping tags

If you want to delete some html element(s) before the extraction is carried out, use parameter strip_tags to pass a comma-separated list of css selectors of elements to delete.

INFO

The example below will remove any meta, form and input tags as well as any element with class hidden.

Passing custom headers

The extract endpoint will forward any headers with the `UJB-` prefix to the target URL.

curl -i \
-H 'UJB-Username: username' \
-H 'UJB-Authorisation: Basic dXNlcm5hbWU6cGFzc3dvcmQ=' \
-H 'ApiKey: <API Key>' \
-X GET \
https://api.ujeebu.com/extract?url=https://ujeebu.com/blog/how-to-extract-clean-text-from-html

The code above will return the following response:

{
  "article": {

    "author": "Sam",
    "pub_date": "2019-08-09 12:42:25",
    "is_article": 1,
    "url": "https://ujeebu.com/blog/how-to-extract-clean-text-from-html",
    "canonical_url": "https://ujeebu.com/blog/how-to-extract-clean-text-from-html/",
    "title": "Extracting clean data from blog and news articles",
    "site_name": "Ujeebu blog",
    "favicon": "https://ujeebu.com/blog/favicon.png",
    "encoding": "utf-8",
    "pages": ["https://ujeebu.com/blog/how-to-extract-clean-text-from-html/"]
  },
  "time": 6.366053104400635,
  "js": false,
  "pagination": false
}

Using Proxies

Using Your Own Proxy

Auto Proxy

When dealing with difficult websites that block requests or have aggressive anti-bot measures, use the auto_proxy parameter to automatically try different proxies until one succeeds.

TIP - How Auto Proxy Works

When auto_proxy=true, Ujeebu automatically cycles through available proxy types in sequence. If one proxy fails (network error, timeout, or status ≥ 400), it immediately retries with the next proxy. This continues until a successful response is received or all proxies have been tried.

Retry Flow:

Select proxy → Make request → Check result
On failure (error, timeout, status ≥ 400 except 404) → Try next proxy
On success (status 200-399) → Return response
Final fallback → Direct connection (no proxy)

curl -X GET 'https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true' \
  -H "ApiKey: YOUR_API_KEY"

const response = await fetch('https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article.title, data.article.text);

const response = await client.extract('https://difficult-site.com/article', {
  auto_proxy: true,
  js: true  // Combine with JS rendering if needed
});

console.log(response.data.article.title, response.data.article.text);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://difficult-site.com/article',
        'auto_proxy': 'true',
        'js': 'true'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

data = response.json()
print(data['article']['title'], data['article']['text'])

response = ujeebu.extract(
    'https://difficult-site.com/article',
    params={
        'auto_proxy': True,
        'js': True  # Combine with JS rendering if needed
    }
)

data = response.json()
print(data['article']['title'], data['article']['text'])

OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:       "https://difficult-site.com/article",
	AutoProxy: true,
	JS:        true,  // Combine with JS rendering if needed
})
if err != nil {
	panic(err)
}

fmt.Println(article.Title, article.Text)

When to use Auto Proxy:

✅ Extracting articles from sites known to block requests
✅ Handling geo-restricted content
✅ Improving success rates for critical extraction tasks
✅ Scraping news sites with aggressive anti-bot measures

Important notes:

You are billed only for the proxy option that succeeds
Each failed attempt still contributes to processing time
A 404 response does NOT trigger a retry (it's a valid "not found")

For detailed documentation, see the Auto Proxy Guide.

CAPTCHA Solving

The Extract API can automatically detect and solve CAPTCHAs on web pages using external solving services. This is an opt-in feature that supports:

reCAPTCHA v2/v3 (including invisible and enterprise versions)
hCaptcha
Cloudflare Turnstile
FunCaptcha (Arkose Labs)
GeeTest
Image CAPTCHAs

INFO - How it works

When auto_captcha_solve is enabled, the API automatically detects CAPTCHAs on the page, sends them to a solving service, and injects the solution token back into the page. This happens transparently before extracting the article content.

Code Example

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000' \
  -H "ApiKey: YOUR_API_KEY"

const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article);

const response = await client.extract('https://example.com/protected-article', {
  js: true,
  auto_captcha_solve: true,
  auto_captcha_solve_timeout: 120000 // 2 minutes
});

console.log(response.data.article);

import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/protected-article',
        'js': 'true',
        'auto_captcha_solve': 'true',
        'auto_captcha_solve_timeout': 120000
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['article'])

response = ujeebu.extract(
    'https://example.com/protected-article',
    params={
        'js': True,
        'auto_captcha_solve': True,
        'auto_captcha_solve_timeout': 120000  # 2 minutes
    }
)

data = response.json()
print(data['article'])

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}

article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:                    "https://example.com/protected-article",
	JS:                     true,
	AutoCaptchaSolve:       true,
	AutoCaptchaSolveTimeout: 120000, // 2 minutes
})
if err != nil {
	panic(err)
}

fmt.Println(article)

WARNING - Additional credits

CAPTCHA solving uses external services and incurs +5 credits on top of the base request cost when a CAPTCHA is detected and solved.

Rate Limits & Credits

Usage Tracking

To track credit usage programmatically, call the /account endpoint. See Account API for the full reference, response shape, and rate limit (10 calls/minute).

Ready to build?

Spin up an API key in 60 seconds

Free tier: 5,000 credits, no card, full access to every endpoint on this page.

Get free API key or try the playground →

← Back to

Extract API

Article Preview API