Playground Sign in Start free

Article Extractor API

export const extractFullPath =

Article Extractor API

Convert any news or blog article into clean, structured JSON data. Extract text, author, publish date, images, and embedded media with a single API call.

Authentication

All API requests require authentication using an API key. Include your API key in the request header.

INFO — Get your API key

Sign up for a free account to receive your API key instantly at ujeebu.com/signup.

Header Format

GET ApiKey: YOUR_API_KEY

Basic Request

Make an extraction request by sending a GET request to the endpoint with the target article URL.

Endpoint

GET https://api.ujeebu.com/extract

Code Examples

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article' \
  -H "ApiKey: YOUR_API_KEY"
const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data);
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const response = await client.extract('https://example.com/article');

console.log(response.data.article);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={'url': 'https://example.com/article'},
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json())
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key='YOUR_API_KEY')

response = ujeebu.extract(
    url='https://example.com/article'
)

data = response.json()
print(data['article'])
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, err := client.Do(req)
	if err != nil {
		panic(err)
	}
	defer res.Body.Close()
	
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, err := ujeebu.NewClient("YOUR_API_KEY")
	if err != nil {
		panic(err)
	}
	
	article, _, err := client.Extract(ujeebu.ExtractParams{
		URL: "https://example.com/article",
	})
	if err != nil {
		panic(err)
	}
	
	fmt.Println(article)
}

Request Parameters

Parameter Type Required Default Description
url string Yes - URL of article to be extracted.
raw_html string No null HTML of article to be extracted. When this is passed, article extraction is carried out on the value of this parameter (i.e. without fetching article from url), however the extractor still relies on url to resolve relative links and relatively referenced assets in the provided html.
js boolean No false Indicates whether to execute JavaScript or not. Set to 'auto' to let the extractor decide.
text boolean No true Indicates whether API should return extracted text.
html boolean No true Indicates whether API should extract html.
media boolean No false Indicates whether API should extract media.
feeds boolean No false Indicates whether API should extract RSS feeds.
images boolean No true Indicates whether API should extract all images present in HTML.
author boolean No true Indicates whether API should extract article's author.
pub_date boolean No true Indicates whether API should extract article's publish date.
partial number No 0 Number of characters or percentage of text (if percent sign is present) of text/html to be returned. 0 means all.
is_article boolean No true When true returns the probability [0-1] of URL being an article. Anything scoring 0.5 and above should be an article, but this may slightly vary from one site to another.
quick_mode boolean No false When true, does a quick analysis of the content instead of the normal advanced parsing. Usually cuts down response time by about 30% to 60%.
strip_tags csv-string No form Indicates which tags to strip from the extracted article HTML. Expects a comma separated list of tag names/css selectors.
timeout number No 60 Maximum number of seconds before request timeout.
js_timeout number No timeout/2 When js is enabled, indicates how many seconds the API should wait for the JS engine to render the supplied URL.
scroll_down boolean No false Indicates whether to scroll down the page or not, this applies only when js is enabled.
scroll_wait number No 100 Wait time in milliseconds between scroll actions when scroll_down is enabled.
scroll_percent number No null Percentage of the page to scroll (0-100). Used when scroll_down is enabled.
progressive_scroll boolean No false Enable progressive scrolling behavior for better dynamic content loading.
scroll_callback string No null JavaScript callback function to execute during scroll events.
scroll_to_selector string No null CSS selector to scroll to a specific element on the page.
wait_until string No load Controls when page load is considered complete. Possible values: 'load' (wait for load event), 'domcontentloaded' (wait for DOMContentLoaded), 'networkidle' (wait for network to be idle), 'commit' (wait for initial HTML commit).
image_analysis boolean No true Indicates whether API should analyse images for minimum width and height (see parameters min_image_width and min_image_height for more details).
min_image_width number No 200 Minimum width of the images kept in the HTML (if image_analysis is false this parameter has no effect).
min_image_height number No 100 Minimum height of the images kept in the HTML (if image_analysis is false this parameter has no effect).
image_timeout number No 2 Image fetching timeout in seconds.
return_only_enclosed_text_images boolean No true Indicates whether to return only images that are enclosed within extracted article HTML.
main_image_in_html boolean No false Include the main image in the extracted HTML body.
publisher_country boolean No false Extract publisher country information from the article.
publisher_tz boolean No false Extract publisher timezone information from the article.
heavy_mode boolean No false Enable more thorough extraction mode with deeper content analysis.
text_length string No priority Text selection mode for extraction algorithm. Possible values: 'conservative', 'auto', 'optimistic', 'priority'.
proxy_type string No rotating Indicates type of proxy to use. Possible values: 'rotating', 'advanced', 'premium', 'residential', 'mobile', 'custom'. When using 'residential' with a non-US proxy_country, requests are automatically routed through residential_geo proxies.
proxy_country string No US Country ISO 3166-1 alpha-2 code to proxy from. Valid only when premium proxy type is chosen.
custom_proxy string No null URI for your custom proxy in the following format: scheme://user:pass@host:port. Applicable and required only if proxy_type=custom.
auto_proxy boolean No false Enable a more advanced proxy by default when rotating proxy is not working. It will move to the next proxy option until it gets the content and will only stop when content is available or none of the options worked. Please note that you are billed only on the top option attempted.
auto_premium_proxy boolean No false Automatically use premium proxy on failures with rotating proxy.
custom_proxy_username string No null Username for custom proxy authentication when using custom_proxy.
custom_proxy_password string No null Password for custom proxy authentication when using custom_proxy.
session_id alphanumeric No null Alphanumeric identifier with a length between 1 and 16 characters, used to route multiple requests from the same proxy instance. Sessions remain active for 30 minutes.
pagination boolean No true Extract and concatenate multiple-page articles.
pagination_max_pages number No 30 Indicates the number of pages to extract when pagination is enabled.
cookies string No null Cookie string to send with the request. Can be a JSON object mapping cookie names to values or a cookie string.
js_use string No null JavaScript engine to use for rendering. Possible values: 'browserless', 'extract_browserless'.
html_timeout number No timeout Timeout in seconds specifically for HTML fetching operations.
block_ads boolean No false Block advertisements and trackers during page scraping.
no_html_cache boolean No false Disable HTML caching for this request.
UJB-headerName string No null Indicates which headers to send to target URL. This can be useful when article is behind a paywall for example, and that you need to pass your authentication cookies.
auto_captcha_solve boolean No false Enable automatic CAPTCHA detection and solving using external services. Supports reCAPTCHA v2/v3, hCaptcha, Cloudflare Turnstile, FunCaptcha, GeeTest, and image CAPTCHAs. When enabled, requests automatically use super mode with JavaScript rendering.
auto_captcha_solve_timeout number No 120000 Timeout in milliseconds for CAPTCHA solving. CAPTCHAs typically take 20-60 seconds to solve.

Response Format

Status Meaning Description Schema
200 OK successful operation SuccessResponse
400 Bad Request Invalid parameter value APIResponseError

Article Schema

{
  "url": "string",
  "canonical_url": "string",
  "title": "string",
  "text": "string",
  "html": "string",
  "summary": "string",
  "image": "string",
  "images": ["string"],
  "media": ["string"],
  "feeds": ["string"],
  "language": "string",
  "author": "string",
  "pub_date": "string",
  "modified_date": "string",
  "site_name": "string",
  "favicon": "string",
  "encoding": "string",
  "is_article": 0.0,
  "next_page": "string",
  "publisher_country": [{"name": "string"}],
  "publisher_tz": [{"name": "string"}]
}

Properties

Name Type Description
url string the URL parameter.
canonical_url string the final (resolved) URL.
title string the title of the article.
text string the extracted text.
html string the extracted html.
summary string summary (if available) of the article text.
image string main image of the article.
images [string] all images present in article.
media [string] all media present in article.
feeds [string] RSS feeds found on the page.
language string language code of article text.
author string author of article.
pub_date string publication date of article.
modified_date string last modified date of article.
site_name string name of site hosting article.
favicon string favicon of site hosting article.
encoding string character encoding of article text.
is_article number probability [0-1] of URL being an article.
next_page string URL of the next page (when pagination is detected).
publisher_country [object] publisher country info (when publisher_country is true).
publisher_tz [object] publisher timezone info (when publisher_tz is true).

Success Response example

{
    "article": {
        "text": "I began learning German at the age of 13, and I\u2019m still trying to explain to myself why it was love at first sound. The answer must surely be: the excellence of my teacher. At an English public school not famed for its cultural generosity, Mr King was that rare thing: a kindly and intelligent man who, in the thick of the second world war, determinedly loved the Germany that he knew was still there somewhere.\nRather than join the chorus of anti-German propaganda, he preferred, doggedly, to inspire his little class with the beauty of the language, and of its literature and culture. One day, he used to say, the real Germany will come back. And he was right. Because now it has.\nWhy was it love at first sound for me? Well...",
        "html": "<p><span>I<\/span> began learning German at the age of 13, and I’m still trying to explain to myself why it was love at first sound. The answer must surely be: the excellence of my teacher. At an English public school not famed for its cultural generosity, Mr King was that rare thing: a kindly and intelligent man who, in the thick of the second world war, determinedly loved the Germany that he knew was still there somewhere.<\/p><p>Rather than join the chorus of anti-German propaganda, he preferred, doggedly, to inspire his little class with the beauty of the language, and of its literature and culture. One day, he used to say, the real Germany will come back. And he was right. Because now it has....",
        "media": [],
        "images": [],
        "author": "John le Carr\u00e9",
        "pub_date": "2017-07-01 23:05:12",
        "is_article": 1,
        "url": "https:\/\/www.theguardian.com\/education\/2017\/jul\/02\/why-we-should-learn-german-john-le-carre",
        "canonical_url": "https:\/\/www.theguardian.com\/education\/2017\/jul\/02\/why-we-should-learn-german-john-le-carre",
        "title": "Why we should learn German | John le Carr\u00e9",
        "language": "en",
        "image": "https:\/\/i.guim.co.uk\/img\/media\/f19eff6f7e1751d88b38e725cfbe6687084d5f64\/0_235_9010_5405\/master\/9010.jpg?width=1200&height=630&quality=85&auto=format&fit=crop&overlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdG8tb3BpbmlvbnMtYWdlLTIwMTcucG5n&enable=upscale&s=efeec857dffdb94cd84c4b652b4e287f",
        "summary": "To help make the European debate decent and civilised, it is now more important than ever to value the skills of the linguist",
        "modified_date": "2017-12-02 03:00:56",
        "site_name": "the Guardian",
        "favicon": "https:\/\/static.guim.co.uk\/images\/favicon-32x32.ico",
        "encoding": "utf-8"
    },
    "time": 0.85
}

Error Response Schema

{
  "url": "string",
  "message": "string",
  "error_code": 400,
  "errors": ["string"]
}

Properties

Name Type Description
url string Given URL
message string Error message
error_code number HTTP status code
errors [string] List of all errors

Response Codes

Code Billed Meaning Suggestion
200 Yes Successful request -
400 NO Some required parameter is missing (URL) Set
401 NO Missing API-KEY Provide API-KEY
404 YES Provided URL not found Provide a valid URL
408 YES Request timeout Increase timeout parameter, use premium proxy or force JS
429 NO Too many requests upgrade your plan
500 NO Internal error Try request or contact us

Examples

TIP — Quick Tip

Use the searchable parameter table above to quickly find the parameters you need for your use case.

Extract with JavaScript Rendering

Enable JavaScript execution to extract content from pages that rely on client-side rendering.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle' \
  -H "ApiKey: YOUR_API_KEY"
const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article.title, data.article.text);
const response = await client.extract('https://example.com/article', {
  js: true,
  js_timeout: 10000,
  wait_until: 'networkidle'
});

console.log(response.data.article.title, response.data.article.text);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'js': 'true',
        'js_timeout': 10000,
        'wait_until': 'networkidle'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

data = response.json()
print(data['article']['title'], data['article']['text'])
response = ujeebu.extract(
    'https://example.com/article',
    params={
        'js': True,
        'js_timeout': 10000,
        'wait_until': 'networkidle'
    }
)

data = response.json()
print(data['article']['title'], data['article']['text'])
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&js=true&js_timeout=10000&wait_until=networkidle", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:       "https://example.com/article",
	JS:        true,
	JSTimeout: 10000,
	WaitUntil: "networkidle",
})
if err != nil {
	panic(err)
}

fmt.Println(article.Title, article.Text)

Extract with Custom Headers

Pass custom headers to access content behind authentication or paywalls.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article' \
  -H "ApiKey: YOUR_API_KEY" \
  -H "UJB-Authorization: Bearer your-token" \
  -H "UJB-Cookie: session=abc123"
const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article', {
  headers: {
    'ApiKey': 'YOUR_API_KEY',
    'UJB-Authorization': 'Bearer your-token',
    'UJB-Cookie': 'session=abc123'
  }
});

const data = await response.json();
console.log(data.article);
const response = await client.extract('https://example.com/article', {}, {
  'Authorization': 'Bearer your-token',
  'Cookie': 'session=abc123'
});

console.log(response.data.article);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={'url': 'https://example.com/article'},
    headers={
        'ApiKey': 'YOUR_API_KEY',
        'UJB-Authorization': 'Bearer your-token',
        'UJB-Cookie': 'session=abc123'
    }
)

print(response.json()['article'])
response = ujeebu.extract(
    'https://example.com/article',
    headers={
        'Authorization': 'Bearer your-token',
        'Cookie': 'session=abc123'
    }
)

data = response.json()
print(data['article'])
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .addHeader("UJB-Authorization", "Bearer your-token")
  .addHeader("UJB-Cookie", "session=abc123")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY',
    'UJB-Authorization: Bearer your-token',
    'UJB-Cookie: session=abc123'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	req.Header.Add("UJB-Authorization", "Bearer your-token")
	req.Header.Add("UJB-Cookie", "session=abc123")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
article, _, err := client.Extract(ujeebu.ExtractParams{
	URL: "https://example.com/article",
	CustomHeaders: map[string]string{
		"Authorization": "Bearer your-token",
		"Cookie":        "session=abc123",
	},
})
if err != nil {
	panic(err)
}

fmt.Println(article)

Extract with Premium Proxy

Use premium proxies for better reliability and geographic targeting.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US' \
  -H "ApiKey: YOUR_API_KEY"
const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article);
const response = await client.extract('https://example.com/article', {
  proxy_type: 'premium',
  proxy_country: 'US'
});

console.log(response.data.article);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'proxy_type': 'premium',
        'proxy_country': 'US'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['article'])
response = ujeebu.extract(
    'https://example.com/article',
    params={
        'proxy_type': 'premium',
        'proxy_country': 'US'
    }
)

data = response.json()
print(data['article'])
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&proxy_type=premium&proxy_country=US", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:          "https://example.com/article",
	ProxyType:    "premium",
	ProxyCountry: "US",
})
if err != nil {
	panic(err)
}

fmt.Println(article)

Extract with Pagination

Automatically extract multi-page articles by following pagination links.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5' \
  -H "ApiKey: YOUR_API_KEY"
const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
// Article text will include content from all pages
console.log(data.article.text);
const response = await client.extract('https://example.com/article', {
  pagination: true,
  pagination_max_pages: 5
});

// Article text will include content from all pages
console.log(response.data.article.text);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'pagination': 'true',
        'pagination_max_pages': 5
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

data = response.json()
# Article text will include content from all pages
print(data['article']['text'])
response = ujeebu.extract(
    'https://example.com/article',
    params={
        'pagination': True,
        'pagination_max_pages': 5
    }
)

data = response.json()
# Article text will include content from all pages
print(data['article']['text'])
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&pagination=true&pagination_max_pages=5", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:                "https://example.com/article",
	Pagination:         true,
	PaginationMaxPages: 5,
})
if err != nil {
	panic(err)
}

// Article text will include content from all pages
fmt.Println(article.Text)

Quick Mode for Faster Extraction

Use quick mode to reduce extraction time by 30-60% with simplified processing.

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true' \
  -H "ApiKey: YOUR_API_KEY"
const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article);
const response = await client.extract('https://example.com/article', {
  quick_mode: true
});

console.log(response.data.article);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/article',
        'quick_mode': 'true'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['article'])
response = ujeebu.extract(
    'https://example.com/article',
    params={'quick_mode': True}
)

data = response.json()
print(data['article'])
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/article&quick_mode=true", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:       "https://example.com/article",
	QuickMode: true,
})
if err != nil {
	panic(err)
}

fmt.Println(article)

Stripping tags

If you want to delete some html element(s) before the extraction is carried out, use parameter strip_tags to pass a comma-separated list of css selectors of elements to delete.

INFO

The example below will remove any meta, form and input tags as well as any element with class hidden.

Passing custom headers

The extract endpoint will forward any headers with the `UJB-` prefix to the target URL {extractFullPath}
curl -i \
-H 'UJB-Username: username' \
-H 'UJB-Authorisation: Basic dXNlcm5hbWU6cGFzc3dvcmQ=' \
-H 'ApiKey: <API Key>' \
-X GET \
https://api.ujeebu.com/extract?url=https://ujeebu.com/blog/how-to-extract-clean-text-from-html

The code above will return the following response:

{
  "article": {

    "author": "Sam",
    "pub_date": "2019-08-09 12:42:25",
    "is_article": 1,
    "url": "https://ujeebu.com/blog/how-to-extract-clean-text-from-html",
    "canonical_url": "https://ujeebu.com/blog/how-to-extract-clean-text-from-html/",
    "title": "Extracting clean data from blog and news articles",
    "site_name": "Ujeebu blog",
    "favicon": "https://ujeebu.com/blog/favicon.png",
    "encoding": "utf-8",
    "pages": ["https://ujeebu.com/blog/how-to-extract-clean-text-from-html/"]
  },
  "time": 6.366053104400635,
  "js": false,
  "pagination": false
}

Using Proxies

Using Your Own Proxy

Auto Proxy

When dealing with difficult websites that block requests or have aggressive anti-bot measures, use the auto_proxy parameter to automatically try different proxies until one succeeds.

TIP — How Auto Proxy Works

When auto_proxy=true, Ujeebu automatically cycles through available proxy types in sequence. If one proxy fails (network error, timeout, or status ≥ 400), it immediately retries with the next proxy. This continues until a successful response is received or all proxies have been tried.

Retry Flow:

  1. Select proxy → Make request → Check result
  2. On failure (error, timeout, status ≥ 400 except 404) → Try next proxy
  3. On success (status 200-399) → Return response
  4. Final fallback → Direct connection (no proxy)
curl -X GET 'https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true' \
  -H "ApiKey: YOUR_API_KEY"
const response = await fetch('https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article.title, data.article.text);
const response = await client.extract('https://difficult-site.com/article', {
  auto_proxy: true,
  js: true  // Combine with JS rendering if needed
});

console.log(response.data.article.title, response.data.article.text);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://difficult-site.com/article',
        'auto_proxy': 'true',
        'js': 'true'
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

data = response.json()
print(data['article']['title'], data['article']['text'])
response = ujeebu.extract(
    'https://difficult-site.com/article',
    params={
        'auto_proxy': True,
        'js': True  # Combine with JS rendering if needed
    }
)

data = response.json()
print(data['article']['title'], data['article']['text'])
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true")
  .addHeader("ApiKey", "YOUR_API_KEY")
  .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => 'https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true',
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTPHEADER => [
    'ApiKey: YOUR_API_KEY'
  ],
]);

$response = curl_exec($curl);
curl_close($curl);

echo $response;
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://difficult-site.com/article&auto_proxy=true&js=true", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:       "https://difficult-site.com/article",
	AutoProxy: true,
	JS:        true,  // Combine with JS rendering if needed
})
if err != nil {
	panic(err)
}

fmt.Println(article.Title, article.Text)

When to use Auto Proxy:

  • ✅ Extracting articles from sites known to block requests
  • ✅ Handling geo-restricted content
  • ✅ Improving success rates for critical extraction tasks
  • ✅ Scraping news sites with aggressive anti-bot measures

Important notes:

  • You are billed only for the proxy option that succeeds
  • Each failed attempt still contributes to processing time
  • A 404 response does NOT trigger a retry (it's a valid "not found")

For detailed documentation, see the Auto Proxy Guide.

CAPTCHA Solving

The Extract API can automatically detect and solve CAPTCHAs on web pages using external solving services. This is an opt-in feature that supports:

  • reCAPTCHA v2/v3 (including invisible and enterprise versions)
  • hCaptcha
  • Cloudflare Turnstile
  • FunCaptcha (Arkose Labs)
  • GeeTest
  • Image CAPTCHAs

INFO — How it works

When auto_captcha_solve is enabled, the API automatically detects CAPTCHAs on the page, sends them to a solving service, and injects the solution token back into the page. This happens transparently before extracting the article content.

Code Example

curl -X GET 'https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000' \
  -H "ApiKey: YOUR_API_KEY"
const response = await fetch('https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000', {
  headers: {
    'ApiKey': 'YOUR_API_KEY'
  }
});

const data = await response.json();
console.log(data.article);
const response = await client.extract('https://example.com/protected-article', {
  js: true,
  auto_captcha_solve: true,
  auto_captcha_solve_timeout: 120000 // 2 minutes
});

console.log(response.data.article);
import requests

response = requests.get(
    'https://api.ujeebu.com/extract',
    params={
        'url': 'https://example.com/protected-article',
        'js': 'true',
        'auto_captcha_solve': 'true',
        'auto_captcha_solve_timeout': 120000
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['article'])
response = ujeebu.extract(
    'https://example.com/protected-article',
    params={
        'js': True,
        'auto_captcha_solve': True,
        'auto_captcha_solve_timeout': 120000  # 2 minutes
    }
)

data = response.json()
print(data['article'])
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func main() {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", "https://api.ujeebu.com/extract?url=https://example.com/protected-article&js=true&auto_captcha_solve=true&auto_captcha_solve_timeout=120000", nil)
	req.Header.Add("ApiKey", "YOUR_API_KEY")
	
	res, _ := client.Do(req)
	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(body))
}
article, _, err := client.Extract(ujeebu.ExtractParams{
	URL:                    "https://example.com/protected-article",
	JS:                     true,
	AutoCaptchaSolve:       true,
	AutoCaptchaSolveTimeout: 120000, // 2 minutes
})
if err != nil {
	panic(err)
}

fmt.Println(article)

WARNING — Additional credits

CAPTCHA solving uses external services and incurs +5 credits on top of the base request cost when a CAPTCHA is detected and solved.

Rate Limits & Credits

Usage Tracking

To track credit usage programmatically, call the /account endpoint. See Account API for the full reference, response shape, and rate limit (10 calls/minute).

Ready to build?

Spin up an API key in 60 seconds

Free tier: 5,000 credits, no card, full access to every endpoint on this page.