Overview

The Extract API is specifically designed to extract the main content from article pages, blog posts, and news stories. Unlike the Scrape API which requires you to specify selectors, the Extract API uses advanced algorithms to automatically identify and extract relevant content.

What You'll Get

Clean article title

Main article text

Featured image

Author name

Publication date

Site name & language

Why Use Extract API?

No need to write CSS selectors or understand page structure. The Extract API works across different websites with different layouts automatically.

2

Extract API vs Scrape API

Understanding when to use each API is important:

Feature	Extract API	Scrape API
Best For	Articles, blogs, news	Any structured data
Setup Required	None - automatic	CSS selectors needed
Accuracy	High for articles	Depends on selectors
Flexibility	Limited to content extraction	Fully customizable
Speed	Fast (optimized)	Depends on complexity
Maintenance	None	Update selectors as sites change

3

Make the API Request

curl -X GET "https://api.ujeebu.com/extract" \
  -H "ApiKey: YOUR_API_KEY" \
  -G \
  --data-urlencode "url=https://example.com/blog/article-title"

import requests

response = requests.get("https://api.ujeebu.com/extract",
    headers={"ApiKey": "YOUR_API_KEY"},
    params={
        "url": "https://example.com/blog/article-title"
    })

data = response.json()
article = data["article"]

print(f"Title: {article['title']}")
print(f"Author: {article.get('author', 'Unknown')}")
print(f"Date: {article.get('pub_date', 'Unknown')}")
print(f"Language: {article.get('language', 'N/A')}")
print(f"\nContent preview:")
print(article['text'][:500] + "...")

const axios = require('axios');

const response = await axios.get('https://api.ujeebu.com/extract', {
  headers: { 'ApiKey': 'YOUR_API_KEY' },
  params: {
    url: 'https://example.com/blog/article-title'
  }
});

const { article } = response.data;

console.log(`Title: ${article.title}`);
console.log(`Author: ${article.author || 'Unknown'}`);
console.log(`Date: ${article.pub_date || 'Unknown'}`);
console.log(`Language: ${article.language || 'N/A'}`);
console.log(`\nContent preview:`);
console.log(article.text.substring(0, 500) + '...');

from ujeebu_python import UjeebuClient

uj = UjeebuClient(api_key="YOUR_API_KEY")

res = uj.extract(url="https://example.com/blog/article-title")
print(res.json())

const { UjeebuClient } = require('@ujeebu-org/ujeebu-sdk');

const client = new UjeebuClient("YOUR_API_KEY");

(async () => {
  const res = await client.extract("https://example.com/blog/article-title");
  console.log(res.data);
})();

<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'https://api.ujeebu.com/extract?url=https%3A%2F%2Fexample.com%2Fblog%2Farticle-title');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($ch, CURLOPT_HTTPHEADER, [
    'ApiKey: ' . 'YOUR_API_KEY',
]);
$response = curl_exec($ch);
curl_close($ch);
echo $response;

package main

import (
    "fmt"
    "io"
    "net/http"
)

func main() {
    url := "https://api.ujeebu.com/extract?url=https%3A%2F%2Fexample.com%2Fblog%2Farticle-title"
    req, _ := http.NewRequest("GET", url, nil)
    req.Header.Set("ApiKey", "YOUR_API_KEY")
    res, err := http.DefaultClient.Do(req)
    if err != nil { panic(err) }
    defer res.Body.Close()
    body, _ := io.ReadAll(res.Body)
    fmt.Println(string(body))
}

package main

import (
    "fmt"
    "log"

    "github.com/ujeebu/ujeebu-go"
)

func main() {
    client, err := ujeebu.NewClient("YOUR_API_KEY")
    if err != nil {
        log.Fatal(err)
    }

    res, _, err := client.Extract(ujeebu.ExtractParams{
        URL: "https://example.com/blog/article-title",
    })
    if err != nil {
        log.Fatal(err)
    }
    fmt.Println(res)
}

4

Response Format

JSON - API Response

{
  "article": {
    "url": "https://example.com/blog/article-title",
    "canonical_url": "https://example.com/blog/article-title",
    "title": "How to Build a Successful Startup",
    "text": "Starting a business is challenging but rewarding...",
    "html": "<article>Starting a business is challenging...</article>",
    "author": "John Smith",
    "pub_date": "2024-01-15 12:00:00",
    "modified_date": "2024-01-16 10:30:00",
    "image": "https://example.com/images/startup.jpg",
    "images": [
      "https://example.com/images/startup.jpg",
      "https://example.com/images/team.jpg"
    ],
    "summary": "A comprehensive guide to building your first startup...",
    "site_name": "Example Blog",
    "language": "en",
    "is_article": 0.95,
    "favicon": "https://example.com/favicon.ico",
    "encoding": "utf-8"
  },
  "time": 1.25
}

Note

The text field contains plain text, while html contains the formatted HTML with proper headings, links, and paragraphs preserved.

5

API Options

Parameter	Description	Default
`url`	The article URL to extract	Required
`js`	Enable JavaScript rendering	false
`html`	Return cleaned HTML content	true
`text`	Extract clean text content	true
`images`	Extract image URLs from the article	true
`author`	Extract author name	true
`pub_date`	Extract publication date	true
`quick_mode`	Faster extraction (30-60% faster, less thorough)	false

Bulk Extraction Example

import requests
import time

def extract_articles(urls):
    """Extract content from multiple articles."""
    articles = []

    for url in urls:
        response = requests.get(
            "https://api.ujeebu.com/extract",
            headers={"ApiKey": "YOUR_API_KEY"},
            params={"url": url})

        if response.status_code == 200:
            data = response.json()
            article = data["article"]
            articles.append({
                'url': url,
                'title': article.get('title'),
                'author': article.get('author'),
                'pub_date': article.get('pub_date'),
                'text': article.get('text'),
                'language': article.get('language')
            })

        time.sleep(1)  # Rate limiting

    return articles

# Extract from multiple URLs
urls = [
    "https://blog.example.com/article-1",
    "https://blog.example.com/article-2",
    "https://news.example.com/story"
]

articles = extract_articles(urls)
print(f"Extracted {len(articles)} articles")

const axios = require('axios');

async function extractArticles(urls) {
  const articles = [];

  for (const url of urls) {
    const response = await axios.get('https://api.ujeebu.com/extract', {
      headers: { 'ApiKey': 'YOUR_API_KEY' },
      params: { url },
      validateStatus: () => true
    });

    if (response.status === 200) {
      const article = response.data.article;
      articles.push({
        url,
        title: article.title,
        author: article.author,
        pub_date: article.pub_date,
        text: article.text,
        language: article.language
      });
    }

    await new Promise(resolve => setTimeout(resolve, 1000)); // Rate limiting
  }

  return articles;
}

// Extract from multiple URLs
const urls = [
  'https://blog.example.com/article-1',
  'https://blog.example.com/article-2',
  'https://news.example.com/story'
];

extractArticles(urls).then(articles => {
  console.log(`Extracted ${articles.length} articles`);
});

from ujeebu_python import UjeebuClient
import time

uj = UjeebuClient(api_key="YOUR_API_KEY")

def extract_articles(urls):
    """Extract content from multiple articles."""
    articles = []

    for url in urls:
        res = uj.extract(url=url)
        data = res.json()
        if "article" in data:
            article = data["article"]
            articles.append({
                'url': url,
                'title': article.get('title'),
                'author': article.get('author'),
                'pub_date': article.get('pub_date'),
                'text': article.get('text'),
                'language': article.get('language')
            })

        time.sleep(1)  # Rate limiting

    return articles

urls = [
    "https://blog.example.com/article-1",
    "https://blog.example.com/article-2",
    "https://news.example.com/story"
]

articles = extract_articles(urls)
print(f"Extracted {len(articles)} articles")

const { UjeebuClient } = require('@ujeebu-org/ujeebu-sdk');

const client = new UjeebuClient("YOUR_API_KEY");

async function extractArticles(urls) {
  const articles = [];

  for (const url of urls) {
    const res = await client.extract(url);
    const data = res.data;
    if (data.article) {
      const article = data.article;
      articles.push({
        url,
        title: article.title,
        author: article.author,
        pub_date: article.pub_date,
        text: article.text,
        language: article.language
      });
    }

    await new Promise(resolve => setTimeout(resolve, 1000)); // Rate limiting
  }

  return articles;
}

const urls = [
  'https://blog.example.com/article-1',
  'https://blog.example.com/article-2',
  'https://news.example.com/story'
];

extractArticles(urls).then(articles => {
  console.log(`Extracted ${articles.length} articles`);
});

<?php
function extract_articles($urls) {
    $articles = [];

    foreach ($urls as $url) {
        $ch = curl_init();
        curl_setopt_array($ch, [
            CURLOPT_URL => 'https://api.ujeebu.com/extract?' . http_build_query(['url' => $url]),
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_HTTPHEADER => ['ApiKey: YOUR_API_KEY'],
        ]);
        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        curl_close($ch);

        if ($httpCode === 200) {
            $data = json_decode($response, true);
            $article = $data['article'];
            $articles[] = [
                'url' => $url,
                'title' => $article['title'] ?? null,
                'author' => $article['author'] ?? null,
                'pub_date' => $article['pub_date'] ?? null,
                'text' => $article['text'] ?? null,
                'language' => $article['language'] ?? null,
            ];
        }

        sleep(1); // Rate limiting
    }

    return $articles;
}

// Extract from multiple URLs
$urls = [
    'https://blog.example.com/article-1',
    'https://blog.example.com/article-2',
    'https://news.example.com/story',
];

$articles = extract_articles($urls);
echo "Extracted " . count($articles) . " articles\n";

package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"time"
)

type Article struct {
	Title    string `json:"title"`
	Author   string `json:"author"`
	PubDate  string `json:"pub_date"`
	Text     string `json:"text"`
	Language string `json:"language"`
}

func extractArticles(urls []string) []Article {
	var articles []Article

	for _, u := range urls {
		endpoint := "https://api.ujeebu.com/extract?" + url.Values{"url": {u}}.Encode()
		req, _ := http.NewRequest("GET", endpoint, nil)
		req.Header.Set("ApiKey", "YOUR_API_KEY")
		res, err := http.DefaultClient.Do(req)
		if err != nil {
			continue
		}
		body, _ := io.ReadAll(res.Body)
		res.Body.Close()

		if res.StatusCode == 200 {
			var data struct {
				Article Article `json:"article"`
			}
			json.Unmarshal(body, &data)
			articles = append(articles, data.Article)
		}

		time.Sleep(1 * time.Second) // Rate limiting
	}

	return articles
}

func main() {
	// Extract from multiple URLs
	urls := []string{
		"https://blog.example.com/article-1",
		"https://blog.example.com/article-2",
		"https://news.example.com/story",
	}

	articles := extractArticles(urls)
	fmt.Printf("Extracted %d articles\n", len(articles))
}

package main

import (
	"fmt"
	"log"
	"time"

	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, err := ujeebu.NewClient("YOUR_API_KEY")
	if err != nil {
		log.Fatal(err)
	}

	urls := []string{
		"https://blog.example.com/article-1",
		"https://blog.example.com/article-2",
		"https://news.example.com/story",
	}

	var articles []*ujeebu.Article
	for _, u := range urls {
		article, _, err := client.Extract(ujeebu.ExtractParams{URL: u})
		if err != nil {
			continue
		}
		articles = append(articles, article)
		time.Sleep(1 * time.Second) // Rate limiting
	}

	fmt.Printf("Extracted %d articles\n", len(articles))
}

6

Best Practices

01

Use for Articles Only

The Extract API is optimized for articles. For product pages or other structured data, use the Scrape API instead.

Important

02

Enable JS When Needed

Some modern sites require JavaScript. Use js=true if content isn't extracted properly.

Recommended

03

Cache Results

Article content rarely changes. Cache extracted content to reduce API calls and improve performance.

Performance

04

Validate Output

Check that title and text are extracted. Some pages may block extraction or have unusual structures.

Best Practice

Clean Article Extraction