Extract Rules
Extract structured data from any web page using CSS selectors. Define extraction rules to scrape specific elements and get clean JSON output.
Overview
Extract rules allow you to scrape structured data from any web page using CSS selectors. Add the extract_rules parameter to your API call with a JSON object defining what data to extract.
GET https://api.ujeebu.com/scrape?extract_rules={...}
TIP — Powerful Data Extraction
Extract rules are perfect for extracting product information, article content, lists, tables, or any repeating data patterns from web pages.
Basic Format
The simplest way to use extract rules:
{
"key_name": {
"selector": "css_selector",
"type": "rule_type"
}
}
curl -X GET 'https://api.ujeebu.com/scrape?url=https://example.com&extract_rules={"title":{"selector":"h1","type":"text"}}' \
-H "ApiKey: YOUR_API_KEY"const extractRules = {
title: { selector: 'h1', type: 'text' },
description: { selector: 'meta[name=description]', type: 'attr', attribute: 'content' }
};
const response = await fetch(
`https://api.ujeebu.com/scrape?url=https://example.com&extract_rules=${encodeURIComponent(JSON.stringify(extractRules))}`,
{ headers: { 'ApiKey': 'YOUR_API_KEY' } }
);
const data = await response.json();
console.log(data.result);import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const extractRules = {
title: { selector: 'h1', type: 'text' },
description: { selector: 'meta[name=description]', type: 'attr', attribute: 'content' }
};
const result = await client.scrapeWithRules(
'https://example.com',
extractRules
);
console.log(result);import requests
import json
extract_rules = {
'title': {'selector': 'h1', 'type': 'text'},
'description': {'selector': 'meta[name=description]', 'type': 'attr', 'attribute': 'content'}
}
response = requests.get(
'https://api.ujeebu.com/scrape',
params={
'url': 'https://example.com',
'extract_rules': json.dumps(extract_rules)
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json()['result'])from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key="YOUR_API_KEY")
extract_rules = {
'title': {'selector': 'h1', 'type': 'text'},
'description': {'selector': 'meta[name=description]', 'type': 'attr', 'attribute': 'content'}
}
result = ujeebu.scrape_with_rules(
url='https://example.com',
extract_rules=extract_rules
)
print(result)import okhttp3.*;
import org.json.*;
OkHttpClient client = new OkHttpClient();
JSONObject extractRules = new JSONObject()
.put("title", new JSONObject().put("selector", "h1").put("type", "text"))
.put("description", new JSONObject()
.put("selector", "meta[name=description]")
.put("type", "attr")
.put("attribute", "content"));
String url = "https://api.ujeebu.com/scrape?url=" +
URLEncoder.encode("https://example.com", "UTF-8") +
"&extract_rules=" + URLEncoder.encode(extractRules.toString(), "UTF-8");
Request request = new Request.Builder()
.url(url)
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$extractRules = [
'title' => ['selector' => 'h1', 'type' => 'text'],
'description' => [
'selector' => 'meta[name=description]',
'type' => 'attr',
'attribute' => 'content'
]
];
$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
'url' => 'https://example.com',
'extract_rules' => json_encode($extractRules)
]);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'ApiKey: YOUR_API_KEY'
]);
$response = curl_exec($ch);
curl_close($ch);
$data = json_decode($response, true);
print_r($data['result']);package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
)
func main() {
extractRules := map[string]interface{}{
"title": map[string]string{"selector": "h1", "type": "text"},
"description": map[string]string{
"selector": "meta[name=description]",
"type": "attr",
"attribute": "content",
},
}
rulesJSON, _ := json.Marshal(extractRules)
apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
url.QueryEscape("https://example.com"),
url.QueryEscape(string(rulesJSON)))
req, _ := http.NewRequest("GET", apiURL, nil)
req.Header.Set("ApiKey", "YOUR_API_KEY")
client := &http.Client{}
resp, _ := client.Do(req)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
fmt.Println(string(body))
}package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, err := ujeebu.NewClient("YOUR-API-KEY")
if err != nil {
panic(err)
}
extractRules := map[string]interface{}{
"title": map[string]string{"selector": "h1", "type": "text"},
"description": map[string]string{
"selector": "meta[name=description]",
"type": "attr",
"attribute": "content",
},
}
response, credits, err := client.Scrape(ujeebu.ScrapeParams{
URL: "https://example.com",
ExtractRules: extractRules,
})
if err != nil {
panic(err)
}
fmt.Printf("Credits used: %d\n", credits)
fmt.Println(response)
}Rule Types
There are 9 types of extraction rules:
| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
text |
rule |
No | Returns the text content of the matched element. | |
link |
rule |
No | Returns the href attribute if the element is an tag. | |
image |
rule |
No | Returns the src attribute if the element is an |
|
attr |
rule |
No | Returns a specified attribute value. Requires 'attribute' property. | |
obj |
rule |
No | Returns an object with nested rules defined in 'children'. | |
fn |
rule |
No | Executes custom JavaScript code to extract data. Access window object, DOM API, and parent elements. | |
table |
rule |
No | Extracts HTML tables as arrays of objects with automatic header detection. | |
tableTranspose |
rule |
No | Extracts transposed tables (key-value format) where headers are in the first column. | |
regex |
rule |
No | Extracts data using regular expressions directly from page content. |
Text Rule
Extract the text content of an element:
{
"product_name": {
"selector": ".product-title",
"type": "text"
}
}
Link Rule
Extract the href from anchor tags:
{
"article_url": {
"selector": "a.article-link",
"type": "link"
}
}
Image Rule
Extract the src from image tags:
{
"product_image": {
"selector": "img.product-photo",
"type": "image"
}
}
Attribute Rule
Extract any attribute value:
{
"meta_description": {
"selector": "meta[name=description]",
"type": "attr",
"attribute": "content"
}
}
Object Rule
Extract nested data structures:
{
"article_card": {
"selector": "article.card-item",
"type": "obj",
"children": {
"title": { "selector": "h1", "type": "text" },
"link": { "selector": "a", "type": "link" }
}
}
}
Function Rule
Execute custom JavaScript code to extract data. This is the most powerful extraction type, allowing you to:
- Access global JavaScript variables (window object)
- Execute custom computations
- Access browser storage (localStorage, sessionStorage)
- Parse JSON from script tags
- Use the
$parentvariable to access parent elements in nested extractions - Perform async operations with
await
Basic Syntax:
{
"config": {
"type": "fn",
"fn": "return window._app_config_;"
}
}
Access Window Variables:
{
"appState": {
"type": "fn",
"fn": "return window.__INITIAL_STATE__;"
}
}
Using $parent in Nested Objects:
When used within nested object extraction, the special $parent variable references the parent element:
{
"products": {
"selector": ".product",
"type": "obj",
"multiple": true,
"children": {
"name": {
"selector": ".name",
"type": "text"
},
"productId": {
"type": "fn",
"fn": "return $parent.getAttribute('data-id');"
},
"category": {
"type": "fn",
"fn": "return $parent.dataset.category;"
},
"itemCount": {
"type": "fn",
"fn": "return $parent.querySelectorAll('.item').length;"
}
}
}
}
Parse JSON from Script Tags:
{
"productData": {
"type": "fn",
"fn": "const script = document.querySelector('script[type=\"application/ld+json\"]'); return script ? JSON.parse(script.textContent) : null;"
}
}
Compute Values:
{
"totalPrice": {
"type": "fn",
"fn": "const prices = Array.from(document.querySelectorAll('.price')).map(el => parseFloat(el.textContent.replace('$', ''))); return prices.reduce((sum, p) => sum + p, 0);"
}
}
Access Browser Storage:
{
"userData": {
"type": "fn",
"fn": "return JSON.parse(localStorage.getItem('user_data') || '{}');"
}
}
Conditional Logic:
{
"availability": {
"type": "fn",
"fn": "return document.querySelector('.in-stock') ? 'available' : 'out of stock';"
}
}
INFO — Best Practices
- Use optional chaining (
?.) to safely access nested properties- Provide default values with nullish coalescing (
??)- Keep functions simple and focused
- Use
$parentfor scoped queries in nested objects- Reserve
fntype for complex scenarios; use CSS selectors for simple cases
WARNING — Error Handling
If the function throws an error, it will return
null. Always use safe access patterns with optional chaining (?.) and nullish coalescing (??) operators.
Safe Access Example:
{
"safeValue": {
"type": "fn",
"fn": "return window.config?.data?.value ?? 'default';"
}
}
Table Extraction
Extract HTML tables automatically with header detection. Perfect for extracting tabular data like product specifications, pricing tables, or feature comparisons.
Basic Table Example:
{
"specifications": {
"selector": "table.product-specs",
"type": "table"
}
}
Given this HTML:
<table class="product-specs">
<thead>
<tr><th>Feature</th><th>Value</th></tr>
</thead>
<tbody>
<tr><td>Weight</td><td>2.5 kg</td></tr>
<tr><td>Dimensions</td><td>30x20x10 cm</td></tr>
<tr><td>Color</td><td>Black</td></tr>
</tbody>
</table>
Result:
{
"specifications": [
{"feature": "Weight", "value": "2.5 kg"},
{"feature": "Dimensions", "value": "30x20x10 cm"},
{"feature": "Color", "value": "Black"}
]
}
Advanced Table Options:
{
"pricing": {
"selector": "table.pricing",
"type": "table",
"headers": ["plan", "price", "features"],
"rowSelector": "tbody tr",
"normalizeHeaders": true,
"asArray": false
}
}
Table Properties:
-
headers(array): Custom header names to use instead of auto-detection -
headerSelector(string): Custom CSS selector for header cells -
rowSelector(string): Custom CSS selector for data rows (default:"tbody tr, tr:not(:first-child)") -
normalizeHeaders(boolean): Convert headers to snake_case (default:true) -
asArray(boolean): Return rows as arrays instead of objects (default:false)
Transposed Table
For tables where headers are in the first column (key-value format). Common in product detail pages and specification sheets.
Example:
{
"product_details": {
"selector": "table.specs",
"type": "tableTranspose"
}
}
Given this HTML:
<table class="specs">
<tr><th>Brand</th><td>Apple</td></tr>
<tr><th>Model</th><td>iPhone 15</td></tr>
<tr><th>Storage</th><td>256GB</td></tr>
<tr><th>Color</th><td>Blue</td></tr>
</table>
Result:
{
"product_details": {
"brand": "Apple",
"model": "iPhone 15",
"storage": "256GB",
"color": "Blue"
}
}
Regex Extraction
Extract data using regular expressions directly from page content. Useful for extracting structured data embedded in text, such as SKUs, phone numbers, emails, or prices.
Basic Regex Example:
{
"sku": {
"type": "regex",
"selector": ".product-info",
"pattern": "SKU:\\s*([A-Z0-9-]+)"
}
}
Given this HTML:
<div class="product-info">
Product details here. SKU: ABC-12345. More info...
</div>
Result:
{
"sku": "ABC-12345"
}
Extract from Entire Page:
{
"phone_numbers": {
"type": "regex",
"pattern": "\\d{3}-\\d{3}-\\d{4}",
"flags": "g"
}
}
Multiple Capture Groups:
{
"date_parts": {
"type": "regex",
"selector": ".publish-date",
"pattern": "(\\d{4})-(\\d{2})-(\\d{2})",
"allGroups": true
}
}
Result: {"date_parts": ["2024", "12", "31"]}
Regex Properties:
-
pattern(string, required): Regular expression pattern -
selector(string, optional): CSS selector to scope the search (defaults to entire page) -
source(string):"text"(default) or"html"- search in text content or HTML -
flags(string): Regex flags (default:"i"for case-insensitive) -
group(number): Specific capture group to return (0 = full match) -
allGroups(boolean): Return all capture groups as array -
default(any): Default value if no match found
Extract Price from Text:
{
"price": {
"type": "regex",
"selector": ".price-text",
"pattern": "\\$([0-9,.]+)",
"default": "0.00"
}
}
Data Transformation Pipeline
Apply post-processing transformations to extracted values using the transform property. Transformations are applied in order (chained), allowing you to clean, convert, and reshape data after extraction.
Basic Transformation:
{
"price": {
"selector": ".price",
"type": "text",
"transform": ["number"]
}
}
Input: "$49.99" → Output: 49.99
Chained Transformations:
{
"price": {
"selector": ".price",
"type": "text",
"transform": [
"trim",
["replace", "\\$", ""],
["replace", "USD", ""],
"trim",
"number"
]
}
}
Input: " $1,234.56 USD " → Output: 1234.56
Available Transformations:
| Transform | Arguments | Description | Example |
|---|---|---|---|
| String Transforms | |||
trim |
- | Remove whitespace | " text " → "text" |
lowercase |
- | Convert to lowercase | "TEXT" → "text" |
uppercase |
- | Convert to uppercase | "text" → "TEXT" |
capitalize |
- | Capitalize first letter | "hello" → "Hello" |
replace |
pattern, replacement | Regex replace | ["replace", "\\$", ""] |
split |
delimiter | Split into array | ["split", ","] |
substring |
start, end | Get substring | ["substring", 0, 5] |
prefix |
text | Add prefix | ["prefix", "$"] |
suffix |
text | Add suffix | ["suffix", " USD"] |
| Number Transforms | |||
number |
- | Parse as float | "$49.99" → 49.99 |
integer |
- | Parse as integer | "42.7" → 42 |
currency |
- | Parse currency | "$1,234.56" → 1234.56 |
round |
decimals | Round to decimals | ["round", 2] |
floor |
- | Round down | 4.9 → 4 |
ceil |
- | Round up | 4.1 → 5 |
abs |
- | Absolute value | -5 → 5 |
| Array Transforms | |||
first |
- | Get first element | [1,2,3] → 1 |
last |
- | Get last element | [1,2,3] → 3 |
nth |
index | Get nth element | ["nth", 1] |
join |
delimiter | Join to string | ["join", ", "] |
unique |
- | Remove duplicates | [1,1,2] → [1,2] |
compact |
- | Remove null/empty | [1,null,"",2] → [1,2] |
flatten |
- | Flatten nested arrays | [[1,2],[3]] → [1,2,3] |
reverse |
- | Reverse array/string | [1,2,3] → [3,2,1] |
sort |
order | Sort (asc/desc) | ["sort", "desc"] |
length |
- | Get length | "text" → 4 |
| Regex Transforms | |||
match |
pattern, group | Extract with regex | ["match", "\\d+", 0] |
matchAll |
pattern | All matches | ["matchAll", "\\d+"] |
| Type Conversion | |||
boolean |
- | Parse as boolean | "true" → true |
string |
- | Convert to string | 42 → "42" |
json |
- | Parse JSON string | '{"a":1}' → {a:1} |
stringify |
- | Convert to JSON | {a:1} → '{"a":1}' |
| Date Transforms | |||
date |
format | Parse date | ["date", "iso"] |
| HTML/Text Cleaning | |||
stripHtml |
- | Remove HTML tags | "<b>text</b>" → "text" |
normalizeWhitespace |
- | Collapse spaces | "a b" → "a b" |
removeNewlines |
- | Remove line breaks | Multi-line → single line |
decode |
- | Decode HTML entities | "&" → "&" |
| Default/Fallback | |||
default |
value | Fallback if null | ["default", "N/A"] |
nullIf |
value | Null if equals | ["nullIf", ""] |
emptyToNull |
- | Convert empty to null | "" → null |
Practical Examples:
1. Clean and Parse Price:
{
"price_numeric": {
"selector": ".price",
"type": "text",
"transform": ["currency"]
}
}
Input: "Price: $1,234.56" → Output: 1234.56
2. Split Tags into Array:
{
"tags": {
"selector": ".product-tags",
"type": "text",
"transform": [
"trim",
["split", ","],
"compact",
"unique"
]
}
}
Input: "electronics, gadgets, , electronics" → Output: ["electronics", "gadgets"]
3. Extract and Round Rating:
{
"rating": {
"selector": ".rating-value",
"type": "text",
"transform": [
"number",
["round", 1]
]
}
}
Input: "4.567 stars" → Output: 4.6
4. Get First Available Image:
{
"main_image": {
"selector": "img.product-image",
"type": "image",
"multiple": true,
"transform": ["first"]
}
}
5. Parse JSON from Data Attribute:
{
"config": {
"selector": "[data-config]",
"type": "attr",
"attribute": "data-config",
"transform": ["json"]
}
}
Input: '{"theme":"dark","lang":"en"}' → Output: {"theme":"dark","lang":"en"}
6. Boolean Conversion:
{
"in_stock": {
"selector": ".stock-status",
"type": "text",
"transform": ["boolean"]
}
}
Input: "true" or "yes" or "1" → Output: true
TIP — Transform Tips
- Transformations are applied sequentially from left to right
- Use array syntax for transforms with arguments:
["replace", "pattern", "replacement"]- Combine multiple transforms to build complex pipelines
- The
defaulttransform provides fallback values for missing data- Use
compactto clean arrays of null/empty values
Simple Extraction
Extract a single value from a page. This example extracts the user agent from whatsmyuseragent.org:
{
"user-agent": {
"selector": ".user-agent .intro-text",
"type": "text"
}
}
Response:
{
"success": true,
"result": {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.0 Safari/537.36"
}
}
Multiple Items
Extract multiple matching elements by adding "multiple": true. This example extracts all quotes from a quotes page:
{
"quote": {
"selector": ".quote-card .description",
"type": "text",
"multiple": true
}
}
Response:
{
"success": true,
"result": {
"quote": [
""The world as we have created it is a process of our thinking..."",
""It is our choices, Harry, that show what we truly are..."",
""There are only two ways to live your life...""
]
}
}
Nested Items
Extract complex nested data using type: "obj" with children. This example extracts quotes with their authors and tags:
{
"quotes": {
"selector": ".quote-card",
"type": "obj",
"multiple": true,
"children": {
"text": {
"selector": ".description",
"type": "text"
},
"author": {
"selector": ".author",
"type": "text"
},
"tags": {
"selector": ".tags .tag",
"type": "text",
"multiple": true
}
}
}
}
Response:
{
"success": true,
"result": {
"quotes": [
{
"text": ""The world as we have created it..."",
"author": "Albert Einstein",
"tags": ["change", "deep-thoughts", "thinking", "world"]
},
{
"text": ""It is our choices, Harry..."",
"author": "J.K. Rowling",
"tags": ["abilities", "choices"]
}
]
}
}
Real-World Examples
E-commerce Product Scraping
Extract product data from an e-commerce page:
{
"products": {
"selector": ".product-card",
"type": "obj",
"multiple": true,
"children": {
"name": { "selector": ".product-name", "type": "text" },
"price": { "selector": ".price", "type": "text" },
"image": { "selector": "img", "type": "image" },
"url": { "selector": "a", "type": "link" },
"rating": { "selector": ".rating", "type": "attr", "attribute": "data-rating" }
}
}
}curl -G 'https://api.ujeebu.com/scrape' \
--data-urlencode 'url=https://shop.example.com/products' \
--data-urlencode 'extract_rules={"products":{"selector":".product-card","type":"obj","multiple":true,"children":{"name":{"selector":".product-name","type":"text"},"price":{"selector":".price","type":"text"}}}}' \
-H "ApiKey: YOUR_API_KEY"const rules = {
products: {
selector: '.product-card',
type: 'obj',
multiple: true,
children: {
name: { selector: '.product-name', type: 'text' },
price: { selector: '.price', type: 'text' },
image: { selector: 'img', type: 'image' },
url: { selector: 'a', type: 'link' },
rating: { selector: '.rating', type: 'attr', attribute: 'data-rating' }
}
}
};
const response = await fetch(
`https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://shop.example.com/products')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
{ headers: { 'ApiKey': 'YOUR_API_KEY' } }
);
const data = await response.json();
console.log(data.result.products);import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const rules = {
products: {
selector: '.product-card',
type: 'obj',
multiple: true,
children: {
name: { selector: '.product-name', type: 'text' },
price: { selector: '.price', type: 'text' },
image: { selector: 'img', type: 'image' },
url: { selector: 'a', type: 'link' },
rating: { selector: '.rating', type: 'attr', attribute: 'data-rating' }
}
}
};
const result = await client.scrapeWithRules(
'https://shop.example.com/products',
rules
);
console.log(result.products);import requests
import json
rules = {
'products': {
'selector': '.product-card',
'type': 'obj',
'multiple': True,
'children': {
'name': {'selector': '.product-name', 'type': 'text'},
'price': {'selector': '.price', 'type': 'text'},
'image': {'selector': 'img', 'type': 'image'},
'url': {'selector': 'a', 'type': 'link'},
'rating': {'selector': '.rating', 'type': 'attr', 'attribute': 'data-rating'}
}
}
}
response = requests.get(
'https://api.ujeebu.com/scrape',
params={
'url': 'https://shop.example.com/products',
'extract_rules': json.dumps(rules)
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json()['result']['products'])from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key="YOUR_API_KEY")
rules = {
'products': {
'selector': '.product-card',
'type': 'obj',
'multiple': True,
'children': {
'name': {'selector': '.product-name', 'type': 'text'},
'price': {'selector': '.price', 'type': 'text'},
'image': {'selector': 'img', 'type': 'image'},
'url': {'selector': 'a', 'type': 'link'},
'rating': {'selector': '.rating', 'type': 'attr', 'attribute': 'data-rating'}
}
}
}
result = ujeebu.scrape_with_rules(
url='https://shop.example.com/products',
extract_rules=rules
)
print(result['products'])import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;
OkHttpClient client = new OkHttpClient();
JSONObject rules = new JSONObject()
.put("products", new JSONObject()
.put("selector", ".product-card")
.put("type", "obj")
.put("multiple", true)
.put("children", new JSONObject()
.put("name", new JSONObject().put("selector", ".product-name").put("type", "text"))
.put("price", new JSONObject().put("selector", ".price").put("type", "text"))
.put("image", new JSONObject().put("selector", "img").put("type", "image"))
.put("url", new JSONObject().put("selector", "a").put("type", "link"))
.put("rating", new JSONObject()
.put("selector", ".rating")
.put("type", "attr")
.put("attribute", "data-rating"))));
String url = "https://api.ujeebu.com/scrape?url=" +
URLEncoder.encode("https://shop.example.com/products", "UTF-8") +
"&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");
Request request = new Request.Builder()
.url(url)
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$rules = [
'products' => [
'selector' => '.product-card',
'type' => 'obj',
'multiple' => true,
'children' => [
'name' => ['selector' => '.product-name', 'type' => 'text'],
'price' => ['selector' => '.price', 'type' => 'text'],
'image' => ['selector' => 'img', 'type' => 'image'],
'url' => ['selector' => 'a', 'type' => 'link'],
'rating' => ['selector' => '.rating', 'type' => 'attr', 'attribute' => 'data-rating']
]
]
];
$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
'url' => 'https://shop.example.com/products',
'extract_rules' => json_encode($rules)
]);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);
$response = curl_exec($ch);
curl_close($ch);
$data = json_decode($response, true);
print_r($data['result']['products']);package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
)
func main() {
rules := map[string]interface{}{
"products": map[string]interface{}{
"selector": ".product-card",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"name": map[string]string{"selector": ".product-name", "type": "text"},
"price": map[string]string{"selector": ".price", "type": "text"},
"image": map[string]string{"selector": "img", "type": "image"},
"url": map[string]string{"selector": "a", "type": "link"},
"rating": map[string]string{"selector": ".rating", "type": "attr", "attribute": "data-rating"},
},
},
}
rulesJSON, _ := json.Marshal(rules)
apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
url.QueryEscape("https://shop.example.com/products"),
url.QueryEscape(string(rulesJSON)))
req, _ := http.NewRequest("GET", apiURL, nil)
req.Header.Set("ApiKey", "YOUR_API_KEY")
client := &http.Client{}
resp, _ := client.Do(req)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
fmt.Println(string(body))
}package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, _ := ujeebu.NewClient("YOUR-API-KEY")
rules := map[string]interface{}{
"products": map[string]interface{}{
"selector": ".product-card",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"name": map[string]string{"selector": ".product-name", "type": "text"},
"price": map[string]string{"selector": ".price", "type": "text"},
"image": map[string]string{"selector": "img", "type": "image"},
"url": map[string]string{"selector": "a", "type": "link"},
"rating": map[string]string{"selector": ".rating", "type": "attr", "attribute": "data-rating"},
},
},
}
response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
URL: "https://shop.example.com/products",
ExtractRules: rules,
})
fmt.Printf("Credits used: %d\n", credits)
fmt.Println(response)
}News Article Scraping
Extract article data from a news site:
{
"articles": {
"selector": "article",
"type": "obj",
"multiple": true,
"children": {
"headline": { "selector": "h2", "type": "text" },
"summary": { "selector": ".summary", "type": "text" },
"author": { "selector": ".author", "type": "text" },
"date": { "selector": "time", "type": "attr", "attribute": "datetime" },
"link": { "selector": "a", "type": "link" },
"thumbnail": { "selector": "img", "type": "image" }
}
}
}curl -G 'https://api.ujeebu.com/scrape' \
--data-urlencode 'url=https://news.example.com' \
--data-urlencode 'extract_rules={"articles":{"selector":"article","type":"obj","multiple":true,"children":{"headline":{"selector":"h2","type":"text"},"summary":{"selector":".summary","type":"text"}}}}' \
-H "ApiKey: YOUR_API_KEY"const rules = {
articles: {
selector: 'article',
type: 'obj',
multiple: true,
children: {
headline: { selector: 'h2', type: 'text' },
summary: { selector: '.summary', type: 'text' },
author: { selector: '.author', type: 'text' },
date: { selector: 'time', type: 'attr', attribute: 'datetime' },
link: { selector: 'a', type: 'link' },
thumbnail: { selector: 'img', type: 'image' }
}
}
};
const response = await fetch(
`https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://news.example.com')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
{ headers: { 'ApiKey': 'YOUR_API_KEY' } }
);
const data = await response.json();
console.log(data.result.articles);import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const rules = {
articles: {
selector: 'article',
type: 'obj',
multiple: true,
children: {
headline: { selector: 'h2', type: 'text' },
summary: { selector: '.summary', type: 'text' },
author: { selector: '.author', type: 'text' },
date: { selector: 'time', type: 'attr', attribute: 'datetime' },
link: { selector: 'a', type: 'link' },
thumbnail: { selector: 'img', type: 'image' }
}
}
};
const result = await client.scrapeWithRules(
'https://news.example.com',
rules
);
console.log(result.articles);import requests
import json
rules = {
'articles': {
'selector': 'article',
'type': 'obj',
'multiple': True,
'children': {
'headline': {'selector': 'h2', 'type': 'text'},
'summary': {'selector': '.summary', 'type': 'text'},
'author': {'selector': '.author', 'type': 'text'},
'date': {'selector': 'time', 'type': 'attr', 'attribute': 'datetime'},
'link': {'selector': 'a', 'type': 'link'},
'thumbnail': {'selector': 'img', 'type': 'image'}
}
}
}
response = requests.get(
'https://api.ujeebu.com/scrape',
params={
'url': 'https://news.example.com',
'extract_rules': json.dumps(rules)
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json()['result']['articles'])from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key="YOUR_API_KEY")
rules = {
'articles': {
'selector': 'article',
'type': 'obj',
'multiple': True,
'children': {
'headline': {'selector': 'h2', 'type': 'text'},
'summary': {'selector': '.summary', 'type': 'text'},
'author': {'selector': '.author', 'type': 'text'},
'date': {'selector': 'time', 'type': 'attr', 'attribute': 'datetime'},
'link': {'selector': 'a', 'type': 'link'},
'thumbnail': {'selector': 'img', 'type': 'image'}
}
}
}
result = ujeebu.scrape_with_rules(
url='https://news.example.com',
extract_rules=rules
)
print(result['articles'])import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;
OkHttpClient client = new OkHttpClient();
JSONObject rules = new JSONObject()
.put("articles", new JSONObject()
.put("selector", "article")
.put("type", "obj")
.put("multiple", true)
.put("children", new JSONObject()
.put("headline", new JSONObject().put("selector", "h2").put("type", "text"))
.put("summary", new JSONObject().put("selector", ".summary").put("type", "text"))
.put("author", new JSONObject().put("selector", ".author").put("type", "text"))
.put("date", new JSONObject().put("selector", "time").put("type", "attr").put("attribute", "datetime"))
.put("link", new JSONObject().put("selector", "a").put("type", "link"))
.put("thumbnail", new JSONObject().put("selector", "img").put("type", "image"))));
String url = "https://api.ujeebu.com/scrape?url=" +
URLEncoder.encode("https://news.example.com", "UTF-8") +
"&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");
Request request = new Request.Builder()
.url(url)
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$rules = [
'articles' => [
'selector' => 'article',
'type' => 'obj',
'multiple' => true,
'children' => [
'headline' => ['selector' => 'h2', 'type' => 'text'],
'summary' => ['selector' => '.summary', 'type' => 'text'],
'author' => ['selector' => '.author', 'type' => 'text'],
'date' => ['selector' => 'time', 'type' => 'attr', 'attribute' => 'datetime'],
'link' => ['selector' => 'a', 'type' => 'link'],
'thumbnail' => ['selector' => 'img', 'type' => 'image']
]
]
];
$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
'url' => 'https://news.example.com',
'extract_rules' => json_encode($rules)
]);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);
$response = curl_exec($ch);
curl_close($ch);
$data = json_decode($response, true);
print_r($data['result']['articles']);package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
)
func main() {
rules := map[string]interface{}{
"articles": map[string]interface{}{
"selector": "article",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"headline": map[string]string{"selector": "h2", "type": "text"},
"summary": map[string]string{"selector": ".summary", "type": "text"},
"author": map[string]string{"selector": ".author", "type": "text"},
"date": map[string]string{"selector": "time", "type": "attr", "attribute": "datetime"},
"link": map[string]string{"selector": "a", "type": "link"},
"thumbnail": map[string]string{"selector": "img", "type": "image"},
},
},
}
rulesJSON, _ := json.Marshal(rules)
apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
url.QueryEscape("https://news.example.com"),
url.QueryEscape(string(rulesJSON)))
req, _ := http.NewRequest("GET", apiURL, nil)
req.Header.Set("ApiKey", "YOUR_API_KEY")
client := &http.Client{}
resp, _ := client.Do(req)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
fmt.Println(string(body))
}package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, _ := ujeebu.NewClient("YOUR-API-KEY")
rules := map[string]interface{}{
"articles": map[string]interface{}{
"selector": "article",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"headline": map[string]string{"selector": "h2", "type": "text"},
"summary": map[string]string{"selector": ".summary", "type": "text"},
"author": map[string]string{"selector": ".author", "type": "text"},
"date": map[string]string{"selector": "time", "type": "attr", "attribute": "datetime"},
"link": map[string]string{"selector": "a", "type": "link"},
"thumbnail": map[string]string{"selector": "img", "type": "image"},
},
},
}
response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
URL: "https://news.example.com",
ExtractRules: rules,
})
fmt.Printf("Credits used: %d\n", credits)
fmt.Println(response)
}Table Data Extraction
Extract data from HTML tables:
{
"rows": {
"selector": "table tbody tr",
"type": "obj",
"multiple": true,
"children": {
"column1": { "selector": "td:nth-child(1)", "type": "text" },
"column2": { "selector": "td:nth-child(2)", "type": "text" },
"column3": { "selector": "td:nth-child(3)", "type": "text" }
}
}
}curl -G 'https://api.ujeebu.com/scrape' \
--data-urlencode 'url=https://data.example.com/table' \
--data-urlencode 'extract_rules={"rows":{"selector":"table tbody tr","type":"obj","multiple":true,"children":{"column1":{"selector":"td:nth-child(1)","type":"text"},"column2":{"selector":"td:nth-child(2)","type":"text"}}}}' \
-H "ApiKey: YOUR_API_KEY"const rules = {
rows: {
selector: 'table tbody tr',
type: 'obj',
multiple: true,
children: {
column1: { selector: 'td:nth-child(1)', type: 'text' },
column2: { selector: 'td:nth-child(2)', type: 'text' },
column3: { selector: 'td:nth-child(3)', type: 'text' }
}
}
};
const response = await fetch(
`https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://data.example.com/table')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
{ headers: { 'ApiKey': 'YOUR_API_KEY' } }
);
const data = await response.json();
console.log(data.result.rows);import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const rules = {
rows: {
selector: 'table tbody tr',
type: 'obj',
multiple: true,
children: {
column1: { selector: 'td:nth-child(1)', type: 'text' },
column2: { selector: 'td:nth-child(2)', type: 'text' },
column3: { selector: 'td:nth-child(3)', type: 'text' }
}
}
};
const result = await client.scrapeWithRules(
'https://data.example.com/table',
rules
);
console.log(result.rows);import requests
import json
rules = {
'rows': {
'selector': 'table tbody tr',
'type': 'obj',
'multiple': True,
'children': {
'column1': {'selector': 'td:nth-child(1)', 'type': 'text'},
'column2': {'selector': 'td:nth-child(2)', 'type': 'text'},
'column3': {'selector': 'td:nth-child(3)', 'type': 'text'}
}
}
}
response = requests.get(
'https://api.ujeebu.com/scrape',
params={
'url': 'https://data.example.com/table',
'extract_rules': json.dumps(rules)
},
headers={'ApiKey': 'YOUR_API_KEY'}
)
print(response.json()['result']['rows'])from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key="YOUR_API_KEY")
rules = {
'rows': {
'selector': 'table tbody tr',
'type': 'obj',
'multiple': True,
'children': {
'column1': {'selector': 'td:nth-child(1)', 'type': 'text'},
'column2': {'selector': 'td:nth-child(2)', 'type': 'text'},
'column3': {'selector': 'td:nth-child(3)', 'type': 'text'}
}
}
}
result = ujeebu.scrape_with_rules(
url='https://data.example.com/table',
extract_rules=rules
)
print(result['rows'])import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;
OkHttpClient client = new OkHttpClient();
JSONObject rules = new JSONObject()
.put("rows", new JSONObject()
.put("selector", "table tbody tr")
.put("type", "obj")
.put("multiple", true)
.put("children", new JSONObject()
.put("column1", new JSONObject().put("selector", "td:nth-child(1)").put("type", "text"))
.put("column2", new JSONObject().put("selector", "td:nth-child(2)").put("type", "text"))
.put("column3", new JSONObject().put("selector", "td:nth-child(3)").put("type", "text"))));
String url = "https://api.ujeebu.com/scrape?url=" +
URLEncoder.encode("https://data.example.com/table", "UTF-8") +
"&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");
Request request = new Request.Builder()
.url(url)
.addHeader("ApiKey", "YOUR_API_KEY")
.build();
Response response = client.newCall(request).execute();
System.out.println(response.body().string());<?php
$rules = [
'rows' => [
'selector' => 'table tbody tr',
'type' => 'obj',
'multiple' => true,
'children' => [
'column1' => ['selector' => 'td:nth-child(1)', 'type' => 'text'],
'column2' => ['selector' => 'td:nth-child(2)', 'type' => 'text'],
'column3' => ['selector' => 'td:nth-child(3)', 'type' => 'text']
]
]
];
$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
'url' => 'https://data.example.com/table',
'extract_rules' => json_encode($rules)
]);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);
$response = curl_exec($ch);
curl_close($ch);
$data = json_decode($response, true);
print_r($data['result']['rows']);package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
)
func main() {
rules := map[string]interface{}{
"rows": map[string]interface{}{
"selector": "table tbody tr",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"column1": map[string]string{"selector": "td:nth-child(1)", "type": "text"},
"column2": map[string]string{"selector": "td:nth-child(2)", "type": "text"},
"column3": map[string]string{"selector": "td:nth-child(3)", "type": "text"},
},
},
}
rulesJSON, _ := json.Marshal(rules)
apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
url.QueryEscape("https://data.example.com/table"),
url.QueryEscape(string(rulesJSON)))
req, _ := http.NewRequest("GET", apiURL, nil)
req.Header.Set("ApiKey", "YOUR_API_KEY")
client := &http.Client{}
resp, _ := client.Do(req)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
fmt.Println(string(body))
}package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, _ := ujeebu.NewClient("YOUR-API-KEY")
rules := map[string]interface{}{
"rows": map[string]interface{}{
"selector": "table tbody tr",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"column1": map[string]string{"selector": "td:nth-child(1)", "type": "text"},
"column2": map[string]string{"selector": "td:nth-child(2)", "type": "text"},
"column3": map[string]string{"selector": "td:nth-child(3)", "type": "text"},
},
},
}
response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
URL: "https://data.example.com/table",
ExtractRules: rules,
})
fmt.Printf("Credits used: %d\n", credits)
fmt.Println(response)
}Advanced: Using Function Type
Extract data using custom JavaScript to access window variables, compute values, and use $parent:
{
"products": {
"selector": ".product-item",
"type": "obj",
"multiple": true,
"children": {
"name": { "selector": ".product-name", "type": "text" },
"price": { "selector": ".price", "type": "text" },
"sku": {
"type": "fn",
"fn": "return $parent.getAttribute('data-sku');"
},
"inStock": {
"type": "fn",
"fn": "return $parent.classList.contains('in-stock');"
},
"rating": {
"type": "fn",
"fn": "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
}
}
},
"pageConfig": {
"type": "fn",
"fn": "return window._pageConfig_ || {};"
},
"totalItems": {
"type": "fn",
"fn": "return document.querySelectorAll('.product-item').length;"
}
}curl -G 'https://api.ujeebu.com/scrape' \
--data-urlencode 'url=https://shop.example.com' \
--data-urlencode 'js=true' \
--data-urlencode 'extract_rules={"products":{"selector":".product-item","type":"obj","multiple":true,"children":{"name":{"selector":".product-name","type":"text"},"sku":{"type":"fn","fn":"return $parent.getAttribute('data-sku');"},"inStock":{"type":"fn","fn":"return $parent.classList.contains('in-stock');"}}},"pageConfig":{"type":"fn","fn":"return window._pageConfig_ || {};"}}' \
-H "ApiKey: YOUR_API_KEY"import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const rules = {
products: {
selector: '.product-item',
type: 'obj',
multiple: true,
children: {
name: { selector: '.product-name', type: 'text' },
price: { selector: '.price', type: 'text' },
sku: {
type: 'fn',
fn: "return $parent.getAttribute('data-sku');"
},
inStock: {
type: 'fn',
fn: "return $parent.classList.contains('in-stock');"
},
rating: {
type: 'fn',
fn: "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
}
}
},
pageConfig: {
type: 'fn',
fn: "return window._pageConfig_ || {};"
},
totalItems: {
type: 'fn',
fn: "return document.querySelectorAll('.product-item').length;"
}
};
const result = await client.scrapeWithRules(
'https://shop.example.com',
rules,
{ js: true }
);
console.log(result);from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key="YOUR_API_KEY")
rules = {
'products': {
'selector': '.product-item',
'type': 'obj',
'multiple': True,
'children': {
'name': {'selector': '.product-name', 'type': 'text'},
'price': {'selector': '.price', 'type': 'text'},
'sku': {
'type': 'fn',
'fn': "return $parent.getAttribute('data-sku');"
},
'inStock': {
'type': 'fn',
'fn': "return $parent.classList.contains('in-stock');"
},
'rating': {
'type': 'fn',
'fn': "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
}
}
},
'pageConfig': {
'type': 'fn',
'fn': "return window._pageConfig_ || {};"
},
'totalItems': {
'type': 'fn',
'fn': "return document.querySelectorAll('.product-item').length;"
}
}
result = ujeebu.scrape_with_rules(
url='https://shop.example.com',
extract_rules=rules,
params={'js': True}
)
print(result)package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, _ := ujeebu.NewClient("YOUR-API-KEY")
rules := map[string]interface{}{
"products": map[string]interface{}{
"selector": ".product-item",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"name": map[string]string{"selector": ".product-name", "type": "text"},
"price": map[string]string{"selector": ".price", "type": "text"},
"sku": map[string]string{
"type": "fn",
"fn": "return $parent.getAttribute('data-sku');",
},
"inStock": map[string]string{
"type": "fn",
"fn": "return $parent.classList.contains('in-stock');",
},
},
},
"pageConfig": map[string]string{
"type": "fn",
"fn": "return window._pageConfig_ || {};",
},
"totalItems": map[string]string{
"type": "fn",
"fn": "return document.querySelectorAll('.product-item').length;",
},
}
response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
URL: "https://shop.example.com",
JS: true,
ExtractRules: rules,
})
fmt.Printf("Credits used: %d\n", credits)
fmt.Println(response)
}Expected Response:
{
"success": true,
"result": {
"products": [
{
"name": "Premium Headphones",
"price": "$199.99",
"sku": "HD-2024-BLK",
"inStock": true,
"rating": 5
},
{
"name": "Wireless Mouse",
"price": "$49.99",
"sku": "MS-2024-GRY",
"inStock": false,
"rating": 4
}
],
"pageConfig": {
"currency": "USD",
"locale": "en-US",
"version": "2.1.0"
},
"totalItems": 24
}
}
TIP — Function Type Use Cases
The
fntype is perfect for:
- Extracting data from JavaScript variables (e.g.,
window.__NEXT_DATA__)- Accessing parent element attributes with
$parent- Computing derived values (totals, averages, etc.)
- Parsing JSON from script tags
- Checking element states (classes, visibility)
- Accessing localStorage/sessionStorage
- Complex conditional logic
Advanced: Table and Regex Extraction
Extract product specifications from a table and SKU from text using table and regex types:
{
"product_name": {
"selector": "h1.product-title",
"type": "text"
},
"sku": {
"type": "regex",
"selector": ".product-info",
"pattern": "SKU:\\s*([A-Z0-9-]+)"
},
"specifications": {
"selector": "table.specs",
"type": "table"
},
"product_details": {
"selector": "table.details",
"type": "tableTranspose"
},
"price": {
"type": "regex",
"selector": ".price-text",
"pattern": "\\$([0-9,.]+)"
}
}curl -G 'https://api.ujeebu.com/scrape' \
--data-urlencode 'url=https://shop.example.com/product/123' \
--data-urlencode 'extract_rules={"product_name":{"selector":"h1.product-title","type":"text"},"sku":{"type":"regex","selector":".product-info","pattern":"SKU:\\\\s*([A-Z0-9-]+)"},"specifications":{"selector":"table.specs","type":"table"},"product_details":{"selector":"table.details","type":"tableTranspose"},"price":{"type":"regex","selector":".price-text","pattern":"\\\\$([0-9,.]+)"}}' \
-H "ApiKey: YOUR_API_KEY"import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const rules = {
product_name: {
selector: 'h1.product-title',
type: 'text'
},
sku: {
type: 'regex',
selector: '.product-info',
pattern: 'SKU:\\\\s*([A-Z0-9-]+)'
},
specifications: {
selector: 'table.specs',
type: 'table'
},
product_details: {
selector: 'table.details',
type: 'tableTranspose'
},
price: {
type: 'regex',
selector: '.price-text',
pattern: '\\\\$([0-9,.]+)'
}
};
const result = await client.scrapeWithRules(
'https://shop.example.com/product/123',
rules
);
console.log(result);from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key="YOUR_API_KEY")
rules = {
'product_name': {
'selector': 'h1.product-title',
'type': 'text'
},
'sku': {
'type': 'regex',
'selector': '.product-info',
'pattern': r'SKU:\s*([A-Z0-9-]+)'
},
'specifications': {
'selector': 'table.specs',
'type': 'table'
},
'product_details': {
'selector': 'table.details',
'type': 'tableTranspose'
},
'price': {
'type': 'regex',
'selector': '.price-text',
'pattern': r'\$([0-9,.]+)'
}
}
result = ujeebu.scrape_with_rules(
url='https://shop.example.com/product/123',
extract_rules=rules
)
print(result)package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, _ := ujeebu.NewClient("YOUR-API-KEY")
rules := map[string]interface{}{
"product_name": map[string]string{
"selector": "h1.product-title",
"type": "text",
},
"sku": map[string]string{
"type": "regex",
"selector": ".product-info",
"pattern": `SKU:\s*([A-Z0-9-]+)`,
},
"specifications": map[string]string{
"selector": "table.specs",
"type": "table",
},
"product_details": map[string]string{
"selector": "table.details",
"type": "tableTranspose",
},
"price": map[string]string{
"type": "regex",
"selector": ".price-text",
"pattern": `\$([0-9,.]+)`,
},
}
response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
URL: "https://shop.example.com/product/123",
ExtractRules: rules,
})
fmt.Printf("Credits used: %d\n", credits)
fmt.Println(response)
}Expected Response:
{
"success": true,
"result": {
"product_name": "Premium Wireless Headphones",
"sku": "HD-2024-BLK-256",
"price": "199.99",
"specifications": [
{"feature": "Battery Life", "value": "40 hours"},
{"feature": "Bluetooth", "value": "5.3"},
{"feature": "Weight", "value": "250g"},
{"feature": "Charging", "value": "USB-C"}
],
"product_details": {
"brand": "AudioTech",
"model": "HD-2024",
"color": "Black",
"warranty": "2 years"
}
}
}
TIP — Table and Regex Use Cases
- Table extraction: Product specifications, pricing tables, feature comparisons, data tables
- Regex extraction: SKUs, prices, phone numbers, email addresses, dates, order numbers, tracking IDs
- Combine both: Extract structured tabular data alongside pattern-based text extraction for comprehensive scraping
Advanced: E-commerce Product with Transformations
Extract and transform product data with clean, structured output:
{
"products": {
"selector": ".product-card",
"type": "obj",
"multiple": true,
"children": {
"name": {
"selector": ".product-name",
"type": "text",
"transform": ["trim", "normalizeWhitespace"]
},
"price": {
"selector": ".price",
"type": "text",
"transform": ["currency", ["round", 2]]
},
"original_price": {
"selector": ".original-price",
"type": "text",
"transform": ["currency", ["default", null]]
},
"discount_percent": {
"selector": ".discount",
"type": "text",
"transform": [["match", "(\\d+)%", 1], "integer"]
},
"rating": {
"selector": ".rating",
"type": "text",
"transform": ["number", ["round", 1], ["default", 0]]
},
"reviews_count": {
"selector": ".reviews",
"type": "text",
"transform": [["match", "(\\d+)", 1], "integer"]
},
"tags": {
"selector": ".tags",
"type": "text",
"transform": [["split", ","], "compact", "unique", "lowercase"]
},
"in_stock": {
"selector": ".stock-status",
"type": "text",
"transform": ["lowercase", "boolean"]
},
"images": {
"selector": ".product-images img",
"type": "image",
"multiple": true,
"transform": ["compact", "unique"]
}
}
}
}import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';
const client = new UjeebuClient(process.env.UJEEBU_API_KEY);
const rules = {
products: {
selector: '.product-card',
type: 'obj',
multiple: true,
children: {
name: {
selector: '.product-name',
type: 'text',
transform: ['trim', 'normalizeWhitespace']
},
price: {
selector: '.price',
type: 'text',
transform: ['currency', ['round', 2]]
},
original_price: {
selector: '.original-price',
type: 'text',
transform: ['currency', ['default', null]]
},
discount_percent: {
selector: '.discount',
type: 'text',
transform: [['match', '(\\\\d+)%', 1], 'integer']
},
rating: {
selector: '.rating',
type: 'text',
transform: ['number', ['round', 1], ['default', 0]]
},
reviews_count: {
selector: '.reviews',
type: 'text',
transform: [['match', '(\\\\d+)', 1], 'integer']
},
tags: {
selector: '.tags',
type: 'text',
transform: [['split', ','], 'compact', 'unique', 'lowercase']
},
in_stock: {
selector: '.stock-status',
type: 'text',
transform: ['lowercase', 'boolean']
},
images: {
selector: '.product-images img',
type: 'image',
multiple: true,
transform: ['compact', 'unique']
}
}
}
};
const result = await client.scrapeWithRules(
'https://shop.example.com',
rules
);
console.log(result);from ujeebu_python import UjeebuClient
ujeebu = UjeebuClient(api_key="YOUR_API_KEY")
rules = {
'products': {
'selector': '.product-card',
'type': 'obj',
'multiple': True,
'children': {
'name': {
'selector': '.product-name',
'type': 'text',
'transform': ['trim', 'normalizeWhitespace']
},
'price': {
'selector': '.price',
'type': 'text',
'transform': ['currency', ['round', 2]]
},
'original_price': {
'selector': '.original-price',
'type': 'text',
'transform': ['currency', ['default', None]]
},
'discount_percent': {
'selector': '.discount',
'type': 'text',
'transform': [['match', r'(\d+)%', 1], 'integer']
},
'rating': {
'selector': '.rating',
'type': 'text',
'transform': ['number', ['round', 1], ['default', 0]]
},
'reviews_count': {
'selector': '.reviews',
'type': 'text',
'transform': [['match', r'(\d+)', 1], 'integer']
},
'tags': {
'selector': '.tags',
'type': 'text',
'transform': [['split', ','], 'compact', 'unique', 'lowercase']
},
'in_stock': {
'selector': '.stock-status',
'type': 'text',
'transform': ['lowercase', 'boolean']
},
'images': {
'selector': '.product-images img',
'type': 'image',
'multiple': True,
'transform': ['compact', 'unique']
}
}
}
}
result = ujeebu.scrape_with_rules(
url='https://shop.example.com',
extract_rules=rules
)
print(result)package main
import (
"fmt"
"github.com/ujeebu/ujeebu-go"
)
func main() {
client, _ := ujeebu.NewClient("YOUR-API-KEY")
rules := map[string]interface{}{
"products": map[string]interface{}{
"selector": ".product-card",
"type": "obj",
"multiple": true,
"children": map[string]interface{}{
"name": map[string]interface{}{
"selector": ".product-name",
"type": "text",
"transform": []string{"trim", "normalizeWhitespace"},
},
"price": map[string]interface{}{
"selector": ".price",
"type": "text",
"transform": []interface{}{"currency", []interface{}{"round", 2}},
},
"rating": map[string]interface{}{
"selector": ".rating",
"type": "text",
"transform": []interface{}{"number", []interface{}{"round", 1}, []interface{}{"default", 0}},
},
"tags": map[string]interface{}{
"selector": ".tags",
"type": "text",
"transform": []interface{}{[]string{"split", ","}, "compact", "unique", "lowercase"},
},
"in_stock": map[string]interface{}{
"selector": ".stock-status",
"type": "text",
"transform": []string{"lowercase", "boolean"},
},
},
},
}
response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
URL: "https://shop.example.com",
ExtractRules: rules,
})
fmt.Printf("Credits used: %d\n", credits)
fmt.Println(response)
}Expected Response:
{
"success": true,
"result": {
"products": [
{
"name": "Wireless Headphones Premium",
"price": 199.99,
"original_price": 249.99,
"discount_percent": 20,
"rating": 4.5,
"reviews_count": 1234,
"tags": ["electronics", "audio", "wireless"],
"in_stock": true,
"images": [
"https://cdn.example.com/img1.jpg",
"https://cdn.example.com/img2.jpg"
]
},
{
"name": "Smart Watch Pro",
"price": 299.99,
"original_price": null,
"discount_percent": null,
"rating": 4.8,
"reviews_count": 567,
"tags": ["electronics", "wearable", "fitness"],
"in_stock": false,
"images": [
"https://cdn.example.com/watch1.jpg"
]
}
]
}
}
What This Example Demonstrates:
-
Price cleaning: Convert
"$199.99"to199.99(number) -
Discount extraction: Extract
20from"Save 20%!" -
Rating normalization: Round
4.567to4.6 -
Review count extraction: Extract
1234from"1,234 reviews" - Tag processing: Split, clean, deduplicate, and lowercase tags
-
Boolean conversion: Convert
"in stock"totrue - Array deduplication: Remove duplicate images
-
Fallback values: Use
nullwhen original price is missing
SUCCESS — Transformation Best Practices
When building extraction rules with transformations:
- Chain transforms logically: Start with text cleaning (trim), then extraction (match, split), then type conversion (number, boolean)
- Use defaults for optional fields: Prevent null values with ["default", 0] or ["default", null]
- Clean arrays: Use compact to remove empty values and unique to deduplicate
- Parse numbers correctly: Use currency for prices, number for decimals, integer for whole numbers
- Extract before converting: Use match to extract patterns before converting types
- Test incrementally: Build your transform pipeline step by step to catch errors early
Parameters
| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
url |
string |
Yes | - |
The URL to scrape. |
extract_rules |
json-string |
Yes | - |
JSON object defining extraction rules. |
js |
boolean |
No | false |
Enable JavaScript rendering before extraction. |
wait_for |
`string | number` | No | null |
timeout |
number |
No | 60 |
Maximum seconds before request timeout. |
proxy_type |
string |
No | rotating |
Proxy type: 'rotating', 'premium', 'residential', 'custom'. |
Response Format
The API returns a JSON response with the extracted data in the result field:
{
"success": true,
"result": {
"key_name": "extracted_value",
"another_key": ["array", "of", "values"]
}
}
If extraction fails for a selector, the value will be null or an empty array for multiple items.
Spin up an API key in 60 seconds
Free tier: 5,000 credits, no card, full access to every endpoint on this page.