Extract Rules

Extract structured data from any web page using CSS selectors. Define extraction rules to scrape specific elements and get clean JSON output.

Overview

Extract rules allow you to scrape structured data from any web page using CSS selectors. Add the extract_rules parameter to your API call with a JSON object defining what data to extract.

GET https://api.ujeebu.com/scrape?extract_rules={...}

TIP - Powerful Data Extraction

Extract rules are perfect for extracting product information, article content, lists, tables, or any repeating data patterns from web pages.

Basic Format

The simplest way to use extract rules:

{
  "key_name": {
    "selector": "css_selector",
    "type": "rule_type"
  }
}

curl -X GET 'https://api.ujeebu.com/scrape?url=https://example.com&extract_rules={"title":{"selector":"h1","type":"text"}}' \
  -H "ApiKey: YOUR_API_KEY"

const extractRules = {
  title: { selector: 'h1', type: 'text' },
  description: { selector: 'meta[name=description]', type: 'attr', attribute: 'content' }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=https://example.com&extract_rules=${encodeURIComponent(JSON.stringify(extractRules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result);

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const extractRules = {
  title: { selector: 'h1', type: 'text' },
  description: { selector: 'meta[name=description]', type: 'attr', attribute: 'content' }
};

const result = await client.scrapeWithRules(
  'https://example.com',
  extractRules
);

console.log(result);

import requests
import json

extract_rules = {
    'title': {'selector': 'h1', 'type': 'text'},
    'description': {'selector': 'meta[name=description]', 'type': 'attr', 'attribute': 'content'}
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://example.com',
        'extract_rules': json.dumps(extract_rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result'])

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

extract_rules = {
    'title': {'selector': 'h1', 'type': 'text'},
    'description': {'selector': 'meta[name=description]', 'type': 'attr', 'attribute': 'content'}
}

result = ujeebu.scrape_with_rules(
    url='https://example.com',
    extract_rules=extract_rules
)

print(result)

import okhttp3.*;
import org.json.*;

OkHttpClient client = new OkHttpClient();

JSONObject extractRules = new JSONObject()
    .put("title", new JSONObject().put("selector", "h1").put("type", "text"))
    .put("description", new JSONObject()
        .put("selector", "meta[name=description]")
        .put("type", "attr")
        .put("attribute", "content"));

String url = "https://api.ujeebu.com/scrape?url=" + 
    URLEncoder.encode("https://example.com", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(extractRules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$extractRules = [
    'title' => ['selector' => 'h1', 'type' => 'text'],
    'description' => [
        'selector' => 'meta[name=description]',
        'type' => 'attr',
        'attribute' => 'content'
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://example.com',
    'extract_rules' => json_encode($extractRules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
    'ApiKey: YOUR_API_KEY'
]);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']);

package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	extractRules := map[string]interface{}{
		"title": map[string]string{"selector": "h1", "type": "text"},
		"description": map[string]string{
			"selector":  "meta[name=description]",
			"type":      "attr",
			"attribute": "content",
		},
	}

	rulesJSON, _ := json.Marshal(extractRules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://example.com"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, err := ujeebu.NewClient("YOUR-API-KEY")
	if err != nil {
		panic(err)
	}

	extractRules := map[string]interface{}{
		"title": map[string]string{"selector": "h1", "type": "text"},
		"description": map[string]string{
			"selector":  "meta[name=description]",
			"type":      "attr",
			"attribute": "content",
		},
	}

	response, credits, err := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://example.com",
		ExtractRules: extractRules,
	})
	if err != nil {
		panic(err)
	}

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Rule Types

There are 9 types of extraction rules:

Parameter	Type	Required	Description
`text`	`rule`	No	Returns the text content of the matched element.
`link`	`rule`	No	Returns the href attribute if the element is an tag.
`image`	`rule`	No	Returns the src attribute if the element is an tag.
`attr`	`rule`	No	Returns a specified attribute value. Requires 'attribute' property.
`obj`	`rule`	No	Returns an object with nested rules defined in 'children'.
`fn`	`rule`	No	Executes custom JavaScript code to extract data. Access window object, DOM API, and parent elements.
`table`	`rule`	No	Extracts HTML tables as arrays of objects with automatic header detection.
`tableTranspose`	`rule`	No	Extracts transposed tables (key-value format) where headers are in the first column.
`regex`	`rule`	No	Extracts data using regular expressions directly from page content.

Text Rule

Extract the text content of an element:

{
  "product_name": {
    "selector": ".product-title",
    "type": "text"
  }
}

Link Rule

Extract the href from anchor tags:

{
  "article_url": {
    "selector": "a.article-link",
    "type": "link"
  }
}

Image Rule

Extract the src from image tags:

{
  "product_image": {
    "selector": "img.product-photo",
    "type": "image"
  }
}

Attribute Rule

Extract any attribute value:

{
  "meta_description": {
    "selector": "meta[name=description]",
    "type": "attr",
    "attribute": "content"
  }
}

Object Rule

Extract nested data structures:

{
  "article_card": {
    "selector": "article.card-item",
    "type": "obj",
    "children": {
      "title": { "selector": "h1", "type": "text" },
      "link": { "selector": "a", "type": "link" }
    }
  }
}

Function Rule

Execute custom JavaScript code to extract data. This is the most powerful extraction type, allowing you to:

Access global JavaScript variables (window object)
Execute custom computations
Access browser storage (localStorage, sessionStorage)
Parse JSON from script tags
Use the $parent variable to access parent elements in nested extractions
Perform async operations with await

Basic Syntax:

{
  "config": {
    "type": "fn",
    "fn": "return window._app_config_;"
  }
}

Access Window Variables:

{
  "appState": {
    "type": "fn",
    "fn": "return window.__INITIAL_STATE__;"
  }
}

Using $parent in Nested Objects:

When used within nested object extraction, the special $parent variable references the parent element:

{
  "products": {
    "selector": ".product",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": {
        "selector": ".name",
        "type": "text"
      },
      "productId": {
        "type": "fn",
        "fn": "return $parent.getAttribute('data-id');"
      },
      "category": {
        "type": "fn",
        "fn": "return $parent.dataset.category;"
      },
      "itemCount": {
        "type": "fn",
        "fn": "return $parent.querySelectorAll('.item').length;"
      }
    }
  }
}

Parse JSON from Script Tags:

{
  "productData": {
    "type": "fn",
    "fn": "const script = document.querySelector('script[type=\"application/ld+json\"]'); return script ? JSON.parse(script.textContent) : null;"
  }
}

Compute Values:

{
  "totalPrice": {
    "type": "fn",
    "fn": "const prices = Array.from(document.querySelectorAll('.price')).map(el => parseFloat(el.textContent.replace('$', ''))); return prices.reduce((sum, p) => sum + p, 0);"
  }
}

Access Browser Storage:

{
  "userData": {
    "type": "fn",
    "fn": "return JSON.parse(localStorage.getItem('user_data') || '{}');"
  }
}

Conditional Logic:

{
  "availability": {
    "type": "fn",
    "fn": "return document.querySelector('.in-stock') ? 'available' : 'out of stock';"
  }
}

INFO - Best Practices

Use optional chaining (?.) to safely access nested properties

Provide default values with nullish coalescing (??)

Keep functions simple and focused

Use $parent for scoped queries in nested objects

Reserve fn type for complex scenarios; use CSS selectors for simple cases

WARNING - Error Handling

If the function throws an error, it will return null. Always use safe access patterns with optional chaining (?.) and nullish coalescing (??) operators.

Safe Access Example:

{
  "safeValue": {
    "type": "fn",
    "fn": "return window.config?.data?.value ?? 'default';"
  }
}

Table Extraction

Extract HTML tables automatically with header detection. Perfect for extracting tabular data like product specifications, pricing tables, or feature comparisons.

Basic Table Example:

{
  "specifications": {
    "selector": "table.product-specs",
    "type": "table"
  }
}

Given this HTML:

<table class="product-specs">
  <thead>
    <tr><th>Feature</th><th>Value</th></tr>
  </thead>
  <tbody>
    <tr><td>Weight</td><td>2.5 kg</td></tr>
    <tr><td>Dimensions</td><td>30x20x10 cm</td></tr>
    <tr><td>Color</td><td>Black</td></tr>
  </tbody>
</table>

Result:

{
  "specifications": [
    {"feature": "Weight", "value": "2.5 kg"},
    {"feature": "Dimensions", "value": "30x20x10 cm"},
    {"feature": "Color", "value": "Black"}
  ]
}

Advanced Table Options:

{
  "pricing": {
    "selector": "table.pricing",
    "type": "table",
    "headers": ["plan", "price", "features"],
    "rowSelector": "tbody tr",
    "normalizeHeaders": true,
    "asArray": false
  }
}

Table Properties:

headers (array): Custom header names to use instead of auto-detection
headerSelector (string): Custom CSS selector for header cells
rowSelector (string): Custom CSS selector for data rows (default: "tbody tr, tr:not(:first-child)")
normalizeHeaders (boolean): Convert headers to snake_case (default: true)
asArray (boolean): Return rows as arrays instead of objects (default: false)

Transposed Table

For tables where headers are in the first column (key-value format). Common in product detail pages and specification sheets.

Example:

{
  "product_details": {
    "selector": "table.specs",
    "type": "tableTranspose"
  }
}

Given this HTML:

<table class="specs">
  <tr><th>Brand</th><td>Apple</td></tr>
  <tr><th>Model</th><td>iPhone 15</td></tr>
  <tr><th>Storage</th><td>256GB</td></tr>
  <tr><th>Color</th><td>Blue</td></tr>
</table>

Result:

{
  "product_details": {
    "brand": "Apple",
    "model": "iPhone 15",
    "storage": "256GB",
    "color": "Blue"
  }
}

Regex Extraction

Extract data using regular expressions directly from page content. Useful for extracting structured data embedded in text, such as SKUs, phone numbers, emails, or prices.

Basic Regex Example:

{
  "sku": {
    "type": "regex",
    "selector": ".product-info",
    "pattern": "SKU:\\s*([A-Z0-9-]+)"
  }
}

Given this HTML:

<div class="product-info">
  Product details here. SKU: ABC-12345. More info...
</div>

Result:

{
  "sku": "ABC-12345"
}

Extract from Entire Page:

{
  "phone_numbers": {
    "type": "regex",
    "pattern": "\\d{3}-\\d{3}-\\d{4}",
    "flags": "g"
  }
}

Multiple Capture Groups:

{
  "date_parts": {
    "type": "regex",
    "selector": ".publish-date",
    "pattern": "(\\d{4})-(\\d{2})-(\\d{2})",
    "allGroups": true
  }
}

Result: {"date_parts": ["2024", "12", "31"]}

Regex Properties:

pattern (string, required): Regular expression pattern
selector (string, optional): CSS selector to scope the search (defaults to entire page)
source (string): "text" (default) or "html" - search in text content or HTML
flags (string): Regex flags (default: "i" for case-insensitive)
group (number): Specific capture group to return (0 = full match)
allGroups (boolean): Return all capture groups as array
default (any): Default value if no match found

Extract Price from Text:

{
  "price": {
    "type": "regex",
    "selector": ".price-text",
    "pattern": "\\$([0-9,.]+)",
    "default": "0.00"
  }
}

Data Transformation Pipeline

Apply post-processing transformations to extracted values using the transform property. Transformations are applied in order (chained), allowing you to clean, convert, and reshape data after extraction.

Basic Transformation:

{
  "price": {
    "selector": ".price",
    "type": "text",
    "transform": ["number"]
  }
}

Input: "$49.99" → Output: 49.99

Chained Transformations:

{
  "price": {
    "selector": ".price",
    "type": "text",
    "transform": [
      "trim",
      ["replace", "\\$", ""],
      ["replace", "USD", ""],
      "trim",
      "number"
    ]
  }
}

Input: " $1,234.56 USD " → Output: 1234.56

Available Transformations:

Transform	Arguments	Description	Example
String Transforms
`trim`	-	Remove whitespace	`" text " → "text"`
`lowercase`	-	Convert to lowercase	`"TEXT" → "text"`
`uppercase`	-	Convert to uppercase	`"text" → "TEXT"`
`capitalize`	-	Capitalize first letter	`"hello" → "Hello"`
`replace`	pattern, replacement	Regex replace	`["replace", "\\$", ""]`
`split`	delimiter	Split into array	`["split", ","]`
`substring`	start, end	Get substring	`["substring", 0, 5]`
`prefix`	text	Add prefix	`["prefix", "$"]`
`suffix`	text	Add suffix	`["suffix", " USD"]`
Number Transforms
`number`	-	Parse as float	`"$49.99" → 49.99`
`integer`	-	Parse as integer	`"42.7" → 42`
`currency`	-	Parse currency	`"$1,234.56" → 1234.56`
`round`	decimals	Round to decimals	`["round", 2]`
`floor`	-	Round down	`4.9 → 4`
`ceil`	-	Round up	`4.1 → 5`
`abs`	-	Absolute value	`-5 → 5`
Array Transforms
`first`	-	Get first element	`[1,2,3] → 1`
`last`	-	Get last element	`[1,2,3] → 3`
`nth`	index	Get nth element	`["nth", 1]`
`join`	delimiter	Join to string	`["join", ", "]`
`unique`	-	Remove duplicates	`[1,1,2] → [1,2]`
`compact`	-	Remove null/empty	`[1,null,"",2] → [1,2]`
`flatten`	-	Flatten nested arrays	`[[1,2],[3]] → [1,2,3]`
`reverse`	-	Reverse array/string	`[1,2,3] → [3,2,1]`
`sort`	order	Sort (asc/desc)	`["sort", "desc"]`
`length`	-	Get length	`"text" → 4`
Regex Transforms
`match`	pattern, group	Extract with regex	`["match", "\\d+", 0]`
`matchAll`	pattern	All matches	`["matchAll", "\\d+"]`
Type Conversion
`boolean`	-	Parse as boolean	`"true" → true`
`string`	-	Convert to string	`42 → "42"`
`json`	-	Parse JSON string	`'{"a":1}' → {a:1}`
`stringify`	-	Convert to JSON	`{a:1} → '{"a":1}'`
Date Transforms
`date`	format	Parse date	`["date", "iso"]`
HTML/Text Cleaning
`stripHtml`	-	Remove HTML tags	`"<b>text</b>" → "text"`
`normalizeWhitespace`	-	Collapse spaces	`"a b" → "a b"`
`removeNewlines`	-	Remove line breaks	Multi-line → single line
`decode`	-	Decode HTML entities	`"&" → "&"`
Default/Fallback
`default`	value	Fallback if null	`["default", "N/A"]`
`nullIf`	value	Null if equals	`["nullIf", ""]`
`emptyToNull`	-	Convert empty to null	`"" → null`

Practical Examples:

1. Clean and Parse Price:

{
  "price_numeric": {
    "selector": ".price",
    "type": "text",
    "transform": ["currency"]
  }
}

Input: "Price: $1,234.56" → Output: 1234.56

2. Split Tags into Array:

{
  "tags": {
    "selector": ".product-tags",
    "type": "text",
    "transform": [
      "trim",
      ["split", ","],
      "compact",
      "unique"
    ]
  }
}

Input: "electronics, gadgets, , electronics" → Output: ["electronics", "gadgets"]

3. Extract and Round Rating:

{
  "rating": {
    "selector": ".rating-value",
    "type": "text",
    "transform": [
      "number",
      ["round", 1]
    ]
  }
}

Input: "4.567 stars" → Output: 4.6

4. Get First Available Image:

{
  "main_image": {
    "selector": "img.product-image",
    "type": "image",
    "multiple": true,
    "transform": ["first"]
  }
}

5. Parse JSON from Data Attribute:

{
  "config": {
    "selector": "[data-config]",
    "type": "attr",
    "attribute": "data-config",
    "transform": ["json"]
  }
}

Input: '{"theme":"dark","lang":"en"}' → Output: {"theme":"dark","lang":"en"}

6. Boolean Conversion:

{
  "in_stock": {
    "selector": ".stock-status",
    "type": "text",
    "transform": ["boolean"]
  }
}

Input: "true" or "yes" or "1" → Output: true

TIP - Transform Tips

Transformations are applied sequentially from left to right

Use array syntax for transforms with arguments: ["replace", "pattern", "replacement"]

Combine multiple transforms to build complex pipelines

The default transform provides fallback values for missing data

Use compact to clean arrays of null/empty values

Simple Extraction

Extract a single value from a page. This example extracts the user agent from whatsmyuseragent.org:

{
  "user-agent": {
    "selector": ".user-agent .intro-text",
    "type": "text"
  }
}

Response:

{
  "success": true,
  "result": {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.0 Safari/537.36"
  }
}

Multiple Items

Extract multiple matching elements by adding "multiple": true. This example extracts all quotes from a quotes page:

{
  "quote": {
    "selector": ".quote-card .description",
    "type": "text",
    "multiple": true
  }
}

Response:

{
  "success": true,
  "result": {
    "quote": [
      ""The world as we have created it is a process of our thinking..."",
      ""It is our choices, Harry, that show what we truly are..."",
      ""There are only two ways to live your life...""
    ]
  }
}

Nested Items

Extract complex nested data using type: "obj" with children. This example extracts quotes with their authors and tags:

{
  "quotes": {
    "selector": ".quote-card",
    "type": "obj",
    "multiple": true,
    "children": {
      "text": {
        "selector": ".description",
        "type": "text"
      },
      "author": {
        "selector": ".author",
        "type": "text"
      },
      "tags": {
        "selector": ".tags .tag",
        "type": "text",
        "multiple": true
      }
    }
  }
}

Response:

{
  "success": true,
  "result": {
    "quotes": [
      {
        "text": ""The world as we have created it..."",
        "author": "Albert Einstein",
        "tags": ["change", "deep-thoughts", "thinking", "world"]
      },
      {
        "text": ""It is our choices, Harry..."",
        "author": "J.K. Rowling",
        "tags": ["abilities", "choices"]
      }
    ]
  }
}

Real-World Examples

E-commerce Product Scraping

Extract product data from an e-commerce page:

{
  "products": {
    "selector": ".product-card",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": { "selector": ".product-name", "type": "text" },
      "price": { "selector": ".price", "type": "text" },
      "image": { "selector": "img", "type": "image" },
      "url": { "selector": "a", "type": "link" },
      "rating": { "selector": ".rating", "type": "attr", "attribute": "data-rating" }
    }
  }
}

curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://shop.example.com/products' \
  --data-urlencode 'extract_rules={"products":{"selector":".product-card","type":"obj","multiple":true,"children":{"name":{"selector":".product-name","type":"text"},"price":{"selector":".price","type":"text"}}}}' \
  -H "ApiKey: YOUR_API_KEY"

const rules = {
  products: {
    selector: '.product-card',
    type: 'obj',
    multiple: true,
    children: {
      name: { selector: '.product-name', type: 'text' },
      price: { selector: '.price', type: 'text' },
      image: { selector: 'img', type: 'image' },
      url: { selector: 'a', type: 'link' },
      rating: { selector: '.rating', type: 'attr', attribute: 'data-rating' }
    }
  }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://shop.example.com/products')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result.products);

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  products: {
    selector: '.product-card',
    type: 'obj',
    multiple: true,
    children: {
      name: { selector: '.product-name', type: 'text' },
      price: { selector: '.price', type: 'text' },
      image: { selector: 'img', type: 'image' },
      url: { selector: 'a', type: 'link' },
      rating: { selector: '.rating', type: 'attr', attribute: 'data-rating' }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com/products',
  rules
);

console.log(result.products);

import requests
import json

rules = {
    'products': {
        'selector': '.product-card',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {'selector': '.product-name', 'type': 'text'},
            'price': {'selector': '.price', 'type': 'text'},
            'image': {'selector': 'img', 'type': 'image'},
            'url': {'selector': 'a', 'type': 'link'},
            'rating': {'selector': '.rating', 'type': 'attr', 'attribute': 'data-rating'}
        }
    }
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://shop.example.com/products',
        'extract_rules': json.dumps(rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result']['products'])

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'products': {
        'selector': '.product-card',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {'selector': '.product-name', 'type': 'text'},
            'price': {'selector': '.price', 'type': 'text'},
            'image': {'selector': 'img', 'type': 'image'},
            'url': {'selector': 'a', 'type': 'link'},
            'rating': {'selector': '.rating', 'type': 'attr', 'attribute': 'data-rating'}
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com/products',
    extract_rules=rules
)

print(result['products'])

import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;

OkHttpClient client = new OkHttpClient();

JSONObject rules = new JSONObject()
    .put("products", new JSONObject()
        .put("selector", ".product-card")
        .put("type", "obj")
        .put("multiple", true)
        .put("children", new JSONObject()
            .put("name", new JSONObject().put("selector", ".product-name").put("type", "text"))
            .put("price", new JSONObject().put("selector", ".price").put("type", "text"))
            .put("image", new JSONObject().put("selector", "img").put("type", "image"))
            .put("url", new JSONObject().put("selector", "a").put("type", "link"))
            .put("rating", new JSONObject()
                .put("selector", ".rating")
                .put("type", "attr")
                .put("attribute", "data-rating"))));

String url = "https://api.ujeebu.com/scrape?url=" +
    URLEncoder.encode("https://shop.example.com/products", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$rules = [
    'products' => [
        'selector' => '.product-card',
        'type' => 'obj',
        'multiple' => true,
        'children' => [
            'name' => ['selector' => '.product-name', 'type' => 'text'],
            'price' => ['selector' => '.price', 'type' => 'text'],
            'image' => ['selector' => 'img', 'type' => 'image'],
            'url' => ['selector' => 'a', 'type' => 'link'],
            'rating' => ['selector' => '.rating', 'type' => 'attr', 'attribute' => 'data-rating']
        ]
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://shop.example.com/products',
    'extract_rules' => json_encode($rules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']['products']);

package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-card",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name":   map[string]string{"selector": ".product-name", "type": "text"},
				"price":  map[string]string{"selector": ".price", "type": "text"},
				"image":  map[string]string{"selector": "img", "type": "image"},
				"url":    map[string]string{"selector": "a", "type": "link"},
				"rating": map[string]string{"selector": ".rating", "type": "attr", "attribute": "data-rating"},
			},
		},
	}

	rulesJSON, _ := json.Marshal(rules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://shop.example.com/products"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-card",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name":   map[string]string{"selector": ".product-name", "type": "text"},
				"price":  map[string]string{"selector": ".price", "type": "text"},
				"image":  map[string]string{"selector": "img", "type": "image"},
				"url":    map[string]string{"selector": "a", "type": "link"},
				"rating": map[string]string{"selector": ".rating", "type": "attr", "attribute": "data-rating"},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com/products",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

News Article Scraping

Extract article data from a news site:

{
  "articles": {
    "selector": "article",
    "type": "obj",
    "multiple": true,
    "children": {
      "headline": { "selector": "h2", "type": "text" },
      "summary": { "selector": ".summary", "type": "text" },
      "author": { "selector": ".author", "type": "text" },
      "date": { "selector": "time", "type": "attr", "attribute": "datetime" },
      "link": { "selector": "a", "type": "link" },
      "thumbnail": { "selector": "img", "type": "image" }
    }
  }
}

curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://news.example.com' \
  --data-urlencode 'extract_rules={"articles":{"selector":"article","type":"obj","multiple":true,"children":{"headline":{"selector":"h2","type":"text"},"summary":{"selector":".summary","type":"text"}}}}' \
  -H "ApiKey: YOUR_API_KEY"

const rules = {
  articles: {
    selector: 'article',
    type: 'obj',
    multiple: true,
    children: {
      headline: { selector: 'h2', type: 'text' },
      summary: { selector: '.summary', type: 'text' },
      author: { selector: '.author', type: 'text' },
      date: { selector: 'time', type: 'attr', attribute: 'datetime' },
      link: { selector: 'a', type: 'link' },
      thumbnail: { selector: 'img', type: 'image' }
    }
  }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://news.example.com')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result.articles);

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  articles: {
    selector: 'article',
    type: 'obj',
    multiple: true,
    children: {
      headline: { selector: 'h2', type: 'text' },
      summary: { selector: '.summary', type: 'text' },
      author: { selector: '.author', type: 'text' },
      date: { selector: 'time', type: 'attr', attribute: 'datetime' },
      link: { selector: 'a', type: 'link' },
      thumbnail: { selector: 'img', type: 'image' }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://news.example.com',
  rules
);

console.log(result.articles);

import requests
import json

rules = {
    'articles': {
        'selector': 'article',
        'type': 'obj',
        'multiple': True,
        'children': {
            'headline': {'selector': 'h2', 'type': 'text'},
            'summary': {'selector': '.summary', 'type': 'text'},
            'author': {'selector': '.author', 'type': 'text'},
            'date': {'selector': 'time', 'type': 'attr', 'attribute': 'datetime'},
            'link': {'selector': 'a', 'type': 'link'},
            'thumbnail': {'selector': 'img', 'type': 'image'}
        }
    }
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://news.example.com',
        'extract_rules': json.dumps(rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result']['articles'])

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'articles': {
        'selector': 'article',
        'type': 'obj',
        'multiple': True,
        'children': {
            'headline': {'selector': 'h2', 'type': 'text'},
            'summary': {'selector': '.summary', 'type': 'text'},
            'author': {'selector': '.author', 'type': 'text'},
            'date': {'selector': 'time', 'type': 'attr', 'attribute': 'datetime'},
            'link': {'selector': 'a', 'type': 'link'},
            'thumbnail': {'selector': 'img', 'type': 'image'}
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://news.example.com',
    extract_rules=rules
)

print(result['articles'])

import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;

OkHttpClient client = new OkHttpClient();

JSONObject rules = new JSONObject()
    .put("articles", new JSONObject()
        .put("selector", "article")
        .put("type", "obj")
        .put("multiple", true)
        .put("children", new JSONObject()
            .put("headline", new JSONObject().put("selector", "h2").put("type", "text"))
            .put("summary", new JSONObject().put("selector", ".summary").put("type", "text"))
            .put("author", new JSONObject().put("selector", ".author").put("type", "text"))
            .put("date", new JSONObject().put("selector", "time").put("type", "attr").put("attribute", "datetime"))
            .put("link", new JSONObject().put("selector", "a").put("type", "link"))
            .put("thumbnail", new JSONObject().put("selector", "img").put("type", "image"))));

String url = "https://api.ujeebu.com/scrape?url=" +
    URLEncoder.encode("https://news.example.com", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$rules = [
    'articles' => [
        'selector' => 'article',
        'type' => 'obj',
        'multiple' => true,
        'children' => [
            'headline' => ['selector' => 'h2', 'type' => 'text'],
            'summary' => ['selector' => '.summary', 'type' => 'text'],
            'author' => ['selector' => '.author', 'type' => 'text'],
            'date' => ['selector' => 'time', 'type' => 'attr', 'attribute' => 'datetime'],
            'link' => ['selector' => 'a', 'type' => 'link'],
            'thumbnail' => ['selector' => 'img', 'type' => 'image']
        ]
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://news.example.com',
    'extract_rules' => json_encode($rules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']['articles']);

package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	rules := map[string]interface{}{
		"articles": map[string]interface{}{
			"selector": "article",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"headline":  map[string]string{"selector": "h2", "type": "text"},
				"summary":   map[string]string{"selector": ".summary", "type": "text"},
				"author":    map[string]string{"selector": ".author", "type": "text"},
				"date":      map[string]string{"selector": "time", "type": "attr", "attribute": "datetime"},
				"link":      map[string]string{"selector": "a", "type": "link"},
				"thumbnail": map[string]string{"selector": "img", "type": "image"},
			},
		},
	}

	rulesJSON, _ := json.Marshal(rules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://news.example.com"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"articles": map[string]interface{}{
			"selector": "article",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"headline":  map[string]string{"selector": "h2", "type": "text"},
				"summary":   map[string]string{"selector": ".summary", "type": "text"},
				"author":    map[string]string{"selector": ".author", "type": "text"},
				"date":      map[string]string{"selector": "time", "type": "attr", "attribute": "datetime"},
				"link":      map[string]string{"selector": "a", "type": "link"},
				"thumbnail": map[string]string{"selector": "img", "type": "image"},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://news.example.com",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Table Data Extraction

Extract data from HTML tables:

{
  "rows": {
    "selector": "table tbody tr",
    "type": "obj",
    "multiple": true,
    "children": {
      "column1": { "selector": "td:nth-child(1)", "type": "text" },
      "column2": { "selector": "td:nth-child(2)", "type": "text" },
      "column3": { "selector": "td:nth-child(3)", "type": "text" }
    }
  }
}

curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://data.example.com/table' \
  --data-urlencode 'extract_rules={"rows":{"selector":"table tbody tr","type":"obj","multiple":true,"children":{"column1":{"selector":"td:nth-child(1)","type":"text"},"column2":{"selector":"td:nth-child(2)","type":"text"}}}}' \
  -H "ApiKey: YOUR_API_KEY"

const rules = {
  rows: {
    selector: 'table tbody tr',
    type: 'obj',
    multiple: true,
    children: {
      column1: { selector: 'td:nth-child(1)', type: 'text' },
      column2: { selector: 'td:nth-child(2)', type: 'text' },
      column3: { selector: 'td:nth-child(3)', type: 'text' }
    }
  }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://data.example.com/table')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result.rows);

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  rows: {
    selector: 'table tbody tr',
    type: 'obj',
    multiple: true,
    children: {
      column1: { selector: 'td:nth-child(1)', type: 'text' },
      column2: { selector: 'td:nth-child(2)', type: 'text' },
      column3: { selector: 'td:nth-child(3)', type: 'text' }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://data.example.com/table',
  rules
);

console.log(result.rows);

import requests
import json

rules = {
    'rows': {
        'selector': 'table tbody tr',
        'type': 'obj',
        'multiple': True,
        'children': {
            'column1': {'selector': 'td:nth-child(1)', 'type': 'text'},
            'column2': {'selector': 'td:nth-child(2)', 'type': 'text'},
            'column3': {'selector': 'td:nth-child(3)', 'type': 'text'}
        }
    }
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://data.example.com/table',
        'extract_rules': json.dumps(rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result']['rows'])

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'rows': {
        'selector': 'table tbody tr',
        'type': 'obj',
        'multiple': True,
        'children': {
            'column1': {'selector': 'td:nth-child(1)', 'type': 'text'},
            'column2': {'selector': 'td:nth-child(2)', 'type': 'text'},
            'column3': {'selector': 'td:nth-child(3)', 'type': 'text'}
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://data.example.com/table',
    extract_rules=rules
)

print(result['rows'])

import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;

OkHttpClient client = new OkHttpClient();

JSONObject rules = new JSONObject()
    .put("rows", new JSONObject()
        .put("selector", "table tbody tr")
        .put("type", "obj")
        .put("multiple", true)
        .put("children", new JSONObject()
            .put("column1", new JSONObject().put("selector", "td:nth-child(1)").put("type", "text"))
            .put("column2", new JSONObject().put("selector", "td:nth-child(2)").put("type", "text"))
            .put("column3", new JSONObject().put("selector", "td:nth-child(3)").put("type", "text"))));

String url = "https://api.ujeebu.com/scrape?url=" +
    URLEncoder.encode("https://data.example.com/table", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());

<?php

$rules = [
    'rows' => [
        'selector' => 'table tbody tr',
        'type' => 'obj',
        'multiple' => true,
        'children' => [
            'column1' => ['selector' => 'td:nth-child(1)', 'type' => 'text'],
            'column2' => ['selector' => 'td:nth-child(2)', 'type' => 'text'],
            'column3' => ['selector' => 'td:nth-child(3)', 'type' => 'text']
        ]
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://data.example.com/table',
    'extract_rules' => json_encode($rules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']['rows']);

package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	rules := map[string]interface{}{
		"rows": map[string]interface{}{
			"selector": "table tbody tr",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"column1": map[string]string{"selector": "td:nth-child(1)", "type": "text"},
				"column2": map[string]string{"selector": "td:nth-child(2)", "type": "text"},
				"column3": map[string]string{"selector": "td:nth-child(3)", "type": "text"},
			},
		},
	}

	rulesJSON, _ := json.Marshal(rules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://data.example.com/table"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"rows": map[string]interface{}{
			"selector": "table tbody tr",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"column1": map[string]string{"selector": "td:nth-child(1)", "type": "text"},
				"column2": map[string]string{"selector": "td:nth-child(2)", "type": "text"},
				"column3": map[string]string{"selector": "td:nth-child(3)", "type": "text"},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://data.example.com/table",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Advanced: Using Function Type

Extract data using custom JavaScript to access window variables, compute values, and use $parent:

{
  "products": {
    "selector": ".product-item",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": { "selector": ".product-name", "type": "text" },
      "price": { "selector": ".price", "type": "text" },
      "sku": {
        "type": "fn",
        "fn": "return $parent.getAttribute('data-sku');"
      },
      "inStock": {
        "type": "fn",
        "fn": "return $parent.classList.contains('in-stock');"
      },
      "rating": {
        "type": "fn",
        "fn": "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
      }
    }
  },
  "pageConfig": {
    "type": "fn",
    "fn": "return window._pageConfig_ || {};"
  },
  "totalItems": {
    "type": "fn",
    "fn": "return document.querySelectorAll('.product-item').length;"
  }
}

curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://shop.example.com' \
  --data-urlencode 'js=true' \
  --data-urlencode 'extract_rules={"products":{"selector":".product-item","type":"obj","multiple":true,"children":{"name":{"selector":".product-name","type":"text"},"sku":{"type":"fn","fn":"return $parent.getAttribute('data-sku');"},"inStock":{"type":"fn","fn":"return $parent.classList.contains('in-stock');"}}},"pageConfig":{"type":"fn","fn":"return window._pageConfig_ || {};"}}' \
  -H "ApiKey: YOUR_API_KEY"

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  products: {
    selector: '.product-item',
    type: 'obj',
    multiple: true,
    children: {
      name: { selector: '.product-name', type: 'text' },
      price: { selector: '.price', type: 'text' },
      sku: {
        type: 'fn',
        fn: "return $parent.getAttribute('data-sku');"
      },
      inStock: {
        type: 'fn',
        fn: "return $parent.classList.contains('in-stock');"
      },
      rating: {
        type: 'fn',
        fn: "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
      }
    }
  },
  pageConfig: {
    type: 'fn',
    fn: "return window._pageConfig_ || {};"
  },
  totalItems: {
    type: 'fn',
    fn: "return document.querySelectorAll('.product-item').length;"
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com',
  rules,
  { js: true }
);

console.log(result);

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'products': {
        'selector': '.product-item',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {'selector': '.product-name', 'type': 'text'},
            'price': {'selector': '.price', 'type': 'text'},
            'sku': {
                'type': 'fn',
                'fn': "return $parent.getAttribute('data-sku');"
            },
            'inStock': {
                'type': 'fn',
                'fn': "return $parent.classList.contains('in-stock');"
            },
            'rating': {
                'type': 'fn',
                'fn': "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
            }
        }
    },
    'pageConfig': {
        'type': 'fn',
        'fn': "return window._pageConfig_ || {};"
    },
    'totalItems': {
        'type': 'fn',
        'fn': "return document.querySelectorAll('.product-item').length;"
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com',
    extract_rules=rules,
    params={'js': True}
)

print(result)

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-item",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name":  map[string]string{"selector": ".product-name", "type": "text"},
				"price": map[string]string{"selector": ".price", "type": "text"},
				"sku": map[string]string{
					"type": "fn",
					"fn":   "return $parent.getAttribute('data-sku');",
				},
				"inStock": map[string]string{
					"type": "fn",
					"fn":   "return $parent.classList.contains('in-stock');",
				},
			},
		},
		"pageConfig": map[string]string{
			"type": "fn",
			"fn":   "return window._pageConfig_ || {};",
		},
		"totalItems": map[string]string{
			"type": "fn",
			"fn":   "return document.querySelectorAll('.product-item').length;",
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com",
		JS:           true,
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Expected Response:

{
  "success": true,
  "result": {
    "products": [
      {
        "name": "Premium Headphones",
        "price": "$199.99",
        "sku": "HD-2024-BLK",
        "inStock": true,
        "rating": 5
      },
      {
        "name": "Wireless Mouse",
        "price": "$49.99",
        "sku": "MS-2024-GRY",
        "inStock": false,
        "rating": 4
      }
    ],
    "pageConfig": {
      "currency": "USD",
      "locale": "en-US",
      "version": "2.1.0"
    },
    "totalItems": 24
  }
}

TIP - Function Type Use Cases

The fn type is perfect for:

Extracting data from JavaScript variables (e.g., window.__NEXT_DATA__)

Accessing parent element attributes with $parent

Computing derived values (totals, averages, etc.)

Parsing JSON from script tags

Checking element states (classes, visibility)

Accessing localStorage/sessionStorage

Complex conditional logic

Advanced: Table and Regex Extraction

Extract product specifications from a table and SKU from text using table and regex types:

{
  "product_name": {
    "selector": "h1.product-title",
    "type": "text"
  },
  "sku": {
    "type": "regex",
    "selector": ".product-info",
    "pattern": "SKU:\\s*([A-Z0-9-]+)"
  },
  "specifications": {
    "selector": "table.specs",
    "type": "table"
  },
  "product_details": {
    "selector": "table.details",
    "type": "tableTranspose"
  },
  "price": {
    "type": "regex",
    "selector": ".price-text",
    "pattern": "\\$([0-9,.]+)"
  }
}

curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://shop.example.com/product/123' \
  --data-urlencode 'extract_rules={"product_name":{"selector":"h1.product-title","type":"text"},"sku":{"type":"regex","selector":".product-info","pattern":"SKU:\\\\s*([A-Z0-9-]+)"},"specifications":{"selector":"table.specs","type":"table"},"product_details":{"selector":"table.details","type":"tableTranspose"},"price":{"type":"regex","selector":".price-text","pattern":"\\\\$([0-9,.]+)"}}' \
  -H "ApiKey: YOUR_API_KEY"

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  product_name: {
    selector: 'h1.product-title',
    type: 'text'
  },
  sku: {
    type: 'regex',
    selector: '.product-info',
    pattern: 'SKU:\\\\s*([A-Z0-9-]+)'
  },
  specifications: {
    selector: 'table.specs',
    type: 'table'
  },
  product_details: {
    selector: 'table.details',
    type: 'tableTranspose'
  },
  price: {
    type: 'regex',
    selector: '.price-text',
    pattern: '\\\\$([0-9,.]+)'
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com/product/123',
  rules
);

console.log(result);

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'product_name': {
        'selector': 'h1.product-title',
        'type': 'text'
    },
    'sku': {
        'type': 'regex',
        'selector': '.product-info',
        'pattern': r'SKU:\s*([A-Z0-9-]+)'
    },
    'specifications': {
        'selector': 'table.specs',
        'type': 'table'
    },
    'product_details': {
        'selector': 'table.details',
        'type': 'tableTranspose'
    },
    'price': {
        'type': 'regex',
        'selector': '.price-text',
        'pattern': r'\$([0-9,.]+)'
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com/product/123',
    extract_rules=rules
)

print(result)

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"product_name": map[string]string{
			"selector": "h1.product-title",
			"type":     "text",
		},
		"sku": map[string]string{
			"type":     "regex",
			"selector": ".product-info",
			"pattern":  `SKU:\s*([A-Z0-9-]+)`,
		},
		"specifications": map[string]string{
			"selector": "table.specs",
			"type":     "table",
		},
		"product_details": map[string]string{
			"selector": "table.details",
			"type":     "tableTranspose",
		},
		"price": map[string]string{
			"type":     "regex",
			"selector": ".price-text",
			"pattern":  `\$([0-9,.]+)`,
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com/product/123",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Expected Response:

{
  "success": true,
  "result": {
    "product_name": "Premium Wireless Headphones",
    "sku": "HD-2024-BLK-256",
    "price": "199.99",
    "specifications": [
      {"feature": "Battery Life", "value": "40 hours"},
      {"feature": "Bluetooth", "value": "5.3"},
      {"feature": "Weight", "value": "250g"},
      {"feature": "Charging", "value": "USB-C"}
    ],
    "product_details": {
      "brand": "AudioTech",
      "model": "HD-2024",
      "color": "Black",
      "warranty": "2 years"
    }
  }
}

TIP - Table and Regex Use Cases

Table extraction: Product specifications, pricing tables, feature comparisons, data tables

Regex extraction: SKUs, prices, phone numbers, email addresses, dates, order numbers, tracking IDs

Combine both: Extract structured tabular data alongside pattern-based text extraction for comprehensive scraping

Advanced: E-commerce Product with Transformations

Extract and transform product data with clean, structured output:

{
  "products": {
    "selector": ".product-card",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": {
        "selector": ".product-name",
        "type": "text",
        "transform": ["trim", "normalizeWhitespace"]
      },
      "price": {
        "selector": ".price",
        "type": "text",
        "transform": ["currency", ["round", 2]]
      },
      "original_price": {
        "selector": ".original-price",
        "type": "text",
        "transform": ["currency", ["default", null]]
      },
      "discount_percent": {
        "selector": ".discount",
        "type": "text",
        "transform": [["match", "(\\d+)%", 1], "integer"]
      },
      "rating": {
        "selector": ".rating",
        "type": "text",
        "transform": ["number", ["round", 1], ["default", 0]]
      },
      "reviews_count": {
        "selector": ".reviews",
        "type": "text",
        "transform": [["match", "(\\d+)", 1], "integer"]
      },
      "tags": {
        "selector": ".tags",
        "type": "text",
        "transform": [["split", ","], "compact", "unique", "lowercase"]
      },
      "in_stock": {
        "selector": ".stock-status",
        "type": "text",
        "transform": ["lowercase", "boolean"]
      },
      "images": {
        "selector": ".product-images img",
        "type": "image",
        "multiple": true,
        "transform": ["compact", "unique"]
      }
    }
  }
}

import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  products: {
    selector: '.product-card',
    type: 'obj',
    multiple: true,
    children: {
      name: {
        selector: '.product-name',
        type: 'text',
        transform: ['trim', 'normalizeWhitespace']
      },
      price: {
        selector: '.price',
        type: 'text',
        transform: ['currency', ['round', 2]]
      },
      original_price: {
        selector: '.original-price',
        type: 'text',
        transform: ['currency', ['default', null]]
      },
      discount_percent: {
        selector: '.discount',
        type: 'text',
        transform: [['match', '(\\\\d+)%', 1], 'integer']
      },
      rating: {
        selector: '.rating',
        type: 'text',
        transform: ['number', ['round', 1], ['default', 0]]
      },
      reviews_count: {
        selector: '.reviews',
        type: 'text',
        transform: [['match', '(\\\\d+)', 1], 'integer']
      },
      tags: {
        selector: '.tags',
        type: 'text',
        transform: [['split', ','], 'compact', 'unique', 'lowercase']
      },
      in_stock: {
        selector: '.stock-status',
        type: 'text',
        transform: ['lowercase', 'boolean']
      },
      images: {
        selector: '.product-images img',
        type: 'image',
        multiple: true,
        transform: ['compact', 'unique']
      }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com',
  rules
);

console.log(result);

from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'products': {
        'selector': '.product-card',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {
                'selector': '.product-name',
                'type': 'text',
                'transform': ['trim', 'normalizeWhitespace']
            },
            'price': {
                'selector': '.price',
                'type': 'text',
                'transform': ['currency', ['round', 2]]
            },
            'original_price': {
                'selector': '.original-price',
                'type': 'text',
                'transform': ['currency', ['default', None]]
            },
            'discount_percent': {
                'selector': '.discount',
                'type': 'text',
                'transform': [['match', r'(\d+)%', 1], 'integer']
            },
            'rating': {
                'selector': '.rating',
                'type': 'text',
                'transform': ['number', ['round', 1], ['default', 0]]
            },
            'reviews_count': {
                'selector': '.reviews',
                'type': 'text',
                'transform': [['match', r'(\d+)', 1], 'integer']
            },
            'tags': {
                'selector': '.tags',
                'type': 'text',
                'transform': [['split', ','], 'compact', 'unique', 'lowercase']
            },
            'in_stock': {
                'selector': '.stock-status',
                'type': 'text',
                'transform': ['lowercase', 'boolean']
            },
            'images': {
                'selector': '.product-images img',
                'type': 'image',
                'multiple': True,
                'transform': ['compact', 'unique']
            }
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com',
    extract_rules=rules
)

print(result)

package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-card",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name": map[string]interface{}{
					"selector":  ".product-name",
					"type":      "text",
					"transform": []string{"trim", "normalizeWhitespace"},
				},
				"price": map[string]interface{}{
					"selector":  ".price",
					"type":      "text",
					"transform": []interface{}{"currency", []interface{}{"round", 2}},
				},
				"rating": map[string]interface{}{
					"selector":  ".rating",
					"type":      "text",
					"transform": []interface{}{"number", []interface{}{"round", 1}, []interface{}{"default", 0}},
				},
				"tags": map[string]interface{}{
					"selector":  ".tags",
					"type":      "text",
					"transform": []interface{}{[]string{"split", ","}, "compact", "unique", "lowercase"},
				},
				"in_stock": map[string]interface{}{
					"selector":  ".stock-status",
					"type":      "text",
					"transform": []string{"lowercase", "boolean"},
				},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Expected Response:

{
  "success": true,
  "result": {
    "products": [
      {
        "name": "Wireless Headphones Premium",
        "price": 199.99,
        "original_price": 249.99,
        "discount_percent": 20,
        "rating": 4.5,
        "reviews_count": 1234,
        "tags": ["electronics", "audio", "wireless"],
        "in_stock": true,
        "images": [
          "https://cdn.example.com/img1.jpg",
          "https://cdn.example.com/img2.jpg"
        ]
      },
      {
        "name": "Smart Watch Pro",
        "price": 299.99,
        "original_price": null,
        "discount_percent": null,
        "rating": 4.8,
        "reviews_count": 567,
        "tags": ["electronics", "wearable", "fitness"],
        "in_stock": false,
        "images": [
          "https://cdn.example.com/watch1.jpg"
        ]
      }
    ]
  }
}

What This Example Demonstrates:

Price cleaning: Convert "$199.99" to 199.99 (number)
Discount extraction: Extract 20 from "Save 20%!"
Rating normalization: Round 4.567 to 4.6
Review count extraction: Extract 1234 from "1,234 reviews"
Tag processing: Split, clean, deduplicate, and lowercase tags
Boolean conversion: Convert "in stock" to true
Array deduplication: Remove duplicate images
Fallback values: Use null when original price is missing

SUCCESS - Transformation Best Practices

When building extraction rules with transformations:

Chain transforms logically: Start with text cleaning (trim), then extraction (match, split), then type conversion (number, boolean)

Use defaults for optional fields: Prevent null values with ["default", 0] or ["default", null]

Clean arrays: Use compact to remove empty values and unique to deduplicate

Parse numbers correctly: Use currency for prices, number for decimals, integer for whole numbers

Extract before converting: Use match to extract patterns before converting types

Test incrementally: Build your transform pipeline step by step to catch errors early

Parameters

Parameter	Type	Required	Default	Description
`url`	`string`	Yes	`-`	The URL to scrape.
`extract_rules`	`json-string`	Yes	`-`	JSON object defining extraction rules.
`js`	`boolean`	No	`false`	Enable JavaScript rendering before extraction.
`wait_for`	`string	number`	No	`null`
`timeout`	`number`	No	`60`	Maximum seconds before request timeout.
`proxy_type`	`string`	No	`rotating`	Proxy type: 'rotating', 'premium', 'residential', 'custom'.

Response Format

The API returns a JSON response with the extracted data in the result field:

{
  "success": true,
  "result": {
    "key_name": "extracted_value",
    "another_key": ["array", "of", "values"]
  }
}

If extraction fails for a selector, the value will be null or an empty array for multiple items.

Ready to build?

Spin up an API key in 60 seconds

Free tier: 5,000 credits, no card, full access to every endpoint on this page.

Get free API key or try the playground →

← Previous

Screenshot