Playground Sign in Start free

Extract Rules

Extract structured data from any web page using CSS selectors. Define extraction rules to scrape specific elements and get clean JSON output.

Overview

Extract rules allow you to scrape structured data from any web page using CSS selectors. Add the extract_rules parameter to your API call with a JSON object defining what data to extract.

GET https://api.ujeebu.com/scrape?extract_rules={...}

TIP — Powerful Data Extraction

Extract rules are perfect for extracting product information, article content, lists, tables, or any repeating data patterns from web pages.

Basic Format

The simplest way to use extract rules:

{
  "key_name": {
    "selector": "css_selector",
    "type": "rule_type"
  }
}
curl -X GET 'https://api.ujeebu.com/scrape?url=https://example.com&extract_rules={"title":{"selector":"h1","type":"text"}}' \
  -H "ApiKey: YOUR_API_KEY"
const extractRules = {
  title: { selector: 'h1', type: 'text' },
  description: { selector: 'meta[name=description]', type: 'attr', attribute: 'content' }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=https://example.com&extract_rules=${encodeURIComponent(JSON.stringify(extractRules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result);
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const extractRules = {
  title: { selector: 'h1', type: 'text' },
  description: { selector: 'meta[name=description]', type: 'attr', attribute: 'content' }
};

const result = await client.scrapeWithRules(
  'https://example.com',
  extractRules
);

console.log(result);
import requests
import json

extract_rules = {
    'title': {'selector': 'h1', 'type': 'text'},
    'description': {'selector': 'meta[name=description]', 'type': 'attr', 'attribute': 'content'}
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://example.com',
        'extract_rules': json.dumps(extract_rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result'])
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

extract_rules = {
    'title': {'selector': 'h1', 'type': 'text'},
    'description': {'selector': 'meta[name=description]', 'type': 'attr', 'attribute': 'content'}
}

result = ujeebu.scrape_with_rules(
    url='https://example.com',
    extract_rules=extract_rules
)

print(result)
import okhttp3.*;
import org.json.*;

OkHttpClient client = new OkHttpClient();

JSONObject extractRules = new JSONObject()
    .put("title", new JSONObject().put("selector", "h1").put("type", "text"))
    .put("description", new JSONObject()
        .put("selector", "meta[name=description]")
        .put("type", "attr")
        .put("attribute", "content"));

String url = "https://api.ujeebu.com/scrape?url=" + 
    URLEncoder.encode("https://example.com", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(extractRules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$extractRules = [
    'title' => ['selector' => 'h1', 'type' => 'text'],
    'description' => [
        'selector' => 'meta[name=description]',
        'type' => 'attr',
        'attribute' => 'content'
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://example.com',
    'extract_rules' => json_encode($extractRules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
    'ApiKey: YOUR_API_KEY'
]);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']);
package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	extractRules := map[string]interface{}{
		"title": map[string]string{"selector": "h1", "type": "text"},
		"description": map[string]string{
			"selector":  "meta[name=description]",
			"type":      "attr",
			"attribute": "content",
		},
	}

	rulesJSON, _ := json.Marshal(extractRules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://example.com"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, err := ujeebu.NewClient("YOUR-API-KEY")
	if err != nil {
		panic(err)
	}

	extractRules := map[string]interface{}{
		"title": map[string]string{"selector": "h1", "type": "text"},
		"description": map[string]string{
			"selector":  "meta[name=description]",
			"type":      "attr",
			"attribute": "content",
		},
	}

	response, credits, err := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://example.com",
		ExtractRules: extractRules,
	})
	if err != nil {
		panic(err)
	}

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Rule Types

There are 9 types of extraction rules:

Parameter Type Required Default Description
text rule No Returns the text content of the matched element.
link rule No Returns the href attribute if the element is an tag.
image rule No Returns the src attribute if the element is an tag.
attr rule No Returns a specified attribute value. Requires 'attribute' property.
obj rule No Returns an object with nested rules defined in 'children'.
fn rule No Executes custom JavaScript code to extract data. Access window object, DOM API, and parent elements.
table rule No Extracts HTML tables as arrays of objects with automatic header detection.
tableTranspose rule No Extracts transposed tables (key-value format) where headers are in the first column.
regex rule No Extracts data using regular expressions directly from page content.

Text Rule

Extract the text content of an element:

{
  "product_name": {
    "selector": ".product-title",
    "type": "text"
  }
}

Extract the href from anchor tags:

{
  "article_url": {
    "selector": "a.article-link",
    "type": "link"
  }
}

Image Rule

Extract the src from image tags:

{
  "product_image": {
    "selector": "img.product-photo",
    "type": "image"
  }
}

Attribute Rule

Extract any attribute value:

{
  "meta_description": {
    "selector": "meta[name=description]",
    "type": "attr",
    "attribute": "content"
  }
}

Object Rule

Extract nested data structures:

{
  "article_card": {
    "selector": "article.card-item",
    "type": "obj",
    "children": {
      "title": { "selector": "h1", "type": "text" },
      "link": { "selector": "a", "type": "link" }
    }
  }
}

Function Rule

Execute custom JavaScript code to extract data. This is the most powerful extraction type, allowing you to:

  • Access global JavaScript variables (window object)
  • Execute custom computations
  • Access browser storage (localStorage, sessionStorage)
  • Parse JSON from script tags
  • Use the $parent variable to access parent elements in nested extractions
  • Perform async operations with await

Basic Syntax:

{
  "config": {
    "type": "fn",
    "fn": "return window._app_config_;"
  }
}

Access Window Variables:

{
  "appState": {
    "type": "fn",
    "fn": "return window.__INITIAL_STATE__;"
  }
}

Using $parent in Nested Objects:

When used within nested object extraction, the special $parent variable references the parent element:

{
  "products": {
    "selector": ".product",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": {
        "selector": ".name",
        "type": "text"
      },
      "productId": {
        "type": "fn",
        "fn": "return $parent.getAttribute('data-id');"
      },
      "category": {
        "type": "fn",
        "fn": "return $parent.dataset.category;"
      },
      "itemCount": {
        "type": "fn",
        "fn": "return $parent.querySelectorAll('.item').length;"
      }
    }
  }
}

Parse JSON from Script Tags:

{
  "productData": {
    "type": "fn",
    "fn": "const script = document.querySelector('script[type=\"application/ld+json\"]'); return script ? JSON.parse(script.textContent) : null;"
  }
}

Compute Values:

{
  "totalPrice": {
    "type": "fn",
    "fn": "const prices = Array.from(document.querySelectorAll('.price')).map(el => parseFloat(el.textContent.replace('$', ''))); return prices.reduce((sum, p) => sum + p, 0);"
  }
}

Access Browser Storage:

{
  "userData": {
    "type": "fn",
    "fn": "return JSON.parse(localStorage.getItem('user_data') || '{}');"
  }
}

Conditional Logic:

{
  "availability": {
    "type": "fn",
    "fn": "return document.querySelector('.in-stock') ? 'available' : 'out of stock';"
  }
}

INFO — Best Practices

  • Use optional chaining (?.) to safely access nested properties
  • Provide default values with nullish coalescing (??)
  • Keep functions simple and focused
  • Use $parent for scoped queries in nested objects
  • Reserve fn type for complex scenarios; use CSS selectors for simple cases

WARNING — Error Handling

If the function throws an error, it will return null. Always use safe access patterns with optional chaining (?.) and nullish coalescing (??) operators.

Safe Access Example:

{
  "safeValue": {
    "type": "fn",
    "fn": "return window.config?.data?.value ?? 'default';"
  }
}

Table Extraction

Extract HTML tables automatically with header detection. Perfect for extracting tabular data like product specifications, pricing tables, or feature comparisons.

Basic Table Example:

{
  "specifications": {
    "selector": "table.product-specs",
    "type": "table"
  }
}

Given this HTML:

<table class="product-specs">
  <thead>
    <tr><th>Feature</th><th>Value</th></tr>
  </thead>
  <tbody>
    <tr><td>Weight</td><td>2.5 kg</td></tr>
    <tr><td>Dimensions</td><td>30x20x10 cm</td></tr>
    <tr><td>Color</td><td>Black</td></tr>
  </tbody>
</table>

Result:

{
  "specifications": [
    {"feature": "Weight", "value": "2.5 kg"},
    {"feature": "Dimensions", "value": "30x20x10 cm"},
    {"feature": "Color", "value": "Black"}
  ]
}

Advanced Table Options:

{
  "pricing": {
    "selector": "table.pricing",
    "type": "table",
    "headers": ["plan", "price", "features"],
    "rowSelector": "tbody tr",
    "normalizeHeaders": true,
    "asArray": false
  }
}

Table Properties:

  • headers (array): Custom header names to use instead of auto-detection
  • headerSelector (string): Custom CSS selector for header cells
  • rowSelector (string): Custom CSS selector for data rows (default: "tbody tr, tr:not(:first-child)")
  • normalizeHeaders (boolean): Convert headers to snake_case (default: true)
  • asArray (boolean): Return rows as arrays instead of objects (default: false)

Transposed Table

For tables where headers are in the first column (key-value format). Common in product detail pages and specification sheets.

Example:

{
  "product_details": {
    "selector": "table.specs",
    "type": "tableTranspose"
  }
}

Given this HTML:

<table class="specs">
  <tr><th>Brand</th><td>Apple</td></tr>
  <tr><th>Model</th><td>iPhone 15</td></tr>
  <tr><th>Storage</th><td>256GB</td></tr>
  <tr><th>Color</th><td>Blue</td></tr>
</table>

Result:

{
  "product_details": {
    "brand": "Apple",
    "model": "iPhone 15",
    "storage": "256GB",
    "color": "Blue"
  }
}

Regex Extraction

Extract data using regular expressions directly from page content. Useful for extracting structured data embedded in text, such as SKUs, phone numbers, emails, or prices.

Basic Regex Example:

{
  "sku": {
    "type": "regex",
    "selector": ".product-info",
    "pattern": "SKU:\\s*([A-Z0-9-]+)"
  }
}

Given this HTML:

<div class="product-info">
  Product details here. SKU: ABC-12345. More info...
</div>

Result:

{
  "sku": "ABC-12345"
}

Extract from Entire Page:

{
  "phone_numbers": {
    "type": "regex",
    "pattern": "\\d{3}-\\d{3}-\\d{4}",
    "flags": "g"
  }
}

Multiple Capture Groups:

{
  "date_parts": {
    "type": "regex",
    "selector": ".publish-date",
    "pattern": "(\\d{4})-(\\d{2})-(\\d{2})",
    "allGroups": true
  }
}

Result: {"date_parts": ["2024", "12", "31"]}

Regex Properties:

  • pattern (string, required): Regular expression pattern
  • selector (string, optional): CSS selector to scope the search (defaults to entire page)
  • source (string): "text" (default) or "html" - search in text content or HTML
  • flags (string): Regex flags (default: "i" for case-insensitive)
  • group (number): Specific capture group to return (0 = full match)
  • allGroups (boolean): Return all capture groups as array
  • default (any): Default value if no match found

Extract Price from Text:

{
  "price": {
    "type": "regex",
    "selector": ".price-text",
    "pattern": "\\$([0-9,.]+)",
    "default": "0.00"
  }
}

Data Transformation Pipeline

Apply post-processing transformations to extracted values using the transform property. Transformations are applied in order (chained), allowing you to clean, convert, and reshape data after extraction.

Basic Transformation:

{
  "price": {
    "selector": ".price",
    "type": "text",
    "transform": ["number"]
  }
}

Input: "$49.99"Output: 49.99

Chained Transformations:

{
  "price": {
    "selector": ".price",
    "type": "text",
    "transform": [
      "trim",
      ["replace", "\\$", ""],
      ["replace", "USD", ""],
      "trim",
      "number"
    ]
  }
}

Input: " $1,234.56 USD "Output: 1234.56

Available Transformations:

Transform Arguments Description Example
String Transforms
trim - Remove whitespace " text " → "text"
lowercase - Convert to lowercase "TEXT" → "text"
uppercase - Convert to uppercase "text" → "TEXT"
capitalize - Capitalize first letter "hello" → "Hello"
replace pattern, replacement Regex replace ["replace", "\\$", ""]
split delimiter Split into array ["split", ","]
substring start, end Get substring ["substring", 0, 5]
prefix text Add prefix ["prefix", "$"]
suffix text Add suffix ["suffix", " USD"]
Number Transforms
number - Parse as float "$49.99" → 49.99
integer - Parse as integer "42.7" → 42
currency - Parse currency "$1,234.56" → 1234.56
round decimals Round to decimals ["round", 2]
floor - Round down 4.9 → 4
ceil - Round up 4.1 → 5
abs - Absolute value -5 → 5
Array Transforms
first - Get first element [1,2,3] → 1
last - Get last element [1,2,3] → 3
nth index Get nth element ["nth", 1]
join delimiter Join to string ["join", ", "]
unique - Remove duplicates [1,1,2] → [1,2]
compact - Remove null/empty [1,null,"",2] → [1,2]
flatten - Flatten nested arrays [[1,2],[3]] → [1,2,3]
reverse - Reverse array/string [1,2,3] → [3,2,1]
sort order Sort (asc/desc) ["sort", "desc"]
length - Get length "text" → 4
Regex Transforms
match pattern, group Extract with regex ["match", "\\d+", 0]
matchAll pattern All matches ["matchAll", "\\d+"]
Type Conversion
boolean - Parse as boolean "true" → true
string - Convert to string 42 → "42"
json - Parse JSON string '{"a":1}' → {a:1}
stringify - Convert to JSON {a:1} → '{"a":1}'
Date Transforms
date format Parse date ["date", "iso"]
HTML/Text Cleaning
stripHtml - Remove HTML tags "<b>text</b>" → "text"
normalizeWhitespace - Collapse spaces "a b" → "a b"
removeNewlines - Remove line breaks Multi-line → single line
decode - Decode HTML entities "&amp;" → "&"
Default/Fallback
default value Fallback if null ["default", "N/A"]
nullIf value Null if equals ["nullIf", ""]
emptyToNull - Convert empty to null "" → null

Practical Examples:

1. Clean and Parse Price:

{
  "price_numeric": {
    "selector": ".price",
    "type": "text",
    "transform": ["currency"]
  }
}

Input: "Price: $1,234.56"Output: 1234.56

2. Split Tags into Array:

{
  "tags": {
    "selector": ".product-tags",
    "type": "text",
    "transform": [
      "trim",
      ["split", ","],
      "compact",
      "unique"
    ]
  }
}

Input: "electronics, gadgets, , electronics"Output: ["electronics", "gadgets"]

3. Extract and Round Rating:

{
  "rating": {
    "selector": ".rating-value",
    "type": "text",
    "transform": [
      "number",
      ["round", 1]
    ]
  }
}

Input: "4.567 stars"Output: 4.6

4. Get First Available Image:

{
  "main_image": {
    "selector": "img.product-image",
    "type": "image",
    "multiple": true,
    "transform": ["first"]
  }
}

5. Parse JSON from Data Attribute:

{
  "config": {
    "selector": "[data-config]",
    "type": "attr",
    "attribute": "data-config",
    "transform": ["json"]
  }
}

Input: '{"theme":"dark","lang":"en"}'Output: {"theme":"dark","lang":"en"}

6. Boolean Conversion:

{
  "in_stock": {
    "selector": ".stock-status",
    "type": "text",
    "transform": ["boolean"]
  }
}

Input: "true" or "yes" or "1"Output: true

TIP — Transform Tips

  • Transformations are applied sequentially from left to right
  • Use array syntax for transforms with arguments: ["replace", "pattern", "replacement"]
  • Combine multiple transforms to build complex pipelines
  • The default transform provides fallback values for missing data
  • Use compact to clean arrays of null/empty values

Simple Extraction

Extract a single value from a page. This example extracts the user agent from whatsmyuseragent.org:

{
  "user-agent": {
    "selector": ".user-agent .intro-text",
    "type": "text"
  }
}

Response:

{
  "success": true,
  "result": {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.0 Safari/537.36"
  }
}

Multiple Items

Extract multiple matching elements by adding "multiple": true. This example extracts all quotes from a quotes page:

{
  "quote": {
    "selector": ".quote-card .description",
    "type": "text",
    "multiple": true
  }
}

Response:

{
  "success": true,
  "result": {
    "quote": [
      ""The world as we have created it is a process of our thinking..."",
      ""It is our choices, Harry, that show what we truly are..."",
      ""There are only two ways to live your life...""
    ]
  }
}

Nested Items

Extract complex nested data using type: "obj" with children. This example extracts quotes with their authors and tags:

{
  "quotes": {
    "selector": ".quote-card",
    "type": "obj",
    "multiple": true,
    "children": {
      "text": {
        "selector": ".description",
        "type": "text"
      },
      "author": {
        "selector": ".author",
        "type": "text"
      },
      "tags": {
        "selector": ".tags .tag",
        "type": "text",
        "multiple": true
      }
    }
  }
}

Response:

{
  "success": true,
  "result": {
    "quotes": [
      {
        "text": ""The world as we have created it..."",
        "author": "Albert Einstein",
        "tags": ["change", "deep-thoughts", "thinking", "world"]
      },
      {
        "text": ""It is our choices, Harry..."",
        "author": "J.K. Rowling",
        "tags": ["abilities", "choices"]
      }
    ]
  }
}

Real-World Examples

E-commerce Product Scraping

Extract product data from an e-commerce page:

{
  "products": {
    "selector": ".product-card",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": { "selector": ".product-name", "type": "text" },
      "price": { "selector": ".price", "type": "text" },
      "image": { "selector": "img", "type": "image" },
      "url": { "selector": "a", "type": "link" },
      "rating": { "selector": ".rating", "type": "attr", "attribute": "data-rating" }
    }
  }
}
curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://shop.example.com/products' \
  --data-urlencode 'extract_rules={"products":{"selector":".product-card","type":"obj","multiple":true,"children":{"name":{"selector":".product-name","type":"text"},"price":{"selector":".price","type":"text"}}}}' \
  -H "ApiKey: YOUR_API_KEY"
const rules = {
  products: {
    selector: '.product-card',
    type: 'obj',
    multiple: true,
    children: {
      name: { selector: '.product-name', type: 'text' },
      price: { selector: '.price', type: 'text' },
      image: { selector: 'img', type: 'image' },
      url: { selector: 'a', type: 'link' },
      rating: { selector: '.rating', type: 'attr', attribute: 'data-rating' }
    }
  }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://shop.example.com/products')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result.products);
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  products: {
    selector: '.product-card',
    type: 'obj',
    multiple: true,
    children: {
      name: { selector: '.product-name', type: 'text' },
      price: { selector: '.price', type: 'text' },
      image: { selector: 'img', type: 'image' },
      url: { selector: 'a', type: 'link' },
      rating: { selector: '.rating', type: 'attr', attribute: 'data-rating' }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com/products',
  rules
);

console.log(result.products);
import requests
import json

rules = {
    'products': {
        'selector': '.product-card',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {'selector': '.product-name', 'type': 'text'},
            'price': {'selector': '.price', 'type': 'text'},
            'image': {'selector': 'img', 'type': 'image'},
            'url': {'selector': 'a', 'type': 'link'},
            'rating': {'selector': '.rating', 'type': 'attr', 'attribute': 'data-rating'}
        }
    }
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://shop.example.com/products',
        'extract_rules': json.dumps(rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result']['products'])
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'products': {
        'selector': '.product-card',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {'selector': '.product-name', 'type': 'text'},
            'price': {'selector': '.price', 'type': 'text'},
            'image': {'selector': 'img', 'type': 'image'},
            'url': {'selector': 'a', 'type': 'link'},
            'rating': {'selector': '.rating', 'type': 'attr', 'attribute': 'data-rating'}
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com/products',
    extract_rules=rules
)

print(result['products'])
import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;

OkHttpClient client = new OkHttpClient();

JSONObject rules = new JSONObject()
    .put("products", new JSONObject()
        .put("selector", ".product-card")
        .put("type", "obj")
        .put("multiple", true)
        .put("children", new JSONObject()
            .put("name", new JSONObject().put("selector", ".product-name").put("type", "text"))
            .put("price", new JSONObject().put("selector", ".price").put("type", "text"))
            .put("image", new JSONObject().put("selector", "img").put("type", "image"))
            .put("url", new JSONObject().put("selector", "a").put("type", "link"))
            .put("rating", new JSONObject()
                .put("selector", ".rating")
                .put("type", "attr")
                .put("attribute", "data-rating"))));

String url = "https://api.ujeebu.com/scrape?url=" +
    URLEncoder.encode("https://shop.example.com/products", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$rules = [
    'products' => [
        'selector' => '.product-card',
        'type' => 'obj',
        'multiple' => true,
        'children' => [
            'name' => ['selector' => '.product-name', 'type' => 'text'],
            'price' => ['selector' => '.price', 'type' => 'text'],
            'image' => ['selector' => 'img', 'type' => 'image'],
            'url' => ['selector' => 'a', 'type' => 'link'],
            'rating' => ['selector' => '.rating', 'type' => 'attr', 'attribute' => 'data-rating']
        ]
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://shop.example.com/products',
    'extract_rules' => json_encode($rules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']['products']);
package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-card",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name":   map[string]string{"selector": ".product-name", "type": "text"},
				"price":  map[string]string{"selector": ".price", "type": "text"},
				"image":  map[string]string{"selector": "img", "type": "image"},
				"url":    map[string]string{"selector": "a", "type": "link"},
				"rating": map[string]string{"selector": ".rating", "type": "attr", "attribute": "data-rating"},
			},
		},
	}

	rulesJSON, _ := json.Marshal(rules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://shop.example.com/products"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-card",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name":   map[string]string{"selector": ".product-name", "type": "text"},
				"price":  map[string]string{"selector": ".price", "type": "text"},
				"image":  map[string]string{"selector": "img", "type": "image"},
				"url":    map[string]string{"selector": "a", "type": "link"},
				"rating": map[string]string{"selector": ".rating", "type": "attr", "attribute": "data-rating"},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com/products",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

News Article Scraping

Extract article data from a news site:

{
  "articles": {
    "selector": "article",
    "type": "obj",
    "multiple": true,
    "children": {
      "headline": { "selector": "h2", "type": "text" },
      "summary": { "selector": ".summary", "type": "text" },
      "author": { "selector": ".author", "type": "text" },
      "date": { "selector": "time", "type": "attr", "attribute": "datetime" },
      "link": { "selector": "a", "type": "link" },
      "thumbnail": { "selector": "img", "type": "image" }
    }
  }
}
curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://news.example.com' \
  --data-urlencode 'extract_rules={"articles":{"selector":"article","type":"obj","multiple":true,"children":{"headline":{"selector":"h2","type":"text"},"summary":{"selector":".summary","type":"text"}}}}' \
  -H "ApiKey: YOUR_API_KEY"
const rules = {
  articles: {
    selector: 'article',
    type: 'obj',
    multiple: true,
    children: {
      headline: { selector: 'h2', type: 'text' },
      summary: { selector: '.summary', type: 'text' },
      author: { selector: '.author', type: 'text' },
      date: { selector: 'time', type: 'attr', attribute: 'datetime' },
      link: { selector: 'a', type: 'link' },
      thumbnail: { selector: 'img', type: 'image' }
    }
  }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://news.example.com')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result.articles);
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  articles: {
    selector: 'article',
    type: 'obj',
    multiple: true,
    children: {
      headline: { selector: 'h2', type: 'text' },
      summary: { selector: '.summary', type: 'text' },
      author: { selector: '.author', type: 'text' },
      date: { selector: 'time', type: 'attr', attribute: 'datetime' },
      link: { selector: 'a', type: 'link' },
      thumbnail: { selector: 'img', type: 'image' }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://news.example.com',
  rules
);

console.log(result.articles);
import requests
import json

rules = {
    'articles': {
        'selector': 'article',
        'type': 'obj',
        'multiple': True,
        'children': {
            'headline': {'selector': 'h2', 'type': 'text'},
            'summary': {'selector': '.summary', 'type': 'text'},
            'author': {'selector': '.author', 'type': 'text'},
            'date': {'selector': 'time', 'type': 'attr', 'attribute': 'datetime'},
            'link': {'selector': 'a', 'type': 'link'},
            'thumbnail': {'selector': 'img', 'type': 'image'}
        }
    }
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://news.example.com',
        'extract_rules': json.dumps(rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result']['articles'])
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'articles': {
        'selector': 'article',
        'type': 'obj',
        'multiple': True,
        'children': {
            'headline': {'selector': 'h2', 'type': 'text'},
            'summary': {'selector': '.summary', 'type': 'text'},
            'author': {'selector': '.author', 'type': 'text'},
            'date': {'selector': 'time', 'type': 'attr', 'attribute': 'datetime'},
            'link': {'selector': 'a', 'type': 'link'},
            'thumbnail': {'selector': 'img', 'type': 'image'}
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://news.example.com',
    extract_rules=rules
)

print(result['articles'])
import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;

OkHttpClient client = new OkHttpClient();

JSONObject rules = new JSONObject()
    .put("articles", new JSONObject()
        .put("selector", "article")
        .put("type", "obj")
        .put("multiple", true)
        .put("children", new JSONObject()
            .put("headline", new JSONObject().put("selector", "h2").put("type", "text"))
            .put("summary", new JSONObject().put("selector", ".summary").put("type", "text"))
            .put("author", new JSONObject().put("selector", ".author").put("type", "text"))
            .put("date", new JSONObject().put("selector", "time").put("type", "attr").put("attribute", "datetime"))
            .put("link", new JSONObject().put("selector", "a").put("type", "link"))
            .put("thumbnail", new JSONObject().put("selector", "img").put("type", "image"))));

String url = "https://api.ujeebu.com/scrape?url=" +
    URLEncoder.encode("https://news.example.com", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$rules = [
    'articles' => [
        'selector' => 'article',
        'type' => 'obj',
        'multiple' => true,
        'children' => [
            'headline' => ['selector' => 'h2', 'type' => 'text'],
            'summary' => ['selector' => '.summary', 'type' => 'text'],
            'author' => ['selector' => '.author', 'type' => 'text'],
            'date' => ['selector' => 'time', 'type' => 'attr', 'attribute' => 'datetime'],
            'link' => ['selector' => 'a', 'type' => 'link'],
            'thumbnail' => ['selector' => 'img', 'type' => 'image']
        ]
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://news.example.com',
    'extract_rules' => json_encode($rules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']['articles']);
package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	rules := map[string]interface{}{
		"articles": map[string]interface{}{
			"selector": "article",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"headline":  map[string]string{"selector": "h2", "type": "text"},
				"summary":   map[string]string{"selector": ".summary", "type": "text"},
				"author":    map[string]string{"selector": ".author", "type": "text"},
				"date":      map[string]string{"selector": "time", "type": "attr", "attribute": "datetime"},
				"link":      map[string]string{"selector": "a", "type": "link"},
				"thumbnail": map[string]string{"selector": "img", "type": "image"},
			},
		},
	}

	rulesJSON, _ := json.Marshal(rules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://news.example.com"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"articles": map[string]interface{}{
			"selector": "article",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"headline":  map[string]string{"selector": "h2", "type": "text"},
				"summary":   map[string]string{"selector": ".summary", "type": "text"},
				"author":    map[string]string{"selector": ".author", "type": "text"},
				"date":      map[string]string{"selector": "time", "type": "attr", "attribute": "datetime"},
				"link":      map[string]string{"selector": "a", "type": "link"},
				"thumbnail": map[string]string{"selector": "img", "type": "image"},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://news.example.com",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Table Data Extraction

Extract data from HTML tables:

{
  "rows": {
    "selector": "table tbody tr",
    "type": "obj",
    "multiple": true,
    "children": {
      "column1": { "selector": "td:nth-child(1)", "type": "text" },
      "column2": { "selector": "td:nth-child(2)", "type": "text" },
      "column3": { "selector": "td:nth-child(3)", "type": "text" }
    }
  }
}
curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://data.example.com/table' \
  --data-urlencode 'extract_rules={"rows":{"selector":"table tbody tr","type":"obj","multiple":true,"children":{"column1":{"selector":"td:nth-child(1)","type":"text"},"column2":{"selector":"td:nth-child(2)","type":"text"}}}}' \
  -H "ApiKey: YOUR_API_KEY"
const rules = {
  rows: {
    selector: 'table tbody tr',
    type: 'obj',
    multiple: true,
    children: {
      column1: { selector: 'td:nth-child(1)', type: 'text' },
      column2: { selector: 'td:nth-child(2)', type: 'text' },
      column3: { selector: 'td:nth-child(3)', type: 'text' }
    }
  }
};

const response = await fetch(
  `https://api.ujeebu.com/scrape?url=${encodeURIComponent('https://data.example.com/table')}&extract_rules=${encodeURIComponent(JSON.stringify(rules))}`,
  { headers: { 'ApiKey': 'YOUR_API_KEY' } }
);

const data = await response.json();
console.log(data.result.rows);
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  rows: {
    selector: 'table tbody tr',
    type: 'obj',
    multiple: true,
    children: {
      column1: { selector: 'td:nth-child(1)', type: 'text' },
      column2: { selector: 'td:nth-child(2)', type: 'text' },
      column3: { selector: 'td:nth-child(3)', type: 'text' }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://data.example.com/table',
  rules
);

console.log(result.rows);
import requests
import json

rules = {
    'rows': {
        'selector': 'table tbody tr',
        'type': 'obj',
        'multiple': True,
        'children': {
            'column1': {'selector': 'td:nth-child(1)', 'type': 'text'},
            'column2': {'selector': 'td:nth-child(2)', 'type': 'text'},
            'column3': {'selector': 'td:nth-child(3)', 'type': 'text'}
        }
    }
}

response = requests.get(
    'https://api.ujeebu.com/scrape',
    params={
        'url': 'https://data.example.com/table',
        'extract_rules': json.dumps(rules)
    },
    headers={'ApiKey': 'YOUR_API_KEY'}
)

print(response.json()['result']['rows'])
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'rows': {
        'selector': 'table tbody tr',
        'type': 'obj',
        'multiple': True,
        'children': {
            'column1': {'selector': 'td:nth-child(1)', 'type': 'text'},
            'column2': {'selector': 'td:nth-child(2)', 'type': 'text'},
            'column3': {'selector': 'td:nth-child(3)', 'type': 'text'}
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://data.example.com/table',
    extract_rules=rules
)

print(result['rows'])
import okhttp3.*;
import org.json.*;
import java.net.URLEncoder;

OkHttpClient client = new OkHttpClient();

JSONObject rules = new JSONObject()
    .put("rows", new JSONObject()
        .put("selector", "table tbody tr")
        .put("type", "obj")
        .put("multiple", true)
        .put("children", new JSONObject()
            .put("column1", new JSONObject().put("selector", "td:nth-child(1)").put("type", "text"))
            .put("column2", new JSONObject().put("selector", "td:nth-child(2)").put("type", "text"))
            .put("column3", new JSONObject().put("selector", "td:nth-child(3)").put("type", "text"))));

String url = "https://api.ujeebu.com/scrape?url=" +
    URLEncoder.encode("https://data.example.com/table", "UTF-8") +
    "&extract_rules=" + URLEncoder.encode(rules.toString(), "UTF-8");

Request request = new Request.Builder()
    .url(url)
    .addHeader("ApiKey", "YOUR_API_KEY")
    .build();

Response response = client.newCall(request).execute();
System.out.println(response.body().string());
<?php

$rules = [
    'rows' => [
        'selector' => 'table tbody tr',
        'type' => 'obj',
        'multiple' => true,
        'children' => [
            'column1' => ['selector' => 'td:nth-child(1)', 'type' => 'text'],
            'column2' => ['selector' => 'td:nth-child(2)', 'type' => 'text'],
            'column3' => ['selector' => 'td:nth-child(3)', 'type' => 'text']
        ]
    ]
];

$url = 'https://api.ujeebu.com/scrape?' . http_build_query([
    'url' => 'https://data.example.com/table',
    'extract_rules' => json_encode($rules)
]);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['ApiKey: YOUR_API_KEY']);

$response = curl_exec($ch);
curl_close($ch);

$data = json_decode($response, true);
print_r($data['result']['rows']);
package main

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
)

func main() {
	rules := map[string]interface{}{
		"rows": map[string]interface{}{
			"selector": "table tbody tr",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"column1": map[string]string{"selector": "td:nth-child(1)", "type": "text"},
				"column2": map[string]string{"selector": "td:nth-child(2)", "type": "text"},
				"column3": map[string]string{"selector": "td:nth-child(3)", "type": "text"},
			},
		},
	}

	rulesJSON, _ := json.Marshal(rules)
	apiURL := fmt.Sprintf("https://api.ujeebu.com/scrape?url=%s&extract_rules=%s",
		url.QueryEscape("https://data.example.com/table"),
		url.QueryEscape(string(rulesJSON)))

	req, _ := http.NewRequest("GET", apiURL, nil)
	req.Header.Set("ApiKey", "YOUR_API_KEY")

	client := &http.Client{}
	resp, _ := client.Do(req)
	defer resp.Body.Close()

	body, _ := io.ReadAll(resp.Body)
	fmt.Println(string(body))
}
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"rows": map[string]interface{}{
			"selector": "table tbody tr",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"column1": map[string]string{"selector": "td:nth-child(1)", "type": "text"},
				"column2": map[string]string{"selector": "td:nth-child(2)", "type": "text"},
				"column3": map[string]string{"selector": "td:nth-child(3)", "type": "text"},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://data.example.com/table",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Advanced: Using Function Type

Extract data using custom JavaScript to access window variables, compute values, and use $parent:

{
  "products": {
    "selector": ".product-item",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": { "selector": ".product-name", "type": "text" },
      "price": { "selector": ".price", "type": "text" },
      "sku": {
        "type": "fn",
        "fn": "return $parent.getAttribute('data-sku');"
      },
      "inStock": {
        "type": "fn",
        "fn": "return $parent.classList.contains('in-stock');"
      },
      "rating": {
        "type": "fn",
        "fn": "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
      }
    }
  },
  "pageConfig": {
    "type": "fn",
    "fn": "return window._pageConfig_ || {};"
  },
  "totalItems": {
    "type": "fn",
    "fn": "return document.querySelectorAll('.product-item').length;"
  }
}
curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://shop.example.com' \
  --data-urlencode 'js=true' \
  --data-urlencode 'extract_rules={"products":{"selector":".product-item","type":"obj","multiple":true,"children":{"name":{"selector":".product-name","type":"text"},"sku":{"type":"fn","fn":"return $parent.getAttribute('data-sku');"},"inStock":{"type":"fn","fn":"return $parent.classList.contains('in-stock');"}}},"pageConfig":{"type":"fn","fn":"return window._pageConfig_ || {};"}}' \
  -H "ApiKey: YOUR_API_KEY"
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  products: {
    selector: '.product-item',
    type: 'obj',
    multiple: true,
    children: {
      name: { selector: '.product-name', type: 'text' },
      price: { selector: '.price', type: 'text' },
      sku: {
        type: 'fn',
        fn: "return $parent.getAttribute('data-sku');"
      },
      inStock: {
        type: 'fn',
        fn: "return $parent.classList.contains('in-stock');"
      },
      rating: {
        type: 'fn',
        fn: "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
      }
    }
  },
  pageConfig: {
    type: 'fn',
    fn: "return window._pageConfig_ || {};"
  },
  totalItems: {
    type: 'fn',
    fn: "return document.querySelectorAll('.product-item').length;"
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com',
  rules,
  { js: true }
);

console.log(result);
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'products': {
        'selector': '.product-item',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {'selector': '.product-name', 'type': 'text'},
            'price': {'selector': '.price', 'type': 'text'},
            'sku': {
                'type': 'fn',
                'fn': "return $parent.getAttribute('data-sku');"
            },
            'inStock': {
                'type': 'fn',
                'fn': "return $parent.classList.contains('in-stock');"
            },
            'rating': {
                'type': 'fn',
                'fn': "const stars = $parent.querySelectorAll('.star.filled').length; return stars;"
            }
        }
    },
    'pageConfig': {
        'type': 'fn',
        'fn': "return window._pageConfig_ || {};"
    },
    'totalItems': {
        'type': 'fn',
        'fn': "return document.querySelectorAll('.product-item').length;"
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com',
    extract_rules=rules,
    params={'js': True}
)

print(result)
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-item",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name":  map[string]string{"selector": ".product-name", "type": "text"},
				"price": map[string]string{"selector": ".price", "type": "text"},
				"sku": map[string]string{
					"type": "fn",
					"fn":   "return $parent.getAttribute('data-sku');",
				},
				"inStock": map[string]string{
					"type": "fn",
					"fn":   "return $parent.classList.contains('in-stock');",
				},
			},
		},
		"pageConfig": map[string]string{
			"type": "fn",
			"fn":   "return window._pageConfig_ || {};",
		},
		"totalItems": map[string]string{
			"type": "fn",
			"fn":   "return document.querySelectorAll('.product-item').length;",
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com",
		JS:           true,
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Expected Response:

{
  "success": true,
  "result": {
    "products": [
      {
        "name": "Premium Headphones",
        "price": "$199.99",
        "sku": "HD-2024-BLK",
        "inStock": true,
        "rating": 5
      },
      {
        "name": "Wireless Mouse",
        "price": "$49.99",
        "sku": "MS-2024-GRY",
        "inStock": false,
        "rating": 4
      }
    ],
    "pageConfig": {
      "currency": "USD",
      "locale": "en-US",
      "version": "2.1.0"
    },
    "totalItems": 24
  }
}

TIP — Function Type Use Cases

The fn type is perfect for:

  • Extracting data from JavaScript variables (e.g., window.__NEXT_DATA__)
  • Accessing parent element attributes with $parent
  • Computing derived values (totals, averages, etc.)
  • Parsing JSON from script tags
  • Checking element states (classes, visibility)
  • Accessing localStorage/sessionStorage
  • Complex conditional logic

Advanced: Table and Regex Extraction

Extract product specifications from a table and SKU from text using table and regex types:

{
  "product_name": {
    "selector": "h1.product-title",
    "type": "text"
  },
  "sku": {
    "type": "regex",
    "selector": ".product-info",
    "pattern": "SKU:\\s*([A-Z0-9-]+)"
  },
  "specifications": {
    "selector": "table.specs",
    "type": "table"
  },
  "product_details": {
    "selector": "table.details",
    "type": "tableTranspose"
  },
  "price": {
    "type": "regex",
    "selector": ".price-text",
    "pattern": "\\$([0-9,.]+)"
  }
}
curl -G 'https://api.ujeebu.com/scrape' \
  --data-urlencode 'url=https://shop.example.com/product/123' \
  --data-urlencode 'extract_rules={"product_name":{"selector":"h1.product-title","type":"text"},"sku":{"type":"regex","selector":".product-info","pattern":"SKU:\\\\s*([A-Z0-9-]+)"},"specifications":{"selector":"table.specs","type":"table"},"product_details":{"selector":"table.details","type":"tableTranspose"},"price":{"type":"regex","selector":".price-text","pattern":"\\\\$([0-9,.]+)"}}' \
  -H "ApiKey: YOUR_API_KEY"
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  product_name: {
    selector: 'h1.product-title',
    type: 'text'
  },
  sku: {
    type: 'regex',
    selector: '.product-info',
    pattern: 'SKU:\\\\s*([A-Z0-9-]+)'
  },
  specifications: {
    selector: 'table.specs',
    type: 'table'
  },
  product_details: {
    selector: 'table.details',
    type: 'tableTranspose'
  },
  price: {
    type: 'regex',
    selector: '.price-text',
    pattern: '\\\\$([0-9,.]+)'
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com/product/123',
  rules
);

console.log(result);
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'product_name': {
        'selector': 'h1.product-title',
        'type': 'text'
    },
    'sku': {
        'type': 'regex',
        'selector': '.product-info',
        'pattern': r'SKU:\s*([A-Z0-9-]+)'
    },
    'specifications': {
        'selector': 'table.specs',
        'type': 'table'
    },
    'product_details': {
        'selector': 'table.details',
        'type': 'tableTranspose'
    },
    'price': {
        'type': 'regex',
        'selector': '.price-text',
        'pattern': r'\$([0-9,.]+)'
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com/product/123',
    extract_rules=rules
)

print(result)
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"product_name": map[string]string{
			"selector": "h1.product-title",
			"type":     "text",
		},
		"sku": map[string]string{
			"type":     "regex",
			"selector": ".product-info",
			"pattern":  `SKU:\s*([A-Z0-9-]+)`,
		},
		"specifications": map[string]string{
			"selector": "table.specs",
			"type":     "table",
		},
		"product_details": map[string]string{
			"selector": "table.details",
			"type":     "tableTranspose",
		},
		"price": map[string]string{
			"type":     "regex",
			"selector": ".price-text",
			"pattern":  `\$([0-9,.]+)`,
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com/product/123",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Expected Response:

{
  "success": true,
  "result": {
    "product_name": "Premium Wireless Headphones",
    "sku": "HD-2024-BLK-256",
    "price": "199.99",
    "specifications": [
      {"feature": "Battery Life", "value": "40 hours"},
      {"feature": "Bluetooth", "value": "5.3"},
      {"feature": "Weight", "value": "250g"},
      {"feature": "Charging", "value": "USB-C"}
    ],
    "product_details": {
      "brand": "AudioTech",
      "model": "HD-2024",
      "color": "Black",
      "warranty": "2 years"
    }
  }
}

TIP — Table and Regex Use Cases

  • Table extraction: Product specifications, pricing tables, feature comparisons, data tables
  • Regex extraction: SKUs, prices, phone numbers, email addresses, dates, order numbers, tracking IDs
  • Combine both: Extract structured tabular data alongside pattern-based text extraction for comprehensive scraping

Advanced: E-commerce Product with Transformations

Extract and transform product data with clean, structured output:

{
  "products": {
    "selector": ".product-card",
    "type": "obj",
    "multiple": true,
    "children": {
      "name": {
        "selector": ".product-name",
        "type": "text",
        "transform": ["trim", "normalizeWhitespace"]
      },
      "price": {
        "selector": ".price",
        "type": "text",
        "transform": ["currency", ["round", 2]]
      },
      "original_price": {
        "selector": ".original-price",
        "type": "text",
        "transform": ["currency", ["default", null]]
      },
      "discount_percent": {
        "selector": ".discount",
        "type": "text",
        "transform": [["match", "(\\d+)%", 1], "integer"]
      },
      "rating": {
        "selector": ".rating",
        "type": "text",
        "transform": ["number", ["round", 1], ["default", 0]]
      },
      "reviews_count": {
        "selector": ".reviews",
        "type": "text",
        "transform": [["match", "(\\d+)", 1], "integer"]
      },
      "tags": {
        "selector": ".tags",
        "type": "text",
        "transform": [["split", ","], "compact", "unique", "lowercase"]
      },
      "in_stock": {
        "selector": ".stock-status",
        "type": "text",
        "transform": ["lowercase", "boolean"]
      },
      "images": {
        "selector": ".product-images img",
        "type": "image",
        "multiple": true,
        "transform": ["compact", "unique"]
      }
    }
  }
}
import { UjeebuClient } from '@ujeebu-org/ujeebu-sdk';

const client = new UjeebuClient(process.env.UJEEBU_API_KEY);

const rules = {
  products: {
    selector: '.product-card',
    type: 'obj',
    multiple: true,
    children: {
      name: {
        selector: '.product-name',
        type: 'text',
        transform: ['trim', 'normalizeWhitespace']
      },
      price: {
        selector: '.price',
        type: 'text',
        transform: ['currency', ['round', 2]]
      },
      original_price: {
        selector: '.original-price',
        type: 'text',
        transform: ['currency', ['default', null]]
      },
      discount_percent: {
        selector: '.discount',
        type: 'text',
        transform: [['match', '(\\\\d+)%', 1], 'integer']
      },
      rating: {
        selector: '.rating',
        type: 'text',
        transform: ['number', ['round', 1], ['default', 0]]
      },
      reviews_count: {
        selector: '.reviews',
        type: 'text',
        transform: [['match', '(\\\\d+)', 1], 'integer']
      },
      tags: {
        selector: '.tags',
        type: 'text',
        transform: [['split', ','], 'compact', 'unique', 'lowercase']
      },
      in_stock: {
        selector: '.stock-status',
        type: 'text',
        transform: ['lowercase', 'boolean']
      },
      images: {
        selector: '.product-images img',
        type: 'image',
        multiple: true,
        transform: ['compact', 'unique']
      }
    }
  }
};

const result = await client.scrapeWithRules(
  'https://shop.example.com',
  rules
);

console.log(result);
from ujeebu_python import UjeebuClient

ujeebu = UjeebuClient(api_key="YOUR_API_KEY")

rules = {
    'products': {
        'selector': '.product-card',
        'type': 'obj',
        'multiple': True,
        'children': {
            'name': {
                'selector': '.product-name',
                'type': 'text',
                'transform': ['trim', 'normalizeWhitespace']
            },
            'price': {
                'selector': '.price',
                'type': 'text',
                'transform': ['currency', ['round', 2]]
            },
            'original_price': {
                'selector': '.original-price',
                'type': 'text',
                'transform': ['currency', ['default', None]]
            },
            'discount_percent': {
                'selector': '.discount',
                'type': 'text',
                'transform': [['match', r'(\d+)%', 1], 'integer']
            },
            'rating': {
                'selector': '.rating',
                'type': 'text',
                'transform': ['number', ['round', 1], ['default', 0]]
            },
            'reviews_count': {
                'selector': '.reviews',
                'type': 'text',
                'transform': [['match', r'(\d+)', 1], 'integer']
            },
            'tags': {
                'selector': '.tags',
                'type': 'text',
                'transform': [['split', ','], 'compact', 'unique', 'lowercase']
            },
            'in_stock': {
                'selector': '.stock-status',
                'type': 'text',
                'transform': ['lowercase', 'boolean']
            },
            'images': {
                'selector': '.product-images img',
                'type': 'image',
                'multiple': True,
                'transform': ['compact', 'unique']
            }
        }
    }
}

result = ujeebu.scrape_with_rules(
    url='https://shop.example.com',
    extract_rules=rules
)

print(result)
package main

import (
	"fmt"
	"github.com/ujeebu/ujeebu-go"
)

func main() {
	client, _ := ujeebu.NewClient("YOUR-API-KEY")

	rules := map[string]interface{}{
		"products": map[string]interface{}{
			"selector": ".product-card",
			"type":     "obj",
			"multiple": true,
			"children": map[string]interface{}{
				"name": map[string]interface{}{
					"selector":  ".product-name",
					"type":      "text",
					"transform": []string{"trim", "normalizeWhitespace"},
				},
				"price": map[string]interface{}{
					"selector":  ".price",
					"type":      "text",
					"transform": []interface{}{"currency", []interface{}{"round", 2}},
				},
				"rating": map[string]interface{}{
					"selector":  ".rating",
					"type":      "text",
					"transform": []interface{}{"number", []interface{}{"round", 1}, []interface{}{"default", 0}},
				},
				"tags": map[string]interface{}{
					"selector":  ".tags",
					"type":      "text",
					"transform": []interface{}{[]string{"split", ","}, "compact", "unique", "lowercase"},
				},
				"in_stock": map[string]interface{}{
					"selector":  ".stock-status",
					"type":      "text",
					"transform": []string{"lowercase", "boolean"},
				},
			},
		},
	}

	response, credits, _ := client.Scrape(ujeebu.ScrapeParams{
		URL:          "https://shop.example.com",
		ExtractRules: rules,
	})

	fmt.Printf("Credits used: %d\n", credits)
	fmt.Println(response)
}

Expected Response:

{
  "success": true,
  "result": {
    "products": [
      {
        "name": "Wireless Headphones Premium",
        "price": 199.99,
        "original_price": 249.99,
        "discount_percent": 20,
        "rating": 4.5,
        "reviews_count": 1234,
        "tags": ["electronics", "audio", "wireless"],
        "in_stock": true,
        "images": [
          "https://cdn.example.com/img1.jpg",
          "https://cdn.example.com/img2.jpg"
        ]
      },
      {
        "name": "Smart Watch Pro",
        "price": 299.99,
        "original_price": null,
        "discount_percent": null,
        "rating": 4.8,
        "reviews_count": 567,
        "tags": ["electronics", "wearable", "fitness"],
        "in_stock": false,
        "images": [
          "https://cdn.example.com/watch1.jpg"
        ]
      }
    ]
  }
}

What This Example Demonstrates:

  • Price cleaning: Convert "$199.99" to 199.99 (number)
  • Discount extraction: Extract 20 from "Save 20%!"
  • Rating normalization: Round 4.567 to 4.6
  • Review count extraction: Extract 1234 from "1,234 reviews"
  • Tag processing: Split, clean, deduplicate, and lowercase tags
  • Boolean conversion: Convert "in stock" to true
  • Array deduplication: Remove duplicate images
  • Fallback values: Use null when original price is missing

SUCCESS — Transformation Best Practices

When building extraction rules with transformations:

  • Chain transforms logically: Start with text cleaning (trim), then extraction (match, split), then type conversion (number, boolean)
  • Use defaults for optional fields: Prevent null values with ["default", 0] or ["default", null]
  • Clean arrays: Use compact to remove empty values and unique to deduplicate
  • Parse numbers correctly: Use currency for prices, number for decimals, integer for whole numbers
  • Extract before converting: Use match to extract patterns before converting types
  • Test incrementally: Build your transform pipeline step by step to catch errors early

Parameters

Parameter Type Required Default Description
url string Yes - The URL to scrape.
extract_rules json-string Yes - JSON object defining extraction rules.
js boolean No false Enable JavaScript rendering before extraction.
wait_for `string number` No null
timeout number No 60 Maximum seconds before request timeout.
proxy_type string No rotating Proxy type: 'rotating', 'premium', 'residential', 'custom'.

Response Format

The API returns a JSON response with the extracted data in the result field:

{
  "success": true,
  "result": {
    "key_name": "extracted_value",
    "another_key": ["array", "of", "values"]
  }
}

If extraction fails for a selector, the value will be null or an empty array for multiple items.

Ready to build?

Spin up an API key in 60 seconds

Free tier: 5,000 credits, no card, full access to every endpoint on this page.