Sharing: A blog post indexing python script for redisearch v1.2

I was working to get redisearch indexing of my blog posts and came up with this script that works with the default Debian 12 redisearch version of 1.2. It will not work with rediseach version 2 or up as the indexing is not hash based in version 2.

  • You need a working version of redisearch v 1.2 loaded into your redis cache. See the following link to enable redisearch in frappe. https://docs.frappe.io/erpnext/user/manual/en/installing_redisearch_to_enable_super_fast_e_commerce_search
    NB: if you are using Debian 12 just sudo apt install redisearch and use the redisearch.so file installed as the shared object file to point to in your frappe config/redis_cache.conf file.

  • Create a file in your app and paste the code below. I placed it in a folder called api and named the file blog_post_search.py

    eg: /yourappname/yourappname/api/blog_post_search.py

import frappe
import datetime
from redis import Redis
from redis.commands.search.field import TextField, NumericField
from redis.commands.search.indexDefinition import IndexDefinition
from redis.commands.search.query import Query
from frappe.utils import cstr, strip_html_tags
import logging

# Enable detailed logging
logging.basicConfig(level=logging.DEBUG)

redis_host = frappe.conf.redis_cache.split("//")[1].split(":")[0]
redis_port = frappe.conf.redis_cache.split(":")[-1]
client = Redis(host=redis_host, port=redis_port, decode_responses=True)


def build_blog_posts_index():
    # Define RediSearch index schema (hash-based for RediSearch 1.2)
    schema = (
        TextField("title", sortable=True),
        TextField("content", weight=1.0),  # Increase weight to match title
        TextField("blogger", sortable=True),
        TextField("published_on", sortable=True),
        NumericField("read_time", sortable=True),
        TextField("meta_image", no_index=True, sortable=True),
    )

    # Function to preprocess documents (convert datetime to string)
    def preprocess_document(doc):
        if isinstance(doc.get("published_on"), datetime.date):
            doc["published_on"] = doc["published_on"].isoformat()
        # Ensure content is a string and not empty
        doc["content"] = strip_html_tags(cstr(doc.get("content", "")))
        if not doc["content"]:
            logging.warning(
                f"Empty content after preprocessing for doc: {doc.get('title')}"
            )
        return doc

    # Fetch blog posts from frappe backend
    try:
        response = frappe.get_all(
            "Blog Post",
            fields=[
                "title",
                "blog_intro",
                "content",
                "blogger",
                "published_on",
                "read_time",
                "meta_image",
                "route",
            ],
        )
        blog_posts = response
    except Exception as e:
        logging.error(f"Failed to fetch blog posts: {e}")
        return

    # Create index and clear existing keys
    try:
        indexes = client.execute_command("FT._LIST")
        if "idx:blog-posts" in indexes:
            logging.info("Deleting Index")
            client.execute_command("FT.DROP", "idx:blog-posts")
            for key in client.keys("blog:*"):
                client.delete(key)
        else:
            logging.info("Index 'idx:blog-posts' does not exist.")

        # Create hash-based index
        client.execute_command(
            "FT.CREATE",
            "idx:blog-posts",
            "ON",
            "HASH",
            "PREFIX",
            1,
            "blog:",
            "SCHEMA",
            "title",
            "TEXT",
            "SORTABLE",
            "blog_intro",
            "TEXT",
            "SORTABLE",
            "content",
            "TEXT",
            "WEIGHT",
            1.0,  # Match title weight
            "blogger",
            "TEXT",
            "SORTABLE",
            "published_on",
            "TEXT",
            "SORTABLE",
            "read_time",
            "NUMERIC",
            "SORTABLE",
            "meta_image",
            "TEXT",
            "NOINDEX",
            "SORTABLE",
            "route",
            "TEXT",
        )
        logging.info("Blog Posts Index created successfully")
    except Exception as e:
        logging.error(f"Blog Posts Index creation failed: {e}")
        return

    # Index documents as hashes
    for i, doc in enumerate(blog_posts):
        try:
            doc = preprocess_document(doc)
            key = f"blog:{i+1}"
            client.hset(key, mapping=doc)
            # logging.debug(f"Indexed document {key}: {doc}")
        except Exception as e:
            logging.error(f"Error indexing document {key}: {e}")

    client.close()


@frappe.whitelist(allow_guest=True)
def search_posts(
    query: str = "", sort_by: str = "", sort_order: str = "False", field: str = ""
):
    logging.info(
        f"Search query: {query}, field: {field}, sort_by: {sort_by}, sort_order: {sort_order}"
    )

    try:
        # Validate field
        valid_fields = [
            "title",
            "blog_intro",
            "content",
            "blogger",
            "published_on",
            "route",
        ]
        if field and field not in valid_fields:
            logging.error(f"Invalid field: {field}. Must be one of {valid_fields}")
            return {"success": False, "error": f"Invalid field: {field}"}

        # Convert sort_order string to boolean
        sort_order_bool = sort_order.lower() == "true" if sort_by else False

        # Build query: field-specific or global
        search_query_str = (
            f"@{field}:{query}" if field and query else f"{query}*" if query else "*"
        )
        search_query = Query(search_query_str).paging(0, 10)
        if sort_by:
            search_query = search_query.sort_by(sort_by, asc=sort_order_bool)

        logging.debug(f"RediSearch query: {search_query.query_string()}")

        # Execute search
        results = client.ft("idx:blog-posts").search(search_query)
        hits = [doc.__dict__ for doc in results.docs]

        logging.info(f"Search returned {len(hits)} hits")
        return {"success": True, "hits": hits}
    except Exception as e:
        logging.error(f"Search failed: {str(e)}")
        return {"success": False, "error": str(e)}
    finally:
        client.close()

  • Add a cron task to your hooks.py as below. This runs the indexing function every 15 minutes.
# Scheduled Tasks
# ---------------
scheduler_events = {
    "cron": {
        "*/15 * * * *": ["yourappname.api.blog_post_search.build_blog_posts_index"],
    },
}
  • Run the command bench execute yourappname.api.build_blog_posts_index from your bench folder to build an index. Watch the terminal output to see any errors.

  • Run the command bench execute --kwargs "{'query': 'enter_your_text_to_search_for'}" yourappname.api.blog_post_search.search_posts
    It should return something like this:

INFO:root:Search query: mega, field: , sort_by: , sort_order: False
DEBUG:root:RediSearch query: mega*
INFO:root:Search returned 2 hits
{"success": true, "hits": [{"id": "blog:2", "payload": null, "meta_image": "/files/bread_board5fbf34.jpg", "content": "Lorem ipsum odor amet, consectetuer adipiscing elit.Felis integer euismod at ac; sollicitudin quisque. Libero torquent ipsum fames maximus mi. Proin sodales dapibus faucibus efficitur litora. Proin ex consequat non nostra malesuada mollis dis neque lectus. Porta mus venenatis amet platea penatibus sed nascetur. Elementum primis adipiscing magna ante nam.Vehicula placerat tempus semper aptent nullam aptent velit quis purus.Tincidunt vivamus egestas eros a ultricies. Ex nam suspendisse mus dolor in est natoque pretium adipiscing. At blandit eros, duis mattis et feugiat congue ipsum. Eu proin dolor viverra arcu sapien. Cursus magnis montes rutrum nullam neque turpis eget nulla. Augue ultricies etiam cubilia vulputate aliquam mi. Inceptos parturient habitant varius amet in ex eu.Nascetur maecenas turpis, netus fermentum est euismod urna porta? Diam netus tempor tortor imperdiet sociosqu. Hendrerit dignissim dis sollicitudin primis adipiscing lectus. Blandit nunc viverra felis penatibus scelerisque magnis habitant rhoncus facilisi. Sociosqu at velit pretium nullam consequat etiam a. Ut pharetra aliquam venenatis libero ultrices leo eleifend donec mollis. Nulla nisl imperdiet dictum eget consequat nibh morbi gravida.Semper vehicula penatibus sollicitudin, et magnis nullam. Etiam sapien proin massa imperdiet mi cubilia ante. Ante finibus duis, at ultricies euismod auctor metus. Luctus odio efficitur duis consequat, egestas fames maximus.Aliquet volutpat sit volutpat nisi montes, fringilla vivamus. Iaculis inceptos tempus suscipit purus potenti non aliquam.Aliquam arcu dignissim fermentum inceptos, ac aptent consectetur. Sit congue finibus morbi curabitur ullamcorper. Sem himenaeos proin malesuada vehicula adipiscing tempor eleifend. Justo risus blandit tempor dis conubia sociosqu nisi scelerisque. Luctus pretium per, dapibus metus magna senectus ante. Nascetur varius ridiculus ullamcorper eros adipiscing. Ridiculus rhoncus enim aenean non lobortis suscipit id aenean metus.", "title": "A shop featured blog post", "published_on": "2025-01-04", "route": "blog/testing/a-shop-featured-blog-post", "blog_intro": "This shop featured blog post will be displayed in the shop mega menu.", "read_time": "2", "blogger": "Christopher Robert Nuss"}, {"id": "blog:4", "payload": null, "meta_image": "/files/jSlWVMN.jpg", "content": "FishLorem ipsum odor amet, consectetuer adipiscing elit. Felis integer euismod at ac; sollicitudin quisque. Libero torquent ipsum fames maximus mi. Proin sodales dapibus faucibus efficitur litora. Proin ex consequat non nostra malesuada mollis dis neque lectus. Porta mus venenatis amet platea penatibus sed nascetur. Elementum primis adipiscing magna ante nam.Vehicula placerat tempus semper aptent nullam aptent velit quis purus.Tincidunt vivamus egestas eros a ultricies. Ex nam suspendisse mus dolor in est natoque pretium adipiscing. At blandit eros, duis mattis et feugiat congue ipsum. Eu proin dolor viverra arcu sapien. Cursus magnis montes rutrum nullam neque turpis eget nulla. Augue ultricies etiam cubilia vulputate aliquam mi. Inceptos parturient habitant varius amet in ex eu.Nascetur maecenas turpis, netus fermentum est euismod urna porta? Diam netus tempor tortor imperdiet sociosqu. Hendrerit dignissim dis sollicitudin primis adipiscing lectus. Blandit nunc viverra felis penatibus scelerisque magnis habitant rhoncus facilisi. Sociosqu at velit pretium nullam consequat etiam a. Ut pharetra aliquam venenatis libero ultrices leo eleifend donec mollis. Nulla nisl imperdiet dictum eget consequat nibh morbi gravida.Semper vehicula penatibus sollicitudin, et magnis nullam. Etiam sapien proin massa imperdiet mi cubilia ante. Ante finibus duis, at ultricies euismod auctor metus. Luctus odio efficitur duis consequat, egestas fames maximus.Aliquet volutpat sit volutpat nisi montes, fringilla vivamus. Iaculis inceptos tempus suscipit purus potenti non aliquam.Aliquam arcu dignissim fermentum inceptos, ac aptent consectetur. Sit congue finibus morbi curabitur ullamcorper. Sem himenaeos proin malesuada vehicula adipiscing tempor eleifend. Justo risus blandit tempor dis conubia sociosqu nisi scelerisque. Luctus pretium per, dapibus metus magna senectus ante. Nascetur varius ridiculus ullamcorper eros adipiscing. Ridiculus rhoncus enim aenean non lobortis suscipit id aenean metus.", "title": "A Featured Wattle Seed Blog Post", "published_on": "2025-01-04", "route": "blog/testing/a-featured-wattle-seed-blog-post", "blog_intro": "This blog post will be featured in the wattle seed mega menu.", "read_time": "2", "blogger": "Christopher Robert Nuss"}]}
  • if your running a production server or you have enabled supervisor in a dev server that you are testing this on, then you have to run sudo supervisorctl reread all and then sudo supervisorctl restart all to restart the redis server.
  • run bench restart on a dev server or bench restart --supervisor on a production server;
  • run bench --site yoursitename migrate to pick up the change in your hooks.py

Check the Scheduled Job Log for errors! If it works you can comment out the logging calls.

Ohh, you’ll have to create a web page or some such a page that passes a query to the api and deal with the returned data. Something like this bit of Nuxt script setup code that I use:

// API base URL from environment variable
const { apiBase } = useRuntimeConfig().public;
const apiEndpoint = `${apiBase}/api/v2/method/littlebunyip.api.blog_post_search.search_posts`

// Search function
const search = async () => {
  loading.value = true
  error.value = null
  posts.value = []
  showNoSearchResults.value = false;

  try {
    const { data: response } = await $fetch(apiEndpoint, {
      method: 'POST',
      body: {
        query: searchQuery.value,
        sort_by: sortBy.value,
        sort_order: 'True', // Default to ascending, adjust as needed
      },
    })

    if (response.success && response.hits.length) {
      showSearchResults.value = true;
      posts.value = response.hits
    } else {
      showNoSearchResults.value = true;
    }
  } catch (err) {
    error.value = `Error: ${err.message}`
  } finally {
    loading.value = false
  }
}

Good luck and I hope someone finds it useful.