JSDoc: Source: tools/webPageScraper.js

const Agent = require('../agent.js').Agent
const BaseTool = require('./baseToolClass.js')
const Summarizer = require('../utils/summarizer.js')
const countTokens = require('../utils/countTokens.js')
const { saveTextToIndexedDB } = require('../utils/indexedDB.js')

class WebPageScraper extends BaseTool {
  static identifier = 'WebPageScraper'
  /**
   * @param {Agent} agent
   */
  constructor(agent) {
    super(WebPageScraper.identifier)
    this.agent = agent
  }

  /**
   * Dictionary of arguments for the tool.
   * @type {Object.<string, string>}
   */
  get args() {
    return {
      url: 'The URL of the web page to scrape',
      question: 'The question for summarization',
    }
  }

  /**
   * Response format of the tool.
   * @type {Object.<string, any>}
   */
  get resp() {
    return {
      text: 'The summarized text from the web page',
      links: 'The extracted links from the web page',
    }
  }

  /**
   * Scrapes a web page using the provided URL.
   * @param {string} url - The URL of the web page.
   * @returns {Promise<{ text: string | null, links: Array<string|null> }>} The scraped data from the web page.
   */
  async scrapeWebPage(url) {
    let text = null
    /**
     * @type {Array<string|null>}
     */
    let links = []

    try {
      const response = await fetch(url)
      const html = await response.text()

      let doc

      const parser = new DOMParser()
      doc = parser.parseFromString(html, 'text/html')

      // Remove script and style tags from the HTML
      const scriptTags = doc.querySelectorAll('script')
      scriptTags.forEach((script) =>
        script.remove()
      )

      const styleTags = doc.querySelectorAll('style')
      styleTags.forEach((style) =>
        style.remove()
      )

      // Extract links from the web page
      const linkElements = doc.querySelectorAll('a')
      links = Array.from(linkElements)
        .map((element) => element.getAttribute('href'))
        .slice(0, 5)

      text = doc.textContent

      return { links, text }
    } catch (error) {
      console.error('An error occurred while scraping the web page:', error)
      return { links, text }
    }
  }

  /**
   * @param {string} url
   * @param {string} selector
   */
  async scrapeWebPageAPI(url, selector) {
    try {
      const apiUrl = `https://web.scraper.workers.dev?url=${encodeURIComponent(
        url
      )}&selector=${encodeURIComponent(selector)}&scrape=text`
      const response = await fetch(apiUrl)
      const data = await response.json()
      const { result } = await data
      const text = result ? result[selector] : 'No data'

      return text
    } catch (error) {
      console.error('An error occurred while scraping the web page:', error)
      return null
    }
  }

  /**
   * Executes the web page scraping and summarization.
   * @param {Array<{url: string, question: string}>|{url: string, question: string}} args
   * @returns {Promise<{ text: string | null, links: Array<string|null> } | string>} The scraped data and summarized text.
   */
  async run(args) {
    if (!args) {
      return 'Error: args is missing'
    }

    let _url
    let _question
    if (Array.isArray(args)) {
      if (args.length === 0) {
        return 'Error: args array is empty'
      }

      const { url, question } = args[0]

      if (!url || !question) {
        if (!url && !question) {
          return 'Error: both url and question arguments are missing'
        } else if (!url) {
          return 'Error: url argument is missing'
        } else {
          return 'Error: question argument is missing'
        }
      }
      _url = url
      _question = question
    } else {
      const { url, question } = args

      if (!url || !question) {
        if (!url && !question) {
          return 'Error: both url and question arguments are missing'
        } else if (!url) {
          return 'Error: url argument is missing'
        } else {
          return 'Error: question argument is missing'
        }
      }

      _url = url
      _question = question
    }

    let title = ''
    let text = ''

    let context = {
      title,
      url: _url,
      question: _question,
    }
    /**
     * @type {Array<string|null>}
     */
    let links = []

    // Save the text for later summarization
    const saveForLater = async () => {
      const localStorageKey = await saveTextToIndexedDB(
        'unsummarized_texts',
        context,
        text
      )
      text = `Information saved for later summarization, key: '${localStorageKey}'`
    }

    try {
      const pagetitle = await this.scrapeWebPageAPI(_url, 'title')
      const results = await this.scrapeWebPageAPI(_url, 'body')

      if (results) {
        context.title = JSON.stringify(pagetitle)
        text = JSON.stringify(results)

        const tokenCount = await countTokens(text)
        const maxTokensThreshold = 800 // Threshold for immediate summarization
        const rateLimitThreshold = 5000 // Maximum tokens to process within the rate limit
        // @ts-ignore
        const summarizer = new Summarizer(this.openaiApiKey)

        if (tokenCount > maxTokensThreshold) {
          if (tokenCount <= rateLimitThreshold) {
            // Calculate parallelization
            const parallelProcesses = Math.ceil(tokenCount / rateLimitThreshold)
            try {
              text = await summarizer.parallelizeSummarization(
                context,
                text,
                maxTokensThreshold,
                parallelProcesses
              )

              console.debug({ tokenCount, parallelProcesses, text })
            } catch (error) {
              console.error({
                error,
                context,
                text,
                maxTokensThreshold,
                parallelProcesses,
              })
              throw Error('Error occurred calculating parallelization')
            }
          } else {
            await saveForLater()
          }
        } else {
          // Summarize the text immediately
          const response = await summarizer.openAIComplete(
            summarizer.summarizePrompt({ text, title: pagetitle })
          )
          text = response.choices[0].message.content
        }

        // TODO: use another tool to grab links, currently only pushing in page url
        links.push(_url)
        // Save the summarized text in IndexedDB
        const indexKey = await saveTextToIndexedDB(
          'web_page_scraper_results',
          context,
          text
        )
        await this._addToMemory(pagetitle, { text, url: _url, indexKey })
      }
    } catch (apiError) {
      console.error(
        'An error occurred while scraping the web page using the API method:',
        apiError
      )

      throw Error('Critical error, threads should be ended')
    }

    return { text, links }
  }

  /**
   * @param {any} pagetitle
   * @param {{ text: string, url: string, indexKey: string }} results
   */
  async _addToMemory(pagetitle, results) {
    if (this.agent.memory) {
      let entry = `Summary for ${pagetitle}:\n`
      entry += `\t${results.text}: {${results.url}} -- ${results.indexKey}\n`
      entry += '\n'

      const memoryEntries = this.agent.memory.docs.filter(
        (/** @type {string} */ doc) => (doc = entry)
      )
      if (memoryEntries.length === 0) {
        await this.agent.memory.add(entry)
      }
      // }
    }
  }
}

module.exports = WebPageScraper