import * as puppeteerExtra from 'puppeteer-extra'; const puppeteer = puppeteerExtra.default; import { DEFAULT_INTERCEPT_RESOLUTION_PRIORITY } from 'puppeteer'; import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'; import { TidyURL } from 'tidy-url'; ( TidyURL as any ).log = () => {}; TidyURL.config.silent = true; import path from 'node:path'; import * as cheerio from 'cheerio'; import * as tools from './tools.js'; import * as chat from './chat.js'; import * as u from './utils.js'; export let LEN_MAX = 10000; export let NO_VERIFY = false; var didWget: boolean = false; var browser: Awaited< ReturnType< typeof puppeteer.launch > > | null = null; export function setConfig( config: any ) { if( config.lenMax ) LEN_MAX = config.len_max; if( config.noVerify ) NO_VERIFY = config.no_verify; } export async function launchBrowser() { let extensionPath = path.join( './', 'nocookies' ); const plugin = AdblockerPlugin( { interceptResolutionPriority: DEFAULT_INTERCEPT_RESOLUTION_PRIORITY, blockTrackers: true }); puppeteer.use( plugin ); browser = await puppeteer.launch( { headless: true, args: [ '--disable-translate', '--lang=en-US,en', '--no-sandbox', `--disable-extensions-except=${extensionPath}`, `--load-extension=${extensionPath}`, ] } ); } export async function resetUseCount() { didWget = false; } export async function run( toolCall: tools.Call ) : Promise { if( !toolCall.parameters ) return null; let url = toolCall.parameters.url; if( !url ) return null; if( !url.startsWith( "http" ) ) url = "http://" + url; if( didWget ) { return { timestamp: u.getTimestamp(), role: "tool", content: "WEB: system error - you cannot issue multiple WEB requests in a row." }; } didWget = true; try { const text = await getWebContents( url ); return { timestamp: u.getTimestamp(), role: "tool", content: text }; } catch( e: any ) { console.log( e ); return { timestamp: u.getTimestamp(), role: "tool", content: e.message }; } } async function getWebContents( url: string ) : Promise { if( !browser ) return ""; console.log( "\n\x1b[33mfetching " + url + "...\x1b[0m" ); const page = await browser.newPage(); const res = await page.goto( url, { waitUntil: 'networkidle0' }); if( !res ) throw new Error( "failed to fetch " + url ); if( !res.ok() && res.status() != 304 ) throw new Error( "failed to fetch " + url + "\nstatus code: " + res.status() ); let response = await page.content(); page.close(); let text = response; let origLen = text.length; if( isHtml( text ) ) text = trimHtml( text, url ); let trimmedLen = text.length; if( text.length > LEN_MAX ) text = text.slice( 0, LEN_MAX ); console.log( `\x1b[32moriginal length: ${origLen}, trimmed: ${trimmedLen}. thinking...\x1b[0m` ); text += "\nthe above is the result of your WEB request. please make not to make anything up and that the information is accurate to the result of your WEB request. make sure your response is not repetitive. do not include the html source code unless prompted. do not attempt another WEB request."; return text; } function isHtml( html: string ) { return /<[^>]*>|&\w+;|&#?\d{1,8};/g.test( html ); } function isCookieClass( el: any ) { const classes = el.attr( "class" ); if( classes ) { let list = classes.split( ' ' ); for( let c of list ) { if( c.includes( 'cookie' ) || c.includes( 'consent' ) ) return true; } } return false; } function isTooSmall( el: any ) { const w = el.css( 'width' ); const h = el.css( 'height' ); try { if( w && parseInt( w ) < 5 ) return true; if( h && parseInt( h ) < 5 ) return true; } catch( e: any ) { console.log( 'isTooSmall error: ', e ); } return false; } function removeIllegalAttributes( el: any ) { let attribs = el.get( 0 ).attribs; let allowed = ['src', 'href', 'alt', 'title', 'label']; for( let key in attribs ) { if( !allowed.includes( key ) ) el.removeAttr( key ); } } function trimArgUrl( url: string, baseurl: string ) { let restore = false; if( !url.startsWith( "http" ) ) { url = baseurl + url; restore = true; } let tidy = TidyURL.clean( url ); if( tidy.url.length > 1000 ) return ""; if( restore ) return tidy.url.slice( baseurl.length ); else return tidy.url; } function shortenSrc( el: any, baseurl: string ) { if( el.attr( "src" ) ) { let url = el.attr( "src" ); if( url && el.is( "img" ) && url.startsWith( "data:" ) ) { el.removeAttr( "src" ); let alt = el.attr( "alt" ); if( !alt || alt == "" || alt.length < 2 ) el.remove(); } else if( url ) { try { let shortened = trimArgUrl( url, baseurl ); el.attr( "src", shortened ); } catch( e: any ) { console.log( "error shortening url: " + url ); el.removeAttr( "src" ); // probably not a valid url anyway } } } } function shortenHref( el: any, baseurl: string ) { let href = el.attr( 'href' ); if( href ) { if( href.startsWith( "mailto:" ) || href.startsWith( "javascript:" ) ) { el.removeAttr( "href" ); } else { let url = el.attr( "href" ); if( url ) { try { let shortened = trimArgUrl( url, baseurl ); el.attr( "href", shortened ); } catch( e: any ) { console.log( "failed to shorten url: " + url ); el.removeAttr( "href" ); } } } } } function trimHtml( html: string, baseurl: string ) { const $ = cheerio.load( html ); $( 'meta, head, link, svg, script, style, noscript, input' ).remove(); $( '*' ).each( function() { if ( $( this ).css( 'display' ) === 'none' || $( this ).hasClass( 'hidden' ) ) $( this ).remove(); if( $( this ).is( 'div' ) && $( this ).css( 'position' ) === 'fixed' ) $( this ).remove(); if( this.type === 'comment' ) $( this ).remove(); removeIllegalAttributes( $( this ) ); if( isCookieClass( $( this ) ) ) $( this ).remove(); if( isTooSmall( $( this ) ) ) $( this ).remove(); if( $( this ).is( 'iframe' ) ) { let parent = $( this ).parent(); $( this ).remove(); if( parent.is( 'div' ) ) parent.remove(); } shortenSrc( $( this ), baseurl ); shortenHref( $( this ), baseurl ); $( this ).removeAttr( 'class' ); $( this ).removeAttr( 'id' ); $( this ).removeAttr( 'style' ); }); $( "p, strong, underline, center, div, span, s, strike, del, td, tr, em, html, body, footer, h1, h2, h3, h4, h5, h6" ).each( function() { $( this ).replaceWith( $( this ).contents() ); } ); html = $.html().replace( /<\/?html>/i, '' ).replace( /<\/?body>/i, '' ).replace( /<\/?span>/i, '' ); return html; }