// utilities for filtering page-search results

//import helpers:
import stringHandler from "@/plugins/StringHandler.js";
//import objectHandler from "@/plugins/ObjectHandler.js";

/** @constant
 *  @type {Array.<string>} string collection of URL parts(chunks)
 *  @default
 *  if these chunks arefound in a page URL we need to ignore the page from search results
 */
const unwantedUrls = [
  "teszt",
  "tesztkereso",
  "sw",
  "oldal-cimkek",
  "oldal-cimkek-2018",
  "oldal-cimkek-2019",
  "oldal-cimkek-2020",
  "letoltheto",
  "torolve",
  "letoltes",
  "udv/atiranyitasok",
  "atvaltas",
  "bek",
  "FB-Brandbook",
  "gazdim-a-tukorkepem",
  "go-atiranyitas",
  "utas/loyal",
  "otthon24",
  "privatemed_next_egeszsegbiztositas",
  "next",
  "nyar",
  "halloween/vagj-visssza",
  "halozat/plakat_vadaszat",
  "ado",
  "socialpartner-regisztracio",
  "ugyfelportal",
  "udv/biztonsagi-masolatok",
  "udv/koronavirus-blog",
  "blogteszt",
  "udv/regi-elemek-gyujtooldala",
  "regi-elemek-gyujtooldala",
  "udv/teszt-oldalak"
];

/** @constant
 *  @type {Array.<RegExp>} used for filtering string chunks from page titles
 *  @default
 *  string chunks found buy these expressions have to be removed from the page titles
 */
const unwantedTitles = [
  /^Dok-/, // finds "Dok-" at the start of the string
  /- landing/gi, // finds "- landing" at the start of the string
  /^Backup ?-/i, // finds "Backup -" pages
  /.*c(i|í)mk(e|é)k.*/i, // finds "címkék"
  /.*l(a|á)bl(e|é)c.*/i, // finds "lábléc"
  /.*fejl(e|é)c.*/i // finds "fejléc"
];

/** @constant
 *  @type {Array.<RegExp>} for filtering string chunks from page content
 *  @default
 *  string chunks found buy these expressions have to be removed from the page content
 */
const unwantedContent = [
  // finds HTML tags and other html chunks:
  /(<(<b>)?style(<\/b>)?[^<]*<\/(<b>)?style(<\/b>)?>)/gi, // finds style tags
  /(^[^<]*<\/(<b>)?style(<\/b>)?>)/gi, // finds unopened style tags
  /(<(<b>)?style(<\/b>)?[^<]*$)/gi, // finds unclosed style tags

  /(<(<b>)?script(<\/b>)?[^<]*<\/(<b>)?script(<\/b>)?>)/gi, // finds script tags
  /(^[^<]*<\/(<b>)?script(<\/b>)?>)/gi, // finds unopened script tags
  /(<(<b>)?script(<\/b>)?[^<]*$)/gi, // finds unclosed script tags

  /(^[^<]*-->)|(<!--[^<]*-->)|(<!--[^<]*$)/gi, // finds comment tags
  /(^[^<]*(<b>)?.*(<\/b>)?.*-->)/gi, // finds ex.: '--ADDITIONAL GTAG CODE and <b>SCRIPT</b>-->'
  /<\/?(?!b>)\w*\b[^>]*>/gi, // finds HTML opening or closing tags, exception: <b> and </b>
  /.*=("|').*("|').*/gi, // finds HTML attributes (ex.: style="display:none")
  /.*("| |.)(<b>)?label(<\/b>)?.*/gi, // find label chunks
  /.*(<b>)?html(<\/b>)?.*/gi, // find html chunks

  // finds JS code chunks without any SCRIPT tags:
  /.*( |;|<b>)var( |<\/b>).+=.*/gi, // finds JS var condition chunks
  /.*(\.|: )(<b>)?log(<\/b>)?( |\)|\().*/gi, // finds JS log condition chunks ('log()' or ': log)' )
  /.*(<b>)?return(<\/b>)?.*/gi, // find JS return chunks
  /.*(<b>)?this(<\/b>)?.*/gi, // find JS this chunks
  /.*(<b>)?self(<\/b>)?.*/gi, // find JS self chunks
  /.*(<b>)?function(<\/b>)?( |\().*/gi, // find JS function chunks
  /.*(<b>)?javascript(<\/b>)?.*/gi, // find JS javascript chunks
  /.*(<b>)?jquery(<\/b>)?.*/gi, // find JS jquery chunks
  /.*(<b>)?void(<\/b>)?.*/gi, // find JS void chunks
  /.*(<b>)?krevo(<\/b>)?.*/gi, // find JS kRevo chunks
  /.*(<b>)?if(<\/b>)? ?\(.*/gi, // finds JS if chunks
  /.*\.(<b>)?prototype(<\/b>)?\..*/gi, // finds JS .prototype. chunks
  /.*\.(<b>)?call(<\/b>)?\(.*/gi, // finds JS .call() chunks
  /.*\.trigger\(".*/gi, // finds JS .trigger() chunks
  /.*}\);.*/gi, // finds }); JS code chunks
  /.*\/\.\*.*/gi, // finds /.* JS code(regexp) chunks

  // finds CSS code chunks without any STYLE tags:
  /.* *[A-z-</>]{3,}: ([A-z-"!</> (),]|[0-9.]){1,};.*/gi, // finds CSS  chunks
  /.*#(<b>)?uins(<\/b>)?.*/gi, // finds CSS #uins chunks

  /SmartCube/gi // finds unexpected strings
];

/** @constant
 *  @type {RegExp} for filtering newline cahracters
 *  @default
 *  string chunks found buy this expression have to be replaced with spaces in the page content
 */
const newLines = /(\r\n|\r|\n)/gim; // finds new line characters (\r or \n or \r\n)

/** @constant
 *  @type {RegExp} for filtering two or more space cahracters
 *  @default
 *  string chunks found by this expression have to be replaced with spaces in the page content
 */
const spaces = / {2,}/gim; // finds two or more space characters ("  " or "   " or more)

/**
 * removes from a string all parts defined by REGEXP
 * @param {string} str to remove parts from
 * @param {RegExp} reg containing expression defining the string part
 * @return {string} returns str without the found parts
 */
function removePatternFromString(str, reg) {
  let checkedStr = stringHandler.forceString(str);
  return reg instanceof RegExp ? checkedStr.replace(reg, "") : checkedStr;
}

/**
 * Checks if a string begins with another string
 * @param {string} str value beeing checked
 * @param {string} searchText value beeing searched for at the beginning of str
 * @return {boolean} returns true or false
 */
function removePatternFromStringStart(str, searchText) {
  const escapedText = stringHandler.escapeRegExp(searchText);
  if (escapedText === "") return false;
  const reg = new RegExp("^/?" + escapedText + "(/|$)", "i");
  const checkedStr = stringHandler.forceString(str);
  return checkedStr.replace(reg, "");
}

export default {
  /**
   * checks if an URL should be allowed in the search result or not
   * @param {string} pageUrl the url to check
   * @return {boolean} returns true if pageUrl is allowed
   */
  isSearchableUrl: function(pageUrl) {
    const checkedUrl = unwantedUrls.reduce(
      removePatternFromStringStart,
      pageUrl
    );
    return checkedUrl === pageUrl ? true : false;
  },

  /**
   * removes unwanted string parts from page titles
   * @param {string} titleString the title string
   * @return {string} returns titleString without the unwamted parts
   */
  filterTitle: function(titleString) {
    return unwantedTitles.reduce(removePatternFromString, titleString);
  },

  /**
   * removes unwanted string parts from page content
   * @param {string} contentString the content string
   * @return {string} returns contentString without the unwamted parts
   * content strings have line breaks, and these need to be replaced first with spaces (because line breaks are spcial cases in REGEXP)
   */
  filterContent: function(contentString) {
    const checkedStr = stringHandler.forceString(contentString);
    const noBreak = checkedStr.replace(newLines, " ");
    const filtered = unwantedContent.reduce(removePatternFromString, noBreak);
    const result = filtered.replace(spaces, " ");
    return result === " " ? "" : result;
  }
};
