const specialCharacters = {
  "&nbsp;": " ",
};

const tagsThatNeedToBeIncluded = [
  {
    tagName: "EM",
    endTag: "</em>",
  },
  {
    tagName: "B",
    endTag: "</b>",
  },
  {
    tagName: "I",
    endTag: "</i>",
  },
  {
    tagName: "SUB",
    endTag: "</sub>",
  },
  {
    tagName: "STRONG",
    endTag: "</strong>",
  },
];

const textNode = 3;

const getMatches = (content, word) => {
  const matches = [];
  let processedIndex = 0;
  if (word && content) {
    const { length } = word;
    while (processedIndex !== -1) {
      processedIndex = content.indexOf(word, processedIndex);
      if (processedIndex !== -1) {
        matches.push(processedIndex);
        processedIndex += length;
      }
    }
  }
  return matches;
};

const getSpecialCharactersIndexes = (content) => {
  const specialCharactersIndexesAndLength = [];

  Object.keys(specialCharacters).forEach((specialChar) => {
    const { length } = specialChar;
    const currentSpecialCharIndexes = getMatches(content, specialChar);
    currentSpecialCharIndexes.forEach((currentSpecialCharIndex) => {
      const specialCharacterInformation = {
        length,
        startIndex: currentSpecialCharIndex,
        endIndex: currentSpecialCharIndex + length,
        value: specialChar,
      };
      let insertionIndex = 0;

      specialCharactersIndexesAndLength.some((specialCharacterInfo) => {
        if (specialCharacterInfo.startIndex < currentSpecialCharIndex) {
          insertionIndex += 1;
          return false;
        }
        return true;
      });

      specialCharactersIndexesAndLength.splice(
        insertionIndex,
        0,
        specialCharacterInformation
      );
    });
  });

  return specialCharactersIndexesAndLength;
};

const replaceSpecialCharactersInContent = (content) =>
  Object.keys(specialCharacters).reduce(
    (updatedContent, specialCharacter) =>
      updatedContent.replace(
        new RegExp(specialCharacter, "g"),
        specialCharacters[specialCharacter]
      ),
    content
  );

const getTreeInfoAfterIncludingTagInStartAndEndWordsIfNeeded = (info, node) => {
  const { nodeName } = node;
  const tagInfo = tagsThatNeedToBeIncluded.find(
    ({ tagName }) => tagName === nodeName
  );

  if (!tagInfo) return info;

  const { outerHTML } = node;
  const { innerHTML } = node;
  const indexOfEndTag = outerHTML.lastIndexOf(tagInfo.endTag);
  const startTag = outerHTML.substring(0, indexOfEndTag - innerHTML.length);
  const firstChild = info.children.shift();
  if (!info.children.length) {
    return {
      ...info,
      children: [
        {
          ...firstChild,
          value: startTag + firstChild.value + tagInfo.endTag,
          startIndex: firstChild.startIndex - startTag.length,
        },
      ],
    };
  }

  const lastChild = info.children.pop();

  return {
    ...info,
    children: [
      {
        ...firstChild,
        value: startTag + firstChild.value,
        startIndex: firstChild.startIndex - startTag.length,
      },
      ...info.children,
      { ...lastChild, value: lastChild.value + tagInfo.endTag },
    ],
  };
};

function getWordsTree(node, startIndex, shouldCheckNodePositionInParent) {
  if (node.nodeType === textNode) {
    return [
      {
        type: node.nodeType,
        className: node.className || "",
        value: node.nodeValue,
        startIndex,
      },
    ];
  }
  let validStartIndex = shouldCheckNodePositionInParent
    ? startIndex + node.outerHTML.indexOf(node.innerHTML)
    : startIndex;

  const childNodes = [...node.childNodes];

  const childrenTree = childNodes.reduce((tree, child) => {
    const childTree = getWordsTree(child, validStartIndex, true);
    if (child.nodeType === textNode) {
      validStartIndex += child.nodeValue.length;
    } else {
      validStartIndex += child.outerHTML.length;
    }
    return [...tree, ...childTree];
  }, []);

  const info = {
    type: node.nodeType,
    className: node.className || "",
    value: node.nodeValue,
    startIndex,
    children: childrenTree,
  };
  return [getTreeInfoAfterIncludingTagInStartAndEndWordsIfNeeded(info, node)];
}

const getTextGroups = (wordsTree, className) =>
  wordsTree.reduce(
    (textGroups, group) =>
      group.type === textNode
        ? [
            ...textGroups,
            { ...group, className: `${group.className} ${className}` },
          ]
        : [...textGroups, ...getTextGroups(group.children, group.className)],
    []
  );

const getWordsInfo = (group) => {
  let globalIndex = group.startIndex;
  const { className } = group;

  let content = group.value;

  return group.value
    .split(/[\s]+/)
    .map((word) => word.trim())
    .filter((word) => word.length)
    .map((word) => {
      let localIndex = content.indexOf(word);
      globalIndex += localIndex;
      const wordInfo = {
        value: word,
        startIndex: globalIndex,
        endIndex: globalIndex + word.length,
        length: word.length,
        className,
      };
      localIndex += word.length;
      content = content.substring(localIndex);
      globalIndex += word.length;
      return wordInfo;
    });
};

function matchWordsInfoWithWordsFromTextContent(splitWords, words) {
  let splitWordsIndex = 0;
  return words.map((processedCorrectWord) => {
    const processedWord = splitWords[splitWordsIndex];
    const element = document.createElement("div");
    element.innerHTML = processedWord.value;

    while (processedCorrectWord.value !== element.textContent) {
      splitWordsIndex += 1;
      const processedSplitWord = splitWords[splitWordsIndex];
      processedWord.value += processedSplitWord.value;
      processedWord.endIndex = processedSplitWord.endIndex;
      processedWord.length += processedSplitWord.length;
      element.innerHTML = processedWord.value;
    }
    splitWordsIndex += 1;
    return processedWord;
  });
}

function addSpecialCharactersAtTheBeginningOfWord(
  wordInfo,
  specialCharacterInfo
) {
  const wordStartIndex = wordInfo.startIndex;
  const specialCharEndIndex = specialCharacterInfo.endIndex;
  if (
    specialCharEndIndex === wordStartIndex ||
    specialCharEndIndex === wordStartIndex - 1
  ) {
    const value =
      specialCharacterInfo.value +
      (specialCharEndIndex !== wordStartIndex ? " " : "") +
      wordInfo.value;
    return {
      ...wordInfo,
      value,
      startIndex: wordInfo.endIndex - value.length,
    };
  }
  return null;
}

const addSpecialCharactersAtTheEndOfWord = (wordInfo, specialCharacterInfo) => {
  const wordEndIndex = wordInfo.endIndex;
  const specialCharIndex = specialCharacterInfo.startIndex;
  const specialChar = specialCharacterInfo.value;

  if (
    wordEndIndex === specialCharIndex ||
    wordEndIndex === specialCharIndex - 1
  ) {
    const value =
      wordInfo.value +
      (wordEndIndex !== specialCharIndex ? " " : "") +
      specialChar;
    return { ...wordInfo, value, endIndex: wordInfo.startIndex + value.length };
  }
  return null;
};

function getMergedSpecialCharacters(specialCharactersIndexes) {
  const mergedChars = [];
  let specialCharsIndex = -1;

  while (specialCharactersIndexes.length > 0) {
    const currentProcessedChar = specialCharactersIndexes.shift();
    if (specialCharsIndex < 0) {
      mergedChars.push(currentProcessedChar);
    } else {
      const updatedWord = addSpecialCharactersAtTheEndOfWord(
        mergedChars[specialCharsIndex],
        currentProcessedChar
      );
      if (!updatedWord) {
        mergedChars.push(currentProcessedChar);
      }
    }
    specialCharsIndex = mergedChars.length - 1;
  }

  return mergedChars;
}

const adjustWordsIndexAfterSpecialCharactersProcessing = (
  rawWordsData,
  specialCharactersIndexes
) => {
  const words = rawWordsData.map((wordData) => ({ ...wordData }));

  let processedSpecialCharIndex = 0;

  words.forEach((word, wordIndex) => {
    let processedWordIndexInText = word.startIndex;

    for (
      let specialCharactersIndex = processedSpecialCharIndex;
      specialCharactersIndex < specialCharactersIndexes.length;
      specialCharactersIndex += 1
    ) {
      const specialCharacterInfo =
        specialCharactersIndexes[specialCharactersIndex];
      if (specialCharacterInfo.startIndex < processedWordIndexInText) {
        // remove one since the special char was processed as length = 1 when computed indexes
        const offset = specialCharacterInfo.length - 1;
        processedSpecialCharIndex = specialCharactersIndex + 1;

        for (
          let processingWordsIndex = wordIndex;
          processingWordsIndex < words.length;
          processingWordsIndex += 1
        ) {
          words[processingWordsIndex].startIndex += offset;
          words[processingWordsIndex].endIndex += offset;
        }
        processedWordIndexInText = word.startIndex;
      } else {
        processedSpecialCharIndex = specialCharactersIndex;
        break;
      }
    }
  });

  const mergedSpecialCharacters = getMergedSpecialCharacters(
    specialCharactersIndexes
  );
  let processingWordsIndex = 0;
  mergedSpecialCharacters.forEach((specialCharacterInfo) => {
    for (
      let wordsIndex = processingWordsIndex;
      wordsIndex < words.length;
      wordsIndex += 1
    ) {
      let updatedWord = addSpecialCharactersAtTheEndOfWord(
        words[wordsIndex],
        specialCharacterInfo
      );

      if (updatedWord) {
        processingWordsIndex = wordsIndex;
        break;
      } else {
        updatedWord = addSpecialCharactersAtTheBeginningOfWord(
          words[wordsIndex],
          specialCharacterInfo
        );
        if (updatedWord) {
          processingWordsIndex = wordsIndex;
          break;
        }
      }
    }
  });
  return words;
};

function parse(content) {
  const node = document.createElement("div");
  node.innerHTML = content;
  const specialCharactersIndexes = getSpecialCharactersIndexes(node.innerHTML);

  node.innerHTML = replaceSpecialCharactersInContent(node.innerHTML);
  window.content = node.innerHTML;
  const wordsTree = getWordsTree(node, 0, false);
  const wordsGroups = getTextGroups(wordsTree, "");

  return adjustWordsIndexAfterSpecialCharactersProcessing(
    matchWordsInfoWithWordsFromTextContent(
      wordsGroups.reduce(
        (ret, wordsGroup) => [...ret, ...getWordsInfo(wordsGroup)],
        []
      ),
      getWordsInfo({ startIndex: 0, value: node.textContent })
    ),
    specialCharactersIndexes
  );
}

export default { parse };
