name : SequenceMatcher.php
<?php

declare(strict_types=1);

namespace Jfcherng\Diff;

/**
 * A longest sequence matcher.
 *
 * The logic is primarily based on the Python difflib package.
 *
 * @see https://docs.python.org/3/library/difflib.html
 */
final class SequenceMatcher
{
    /** @var int 0, opcode: no operation */
    const OP_NOP = 0;

    /** @var int 1, opcode: equal */
    const OP_EQ = 1 << 0;

    /** @var int 2, opcode: delete */
    const OP_DEL = 1 << 1;

    /** @var int 4, opcode: insert */
    const OP_INS = 1 << 2;

    /** @var int 8, opcode: replace */
    const OP_REP = 1 << 3;

    const OP_INT_TO_STR_MAP = [
        self::OP_NOP => 'nop',
        self::OP_EQ => 'eq',
        self::OP_DEL => 'del',
        self::OP_INS => 'ins',
        self::OP_REP => 'rep',
    ];

    const OP_STR_TO_INT_MAP = [
        'nop' => self::OP_NOP,
        'eq' => self::OP_EQ,
        'del' => self::OP_DEL,
        'ins' => self::OP_INS,
        'rep' => self::OP_REP,
    ];

    /**
     * The helper line which may be used to append to the source inputs to help
     * it easier to handle EOL at EOF problem. This line shouldn't be counted into diff.
     *
     * @var string
     */
    const APPENDED_HELPER_LINE = "\u{fcf28}\u{fc232}";

    /**
     * @var null|callable either a string or an array containing a callback function to determine if a line is "junk" or not
     */
    private $junkCallback;

    /**
     * @var array the first sequence to compare against
     */
    private $a = [];

    /**
     * @var array the second sequence
     */
    private $b = [];

    /**
     * @var array the first sequence to compare against (transformed)
     */
    private $at = [];

    /**
     * @var array the second sequence (transformed)
     */
    private $bt = [];

    /**
     * @var array array of characters that are considered junk from the second sequence. Characters are the array key.
     */
    private $junkDict = [];

    /**
     * @var array array of indices that do not contain junk elements
     */
    private $b2j = [];

    /**
     * @var array
     */
    private $options = [];

    /**
     * @var array
     */
    private static $defaultOptions = [
        'ignoreCase' => false,
        'ignoreLineEnding' => false,
        'ignoreWhitespace' => false,
        'lengthLimit' => 2000,
    ];

    /**
     * @var array
     */
    private $matchingBlocks = [];

    /**
     * @var array generated opcodes which manipulates seq1 to seq2
     */
    private $opcodes = [];

    /**
     * The constructor. With the sequences being passed, they'll be set
     * for the sequence matcher and it will perform a basic cleanup &
     * calculate junk elements.
     *
     * @param string[]      $a            an array containing the lines to compare against
     * @param string[]      $b            an array containing the lines to compare
     * @param null|callable $junkCallback either an array or string that references a callback function (if there is one) to determine 'junk' characters
     * @param array         $options      the options
     */
    public function __construct(array $a, array $b, ?callable $junkCallback = null, array $options = [])
    {
        $this->junkCallback = $junkCallback;
        $this->setOptions($options);
        $this->setSequences($a, $b);
    }

    /**
     * Set the options.
     *
     * @param array $options The options
     */
    public function setOptions(array $options): self
    {
        $needRerunChainB = $this->isAnyOptionChanged(
            $this->options,
            $options,
            ['ignoreCase', 'ignoreLineEnding', 'ignoreWhitespace', 'lengthLimit']
        );

        $this->options = $options + self::$defaultOptions;

        if ($needRerunChainB) {
            $this->chainB();
        }

        $this->resetCachedResults();

        return $this;
    }

    /**
     * Get the options.
     */
    public function getOptions(): array
    {
        return $this->options;
    }

    /**
     * Reset cached results.
     */
    public function resetCachedResults(): self
    {
        $this->matchingBlocks = [];
        $this->opcodes = [];

        return $this;
    }

    /**
     * Set the first and second sequences to use with the sequence matcher.
     *
     * This method is more effecient than "->setSeq1($old)->setSeq2($new)"
     * because it only run the routine once.
     *
     * @param string[] $a an array containing the lines to compare against
     * @param string[] $b an array containing the lines to compare
     */
    public function setSequences(array $a, array $b): self
    {
        $need_routine = false;

        if ($this->a !== $a) {
            $need_routine = true;
            $this->a = $a;
        }

        if ($this->b !== $b) {
            $need_routine = true;
            $this->b = $b;
        }

        if ($need_routine) {
            $this->chainB();
            $this->resetCachedResults();
        }

        return $this;
    }

    /**
     * Set the first sequence ($a) and reset any internal caches to indicate that
     * when calling the calculation methods, we need to recalculate them.
     *
     * @param string[] $a the sequence to set as the first sequence
     */
    public function setSeq1(array $a): self
    {
        if ($this->a !== $a) {
            $this->a = $a;
            $this->chainB();
            $this->resetCachedResults();
        }

        return $this;
    }

    /**
     * Set the second sequence ($b) and reset any internal caches to indicate that
     * when calling the calculation methods, we need to recalculate them.
     *
     * @param string[] $b the sequence to set as the second sequence
     */
    public function setSeq2(array $b): self
    {
        if ($this->b !== $b) {
            $this->b = $b;
            $this->chainB();
            $this->resetCachedResults();
        }

        return $this;
    }

    /**
     * Find the longest matching block in the two sequences, as defined by the
     * lower and upper constraints for each sequence. (for the first sequence,
     * $alo - $ahi and for the second sequence, $blo - $bhi).
     *
     * Essentially, of all of the maximal matching blocks, return the one that
     * startest earliest in $a, and all of those maximal matching blocks that
     * start earliest in $a, return the one that starts earliest in $b.
     *
     * If the junk callback is defined, do the above but with the restriction
     * that the junk element appears in the block. Extend it as far as possible
     * by matching only junk elements in both $a and $b.
     *
     * @param int $alo the lower constraint for the first sequence
     * @param int $ahi the upper constraint for the first sequence
     * @param int $blo the lower constraint for the second sequence
     * @param int $bhi the upper constraint for the second sequence
     *
     * @return int[] an array containing the longest match that includes the starting position in $a, start in $b and the length/size
     */
    public function findLongestMatch(int $alo, int $ahi, int $blo, int $bhi): array
    {
        $bestI = $alo;
        $bestJ = $blo;
        $bestSize = 0;

        $j2Len = [];

        for ($i = $alo; $i < $ahi; ++$i) {
            if (null === ($element = $this->at[$i] ?? null)) {
                continue;
            }

            $newJ2Len = [];
            $jDict = $this->b2j[$element] ?? [];

            foreach ($jDict as $j) {
                if ($j < $blo) {
                    continue;
                }

                if ($j >= $bhi) {
                    break;
                }

                $k = ($j2Len[$j - 1] ?? 0) + 1;
                $newJ2Len[$j] = $k;

                if ($k > $bestSize) {
                    $bestI = $i - $k + 1;
                    $bestJ = $j - $k + 1;
                    $bestSize = $k;
                }
            }

            $j2Len = $newJ2Len;
        }

        while (
            $bestI > $alo
            && $bestJ > $blo
            && $this->at[$bestI - 1] === $this->bt[$bestJ - 1]
            && !$this->isBJunk($this->bt[$bestJ - 1])
        ) {
            --$bestI;
            --$bestJ;
            ++$bestSize;
        }

        while (
            $bestI + $bestSize < $ahi
            && $bestJ + $bestSize < $bhi
            && $this->at[$bestI + $bestSize] === $this->bt[$bestJ + $bestSize]
            && !$this->isBJunk($this->bt[$bestJ + $bestSize])
        ) {
            ++$bestSize;
        }

        while (
            $bestI > $alo
            && $bestJ > $blo
            && $this->at[$bestI - 1] === $this->bt[$bestJ - 1]
            && $this->isBJunk($this->bt[$bestJ - 1])
        ) {
            --$bestI;
            --$bestJ;
            ++$bestSize;
        }

        while (
            $bestI + $bestSize < $ahi
            && $bestJ + $bestSize < $bhi
            && $this->at[$bestI + $bestSize] === $this->bt[$bestJ + $bestSize]
            && $this->isBJunk($this->bt[$bestJ + $bestSize])
        ) {
            ++$bestSize;
        }

        return [$bestI, $bestJ, $bestSize];
    }

    /**
     * Return a nested set of arrays for all of the matching sub-sequences
     * in the strings $a and $b.
     *
     * Each block contains the lower constraint of the block in $a, the lower
     * constraint of the block in $b and finally the number of lines that the
     * block continues for.
     *
     * @return int[][] a nested array of the matching blocks, as described by the function
     */
    public function getMatchingBlocks(): array
    {
        if (!empty($this->matchingBlocks)) {
            return $this->matchingBlocks;
        }

        $aCount = \count($this->a);
        $bCount = \count($this->b);

        $queue = [
            [0, $aCount, 0, $bCount],
        ];

        $matchingBlocks = [];
        while (!empty($queue)) {
            [$alo, $ahi, $blo, $bhi] = \array_pop($queue);
            [$i, $j, $k] = $x = $this->findLongestMatch($alo, $ahi, $blo, $bhi);

            if ($k) {
                $matchingBlocks[] = $x;

                if ($alo < $i && $blo < $j) {
                    $queue[] = [$alo, $i, $blo, $j];
                }

                if ($i + $k < $ahi && $j + $k < $bhi) {
                    $queue[] = [$i + $k, $ahi, $j + $k, $bhi];
                }
            }
        }

        \usort($matchingBlocks, function (array $a, array $b): int {
            $aCount = \count($a);
            $bCount = \count($b);
            $min = \min($aCount, $bCount);

            for ($i = 0; $i < $min; ++$i) {
                if ($a[$i] !== $b[$i]) {
                    return $a[$i] <=> $b[$i];
                }
            }

            return $aCount <=> $bCount;
        });

        $i1 = $j1 = $k1 = 0;
        $nonAdjacent = [];
        foreach ($matchingBlocks as [$i2, $j2, $k2]) {
            if ($i1 + $k1 === $i2 && $j1 + $k1 === $j2) {
                $k1 += $k2;

                continue;
            }

            if ($k1) {
                $nonAdjacent[] = [$i1, $j1, $k1];
            }

            $i1 = $i2;
            $j1 = $j2;
            $k1 = $k2;
        }

        if ($k1) {
            $nonAdjacent[] = [$i1, $j1, $k1];
        }

        $nonAdjacent[] = [$aCount, $bCount, 0];

        $this->matchingBlocks = $nonAdjacent;

        return $this->matchingBlocks;
    }

    /**
     * Return a list of all of the opcodes for the differences between the
     * two strings.
     *
     * The nested array returned contains an array describing the opcode
     * which includes:
     * 0 - The type of tag (as described below) for the opcode.
     * 1 - The beginning line in the first sequence.
     * 2 - The end line in the first sequence.
     * 3 - The beginning line in the second sequence.
     * 4 - The end line in the second sequence.
     *
     * The different types of tags include:
     * replace - The string from $i1 to $i2 in $a should be replaced by
     *           the string in $b from $j1 to $j2.
     * delete -  The string in $a from $i1 to $j2 should be deleted.
     * insert -  The string in $b from $j1 to $j2 should be inserted at
     *           $i1 in $a.
     * equal  -  The two strings with the specified ranges are equal.
     *
     * @return int[][] array of the opcodes describing the differences between the strings
     */
    public function getOpcodes(): array
    {
        if (!empty($this->opcodes)) {
            return $this->opcodes;
        }

        $i = $j = 0;
        $this->opcodes = [];

        foreach ($this->getMatchingBlocks() as [$ai, $bj, $size]) {
            if ($i < $ai && $j < $bj) {
                $tag = self::OP_REP;
            } elseif ($i < $ai) {
                $tag = self::OP_DEL;
            } elseif ($j < $bj) {
                $tag = self::OP_INS;
            } else {
                $tag = self::OP_NOP;
            }

            if ($tag) {
                $this->opcodes[] = [$tag, $i, $ai, $j, $bj];
            }

            $i = $ai + $size;
            $j = $bj + $size;

            if ($size) {
                $this->opcodes[] = [self::OP_EQ, $ai, $i, $bj, $j];
            }
        }

        return $this->opcodes;
    }

    /**
     * Return a series of nested arrays containing different groups of generated
     * opcodes for the differences between the strings with up to $context lines
     * of surrounding content.
     *
     * Essentially what happens here is any big equal blocks of strings are stripped
     * out, the smaller subsets of changes are then arranged in to their groups.
     * This means that the sequence matcher and diffs do not need to include the full
     * content of the different files but can still provide context as to where the
     * changes are.
     *
     * @param int $context the number of lines of context to provide around the groups
     *
     * @return int[][][] nested array of all of the grouped opcodes
     */
    public function getGroupedOpcodes(int $context = 3): array
    {
        $opcodes = $this->getOpcodes();

        if (empty($opcodes)) {
            $opcodes = [
                [self::OP_EQ, 0, 1, 0, 1],
            ];
        }

        if ($opcodes[0][0] === self::OP_EQ) {
            // fix the leading sequence which is out of context.
            $opcodes[0] = [
                $opcodes[0][0],
                \max($opcodes[0][1], $opcodes[0][2] - $context),
                $opcodes[0][2],
                \max($opcodes[0][3], $opcodes[0][4] - $context),
                $opcodes[0][4],
            ];
        }

        $lastItem = \count($opcodes) - 1;
        if ($opcodes[$lastItem][0] === self::OP_EQ) {
            [$tag, $i1, $i2, $j1, $j2] = $opcodes[$lastItem];
            // fix the trailing sequence which is out of context.
            $opcodes[$lastItem] = [
                $tag,
                $i1,
                \min($i2, $i1 + $context),
                $j1,
                \min($j2, $j1 + $context),
            ];
        }

        $maxRange = $context << 1;
        $groups = $group = [];
        foreach ($opcodes as [$tag, $i1, $i2, $j1, $j2]) {
            if ($tag === self::OP_EQ && $i2 - $i1 > $maxRange) {
                $group[] = [
                    $tag,
                    $i1,
                    \min($i2, $i1 + $context),
                    $j1,
                    \min($j2, $j1 + $context),
                ];
                $groups[] = $group;
                $group = [];
                $i1 = \max($i1, $i2 - $context);
                $j1 = \max($j1, $j2 - $context);
            }

            $group[] = [$tag, $i1, $i2, $j1, $j2];
        }

        if (
            !empty($group)
            && (
                \count($group) !== 1
                || $group[0][0] !== self::OP_EQ
            )
        ) {
            $groups[] = $group;
        }

        // there will be at least leading/trailing OP_EQ blocks
        // if we want really zero-context, we keep only non-equal blocks
        if ($context <= 0) {
            $groupsNew = [];

            foreach ($groups as $group) {
                $groupNew = [];

                foreach ($group as $block) {
                    if ($block[0] !== self::OP_EQ) {
                        $groupNew[] = $block;
                    }
                }

                if (!empty($groupNew)) {
                    $groupsNew[] = $groupNew;
                }
            }

            return $groupsNew;
        }

        return $groups;
    }

    /**
     * Convert an operation code from int into its string form.
     *
     * @param int $op the operation code
     *
     * @throws \InvalidArgumentException
     *
     * @return string the string representation of the operation code
     */
    public static function opIntToStr(int $op): string
    {
        if (!isset(self::OP_INT_TO_STR_MAP[$op])) {
            throw new \InvalidArgumentException("Invalid OP: {$op}");
        }

        return self::OP_INT_TO_STR_MAP[$op];
    }

    /**
     * Convert an operation code from string into its int form.
     *
     * @param string $op the operation code
     *
     * @throws \InvalidArgumentException
     *
     * @return int the int representation of the operation code
     */
    public static function opStrToInt(string $op): int
    {
        if (!isset(self::OP_STR_TO_INT_MAP[$op])) {
            throw new \InvalidArgumentException("Invalid OP: {$op}");
        }

        return self::OP_STR_TO_INT_MAP[$op];
    }

    /**
     * Determine if any option under test changed.
     *
     * @param array $old  the old options
     * @param array $new  the new options
     * @param array $keys the option keys under test
     */
    private function isAnyOptionChanged(array $old, array $new, array $keys): bool
    {
        foreach ($keys as $key) {
            if (isset($new[$key]) && $new[$key] !== $old[$key]) {
                return true;
            }
        }

        return false;
    }

    /**
     * Get the processed line with the initialized options.
     *
     * @param string $line the line
     *
     * @return string the line after being processed
     */
    private function processLineWithOptions(string $line): string
    {
        if ($this->options['ignoreWhitespace']) {
            static $whitespaces = [' ', "\t", "\r", "\n"];

            $line = \str_replace($whitespaces, '', $line);
        }

        if ($this->options['ignoreCase']) {
            $line = \strtolower($line);
        }

        if ($this->options['ignoreLineEnding']) {
            $line = \rtrim($line, "\r\n");
        }

        return $line;
    }

    /**
     * Generate the internal arrays containing the list of junk and non-junk
     * characters for the second ($b) sequence.
     */
    private function chainB(): self
    {
        $this->at = \array_map([$this, 'processLineWithOptions'], $this->a);
        $this->bt = \array_map([$this, 'processLineWithOptions'], $this->b);

        $length = \count($this->bt);
        $this->b2j = [];
        $popularDict = [];

        for ($i = 0; $i < $length; ++$i) {
            $char = $this->bt[$i];
            $this->b2j[$char] = $this->b2j[$char] ?? [];

            if (
                $length >= $this->options['lengthLimit']
                && \count($this->b2j[$char]) * 100 > $length
                && $char !== self::APPENDED_HELPER_LINE
            ) {
                $popularDict[$char] = 1;

                unset($this->b2j[$char]);
            } else {
                $this->b2j[$char][] = $i;
            }
        }

        // remove leftovers
        foreach (\array_keys($popularDict) as $char) {
            unset($this->b2j[$char]);
        }

        $this->junkDict = [];
        if (\is_callable($this->junkCallback)) {
            foreach (\array_keys($popularDict) as $char) {
                if (($this->junkCallback)($char)) {
                    $this->junkDict[$char] = 1;
                    unset($popularDict[$char]);
                }
            }

            foreach (\array_keys($this->b2j) as $char) {
                if (($this->junkCallback)($char)) {
                    $this->junkDict[$char] = 1;
                    unset($this->b2j[$char]);
                }
            }
        }

        return $this;
    }

    /**
     * Checks if a particular character is in the junk dictionary
     * for the list of junk characters.
     *
     * @return bool $b True if the character is considered junk. False if not.
     */
    private function isBJunk(string $b): bool
    {
        return isset($this->junkDict[$b]);
    }
}

© 2025 Cubjrnet7