/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.xpack.inference.chunking;

import com.ibm.icu.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.elasticsearch.common.Strings;
import org.elasticsearch.inference.ChunkingSettings;
import org.elasticsearch.xpack.inference.chunking.Chunker;
import org.elasticsearch.xpack.inference.chunking.ChunkerUtils;
import org.elasticsearch.xpack.inference.chunking.RecursiveChunkingSettings;
import org.elasticsearch.xpack.inference.chunking.SentenceBoundaryChunker;
import org.elasticsearch.xpack.inference.chunking.SentenceBoundaryChunkingSettings;

public class RecursiveChunker
implements Chunker {
    private final BreakIterator wordIterator = BreakIterator.getWordInstance();

    @Override
    public List<Chunker.ChunkOffset> chunk(String input, ChunkingSettings chunkingSettings) {
        if (chunkingSettings instanceof RecursiveChunkingSettings) {
            RecursiveChunkingSettings recursiveChunkingSettings = (RecursiveChunkingSettings)chunkingSettings;
            return this.chunk(input, new Chunker.ChunkOffset(0, input.length()), recursiveChunkingSettings.getSeparators(), recursiveChunkingSettings.maxChunkSize(), 0);
        }
        throw new IllegalArgumentException(Strings.format((String)"RecursiveChunker can't use ChunkingSettings with strategy [%s]", (Object[])new Object[]{chunkingSettings.getChunkingStrategy()}));
    }

    private List<Chunker.ChunkOffset> chunk(String input, Chunker.ChunkOffset offset, List<String> separators, int maxChunkSize, int separatorIndex) {
        if (offset.start() == offset.end() || this.isChunkWithinMaxSize(this.buildChunkOffsetAndCount(input, offset), maxChunkSize)) {
            return List.of(offset);
        }
        if (separatorIndex > separators.size() - 1) {
            return this.chunkWithBackupChunker(input, offset, maxChunkSize);
        }
        List<ChunkOffsetAndCount> potentialChunks = this.mergeChunkOffsetsUpToMaxChunkSize(this.splitTextBySeparatorRegex(input, offset, separators.get(separatorIndex)), maxChunkSize);
        ArrayList<Chunker.ChunkOffset> actualChunks = new ArrayList<Chunker.ChunkOffset>();
        for (ChunkOffsetAndCount potentialChunk : potentialChunks) {
            if (this.isChunkWithinMaxSize(potentialChunk, maxChunkSize)) {
                actualChunks.add(potentialChunk.chunkOffset());
                continue;
            }
            actualChunks.addAll(this.chunk(input, potentialChunk.chunkOffset(), separators, maxChunkSize, separatorIndex + 1));
        }
        return actualChunks;
    }

    private boolean isChunkWithinMaxSize(ChunkOffsetAndCount chunkOffsetAndCount, int maxChunkSize) {
        return chunkOffsetAndCount.wordCount <= maxChunkSize;
    }

    private ChunkOffsetAndCount buildChunkOffsetAndCount(String fullText, Chunker.ChunkOffset offset) {
        this.wordIterator.setText(fullText);
        return new ChunkOffsetAndCount(offset, ChunkerUtils.countWords(offset.start(), offset.end(), this.wordIterator));
    }

    private List<ChunkOffsetAndCount> splitTextBySeparatorRegex(String input, Chunker.ChunkOffset offset, String separatorRegex) {
        Pattern pattern = Pattern.compile(separatorRegex, 8);
        Matcher matcher = pattern.matcher(input).region(offset.start(), offset.end());
        ArrayList<ChunkOffsetAndCount> chunkOffsets = new ArrayList<ChunkOffsetAndCount>();
        int chunkStart = offset.start();
        while (matcher.find()) {
            int chunkEnd = matcher.start();
            if (chunkStart < chunkEnd) {
                chunkOffsets.add(this.buildChunkOffsetAndCount(input, new Chunker.ChunkOffset(chunkStart, chunkEnd)));
            }
            chunkStart = chunkEnd;
        }
        if (chunkStart < offset.end()) {
            chunkOffsets.add(this.buildChunkOffsetAndCount(input, new Chunker.ChunkOffset(chunkStart, offset.end())));
        }
        return chunkOffsets;
    }

    private List<ChunkOffsetAndCount> mergeChunkOffsetsUpToMaxChunkSize(List<ChunkOffsetAndCount> chunkOffsets, int maxChunkSize) {
        if (chunkOffsets.size() < 2) {
            return chunkOffsets;
        }
        ArrayList<ChunkOffsetAndCount> mergedOffsetsAndCounts = new ArrayList<ChunkOffsetAndCount>();
        ChunkOffsetAndCount mergedChunk = chunkOffsets.getFirst();
        for (int i = 1; i < chunkOffsets.size(); ++i) {
            ChunkOffsetAndCount chunkOffsetAndCountToMerge = chunkOffsets.get(i);
            ChunkOffsetAndCount potentialMergedChunk = new ChunkOffsetAndCount(new Chunker.ChunkOffset(mergedChunk.chunkOffset.start(), chunkOffsetAndCountToMerge.chunkOffset.end()), mergedChunk.wordCount + chunkOffsetAndCountToMerge.wordCount);
            if (this.isChunkWithinMaxSize(potentialMergedChunk, maxChunkSize)) {
                mergedChunk = potentialMergedChunk;
            } else {
                mergedOffsetsAndCounts.add(mergedChunk);
                mergedChunk = chunkOffsets.get(i);
            }
            if (i != chunkOffsets.size() - 1) continue;
            mergedOffsetsAndCounts.add(mergedChunk);
        }
        return mergedOffsetsAndCounts;
    }

    private List<Chunker.ChunkOffset> chunkWithBackupChunker(String input, Chunker.ChunkOffset offset, int maxChunkSize) {
        List<Chunker.ChunkOffset> chunks = new SentenceBoundaryChunker().chunk(input.substring(offset.start(), offset.end()), new SentenceBoundaryChunkingSettings(maxChunkSize, 0));
        ArrayList<Chunker.ChunkOffset> chunksWithOffsets = new ArrayList<Chunker.ChunkOffset>();
        for (Chunker.ChunkOffset chunk : chunks) {
            chunksWithOffsets.add(new Chunker.ChunkOffset(chunk.start() + offset.start(), chunk.end() + offset.start()));
        }
        return chunksWithOffsets;
    }

    private record ChunkOffsetAndCount(Chunker.ChunkOffset chunkOffset, int wordCount) {
    }
}

