java.lang.Object
org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization
All Implemented Interfaces:
NamedWriteable, Writeable, org.elasticsearch.xcontent.ToXContent, org.elasticsearch.xcontent.ToXContentObject, NamedXContentObject
Direct Known Subclasses:
BertJapaneseTokenization, BertTokenization, DebertaV2Tokenization, MPNetTokenization, RobertaTokenization, XLMRobertaTokenization

public abstract class Tokenization extends Object implements NamedXContentObject, NamedWriteable
  • Field Details

    • DO_LOWER_CASE

      public static final org.elasticsearch.xcontent.ParseField DO_LOWER_CASE
    • WITH_SPECIAL_TOKENS

      public static final org.elasticsearch.xcontent.ParseField WITH_SPECIAL_TOKENS
    • MAX_SEQUENCE_LENGTH

      public static final org.elasticsearch.xcontent.ParseField MAX_SEQUENCE_LENGTH
    • TRUNCATE

      public static final org.elasticsearch.xcontent.ParseField TRUNCATE
    • SPAN

      public static final org.elasticsearch.xcontent.ParseField SPAN
    • DEFAULT_MAX_SEQUENCE_LENGTH

      public static final int DEFAULT_MAX_SEQUENCE_LENGTH
      See Also:
    • UNSET_SPAN_VALUE

      public static final int UNSET_SPAN_VALUE
      See Also:
    • doLowerCase

      protected final boolean doLowerCase
    • withSpecialTokens

      protected final boolean withSpecialTokens
    • maxSequenceLength

      protected final int maxSequenceLength
    • truncate

      protected final Tokenization.Truncate truncate
    • span

      protected final int span
  • Constructor Details

  • Method Details

    • createDefault

      public static BertTokenization createDefault()
    • updateWindowSettings

      public Tokenization updateWindowSettings(Tokenization.SpanSettings update)
      Return a copy of this with the tokenizer span settings updated
      Parameters:
      update - The settings to update
      Returns:
      An updated Tokenization
    • writeTo

      public void writeTo(StreamOutput out) throws IOException
      Specified by:
      writeTo in interface Writeable
      Throws:
      IOException
    • getMaskToken

      public abstract String getMaskToken()
    • toXContent

      public org.elasticsearch.xcontent.XContentBuilder toXContent(org.elasticsearch.xcontent.XContentBuilder builder, org.elasticsearch.xcontent.ToXContent.Params params) throws IOException
      Specified by:
      toXContent in interface org.elasticsearch.xcontent.ToXContent
      Throws:
      IOException
    • validateSpanAndMaxSequenceLength

      public static void validateSpanAndMaxSequenceLength(int maxSequenceLength, int span)
    • validateSpanAndTruncate

      public static void validateSpanAndTruncate(@Nullable Tokenization.Truncate truncate, @Nullable Integer span)
    • equals

      public boolean equals(Object o)
      Overrides:
      equals in class Object
    • hashCode

      public int hashCode()
      Overrides:
      hashCode in class Object
    • doLowerCase

      public boolean doLowerCase()
    • withSpecialTokens

      public boolean withSpecialTokens()
    • maxSequenceLength

      public int maxSequenceLength()
    • getTruncate

      public Tokenization.Truncate getTruncate()
    • getSpan

      public int getSpan()
    • getMaxSequenceLength

      public int getMaxSequenceLength()
    • validateVocabulary

      public void validateVocabulary(PutTrainedModelVocabularyAction.Request request)