163 lines
4.4 KiB
Go
163 lines
4.4 KiB
Go
|
package chroma
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"strings"
|
||
|
)
|
||
|
|
||
|
var (
|
||
|
defaultOptions = &TokeniseOptions{
|
||
|
State: "root",
|
||
|
EnsureLF: true,
|
||
|
}
|
||
|
)
|
||
|
|
||
|
// Config for a lexer.
|
||
|
type Config struct {
|
||
|
// Name of the lexer.
|
||
|
Name string `xml:"name,omitempty"`
|
||
|
|
||
|
// Shortcuts for the lexer
|
||
|
Aliases []string `xml:"alias,omitempty"`
|
||
|
|
||
|
// File name globs
|
||
|
Filenames []string `xml:"filename,omitempty"`
|
||
|
|
||
|
// Secondary file name globs
|
||
|
AliasFilenames []string `xml:"alias_filename,omitempty"`
|
||
|
|
||
|
// MIME types
|
||
|
MimeTypes []string `xml:"mime_type,omitempty"`
|
||
|
|
||
|
// Regex matching is case-insensitive.
|
||
|
CaseInsensitive bool `xml:"case_insensitive,omitempty"`
|
||
|
|
||
|
// Regex matches all characters.
|
||
|
DotAll bool `xml:"dot_all,omitempty"`
|
||
|
|
||
|
// Regex does not match across lines ($ matches EOL).
|
||
|
//
|
||
|
// Defaults to multiline.
|
||
|
NotMultiline bool `xml:"not_multiline,omitempty"`
|
||
|
|
||
|
// Don't strip leading and trailing newlines from the input.
|
||
|
// DontStripNL bool
|
||
|
|
||
|
// Strip all leading and trailing whitespace from the input
|
||
|
// StripAll bool
|
||
|
|
||
|
// Make sure that the input ends with a newline. This
|
||
|
// is required for some lexers that consume input linewise.
|
||
|
EnsureNL bool `xml:"ensure_nl,omitempty"`
|
||
|
|
||
|
// If given and greater than 0, expand tabs in the input.
|
||
|
// TabSize int
|
||
|
|
||
|
// Priority of lexer.
|
||
|
//
|
||
|
// If this is 0 it will be treated as a default of 1.
|
||
|
Priority float32 `xml:"priority,omitempty"`
|
||
|
|
||
|
// Analyse is a list of regexes to match against the input.
|
||
|
//
|
||
|
// If a match is found, the score is returned if single attribute is set to true,
|
||
|
// otherwise the sum of all the score of matching patterns will be
|
||
|
// used as the final score.
|
||
|
Analyse *AnalyseConfig `xml:"analyse,omitempty"`
|
||
|
}
|
||
|
|
||
|
// AnalyseConfig defines the list of regexes analysers.
|
||
|
type AnalyseConfig struct {
|
||
|
Regexes []RegexConfig `xml:"regex,omitempty"`
|
||
|
// If true, the first matching score is returned.
|
||
|
First bool `xml:"first,attr"`
|
||
|
}
|
||
|
|
||
|
// RegexConfig defines a single regex pattern and its score in case of match.
|
||
|
type RegexConfig struct {
|
||
|
Pattern string `xml:"pattern,attr"`
|
||
|
Score float32 `xml:"score,attr"`
|
||
|
}
|
||
|
|
||
|
// Token output to formatter.
|
||
|
type Token struct {
|
||
|
Type TokenType `json:"type"`
|
||
|
Value string `json:"value"`
|
||
|
}
|
||
|
|
||
|
func (t *Token) String() string { return t.Value }
|
||
|
func (t *Token) GoString() string { return fmt.Sprintf("&Token{%s, %q}", t.Type, t.Value) }
|
||
|
|
||
|
// Clone returns a clone of the Token.
|
||
|
func (t *Token) Clone() Token {
|
||
|
return *t
|
||
|
}
|
||
|
|
||
|
// EOF is returned by lexers at the end of input.
|
||
|
var EOF Token
|
||
|
|
||
|
// TokeniseOptions contains options for tokenisers.
|
||
|
type TokeniseOptions struct {
|
||
|
// State to start tokenisation in. Defaults to "root".
|
||
|
State string
|
||
|
// Nested tokenisation.
|
||
|
Nested bool
|
||
|
|
||
|
// If true, all EOLs are converted into LF
|
||
|
// by replacing CRLF and CR
|
||
|
EnsureLF bool
|
||
|
}
|
||
|
|
||
|
// A Lexer for tokenising source code.
|
||
|
type Lexer interface {
|
||
|
// Config describing the features of the Lexer.
|
||
|
Config() *Config
|
||
|
// Tokenise returns an Iterator over tokens in text.
|
||
|
Tokenise(options *TokeniseOptions, text string) (Iterator, error)
|
||
|
// SetRegistry sets the registry this Lexer is associated with.
|
||
|
//
|
||
|
// The registry should be used by the Lexer if it needs to look up other
|
||
|
// lexers.
|
||
|
SetRegistry(registry *LexerRegistry) Lexer
|
||
|
// SetAnalyser sets a function the Lexer should use for scoring how
|
||
|
// likely a fragment of text is to match this lexer, between 0.0 and 1.0.
|
||
|
// A value of 1 indicates high confidence.
|
||
|
//
|
||
|
// Lexers may ignore this if they implement their own analysers.
|
||
|
SetAnalyser(analyser func(text string) float32) Lexer
|
||
|
// AnalyseText scores how likely a fragment of text is to match
|
||
|
// this lexer, between 0.0 and 1.0. A value of 1 indicates high confidence.
|
||
|
AnalyseText(text string) float32
|
||
|
}
|
||
|
|
||
|
// Lexers is a slice of lexers sortable by name.
|
||
|
type Lexers []Lexer
|
||
|
|
||
|
func (l Lexers) Len() int { return len(l) }
|
||
|
func (l Lexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
||
|
func (l Lexers) Less(i, j int) bool {
|
||
|
return strings.ToLower(l[i].Config().Name) < strings.ToLower(l[j].Config().Name)
|
||
|
}
|
||
|
|
||
|
// PrioritisedLexers is a slice of lexers sortable by priority.
|
||
|
type PrioritisedLexers []Lexer
|
||
|
|
||
|
func (l PrioritisedLexers) Len() int { return len(l) }
|
||
|
func (l PrioritisedLexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
||
|
func (l PrioritisedLexers) Less(i, j int) bool {
|
||
|
ip := l[i].Config().Priority
|
||
|
if ip == 0 {
|
||
|
ip = 1
|
||
|
}
|
||
|
jp := l[j].Config().Priority
|
||
|
if jp == 0 {
|
||
|
jp = 1
|
||
|
}
|
||
|
return ip > jp
|
||
|
}
|
||
|
|
||
|
// Analyser determines how appropriate this lexer is for the given text.
|
||
|
type Analyser interface {
|
||
|
AnalyseText(text string) float32
|
||
|
}
|