croissant

import "github.com/b13rg/croissant-go/pkg/croissant"

Croissant spec filetypes and relations.

Croissant spec filetypes and relations.

Croissant spec filetypes and relations.

Croissant spec filetypes and relations.

Croissant spec filetypes and relations.

Index

Variables

The suggested context to use in a Croissant Json-LD file.

var SuggestedContext = map[string]interface{}{
    "@language":  "en",
    "@vocab":     "https://schema.org/",
    "sc":         "https://schema.org/",
    "cr":         "http://mlcommons.org/croissant/",
    "rai":        "http://mlcommons.org/croissant/RAI/",
    "dct":        "http://purl.org/dc/terms/",
    "citeAs":     "cr:citeAs",
    "column":     "cr:column",
    "conformsTo": "dct:conformsTo",
    "data": map[string]interface{}{
        "@id":   "cr:data",
        "@type": "@json",
    },
    "dataType": map[string]interface{}{
        "@id":   "cr:dataType",
        "@type": "@vocab",
    },
    "examples": map[string]interface{}{
        "@id":   "cr:examples",
        "@type": "@json",
    },
    "extract":       "cr:extract",
    "field":         "cr:field",
    "fileProperty":  "cr:fileProperty",
    "fileObject":    "cr:fileObject",
    "fileSet":       "cr:fileSet",
    "format":        "cr:format",
    "includes":      "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath":      "cr:jsonPath",
    "key":           "cr:key",
    "md5":           "cr:md5",
    "parentField":   "cr:parentField",
    "path":          "cr:path",
    "recordSet":     "cr:recordSet",
    "references":    "cr:references",
    "regex":         "cr:regex",
    "repeated":      "cr:repeated",
    "replace":       "cr:replace",
    "separator":     "cr:separator",
    "source":        "cr:source",
    "subField":      "cr:subField",
    "transform":     "cr:transform",
}

type ClassRefItem

type ClassRefItem struct {
    // ID of the resource.
    ID string `json:"@id,omitempty"`
}

type ClassRefList

type ClassRefList []ClassRefItem

func (ClassRefList) MarshalJSON

func (ref ClassRefList) MarshalJSON() ([]byte, error)

func (*ClassRefList) UnmarshalJSON

func (ref *ClassRefList) UnmarshalJSON(data []byte) error

type ContentExtractionEnumeration

type ContentExtractionEnumeration struct {
    // Full path to file, from Croissant extraction or download folders.
    FullPath string
    // Name of the file (no path).
    Filename string
    // Byte content of the file.
    Content string
    // Byte content of each line of the file.
    Lines string
    // The numbers of each line in the file.
    LineNumbers string
}

func NewContentExtractionEnumeration

func NewContentExtractionEnumeration() *ContentExtractionEnumeration

type DataSet

[Dataset Class](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#dataset-level-information) Based on https://docs.mlcommons.org/croissant/docs/croissant-spec.html#schemaorgdataset

type DataSet struct {

    // Context alias definitions to make rest of document shorter.
    Context map[string]interface{} `json:"@context"`
    // Must be `schema.org/Dataset`.
    Type string `json:"@type"`
    // The name of the dataset
    Name string `json:"name"`
    // Description of the dataset
    Description string `json:"description"`
    // Schema version the croissant file conforms to.
    ConformsTo string `json:"conformsTo"`
    // A citation to the dataset itself.
    CiteAs string `json:"citeAs,omitempty"`
    // Licenses of the dataset.
    // Spec suggests using references from https://spdx.org/licenses/.
    License types.StringOrSlice `json:"license"`
    // Url of the dataset, usually a webpage.
    URL string `json:"url"`
    // One or more Person or Organizations that created the dataset.
    Creator []string `json:"creator,omitempty"`
    // The date the dataset was published.
    DatePublished string `json:"datePublished,omitempty"`

    // Keywords associated with the text
    Keywords []string `json:"keywords,omitempty"`
    // Publisher of the dataset, sometimes distinct from creator.
    Publisher []string `json:"publisher,omitempty"`
    // Version of the dataset.
    // Either an single int, or a MAJOR.MINOR.PATCH sematic version string.
    // [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html)
    Version string `json:"version,omitempty"`
    // Date the dataset was initially created
    DateCreated string `json:"dateCreated,omitempty"`
    // Date the dataset was last modified
    DateModified string `json:"dateModified,omitempty"`
    // List of URLs that represent the same dataset as this one.
    SameAs []string `json:"sameAs,omitempty"`
    // License that applies to the croissant metadata.
    SdLicense []string `json:"sdLicense,omitempty"`
    // Language of the content of the dataset.
    InLanguage []string `json:"inLanguage,omitempty"`

    // Whether the dataset is a live dataset (in-process of being updated).
    IsLiveDataset bool `json:"isLiveDataset,omitempty"`

    // List of FileObjects and FileSets associated with the dataset.
    // Modified from schema.org/Dataset.
    // Required.
    Distribution Distribution `json:"distribution,omitempty"`

    // List of RecordSets associated with the dataset
    RecordSets []RecordSet `json:"recordSet"`
}

func NewDataSet

func NewDataSet() *DataSet

func NewDataSetFromPath

func NewDataSetFromPath(filePath string) (*DataSet, error)

func NewFileSet

func NewFileSet() *DataSet

func (*DataSet) Validate

func (ds *DataSet) Validate() ([]types.CroissantWarning, []types.CroissantError)

func (*DataSet) ValidateRecommendedProps

func (ds *DataSet) ValidateRecommendedProps() ([]types.CroissantWarning, []types.CroissantError)

func (*DataSet) ValidateRequiredProps

func (ds *DataSet) ValidateRequiredProps() []types.CroissantError

func (*DataSet) WriteToFile

func (ds *DataSet) WriteToFile(path string) error

type DataSource

type DataSource struct {
    // Must be DataSource
    NType *string `json:"@type,omitempty"`
    // Node ID
    ClassRefItem
    // The name of the referenced FileObject source of the data.
    FileObject *ClassRefItem `json:"fileObject,omitempty"`
    // The name of the reference RecordSet source.
    FileSet *ClassRefItem `json:"fileSet,omitempty"`
    // The name of the referenced RecordSet source.
    RecordSet *ClassRefItem `json:"recordSet,omitempty"`
    // The extraction method from the provided source.
    Extract *Extract `json:"extract,omitempty"`
    // Transformations to apply to data after extraction.
    Transform *Transform `json:"transform,omitempty"`
    // A format to parse data values from text.
    Format *Format `json:"format,omitempty"`
}

func NewDataSource

func NewDataSource() *DataSource

type DataType

type DataType struct {
    // MIME type
    DataType string
}

func NewDataType

func NewDataType(mimeType string) *DataType

type Distribution

Type used to group data resource objects together.

type Distribution []DistributionItem

func (*Distribution) UnmarshalJSON

func (d *Distribution) UnmarshalJSON(data []byte) error

type DistributionItem

type DistributionItem interface {
    Validate() ([]types.CroissantWarning, []types.CroissantError)
}

type Extract

type Extract struct {
    // Extraction method.
    FileProperty string `json:"fileProperty,omitempty"`
    // Name of the column (field) that contains values.
    Column string `json:"column,omitempty"`
    // A JSON path expression that obtains values.
    JsonPath string `json:"jsonPath,omitempty"`
}

func NewExtract

func NewExtract() *Extract

type Field

type Field struct {
    // Must be field.
    Type string `json:"@type"`
    // Node ID.
    ClassRefItem
    // Name of the Field.
    Name string `json:"name"`
    // Description of the Field.
    Description string `json:"description"`
    // The data types that correspond to the Field.
    DataType types.StringOrSlice `json:"dataType,omitempty"`
    // The source of data for the field.
    Source *DataSource `json:"source,omitempty"`
    // If true the Field is a list of DataType values.
    Repeated bool `json:"repeated,omitempty"`
    // A property URL that is equivalent to this field
    EquivalentProperty string `json:"equivalentProperty,omitempty"`
    // References to other fields in a different RecordSet.
    References FieldRefSlice `json:"references,omitempty"`
    // List of Fields nested inside this one.
    SubField []Field `json:"subField,omitempty"`
    // References to other Fields in the same RecordSet.
    ParentField FieldRefSlice `json:"parentField,omitempty"`
}

func NewField

func NewField() *Field

func (*Field) Validate

func (obj *Field) Validate() ([]types.CroissantWarning, []types.CroissantError)

type FieldRef

type FieldRef struct {
    Field ClassRefItem `json:"field"`
}

type FieldRefSlice

type FieldRefSlice []FieldRef

func (FieldRefSlice) MarshalJSON

func (ref FieldRefSlice) MarshalJSON() ([]byte, error)

func (*FieldRefSlice) UnmarshalJSON

func (ref *FieldRefSlice) UnmarshalJSON(data []byte) error

type FileObject

type FileObject struct {
    // Must be FileObject
    Type string `json:"@type"`
    // Node ID.
    ClassRefItem
    // The name of the file.
    Name string `json:"name"`
    // Description of file.
    Description string `json:"description"`
    // URL to the actual bytes of the file object.
    ContentURL string `json:"contentUrl"`
    // File size in [mega/kilo/...]bytes.
    // Defaults to bytes if unit not specified.
    ContentSize string `json:"contentSize,omitempty"`
    // Format of the file given as a MIME type.
    EncodingFormat string `json:"encodingFormat"`
    // Checksum of the file contents.
    Sha256 string `json:"sha256,omitempty"`
    // Another FileObject or FileSet this resource is contained in.
    ContainedIn ClassRefList `json:"containedIn,omitempty"`
}

func NewFileObject

func NewFileObject() *FileObject

func (*FileObject) Update

func (*FileObject) Update() error

Update FileObject struct from resource.

func (*FileObject) Validate

func (obj *FileObject) Validate() ([]types.CroissantWarning, []types.CroissantError)

func (*FileObject) ValidateHash

func (*FileObject) ValidateHash() error

type FileSet

type FileSet struct {
    // Must be FileSet.
    Type string `json:"@type"`
    // Node ID
    ClassRefItem
    // Name of FileSet
    Name string `json:"name"`
    // Description of FileSet
    Description string `json:"description"`
    // The FileSet or FileObject the resource is contained in.
    ContainedIn ClassRefList `json:"containedIn"`
    // MIME type
    EncodingFormat string `json:"encodingFormat"`
    // A glob pattern of files to include.
    Includes string `json:"includes"`
    // A glob patter of files to exclude.
    Excludes string `json:"excludes,omitempty"`
}

func (*FileSet) Update

func (*FileSet) Update() error

Update FileSet struct from resource.

func (*FileSet) Validate

func (obj *FileSet) Validate() ([]types.CroissantWarning, []types.CroissantError)

type Format

type Format struct{}

type RecordSet

type RecordSet struct {
    // Must be RecordSet
    Type string `json:"@type"`
    // Node ID
    ClassRefItem
    // Name of the RecordSet
    Name string `json:"name"`
    // Description of the RecordSet
    Description string `json:"description"`
    // The data types that correspond to all fields in the RecordSet.
    DataType types.StringOrSlice `json:"dataType"`
    // One or more Fields that uniquely identify records in the RecordSet.
    Key ClassRefList `json:"key,omitempty"`
    // List of data element Fields that appear in the RecordSet.
    Field []Field `json:"field"`
    // One or more records that constitute the data of the RecordSet.
    Data []interface{} `json:"data,omitempty"`
    // One or more records provided as an example of the RecordSet.
    Examples []interface{} `json:"examples,omitempty"`
}

func NewRecordSet

func NewRecordSet() *RecordSet

func (*RecordSet) Validate

func (obj *RecordSet) Validate() ([]types.CroissantWarning, []types.CroissantError)

type Split

type Split struct {
    TrainSplit      string
    TestSplit       string
    ValidationSplit string
}

func NewSplit

func NewSplit(trainSplit string, testSplit string, validationSplit string) *Split

type Transform

type Transform struct {
    // Split data source string on character.
    Delimiter string `json:"delimiter,omitempty"`
    // Apply regex to data source.
    Regex string `json:"regex,omitempty"`
    // the path to extract json from.
    JsonPath string `json:"jsonPath,omitempty"`
    // A JSON query to evaluate against the data source.
    JsonQuery string `json:"jsonquery,omitempty"`
}

func NewTransform

func NewTransform() *Transform