diff options
Diffstat (limited to 'datamaps/reader.go')
-rw-r--r-- | datamaps/reader.go | 293 |
1 files changed, 293 insertions, 0 deletions
diff --git a/datamaps/reader.go b/datamaps/reader.go new file mode 100644 index 0000000..25e6c99 --- /dev/null +++ b/datamaps/reader.go @@ -0,0 +1,293 @@ +/* datmamaps packages handles datamap files and populated spreadsheets. + */ + +package datamaps + +import ( + "database/sql" + "encoding/csv" + "errors" + "fmt" + "io" + "io/ioutil" + "log" + "path/filepath" + "strings" + + // Required for the sqlite3 driver + _ "github.com/mattn/go-sqlite3" + + "github.com/tealeg/xlsx/v3" +) + +type ( + // sheetData is the data from the sheet. + sheetData map[string]extractedCell + + // FileData is the data from the file. + FileData map[string]sheetData + + // ExtractedData is the Extracted data from the file, filtered by a Datamap. + ExtractedData map[string]map[string]xlsx.Cell +) + +//datamapLine - a line from the datamap. +type datamapLine struct { + Key string + Sheet string + Cellref string +} + +//extractedCell is data pulled from a cell. +type extractedCell struct { + Cell *xlsx.Cell + Col string + Row int + Value string +} + +var ( + inner = make(sheetData) +) + +// ExtractedDatamapFile is a slice of datamapLine structs, each of which encodes a single line +// in the datamap file/database table. +type ExtractedDatamapFile []datamapLine + +//sheetInSlice is a helper which returns true +// if a string is in a slice of strings. +func sheetInSlice(list []string, key string) bool { + for _, x := range list { + if x == key { + return true + } + } + + return false +} + +//getSheetNames returns the number of Sheet field entries +// in a slice of datamapLine structs. +func getSheetNames(dmls ExtractedDatamapFile) []string { + var sheetNames []string + + for _, dml := range dmls { + if !sheetInSlice(sheetNames, dml.Sheet) { + sheetNames = append(sheetNames, dml.Sheet) + } + } + + return sheetNames +} + +// ReadDML returns a slice of datamapLine structs given a +// path to a datamap file. +func ReadDML(path string) (ExtractedDatamapFile, error) { + var s ExtractedDatamapFile + + data, err := ioutil.ReadFile(path) + + if err != nil { + return s, fmt.Errorf("Cannot find file: %s", path) + } + + r := csv.NewReader(strings.NewReader(string(data))) + + for { + record, err := r.Read() + if err == io.EOF { + break + } + + if err != nil { + return s, errors.New("Cannot read line %s") + } + + if record[0] == "cell_key" { + // this must be the header + continue + } + + dml := datamapLine{ + Key: strings.Trim(record[0], " "), + Sheet: strings.Trim(record[1], " "), + Cellref: strings.Trim(record[2], " ")} + s = append(s, dml) + } + + return s, nil +} + +// cellVisitor is used by datamaps.rowVisitor() and is called +// on every cell in the target xlsx file in order to extract +// the data. +func cellVisitor(c *xlsx.Cell) error { + x, y := c.GetCoordinates() + cellref := xlsx.GetCellIDStringFromCoords(x, y) + + // TODO: we need to store the c.NumFmt value here in the + // database so we can reply it again when we write the values + // to the master or elsewhere. We should keep the value itself + // in its unformatted state - i.e a date being something like 488594. + + ex := extractedCell{ + Cell: c, + Value: c.Value, + } + + inner[cellref] = ex + + return nil +} + +// rowVisitor is used as a callback by xlsx.sheet.ForEachRow(). It wraps +// a call to xlsx.Row.ForEachCell() which actually extracts the data. +func rowVisitor(r *xlsx.Row) error { + if err := r.ForEachCell(cellVisitor, xlsx.SkipEmptyCells); err != nil { + return err + } + return nil +} + +// ReadXLSX returns a file at path's data as a map, +// keyed on sheet name. All values are returned as strings. +// Paths to a datamap and the spreadsheet file required. +func ReadXLSX(path string) FileData { + wb, err := xlsx.OpenFile(path) + if err != nil { + fmt.Errorf("cannot open file at %s - %v", path, err) + } + + outer := make(FileData, 1) + + // get the data + for _, sheet := range wb.Sheets { + + if err := sheet.ForEachRow(rowVisitor); err != nil { + fmt.Errorf("cannot call ForEachRow() in sheet %s - %v", sheet.Name, err) + } + outer[sheet.Name] = inner + inner = make(sheetData) + } + + return outer +} + +// DatamapFromDB creates an ExtractedDatamapFile from the database given +// the name of a datamap. Of course, in this instance, the data is not +// coming from a datamap file (such as datamap.csv) but from datamap data +// previous stored in the database by DatamapToDB or similar. +func DatamapFromDB(name string, db *sql.DB) (ExtractedDatamapFile, error) { + + var out ExtractedDatamapFile + + query := ` + select + key, sheet, cellref + from datamap_line + join datamap on datamap_line.dm_id = datamap.id where datamap.name = ?; + ` + rows, err := db.Query(query, name) + if err != nil { + erstr := fmt.Sprintf("cannot query for datamap key, sheet and cellref - %v", err) + return nil, errors.New(erstr) + } + defer rows.Close() + + for rows.Next() { + var ( + key string + sheet string + cellref string + ) + if err := rows.Scan(&key, &sheet, &cellref); err != nil { + return nil, err + } + + out = append(out, datamapLine{Key: key, Sheet: sheet, Cellref: cellref}) + } + + return out, nil +} + +// ExtractDBDatamap uses a datamap named from the database db to extract values +// from the populated spreadsheet file file. +func ExtractDBDatamap(name string, file string, db *sql.DB) (ExtractedData, error) { + ddata, err := DatamapFromDB(name, db) // this will need to return an ExtractedDatamapFile + if err != nil { + erstr := fmt.Sprintf("cannot call DatamapFromDB() - %v", err) + return nil, errors.New(erstr) + } + if len(ddata) == 0 { + return nil, fmt.Errorf("there is no datamap in the database matching name '%s'. Try running 'datamaps datamap --import...'", name) + } + xdata := ReadXLSX(file) + + names := getSheetNames(ddata) + outer := make(ExtractedData, len(names)) + // var inner map[string]xlsx.Cell + + for _, s := range names { + outer[s] = make(map[string]xlsx.Cell) + } + + for _, i := range ddata { + sheet := i.Sheet + cellref := i.Cellref + + if val, ok := xdata[sheet][cellref]; ok { + outer[sheet][cellref] = *val.Cell + } + } + + return outer, nil +} + +// extract returns the file at path's data as a map, +// using the datamap as a filter, keyed on sheet name. All values +// are returned as strings. (Currently deprecated in favour of +// ExtractDBDatamap. +func extract(dm string, path string) ExtractedData { + xdata := ReadXLSX(path) + ddata, err := ReadDML(dm) + + if err != nil { + log.Fatal(err) + } + + names := getSheetNames(ddata) + outer := make(ExtractedData, len(names)) + inner := make(map[string]xlsx.Cell) + + for _, i := range ddata { + sheet := i.Sheet + cellref := i.Cellref + + if val, ok := xdata[sheet][cellref]; ok { + inner[cellref] = *val.Cell + outer[sheet] = inner + } + } + + return outer +} + +//getTargetFiles finds all xlsx and xlsm files in directory. +func getTargetFiles(path string) ([]string, error) { + if lastchar := path[len(path)-1:]; lastchar != string(filepath.Separator) { + return nil, fmt.Errorf("path must end in a %s character", string(filepath.Separator)) + } + + fullpath := strings.Join([]string{path, "*.xls[xm]"}, "") + output, err := filepath.Glob(fullpath) + + if err != nil { + return nil, err + } + + if output == nil { + return nil, fmt.Errorf("cannot find any xlsx files in %s", path) + } + + return output, nil +} |