123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170 |
- #!/usr/bin/python3
-
- import json
- import sys
-
- BENCHMARK_FILENAME = "benchmark-data.txt"
- OUTPUT_FILENAME = "benchmark-data.c"
- OUTPUT_TABU = "benchmark-data.json"
-
- def parse_attribute_line(line: str) -> dict:
- words = line.split()
- assert words[0] == "@attribute"
- attribute = { "name": words[1] }
- if words[2] == "numeric":
- attribute["value-types"] = words[2]
- elif words[2].startswith('{'):
- values = words[2].lstrip('{').rstrip().rstrip('}').split(',')
- attribute["value-types"] = "enum"
- attribute["values"] = list(enumerate(values))
- attribute["namedict"]: dict[str, int] = {}
- for number, name in attribute["values"]:
- attribute["namedict"][name] = number
- x_max = len(attribute["values"]) - 1
- attribute["normalized-values"]: list[float] = []
- for value, _ in attribute["values"]:
- attribute["normalized-values"].append(value / x_max)
- return attribute
-
- def parse_data(attributes: list, line: str) -> list[float]:
-
- parsed_data = []
-
- for fieldnum, field in enumerate(line.split(',')):
-
- attr = attributes[fieldnum]
-
- if attr["value-types"] == "numeric":
- # Numeric field. Just copy it as is, we'll do the normalization later.
- parsed_data.append(float(field))
-
- elif attr["value-types"] == "enum":
- # Get the normalized numeric value for the current symbolic field
- numeric_value: int = attributes[fieldnum]["namedict"].get(field)
- assert numeric_value is not None
- parsed_data.append(attributes[fieldnum]["normalized-values"][numeric_value])
-
- else:
- print("Unknown value type at field {} ({}). Line: {}"
- .format(fieldnum, field, line))
- print("attr: ", json.dumps(attr))
-
- return parsed_data
-
- def update_min_max(min_max: list[list[float]], parsed_line: list[float]) -> None:
- for fieldnum, field in enumerate(parsed_line):
- oldmin = min_max[0][fieldnum]
- oldmax = min_max[1][fieldnum]
- min_max[0][fieldnum] = field if oldmin is None or field < oldmin else oldmin
- min_max[1][fieldnum] = field if oldmax is None or field > oldmax else oldmax
-
- def normalize(data: list[list[float]],
- min_max: list[list[float]],
- attributes: list) -> list[list[float]]:
-
- normalized_data: list[list[float]] = []
-
- for line in data:
-
- normalized_line = []
-
- for fieldnum, field in enumerate(line):
-
- # Fields with values of type enum are already normalized, so we
- # should skip them
-
- if attributes[fieldnum]["value-types"] == "enum":
- normalized_line.append(field)
- continue
-
-
- x_min = min_max[0][fieldnum]
- x_max = min_max[1][fieldnum]
-
- if x_min == x_max:
- if 0 <= field <= 1:
- normalized_line.append(field)
- else:
- print("Problem with field {} ({}). Line: {}"
- .format(fieldnum, field, line))
- print("attr: ", json.dumps(attributes[fieldnum]))
- normalized_line.append('ERROR')
- continue
-
- normalized_value = (field - x_min) / (x_max - x_min)
- normalized_line.append(normalized_value)
-
- normalized_data.append(normalized_line)
-
- return normalized_data
-
- def main():
- attributes = []
- data = []
- min_max = [None, None]
- with open(BENCHMARK_FILENAME, 'r', encoding="utf-8") as benchmark_file:
-
- data_started = False
-
- for line in benchmark_file:
-
- if not line.rstrip():
- continue
-
- if not data_started:
- if line.startswith('@'):
- if line.startswith("@attribute "):
- attributes.append(parse_attribute_line(line))
-
- elif line.rstrip() == "@data":
- #min_max[0] = [+float('inf')] * len(attributes)
- #min_max[1] = [-float('inf')] * len(attributes)
- min_max[0] = [None] * len(attributes)
- min_max[1] = [None] * len(attributes)
- data_started = True
- else:
- # Should not happen
- print("What the hell happened here?", file=sys.stderr)
-
- else:
- # Data
- parsed_line = parse_data(attributes, line.rstrip())
- data.append(parsed_line)
- update_min_max(min_max, parsed_line)
-
- data = normalize(data, min_max, attributes)
-
- print(json.dumps(attributes, indent=4))
- print('\n' + ('-' * 76))
- print(json.dumps(data, indent=4))
- print('\n' + ('-' * 76))
- print(json.dumps(list(zip(min_max[0], min_max[1],
- [attr["name"] for attr in attributes])), indent=4))
-
- with open(OUTPUT_FILENAME, 'w', encoding="utf-8") as out:
- print('#include "benchmark-data.h"\n#include <stdlib.h>\n', file=out)
-
- print("vector *benchmark_data;", file=out)
- print("const size_t N_VECTORS = {};".format(len(data)), file=out)
- print("const size_t VECTOR_SIZE = {};\n".format(len(attributes)), file=out)
-
- print("void init_benchmark_data(void)\n{", file=out)
- print("\tsize_t i;\n", file=out)
-
- print("\tbenchmark_data = calloc(N_VECTORS, sizeof(*benchmark_data));\n", file=out)
-
- print("\tfor (i = 0; i < N_VECTORS; i++)", file=out)
- print("\t\tbenchmark_data[i] = calloc(VECTOR_SIZE, sizeof(benchmark_data[i][0]));\n", file=out)
-
- for nv, vector in enumerate(data):
- for na, attr in enumerate(vector):
- print("\tbenchmark_data[{}][{}] = {};".format(nv, na, vector[na]),
- file=out)
-
- print("}", file=out)
-
- with open(OUTPUT_TABU, 'w', encoding="utf-8") as out:
- print(json.dumps(data), file=out)
-
- if __name__ == "__main__":
- main()
|