#!/usr/bin/python3 import json import sys BENCHMARK_FILENAME = "benchmark-data.txt" OUTPUT_FILENAME = "benchmark-data.c" def parse_attribute_line(line: str) -> dict: words = line.split() assert words[0] == "@attribute" attribute = { "name": words[1] } if words[2] == "numeric": attribute["value-types"] = words[2] elif words[2].startswith('{'): values = words[2].lstrip('{').rstrip().rstrip('}').split(',') attribute["value-types"] = "enum" attribute["values"] = list(enumerate(values)) attribute["namedict"]: dict[str, int] = {} for number, name in attribute["values"]: attribute["namedict"][name] = number x_max = len(attribute["values"]) - 1 attribute["normalized-values"]: list[float] = [] for value, _ in attribute["values"]: attribute["normalized-values"].append(value / x_max) return attribute def parse_data(attributes: list, line: str) -> list[float]: parsed_data = [] for fieldnum, field in enumerate(line.split(',')): attr = attributes[fieldnum] if attr["value-types"] == "numeric": # Numeric field. Just copy it as is, we'll do the normalization later. parsed_data.append(float(field)) elif attr["value-types"] == "enum": # Get the normalized numeric value for the current symbolic field numeric_value: int = attributes[fieldnum]["namedict"].get(field) assert numeric_value is not None parsed_data.append(attributes[fieldnum]["normalized-values"][numeric_value]) else: print("Unknown value type at field {} ({}). Line: {}" .format(fieldnum, field, line)) print("attr: ", json.dumps(attr)) return parsed_data def update_min_max(min_max: list[list[float]], parsed_line: list[float]) -> None: for fieldnum, field in enumerate(parsed_line): oldmin = min_max[0][fieldnum] oldmax = min_max[1][fieldnum] min_max[0][fieldnum] = field if oldmin is None or field < oldmin else oldmin min_max[1][fieldnum] = field if oldmax is None or field > oldmax else oldmax def normalize(data: list[list[float]], min_max: list[list[float]], attributes: list) -> list[list[float]]: normalized_data: list[list[float]] = [] for line in data: normalized_line = [] for fieldnum, field in enumerate(line): # Fields with values of type enum are already normalized, so we # should skip them if attributes[fieldnum]["value-types"] == "enum": normalized_line.append(field) continue x_min = min_max[0][fieldnum] x_max = min_max[1][fieldnum] if x_min == x_max: if 0 <= field <= 1: normalized_line.append(field) else: print("Problem with field {} ({}). Line: {}" .format(fieldnum, field, line)) print("attr: ", json.dumps(attributes[fieldnum])) normalized_line.append('ERROR') continue normalized_value = (field - x_min) / (x_max - x_min) normalized_line.append(normalized_value) normalized_data.append(normalized_line) return normalized_data def main(): attributes = [] data = [] min_max = [None, None] with open(BENCHMARK_FILENAME, 'r', encoding="utf-8") as benchmark_file: data_started = False for line in benchmark_file: if not line.rstrip(): continue if not data_started: if line.startswith('@'): if line.startswith("@attribute "): attributes.append(parse_attribute_line(line)) elif line.rstrip() == "@data": #min_max[0] = [+float('inf')] * len(attributes) #min_max[1] = [-float('inf')] * len(attributes) min_max[0] = [None] * len(attributes) min_max[1] = [None] * len(attributes) data_started = True else: # Should not happen print("What the hell happened here?", file=sys.stderr) else: # Data parsed_line = parse_data(attributes, line.rstrip()) data.append(parsed_line) update_min_max(min_max, parsed_line) data = normalize(data, min_max, attributes) print(json.dumps(attributes, indent=4)) print('\n' + ('-' * 76)) print(json.dumps(data, indent=4)) print('\n' + ('-' * 76)) print(json.dumps(list(zip(min_max[0], min_max[1], [attr["name"] for attr in attributes])), indent=4)) with open(OUTPUT_FILENAME, 'w', encoding="utf-8") as out: print("const double benchmark_data[{}][{}] = {}" .format(len(data), len(attributes), '{'), file=out) #print('\n'.join(map(lambda l: ','.join(map(str, l)), data)), file=out) for line in data: print("\t{" + ", ".join([str(field) for field in line]) + "},", file=out) print("};", file=out) if __name__ == "__main__": main()