Blame - tools/verify_assembler_traces.py - arm/vixl.git

blob: 1162a85feff892787401665083576d61bbc154a3 [file] [log] [blame]

Pierre Langlois	88c46b8	2016-06-02 18:15:32 +0100	[diff] [blame^]	1	#!/usr/bin/env python3
				2
				3	# Copyright 2016, ARM Limited
				4	# All rights reserved.
				5	#
				6	# Redistribution and use in source and binary forms, with or without
				7	# modification, are permitted provided that the following conditions are met:
				8	#
				9	# * Redistributions of source code must retain the above copyright notice,
				10	# this list of conditions and the following disclaimer.
				11	# * Redistributions in binary form must reproduce the above copyright notice,
				12	# this list of conditions and the following disclaimer in the documentation
				13	# and/or other materials provided with the distribution.
				14	# * Neither the name of ARM Limited nor the names of its contributors may be
				15	# used to endorse or promote products derived from this software without
				16	# specific prior written permission.
				17	#
				18	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
				19	# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
				20	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
				21	# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
				22	# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				23	# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
				24	# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
				25	# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				26	# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				27	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				28
				29	"""
				30	Verify generated AArch32 assembler traces against `llvm-mc`.
				31
				32	This script will find all files in `test/a32/traces/` with names starting will
				33	`assembler`, and check them against `llvm-mc`. It checks our assembler is
				34	correct by looking up what instruction we meant to asssemble, assemble it with
				35	`llvm` and check the result is bit identical to what our assembler generated.
				36
				37	You may run the script with no arguments from VIXL's top-level directory as long
				38	as `llvm-mc` is in your PATH. You may provide a different `llvm-mc` path with
				39	the `--llvm-mc` option. This script relies on version 3.8 or higher of
				40	LLVM. Previous versions refuse to assemble some instructions that ARMv8 allows,
				41	but ARMv7 did not.
				42
				43	For example, let's say we have the following assembler trace for CLZ
				44	(the real trace is a lot bigger):
				45
				46	~~~
				47	static const byte kInstruction_Clz_eq_r0_r0[] = {
				48	0x10, 0x0f, 0x6f, 0x01 // Clz eq r0 r0
				49	};
				50	static const byte kInstruction_Clz_eq_r0_r1[] = {
				51	0x11, 0x0f, 0x6f, 0x01 // Clz eq r0 r1
				52	};
				53	static const byte kInstruction_Clz_eq_r0_r2[] = {
				54	0x12, 0x0f, 0x6f, 0x01 // Clz eq r0 r2
				55	};
				56	static const TestResult kReferenceClz[] = {
				57	{
				58	ARRAY_SIZE(kInstruction_Clz_eq_r0_r0),
				59	kInstruction_Clz_eq_r0_r0,
				60	},
				61	{
				62	ARRAY_SIZE(kInstruction_Clz_eq_r0_r1),
				63	kInstruction_Clz_eq_r0_r1,
				64	},
				65	{
				66	ARRAY_SIZE(kInstruction_Clz_eq_r0_r2),
				67	kInstruction_Clz_eq_r0_r2,
				68	},
				69	};
				70	~~~
				71
				72	The traces contain both the list of bytes that were encoded as well as a comment
				73	with a description of the instruction this is. This script searches for these
				74	lines and checks them.
				75
				76	With our example, the script will find the following:
				77
				78	[
				79	("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
				80	("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
				81	("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
				82	]
				83
				84	Then the tricky part is to convert the description of the instruction into the
				85	following valid assembly syntax:
				86
				87	clzeq r0, r0
				88	clzeq r0, r1
				89	clzeq r0, r2
				90
				91	Our example is easy, but it gets more complicated with load and store
				92	instructions for example. We can feed this as input to `llvm-mc`:
				93
				94	$ echo "
				95	clzeq r0, r0
				96	clzeq r0, r1
				97	clzeq r0, r2
				98	" \| llvm-mc -assemble -arch=arm -mattr=v8,crc -show-encoding
				99
				100	And we will get the following output:
				101
				102	.text
				103	clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01]
				104	clzeq r0, r1 @ encoding: [0x11,0x0f,0x6f,0x01]
				105	clzeq r0, r2 @ encoding: [0x12,0x0f,0x6f,0x01]
				106
				107	The script will finally extract the encoding and compare it to what VIXL
				108	generated.
				109	"""
				110
				111	import argparse
				112	import subprocess
				113	import os
				114	import re
				115	import itertools
				116	import types
				117
				118	def BuildOptions():
				119	result = argparse.ArgumentParser(
				120	description = 'Use `llvm-mc` to check the assembler traces are correct.',
				121	formatter_class = argparse.ArgumentDefaultsHelpFormatter)
				122	result.add_argument('--llvm-mc', default='llvm-mc', help='Path to llvm-mc')
				123	result.add_argument('--verbose', '-v', action='store_true')
				124	return result.parse_args()
				125
				126
				127	def CheckLLVMVersion(llvm_mc):
				128	version = subprocess.check_output([llvm_mc, '-version'])
				129	m = re.search("^ LLVM version (\d)\.(\d)\.\d$", version.decode(), re.M)
				130	major, minor = m.groups()
				131	if int(major) < 3 or (int(major) == 3 and int(minor) < 8):
				132	raise Exception("This script requires LLVM version 3.8 or higher.")
				133
				134
				135	def ConvertToLLVMFormat(vixl_instruction, triple):
				136	"""
				137	Take an string representing an instruction and convert it to assembly syntax
				138	for LLVM. VIXL's test generation framework will print instruction
				139	representations as a space seperated list. The first element is the mnemonic
				140	and the following elements are operands.
				141	"""
				142
				143	# Dictionnary of patterns. The key is an identifier used in
				144	# `llvm_mc_instruction_converters` below. The value needs to be a capturing
				145	# regular expression.
				146	pattern_matchers = {
				147	"mnemonic": "(\w+)",
				148	"condition":
				149	"(al\|eq\|ne\|cs\|cc\|mi\|pl\|vs\|vc\|hi\|ls\|ge\|lt\|gt\|le)",
				150	"register":
				151	"(r0\|r1\|r2\|r3\|r4\|r5\|r6\|r7\|r8\|r9\|r10\|r11\|r12\|r13\|r14\|r15\|pc\|sp\|lr)",
				152	"immediate": "(0x[0-9a-f]+\|[0-9]+)",
				153	"shift": "(lsl\|lsr\|asr\|ror)",
				154	}
				155
				156	# List of converters. Each of them represents an instruction form and what to
				157	# convert it to. This list needs to be complete; an exception is raised if we
				158	# couldn't find a converter for the instruction.
				159	#
				160	# The first part of each tuple is a pattern to match. It's simply a regular
				161	# expression. Additionally, each identifier in curly braces is replaced by the
				162	# corresponding pattern from `pattern_matchers`.
				163	#
				164	# The second part of the tuple is a string that describes what the result will
				165	# look like. Empty curly braces are replaced by matches, in order.
				166	llvm_mc_instruction_converters = [
				167	("it {condition}", "it {}"),
				168	("{mnemonic} {condition} {register} {immediate}",
				169	"{}{} {}, #{}"),
				170	("{mnemonic} {condition} {register} {register} {immediate}",
				171	"{}{} {}, {}, #{}"),
				172	("{mnemonic} {condition} {register} {register}",
				173	"{}{} {}, {}"),
				174	("{mnemonic} {condition} {register} {register} {register}",
				175	"{}{} {}, {}, {}"),
				176	("{mnemonic} {register} {register} {register}",
				177	"{} {}, {}, {}"),
				178	("{mnemonic} {condition} {register} {register} {immediate}",
				179	"{}{} {}, {}, #{}"),
				180	("{mnemonic} {condition} {register} {register} {register} {shift} "
				181	"{immediate}",
				182	"{}{} {}, {}, {}, {} #{}"),
				183	("{mnemonic} {condition} {register} {register} {register} {shift} "
				184	"{register}",
				185	"{}{} {}, {}, {}, {} {}"),
				186	("{mnemonic} {condition} {register} {register} {shift} {immediate}",
				187	"{}{} {}, {}, {} #{}"),
				188	("{mnemonic} {condition} {register} {register} {shift} {register}",
				189	"{}{} {}, {}, {} {}"),
				190	("{mnemonic} {condition} {register} {register} plus {immediate} offset",
				191	"{}{} {}, [{}, #{}]"),
				192	("{mnemonic} {condition} {register} {register} minus {immediate} offset",
				193	"{}{} {}, [{}, #-{}]"),
				194	("{mnemonic} {condition} {register} {register} plus {immediate} postindex",
				195	"{}{} {}, [{}], #{}"),
				196	("{mnemonic} {condition} {register} {register} minus {immediate} "
				197	"postindex",
				198	"{}{} {}, [{}], #-{}"),
				199	("{mnemonic} {condition} {register} {register} plus {immediate} preindex",
				200	"{}{} {}, [{}, #{}]!"),
				201	("{mnemonic} {condition} {register} {register} minus {immediate} "
				202	"preindex",
				203	"{}{} {}, [{}, #-{}]!"),
				204	("{mnemonic} {condition} {register} {register} plus {register} offset",
				205	"{}{} {}, [{}, {}]"),
				206	("{mnemonic} {condition} {register} {register} minus {register} offset",
				207	"{}{} {}, [{}, -{}]"),
				208	("{mnemonic} {condition} {register} {register} plus {register} postindex",
				209	"{}{} {}, [{}], {}"),
				210	("{mnemonic} {condition} {register} {register} minus {register} "
				211	"postindex",
				212	"{}{} {}, [{}], -{}"),
				213	("{mnemonic} {condition} {register} {register} plus {register} preindex",
				214	"{}{} {}, [{}, {}]!"),
				215	("{mnemonic} {condition} {register} {register} minus {register} preindex",
				216	"{}{} {}, [{}, -{}]!"),
				217	("{mnemonic} {condition} {register} {register} plus {register} {shift} "
				218	"{immediate} offset",
				219	"{}{} {}, [{}, {}, {} #{}]"),
				220	("{mnemonic} {condition} {register} {register} minus {register} {shift} "
				221	"{immediate} offset",
				222	"{}{} {}, [{}, -{}, {} #{}]"),
				223	("{mnemonic} {condition} {register} {register} plus {register} {shift} "
				224	"{immediate} postindex",
				225	"{}{} {}, [{}], {}, {} #{}"),
				226	("{mnemonic} {condition} {register} {register} minus {register} {shift} "
				227	"{immediate} postindex",
				228	"{}{} {}, [{}], -{}, {} #{}"),
				229	("{mnemonic} {condition} {register} {register} plus {register} {shift} "
				230	"{immediate} preindex",
				231	"{}{} {}, [{}, {}, {} #{}]!"),
				232	("{mnemonic} {condition} {register} {register} minus {register} {shift} "
				233	"{immediate} preindex",
				234	"{}{} {}, [{}, -{}, {} #{}]!"),
				235	]
				236
				237	# Work around issues in LLVM 3.8.
				238	if triple == "thumbv8":
				239	def ConvertMovRdImm(matches):
				240	"""
				241	LLVM chooses the T3 encoding for `mov <rd>, #<immediate>` when the
				242	immediate fits both into a modified immediate (T2 encoding) and 16
				243	bits (T3 encoding). Adding the `.W` modifier forces the T2 encoding to
				244	be used.
				245	"""
				246	# The immediate is the second capture in "mov al {register} {immediate}".
				247	imm = int(matches[1], 16)
				248	if imm <= 0xffff:
				249	lsb = imm & -imm
				250	if (imm >> 8) < lsb:
				251	return "mov.w {}, #{}".format(*matches)
				252	# Fall back to a LLVM making the right decision.
				253	return "mov {}, #{}".format(*matches)
				254	llvm_mc_instruction_converters[:0] = [
				255	# The ARM ARM specifies that if <Rn> is PC in either an ADD or SUB
				256	# instruction with an immediate, the assembler should use the ADR
				257	# encoding. LLVM does not know about this subtlety. We get around this
				258	# by manually translating the instruction to their ADR form.
				259	("add al {register} pc {immediate}", "adr {}, #{}"),
				260	("sub al {register} pc {immediate}", "adr {}, #-{}"),
				261
				262	# LLVM is (rightfully) being helpful by swapping register operands so
				263	# that the 16 bit encoding of the following instructions is used.
				264	# However, VIXL does not do this. These rules specifically add the `.w`
				265	# modifier to force LLVM to use the 32 bit encoding if the last register
				266	# is identical to first one. But at the same time, we should still use
				267	# the narrow encoding if all registers are the same.
				268	("adcs al {register} (\\1) (\\1)", "adcs.n {}, {}, {}"),
				269	("adcs al {register} {register} (\\1)", "adcs.w {}, {}, {}"),
				270	("orrs al {register} (\\1) (\\1)", "orrs.n {}, {}, {}"),
				271	("orrs al {register} {register} (\\1)", "orrs.w {}, {}, {}"),
				272	("eors al {register} (\\1) (\\1)", "eors.n {}, {}, {}"),
				273	("eors al {register} {register} (\\1)", "eors.w {}, {}, {}"),
				274	("ands al {register} (\\1) (\\1)", "ands.n {}, {}, {}"),
				275	("ands al {register} {register} (\\1)", "ands.w {}, {}, {}"),
				276	# Solve the same issue as for the previous rules, however, we need to
				277	# take into account that ADD instructions with the stack pointer have
				278	# additional 16 bit forms.
				279	("add al {register} (\\1) (\\1)", "add.n {}, {}, {}"),
				280	("add al {register} (\\1) r13", "add.w {}, {}, sp"),
				281	("add al {register} r13 (\\1)", "add.n {}, sp, {}"),
				282	("add al {register} {register} (\\1)", "add.w {}, {}, {}"),
				283	("mov al {register} {immediate}", ConvertMovRdImm)
				284	]
				285
				286	# Our test generator framework uses mnemonics starting with a capital letters.
				287	# We need everythin to be lower case for LLVM.
				288	vixl_instruction = vixl_instruction.lower()
				289
				290	llvm_instruction = []
				291
				292	# VIXL may have generated more than one instruction seperated by ';'
				293	# (an IT instruction for example).
				294	for instruction in vixl_instruction.split(';'):
				295	# Strip out extra white spaces.
				296	instruction = instruction.strip()
				297	# Try all converters in the list.
				298	for pattern, result in llvm_mc_instruction_converters:
				299	# Build the regular expression for this converter.
				300	instruction_matcher = "^" + pattern.format(**pattern_matchers) + "$"
				301	match = re.match(instruction_matcher, instruction)
				302	if match:
				303	# If we have a match, the object will contain a tuple of substrings.
				304	if isinstance(result, types.FunctionType):
				305	# `result` is a function, call it produce the instruction.
				306	llvm_instruction.append(result(match.groups()))
				307	else:
				308	# `result` is a string, use it as the format string.
				309	assert(isinstance(result, str))
				310	llvm_instruction.append(result.format(*match.groups()))
				311	break
				312
				313	if llvm_instruction:
				314	return "\n".join(llvm_instruction)
				315
				316	# No converters worked so raise an exception.
				317	raise Exception("Unsupported instruction {}.".format(instruction))
				318
				319
				320	def ReadTrace(trace):
				321	"""
				322	Receive the content of an assembler trace, extract the relevant information
				323	and return it as a list of tuples. The first part of each typle is a string
				324	representing the instruction. The second part is a list of bytes representing
				325	the encoding.
				326
				327	For example:
				328
				329	[
				330	("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
				331	("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
				332	("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
				333	]
				334	"""
				335
				336	pattern = re.compile(
				337	"^ (?P<encoding>(:?0x[0-9a-f]{2}, )+0x[0-9a-f]{2}) // (?P<instruction>.*)$",
				338	re.M)
				339	return [
				340	(m.group('instruction'), m.group('encoding').replace(" ", "").split(","))
				341	for m in re.finditer(pattern, trace)
				342	]
				343
				344
				345	def VerifyInstructionsWithLLVMMC(llvm_mc, f, triple):
				346	"""
				347	Extract all instructions from `f`, feed them to `llvm-mc` and make sure it's
				348	encoded them the same way as VIXL. `triple` allows us to specify either
				349	"thumbv8" or "armv8".
				350	"""
				351
				352	vixl_reference = ReadTrace(f.read())
				353	vixl_instructions, vixl_encodings = zip(*vixl_reference)
				354	instructions = [
				355	ConvertToLLVMFormat(instruction, triple)
				356	for instruction in vixl_instructions
				357	]
				358	llvm_mc_proc = subprocess.Popen(
				359	[llvm_mc, '-assemble', '-triple={}'.format(triple), '-mattr=v8,crc',
				360	# LLVM fails to recognize some instructions as valid T32 when we do not
				361	# set `-mcpu`.
				362	'-mcpu=cortex-a53', '-show-encoding'],
				363	stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
				364	out, err = llvm_mc_proc.communicate("\n".join(instructions).encode())
				365	# If `llvm-mc` printed something to stderr then stop.
				366	if err:
				367	print(err.decode())
				368	return
				369
				370	# Extract list of bytes from `llvm-mc` output. It's in the following form:
				371	#
				372	# clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01]
				373	# ^^^^ ^^^^ ^^^^ ^^^^
				374	llvm_encodings = [
				375	match_object.group('encoding').replace(" ", "").split(",")
				376	for match_object in re.finditer(".@ encoding: \[(?P<encoding>.)\]",
				377	out.decode())
				378	]
				379
				380	# If LLVM has generated exactly twice as much instructions, we assume this is
				381	# due to IT instructions preceding every instruction under test. VIXL's
				382	# assembly reference files will contain a single array of 4 bytes encoding
				383	# both the IT and the following instruction. While LLVM will have decoded them
				384	# into two seperate 2 bytes arrays.
				385	if len(llvm_encodings) == 2 * len(vixl_encodings):
				386	llvm_encodings = [
				387	llvm_encodings[i * 2] + llvm_encodings[(i * 2) + 1]
				388	for i in range(0, len(vixl_encodings))
				389	]
				390
				391	# Check the encodings from LLVM are identical to VIXL's.
				392	if len(llvm_encodings) != len(vixl_encodings):
				393	print("""Error: llvm-mc generated {} instructions than there are in the
				394	generated trace.
				395	""".format("fewer" if len(llvm_encodings) < len(vixl_encodings) else "more"))
				396	else:
				397	for i in range(0, len(vixl_encodings)):
				398	if llvm_encodings[i] != vixl_encodings[i]:
				399	print("""Error: llvm-mc disagrees on the encoding of \"{instruction}\":
				400	LLVM-MC: {llvm}
				401	VIXL: {vixl}
				402	""".format(instruction=vixl_instructions[i].replace("\n", "; "),
				403	llvm=llvm_encodings[i],
				404	vixl=vixl_encodings[i]))
				405
				406
				407	if __name__ == "__main__":
				408	args = BuildOptions()
				409
				410	CheckLLVMVersion(args.llvm_mc)
				411
				412	trace_dir = 'test/a32/traces/'
				413	trace_files = [
				414	trace_file
				415	for trace_file in os.listdir(trace_dir)
				416	if trace_file.startswith("assembler-")
				417	]
				418	trace_files.sort()
				419	for trace_file in trace_files:
				420	if args.verbose:
				421	print("Verifying \"" + trace_file + "\".")
				422	with open(os.path.join(trace_dir, trace_file), "r") as f:
				423	if "t32" in trace_file:
				424	VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "thumbv8")
				425	elif "a32" in trace_file:
				426	VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "armv8")
				427	else:
				428	raise Exception("Failed to recognize the ISA in \"" + trace_file + "\".")