blob: d78d17c2ffa4b4482a053aef319ac284c4bc0fb1 [file] [log] [blame]
Pierre Langlois88c46b82016-06-02 18:15:32 +01001#!/usr/bin/env python3
2
Alexandre Ramesb78f1392016-07-01 14:22:22 +01003# Copyright 2016, VIXL authors
Pierre Langlois88c46b82016-06-02 18:15:32 +01004# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are met:
8#
9# * Redistributions of source code must retain the above copyright notice,
10# this list of conditions and the following disclaimer.
11# * Redistributions in binary form must reproduce the above copyright notice,
12# this list of conditions and the following disclaimer in the documentation
13# and/or other materials provided with the distribution.
14# * Neither the name of ARM Limited nor the names of its contributors may be
15# used to endorse or promote products derived from this software without
16# specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29"""
30Verify generated AArch32 assembler traces against `llvm-mc`.
31
Alexandre Ramesd3832962016-07-04 15:03:43 +010032This script will find all files in `test/aarch32/traces/` with names starting
33will `assembler`, and check them against `llvm-mc`. It checks our assembler is
Josh Sorefb43d6ef2022-08-03 12:47:14 -040034correct by looking up what instruction we meant to assemble, assemble it with
Pierre Langlois88c46b82016-06-02 18:15:32 +010035`llvm` and check the result is bit identical to what our assembler generated.
36
37You may run the script with no arguments from VIXL's top-level directory as long
38as `llvm-mc` is in your PATH. You may provide a different `llvm-mc` path with
39the `--llvm-mc` option. This script relies on version 3.8 or higher of
40LLVM. Previous versions refuse to assemble some instructions that ARMv8 allows,
41but ARMv7 did not.
42
43For example, let's say we have the following assembler trace for CLZ
44(the real trace is a lot bigger):
45
46~~~
47static const byte kInstruction_Clz_eq_r0_r0[] = {
48 0x10, 0x0f, 0x6f, 0x01 // Clz eq r0 r0
49};
50static const byte kInstruction_Clz_eq_r0_r1[] = {
51 0x11, 0x0f, 0x6f, 0x01 // Clz eq r0 r1
52};
53static const byte kInstruction_Clz_eq_r0_r2[] = {
54 0x12, 0x0f, 0x6f, 0x01 // Clz eq r0 r2
55};
56static const TestResult kReferenceClz[] = {
57 {
58 ARRAY_SIZE(kInstruction_Clz_eq_r0_r0),
59 kInstruction_Clz_eq_r0_r0,
60 },
61 {
62 ARRAY_SIZE(kInstruction_Clz_eq_r0_r1),
63 kInstruction_Clz_eq_r0_r1,
64 },
65 {
66 ARRAY_SIZE(kInstruction_Clz_eq_r0_r2),
67 kInstruction_Clz_eq_r0_r2,
68 },
69};
70~~~
71
72The traces contain both the list of bytes that were encoded as well as a comment
73with a description of the instruction this is. This script searches for these
74lines and checks them.
75
76With our example, the script will find the following:
77
78 [
79 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
80 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
81 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
82 ]
83
84Then the tricky part is to convert the description of the instruction into the
85following valid assembly syntax:
86
87 clzeq r0, r0
88 clzeq r0, r1
89 clzeq r0, r2
90
91Our example is easy, but it gets more complicated with load and store
92instructions for example. We can feed this as input to `llvm-mc`:
93
94 $ echo "
95 clzeq r0, r0
96 clzeq r0, r1
97 clzeq r0, r2
98 " | llvm-mc -assemble -arch=arm -mattr=v8,crc -show-encoding
99
100And we will get the following output:
101
102 .text
103 clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01]
104 clzeq r0, r1 @ encoding: [0x11,0x0f,0x6f,0x01]
105 clzeq r0, r2 @ encoding: [0x12,0x0f,0x6f,0x01]
106
107The script will finally extract the encoding and compare it to what VIXL
108generated.
109"""
110
111import argparse
112import subprocess
113import os
114import re
115import itertools
116import types
117
118def BuildOptions():
119 result = argparse.ArgumentParser(
120 description = 'Use `llvm-mc` to check the assembler traces are correct.',
121 formatter_class = argparse.ArgumentDefaultsHelpFormatter)
122 result.add_argument('--llvm-mc', default='llvm-mc', help='Path to llvm-mc')
123 result.add_argument('--verbose', '-v', action='store_true')
124 return result.parse_args()
125
126
127def CheckLLVMVersion(llvm_mc):
128 version = subprocess.check_output([llvm_mc, '-version'])
129 m = re.search("^ LLVM version (\d)\.(\d)\.\d$", version.decode(), re.M)
130 major, minor = m.groups()
131 if int(major) < 3 or (int(major) == 3 and int(minor) < 8):
132 raise Exception("This script requires LLVM version 3.8 or higher.")
133
134
135def ConvertToLLVMFormat(vixl_instruction, triple):
136 """
137 Take an string representing an instruction and convert it to assembly syntax
138 for LLVM. VIXL's test generation framework will print instruction
Josh Sorefb43d6ef2022-08-03 12:47:14 -0400139 representations as a space separated list. The first element is the mnemonic
Pierre Langlois88c46b82016-06-02 18:15:32 +0100140 and the following elements are operands.
141 """
142
Pierre Langlois4d912ac2016-11-08 11:13:31 +0000143 def DtUntypedToLLVM(matches):
144 dt = ""
145 if matches[1] == "untyped8":
146 dt = "8"
147 elif matches[1] == "untyped16":
148 dt = "16"
149 elif matches[1] == "untyped32":
150 dt = "32"
151 else:
152 raise Exception()
153
154 return "{}.{} {}, {}, {}".format(matches[0], dt, matches[2], matches[3], matches[4])
155
Pierre Langlois88c46b82016-06-02 18:15:32 +0100156 # Dictionnary of patterns. The key is an identifier used in
157 # `llvm_mc_instruction_converters` below. The value needs to be a capturing
158 # regular expression.
159 pattern_matchers = {
Pierre Langlois5b0cbc82016-09-26 14:00:30 +0100160 # Allow an optional underscore in case this an "and" instruction.
161 "mnemonic": "(\w+?)_?",
Pierre Langlois88c46b82016-06-02 18:15:32 +0100162 "condition":
163 "(al|eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le)",
164 "register":
165 "(r0|r1|r2|r3|r4|r5|r6|r7|r8|r9|r10|r11|r12|r13|r14|r15|pc|sp|lr)",
166 "immediate": "(0x[0-9a-f]+|[0-9]+)",
167 "shift": "(lsl|lsr|asr|ror)",
Pierre Langlois4d912ac2016-11-08 11:13:31 +0000168 "dregister": "(d[0-9]|d[12][0-9]|d3[01])",
169 "dt": "(s8|s16|s32|s64|u8|u16|u32|u64|f16|f32|f64|i8|i16|i32|i64|p8|p64)",
170 "dt_untyped": "(untyped8|untyped16|untyped32)"
Pierre Langlois88c46b82016-06-02 18:15:32 +0100171 }
172
173 # List of converters. Each of them represents an instruction form and what to
174 # convert it to. This list needs to be complete; an exception is raised if we
175 # couldn't find a converter for the instruction.
176 #
177 # The first part of each tuple is a pattern to match. It's simply a regular
178 # expression. Additionally, each identifier in curly braces is replaced by the
179 # corresponding pattern from `pattern_matchers`.
180 #
181 # The second part of the tuple is a string that describes what the result will
182 # look like. Empty curly braces are replaced by matches, in order.
183 llvm_mc_instruction_converters = [
184 ("it {condition}", "it {}"),
185 ("{mnemonic} {condition} {register} {immediate}",
186 "{}{} {}, #{}"),
187 ("{mnemonic} {condition} {register} {register} {immediate}",
188 "{}{} {}, {}, #{}"),
189 ("{mnemonic} {condition} {register} {register}",
190 "{}{} {}, {}"),
191 ("{mnemonic} {condition} {register} {register} {register}",
192 "{}{} {}, {}, {}"),
193 ("{mnemonic} {register} {register} {register}",
194 "{} {}, {}, {}"),
195 ("{mnemonic} {condition} {register} {register} {immediate}",
196 "{}{} {}, {}, #{}"),
197 ("{mnemonic} {condition} {register} {register} {register} {shift} "
198 "{immediate}",
199 "{}{} {}, {}, {}, {} #{}"),
200 ("{mnemonic} {condition} {register} {register} {register} {shift} "
201 "{register}",
202 "{}{} {}, {}, {}, {} {}"),
203 ("{mnemonic} {condition} {register} {register} {shift} {immediate}",
204 "{}{} {}, {}, {} #{}"),
205 ("{mnemonic} {condition} {register} {register} {shift} {register}",
206 "{}{} {}, {}, {} {}"),
207 ("{mnemonic} {condition} {register} {register} plus {immediate} offset",
208 "{}{} {}, [{}, #{}]"),
209 ("{mnemonic} {condition} {register} {register} minus {immediate} offset",
210 "{}{} {}, [{}, #-{}]"),
211 ("{mnemonic} {condition} {register} {register} plus {immediate} postindex",
212 "{}{} {}, [{}], #{}"),
213 ("{mnemonic} {condition} {register} {register} minus {immediate} "
214 "postindex",
215 "{}{} {}, [{}], #-{}"),
216 ("{mnemonic} {condition} {register} {register} plus {immediate} preindex",
217 "{}{} {}, [{}, #{}]!"),
218 ("{mnemonic} {condition} {register} {register} minus {immediate} "
219 "preindex",
220 "{}{} {}, [{}, #-{}]!"),
221 ("{mnemonic} {condition} {register} {register} plus {register} offset",
222 "{}{} {}, [{}, {}]"),
223 ("{mnemonic} {condition} {register} {register} minus {register} offset",
224 "{}{} {}, [{}, -{}]"),
225 ("{mnemonic} {condition} {register} {register} plus {register} postindex",
226 "{}{} {}, [{}], {}"),
227 ("{mnemonic} {condition} {register} {register} minus {register} "
228 "postindex",
229 "{}{} {}, [{}], -{}"),
230 ("{mnemonic} {condition} {register} {register} plus {register} preindex",
231 "{}{} {}, [{}, {}]!"),
232 ("{mnemonic} {condition} {register} {register} minus {register} preindex",
233 "{}{} {}, [{}, -{}]!"),
234 ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
235 "{immediate} offset",
236 "{}{} {}, [{}, {}, {} #{}]"),
237 ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
238 "{immediate} offset",
239 "{}{} {}, [{}, -{}, {} #{}]"),
240 ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
241 "{immediate} postindex",
242 "{}{} {}, [{}], {}, {} #{}"),
243 ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
244 "{immediate} postindex",
245 "{}{} {}, [{}], -{}, {} #{}"),
246 ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
247 "{immediate} preindex",
248 "{}{} {}, [{}, {}, {} #{}]!"),
249 ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
250 "{immediate} preindex",
251 "{}{} {}, [{}, -{}, {} #{}]!"),
Pierre Langlois4d912ac2016-11-08 11:13:31 +0000252 ("{mnemonic} {dt} {dregister} {dregister} {dregister}",
253 "{}.{} {}, {}, {}"),
254 ("{mnemonic} {dt_untyped} {dregister} {dregister} {dregister}", DtUntypedToLLVM)
Pierre Langlois88c46b82016-06-02 18:15:32 +0100255 ]
256
257 # Work around issues in LLVM 3.8.
258 if triple == "thumbv8":
259 def ConvertMovRdImm(matches):
260 """
261 LLVM chooses the T3 encoding for `mov <rd>, #<immediate>` when the
262 immediate fits both into a modified immediate (T2 encoding) and 16
263 bits (T3 encoding). Adding the `.W` modifier forces the T2 encoding to
264 be used.
265 """
266 # The immediate is the second capture in "mov al {register} {immediate}".
267 imm = int(matches[1], 16)
268 if imm <= 0xffff:
269 lsb = imm & -imm
270 if (imm >> 8) < lsb:
271 return "mov.w {}, #{}".format(*matches)
272 # Fall back to a LLVM making the right decision.
273 return "mov {}, #{}".format(*matches)
274 llvm_mc_instruction_converters[:0] = [
275 # The ARM ARM specifies that if <Rn> is PC in either an ADD or SUB
276 # instruction with an immediate, the assembler should use the ADR
277 # encoding. LLVM does not know about this subtlety. We get around this
278 # by manually translating the instruction to their ADR form.
279 ("add al {register} pc {immediate}", "adr {}, #{}"),
280 ("sub al {register} pc {immediate}", "adr {}, #-{}"),
281
282 # LLVM is (rightfully) being helpful by swapping register operands so
283 # that the 16 bit encoding of the following instructions is used.
284 # However, VIXL does not do this. These rules specifically add the `.w`
285 # modifier to force LLVM to use the 32 bit encoding if the last register
286 # is identical to first one. But at the same time, we should still use
287 # the narrow encoding if all registers are the same.
288 ("adcs al {register} (\\1) (\\1)", "adcs.n {}, {}, {}"),
289 ("adcs al {register} {register} (\\1)", "adcs.w {}, {}, {}"),
290 ("orrs al {register} (\\1) (\\1)", "orrs.n {}, {}, {}"),
291 ("orrs al {register} {register} (\\1)", "orrs.w {}, {}, {}"),
292 ("eors al {register} (\\1) (\\1)", "eors.n {}, {}, {}"),
293 ("eors al {register} {register} (\\1)", "eors.w {}, {}, {}"),
294 ("ands al {register} (\\1) (\\1)", "ands.n {}, {}, {}"),
295 ("ands al {register} {register} (\\1)", "ands.w {}, {}, {}"),
296 # Solve the same issue as for the previous rules, however, we need to
297 # take into account that ADD instructions with the stack pointer have
298 # additional 16 bit forms.
299 ("add al {register} (\\1) (\\1)", "add.n {}, {}, {}"),
300 ("add al {register} (\\1) r13", "add.w {}, {}, sp"),
301 ("add al {register} r13 (\\1)", "add.n {}, sp, {}"),
302 ("add al {register} {register} (\\1)", "add.w {}, {}, {}"),
303 ("mov al {register} {immediate}", ConvertMovRdImm)
304 ]
305
306 # Our test generator framework uses mnemonics starting with a capital letters.
Josh Sorefb43d6ef2022-08-03 12:47:14 -0400307 # We need everything to be lower case for LLVM.
Pierre Langlois88c46b82016-06-02 18:15:32 +0100308 vixl_instruction = vixl_instruction.lower()
309
310 llvm_instruction = []
311
Josh Sorefb43d6ef2022-08-03 12:47:14 -0400312 # VIXL may have generated more than one instruction separated by ';'
Pierre Langlois88c46b82016-06-02 18:15:32 +0100313 # (an IT instruction for example).
314 for instruction in vixl_instruction.split(';'):
315 # Strip out extra white spaces.
316 instruction = instruction.strip()
317 # Try all converters in the list.
318 for pattern, result in llvm_mc_instruction_converters:
319 # Build the regular expression for this converter.
320 instruction_matcher = "^" + pattern.format(**pattern_matchers) + "$"
321 match = re.match(instruction_matcher, instruction)
322 if match:
323 # If we have a match, the object will contain a tuple of substrings.
324 if isinstance(result, types.FunctionType):
325 # `result` is a function, call it produce the instruction.
326 llvm_instruction.append(result(match.groups()))
327 else:
328 # `result` is a string, use it as the format string.
329 assert(isinstance(result, str))
330 llvm_instruction.append(result.format(*match.groups()))
331 break
332
333 if llvm_instruction:
334 return "\n".join(llvm_instruction)
335
336 # No converters worked so raise an exception.
337 raise Exception("Unsupported instruction {}.".format(instruction))
338
339
340def ReadTrace(trace):
341 """
342 Receive the content of an assembler trace, extract the relevant information
343 and return it as a list of tuples. The first part of each typle is a string
344 representing the instruction. The second part is a list of bytes representing
345 the encoding.
346
347 For example:
348
349 [
350 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
351 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
352 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
353 ]
354 """
355
356 pattern = re.compile(
357 "^ (?P<encoding>(:?0x[0-9a-f]{2}, )+0x[0-9a-f]{2}) // (?P<instruction>.*)$",
358 re.M)
359 return [
360 (m.group('instruction'), m.group('encoding').replace(" ", "").split(","))
361 for m in re.finditer(pattern, trace)
362 ]
363
364
365def VerifyInstructionsWithLLVMMC(llvm_mc, f, triple):
366 """
367 Extract all instructions from `f`, feed them to `llvm-mc` and make sure it's
368 encoded them the same way as VIXL. `triple` allows us to specify either
369 "thumbv8" or "armv8".
370 """
371
372 vixl_reference = ReadTrace(f.read())
373 vixl_instructions, vixl_encodings = zip(*vixl_reference)
374 instructions = [
375 ConvertToLLVMFormat(instruction, triple)
376 for instruction in vixl_instructions
377 ]
378 llvm_mc_proc = subprocess.Popen(
379 [llvm_mc, '-assemble', '-triple={}'.format(triple), '-mattr=v8,crc',
380 # LLVM fails to recognize some instructions as valid T32 when we do not
381 # set `-mcpu`.
382 '-mcpu=cortex-a53', '-show-encoding'],
383 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
384 out, err = llvm_mc_proc.communicate("\n".join(instructions).encode())
385 # If `llvm-mc` printed something to stderr then stop.
386 if err:
387 print(err.decode())
388 return
389
390 # Extract list of bytes from `llvm-mc` output. It's in the following form:
391 #
392 # clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01]
393 # ^^^^ ^^^^ ^^^^ ^^^^
394 llvm_encodings = [
395 match_object.group('encoding').replace(" ", "").split(",")
396 for match_object in re.finditer(".*@ encoding: \[(?P<encoding>.*)\]",
397 out.decode())
398 ]
399
400 # If LLVM has generated exactly twice as much instructions, we assume this is
401 # due to IT instructions preceding every instruction under test. VIXL's
402 # assembly reference files will contain a single array of 4 bytes encoding
403 # both the IT and the following instruction. While LLVM will have decoded them
Josh Sorefb43d6ef2022-08-03 12:47:14 -0400404 # into two separate 2 bytes arrays.
Pierre Langlois88c46b82016-06-02 18:15:32 +0100405 if len(llvm_encodings) == 2 * len(vixl_encodings):
406 llvm_encodings = [
407 llvm_encodings[i * 2] + llvm_encodings[(i * 2) + 1]
408 for i in range(0, len(vixl_encodings))
409 ]
410
411 # Check the encodings from LLVM are identical to VIXL's.
412 if len(llvm_encodings) != len(vixl_encodings):
413 print("""Error: llvm-mc generated {} instructions than there are in the
414generated trace.
415 """.format("fewer" if len(llvm_encodings) < len(vixl_encodings) else "more"))
416 else:
417 for i in range(0, len(vixl_encodings)):
418 if llvm_encodings[i] != vixl_encodings[i]:
419 print("""Error: llvm-mc disagrees on the encoding of \"{instruction}\":
420 LLVM-MC: {llvm}
421 VIXL: {vixl}
422 """.format(instruction=vixl_instructions[i].replace("\n", "; "),
423 llvm=llvm_encodings[i],
424 vixl=vixl_encodings[i]))
425
426
427if __name__ == "__main__":
428 args = BuildOptions()
429
430 CheckLLVMVersion(args.llvm_mc)
431
Alexandre Ramesd3832962016-07-04 15:03:43 +0100432 trace_dir = 'test/aarch32/traces/'
Pierre Langlois88c46b82016-06-02 18:15:32 +0100433 trace_files = [
434 trace_file
435 for trace_file in os.listdir(trace_dir)
436 if trace_file.startswith("assembler-")
437 ]
438 trace_files.sort()
439 for trace_file in trace_files:
440 if args.verbose:
441 print("Verifying \"" + trace_file + "\".")
442 with open(os.path.join(trace_dir, trace_file), "r") as f:
443 if "t32" in trace_file:
444 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "thumbv8")
445 elif "a32" in trace_file:
446 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "armv8")
447 else:
448 raise Exception("Failed to recognize the ISA in \"" + trace_file + "\".")