blob: 1162a85feff892787401665083576d61bbc154a3 [file] [log] [blame]
Pierre Langlois88c46b82016-06-02 18:15:32 +01001#!/usr/bin/env python3
2
3# Copyright 2016, ARM Limited
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are met:
8#
9# * Redistributions of source code must retain the above copyright notice,
10# this list of conditions and the following disclaimer.
11# * Redistributions in binary form must reproduce the above copyright notice,
12# this list of conditions and the following disclaimer in the documentation
13# and/or other materials provided with the distribution.
14# * Neither the name of ARM Limited nor the names of its contributors may be
15# used to endorse or promote products derived from this software without
16# specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29"""
30Verify generated AArch32 assembler traces against `llvm-mc`.
31
32This script will find all files in `test/a32/traces/` with names starting will
33`assembler`, and check them against `llvm-mc`. It checks our assembler is
34correct by looking up what instruction we meant to asssemble, assemble it with
35`llvm` and check the result is bit identical to what our assembler generated.
36
37You may run the script with no arguments from VIXL's top-level directory as long
38as `llvm-mc` is in your PATH. You may provide a different `llvm-mc` path with
39the `--llvm-mc` option. This script relies on version 3.8 or higher of
40LLVM. Previous versions refuse to assemble some instructions that ARMv8 allows,
41but ARMv7 did not.
42
43For example, let's say we have the following assembler trace for CLZ
44(the real trace is a lot bigger):
45
46~~~
47static const byte kInstruction_Clz_eq_r0_r0[] = {
48 0x10, 0x0f, 0x6f, 0x01 // Clz eq r0 r0
49};
50static const byte kInstruction_Clz_eq_r0_r1[] = {
51 0x11, 0x0f, 0x6f, 0x01 // Clz eq r0 r1
52};
53static const byte kInstruction_Clz_eq_r0_r2[] = {
54 0x12, 0x0f, 0x6f, 0x01 // Clz eq r0 r2
55};
56static const TestResult kReferenceClz[] = {
57 {
58 ARRAY_SIZE(kInstruction_Clz_eq_r0_r0),
59 kInstruction_Clz_eq_r0_r0,
60 },
61 {
62 ARRAY_SIZE(kInstruction_Clz_eq_r0_r1),
63 kInstruction_Clz_eq_r0_r1,
64 },
65 {
66 ARRAY_SIZE(kInstruction_Clz_eq_r0_r2),
67 kInstruction_Clz_eq_r0_r2,
68 },
69};
70~~~
71
72The traces contain both the list of bytes that were encoded as well as a comment
73with a description of the instruction this is. This script searches for these
74lines and checks them.
75
76With our example, the script will find the following:
77
78 [
79 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
80 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
81 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
82 ]
83
84Then the tricky part is to convert the description of the instruction into the
85following valid assembly syntax:
86
87 clzeq r0, r0
88 clzeq r0, r1
89 clzeq r0, r2
90
91Our example is easy, but it gets more complicated with load and store
92instructions for example. We can feed this as input to `llvm-mc`:
93
94 $ echo "
95 clzeq r0, r0
96 clzeq r0, r1
97 clzeq r0, r2
98 " | llvm-mc -assemble -arch=arm -mattr=v8,crc -show-encoding
99
100And we will get the following output:
101
102 .text
103 clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01]
104 clzeq r0, r1 @ encoding: [0x11,0x0f,0x6f,0x01]
105 clzeq r0, r2 @ encoding: [0x12,0x0f,0x6f,0x01]
106
107The script will finally extract the encoding and compare it to what VIXL
108generated.
109"""
110
111import argparse
112import subprocess
113import os
114import re
115import itertools
116import types
117
118def BuildOptions():
119 result = argparse.ArgumentParser(
120 description = 'Use `llvm-mc` to check the assembler traces are correct.',
121 formatter_class = argparse.ArgumentDefaultsHelpFormatter)
122 result.add_argument('--llvm-mc', default='llvm-mc', help='Path to llvm-mc')
123 result.add_argument('--verbose', '-v', action='store_true')
124 return result.parse_args()
125
126
127def CheckLLVMVersion(llvm_mc):
128 version = subprocess.check_output([llvm_mc, '-version'])
129 m = re.search("^ LLVM version (\d)\.(\d)\.\d$", version.decode(), re.M)
130 major, minor = m.groups()
131 if int(major) < 3 or (int(major) == 3 and int(minor) < 8):
132 raise Exception("This script requires LLVM version 3.8 or higher.")
133
134
135def ConvertToLLVMFormat(vixl_instruction, triple):
136 """
137 Take an string representing an instruction and convert it to assembly syntax
138 for LLVM. VIXL's test generation framework will print instruction
139 representations as a space seperated list. The first element is the mnemonic
140 and the following elements are operands.
141 """
142
143 # Dictionnary of patterns. The key is an identifier used in
144 # `llvm_mc_instruction_converters` below. The value needs to be a capturing
145 # regular expression.
146 pattern_matchers = {
147 "mnemonic": "(\w+)",
148 "condition":
149 "(al|eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le)",
150 "register":
151 "(r0|r1|r2|r3|r4|r5|r6|r7|r8|r9|r10|r11|r12|r13|r14|r15|pc|sp|lr)",
152 "immediate": "(0x[0-9a-f]+|[0-9]+)",
153 "shift": "(lsl|lsr|asr|ror)",
154 }
155
156 # List of converters. Each of them represents an instruction form and what to
157 # convert it to. This list needs to be complete; an exception is raised if we
158 # couldn't find a converter for the instruction.
159 #
160 # The first part of each tuple is a pattern to match. It's simply a regular
161 # expression. Additionally, each identifier in curly braces is replaced by the
162 # corresponding pattern from `pattern_matchers`.
163 #
164 # The second part of the tuple is a string that describes what the result will
165 # look like. Empty curly braces are replaced by matches, in order.
166 llvm_mc_instruction_converters = [
167 ("it {condition}", "it {}"),
168 ("{mnemonic} {condition} {register} {immediate}",
169 "{}{} {}, #{}"),
170 ("{mnemonic} {condition} {register} {register} {immediate}",
171 "{}{} {}, {}, #{}"),
172 ("{mnemonic} {condition} {register} {register}",
173 "{}{} {}, {}"),
174 ("{mnemonic} {condition} {register} {register} {register}",
175 "{}{} {}, {}, {}"),
176 ("{mnemonic} {register} {register} {register}",
177 "{} {}, {}, {}"),
178 ("{mnemonic} {condition} {register} {register} {immediate}",
179 "{}{} {}, {}, #{}"),
180 ("{mnemonic} {condition} {register} {register} {register} {shift} "
181 "{immediate}",
182 "{}{} {}, {}, {}, {} #{}"),
183 ("{mnemonic} {condition} {register} {register} {register} {shift} "
184 "{register}",
185 "{}{} {}, {}, {}, {} {}"),
186 ("{mnemonic} {condition} {register} {register} {shift} {immediate}",
187 "{}{} {}, {}, {} #{}"),
188 ("{mnemonic} {condition} {register} {register} {shift} {register}",
189 "{}{} {}, {}, {} {}"),
190 ("{mnemonic} {condition} {register} {register} plus {immediate} offset",
191 "{}{} {}, [{}, #{}]"),
192 ("{mnemonic} {condition} {register} {register} minus {immediate} offset",
193 "{}{} {}, [{}, #-{}]"),
194 ("{mnemonic} {condition} {register} {register} plus {immediate} postindex",
195 "{}{} {}, [{}], #{}"),
196 ("{mnemonic} {condition} {register} {register} minus {immediate} "
197 "postindex",
198 "{}{} {}, [{}], #-{}"),
199 ("{mnemonic} {condition} {register} {register} plus {immediate} preindex",
200 "{}{} {}, [{}, #{}]!"),
201 ("{mnemonic} {condition} {register} {register} minus {immediate} "
202 "preindex",
203 "{}{} {}, [{}, #-{}]!"),
204 ("{mnemonic} {condition} {register} {register} plus {register} offset",
205 "{}{} {}, [{}, {}]"),
206 ("{mnemonic} {condition} {register} {register} minus {register} offset",
207 "{}{} {}, [{}, -{}]"),
208 ("{mnemonic} {condition} {register} {register} plus {register} postindex",
209 "{}{} {}, [{}], {}"),
210 ("{mnemonic} {condition} {register} {register} minus {register} "
211 "postindex",
212 "{}{} {}, [{}], -{}"),
213 ("{mnemonic} {condition} {register} {register} plus {register} preindex",
214 "{}{} {}, [{}, {}]!"),
215 ("{mnemonic} {condition} {register} {register} minus {register} preindex",
216 "{}{} {}, [{}, -{}]!"),
217 ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
218 "{immediate} offset",
219 "{}{} {}, [{}, {}, {} #{}]"),
220 ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
221 "{immediate} offset",
222 "{}{} {}, [{}, -{}, {} #{}]"),
223 ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
224 "{immediate} postindex",
225 "{}{} {}, [{}], {}, {} #{}"),
226 ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
227 "{immediate} postindex",
228 "{}{} {}, [{}], -{}, {} #{}"),
229 ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
230 "{immediate} preindex",
231 "{}{} {}, [{}, {}, {} #{}]!"),
232 ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
233 "{immediate} preindex",
234 "{}{} {}, [{}, -{}, {} #{}]!"),
235 ]
236
237 # Work around issues in LLVM 3.8.
238 if triple == "thumbv8":
239 def ConvertMovRdImm(matches):
240 """
241 LLVM chooses the T3 encoding for `mov <rd>, #<immediate>` when the
242 immediate fits both into a modified immediate (T2 encoding) and 16
243 bits (T3 encoding). Adding the `.W` modifier forces the T2 encoding to
244 be used.
245 """
246 # The immediate is the second capture in "mov al {register} {immediate}".
247 imm = int(matches[1], 16)
248 if imm <= 0xffff:
249 lsb = imm & -imm
250 if (imm >> 8) < lsb:
251 return "mov.w {}, #{}".format(*matches)
252 # Fall back to a LLVM making the right decision.
253 return "mov {}, #{}".format(*matches)
254 llvm_mc_instruction_converters[:0] = [
255 # The ARM ARM specifies that if <Rn> is PC in either an ADD or SUB
256 # instruction with an immediate, the assembler should use the ADR
257 # encoding. LLVM does not know about this subtlety. We get around this
258 # by manually translating the instruction to their ADR form.
259 ("add al {register} pc {immediate}", "adr {}, #{}"),
260 ("sub al {register} pc {immediate}", "adr {}, #-{}"),
261
262 # LLVM is (rightfully) being helpful by swapping register operands so
263 # that the 16 bit encoding of the following instructions is used.
264 # However, VIXL does not do this. These rules specifically add the `.w`
265 # modifier to force LLVM to use the 32 bit encoding if the last register
266 # is identical to first one. But at the same time, we should still use
267 # the narrow encoding if all registers are the same.
268 ("adcs al {register} (\\1) (\\1)", "adcs.n {}, {}, {}"),
269 ("adcs al {register} {register} (\\1)", "adcs.w {}, {}, {}"),
270 ("orrs al {register} (\\1) (\\1)", "orrs.n {}, {}, {}"),
271 ("orrs al {register} {register} (\\1)", "orrs.w {}, {}, {}"),
272 ("eors al {register} (\\1) (\\1)", "eors.n {}, {}, {}"),
273 ("eors al {register} {register} (\\1)", "eors.w {}, {}, {}"),
274 ("ands al {register} (\\1) (\\1)", "ands.n {}, {}, {}"),
275 ("ands al {register} {register} (\\1)", "ands.w {}, {}, {}"),
276 # Solve the same issue as for the previous rules, however, we need to
277 # take into account that ADD instructions with the stack pointer have
278 # additional 16 bit forms.
279 ("add al {register} (\\1) (\\1)", "add.n {}, {}, {}"),
280 ("add al {register} (\\1) r13", "add.w {}, {}, sp"),
281 ("add al {register} r13 (\\1)", "add.n {}, sp, {}"),
282 ("add al {register} {register} (\\1)", "add.w {}, {}, {}"),
283 ("mov al {register} {immediate}", ConvertMovRdImm)
284 ]
285
286 # Our test generator framework uses mnemonics starting with a capital letters.
287 # We need everythin to be lower case for LLVM.
288 vixl_instruction = vixl_instruction.lower()
289
290 llvm_instruction = []
291
292 # VIXL may have generated more than one instruction seperated by ';'
293 # (an IT instruction for example).
294 for instruction in vixl_instruction.split(';'):
295 # Strip out extra white spaces.
296 instruction = instruction.strip()
297 # Try all converters in the list.
298 for pattern, result in llvm_mc_instruction_converters:
299 # Build the regular expression for this converter.
300 instruction_matcher = "^" + pattern.format(**pattern_matchers) + "$"
301 match = re.match(instruction_matcher, instruction)
302 if match:
303 # If we have a match, the object will contain a tuple of substrings.
304 if isinstance(result, types.FunctionType):
305 # `result` is a function, call it produce the instruction.
306 llvm_instruction.append(result(match.groups()))
307 else:
308 # `result` is a string, use it as the format string.
309 assert(isinstance(result, str))
310 llvm_instruction.append(result.format(*match.groups()))
311 break
312
313 if llvm_instruction:
314 return "\n".join(llvm_instruction)
315
316 # No converters worked so raise an exception.
317 raise Exception("Unsupported instruction {}.".format(instruction))
318
319
320def ReadTrace(trace):
321 """
322 Receive the content of an assembler trace, extract the relevant information
323 and return it as a list of tuples. The first part of each typle is a string
324 representing the instruction. The second part is a list of bytes representing
325 the encoding.
326
327 For example:
328
329 [
330 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
331 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
332 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
333 ]
334 """
335
336 pattern = re.compile(
337 "^ (?P<encoding>(:?0x[0-9a-f]{2}, )+0x[0-9a-f]{2}) // (?P<instruction>.*)$",
338 re.M)
339 return [
340 (m.group('instruction'), m.group('encoding').replace(" ", "").split(","))
341 for m in re.finditer(pattern, trace)
342 ]
343
344
345def VerifyInstructionsWithLLVMMC(llvm_mc, f, triple):
346 """
347 Extract all instructions from `f`, feed them to `llvm-mc` and make sure it's
348 encoded them the same way as VIXL. `triple` allows us to specify either
349 "thumbv8" or "armv8".
350 """
351
352 vixl_reference = ReadTrace(f.read())
353 vixl_instructions, vixl_encodings = zip(*vixl_reference)
354 instructions = [
355 ConvertToLLVMFormat(instruction, triple)
356 for instruction in vixl_instructions
357 ]
358 llvm_mc_proc = subprocess.Popen(
359 [llvm_mc, '-assemble', '-triple={}'.format(triple), '-mattr=v8,crc',
360 # LLVM fails to recognize some instructions as valid T32 when we do not
361 # set `-mcpu`.
362 '-mcpu=cortex-a53', '-show-encoding'],
363 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
364 out, err = llvm_mc_proc.communicate("\n".join(instructions).encode())
365 # If `llvm-mc` printed something to stderr then stop.
366 if err:
367 print(err.decode())
368 return
369
370 # Extract list of bytes from `llvm-mc` output. It's in the following form:
371 #
372 # clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01]
373 # ^^^^ ^^^^ ^^^^ ^^^^
374 llvm_encodings = [
375 match_object.group('encoding').replace(" ", "").split(",")
376 for match_object in re.finditer(".*@ encoding: \[(?P<encoding>.*)\]",
377 out.decode())
378 ]
379
380 # If LLVM has generated exactly twice as much instructions, we assume this is
381 # due to IT instructions preceding every instruction under test. VIXL's
382 # assembly reference files will contain a single array of 4 bytes encoding
383 # both the IT and the following instruction. While LLVM will have decoded them
384 # into two seperate 2 bytes arrays.
385 if len(llvm_encodings) == 2 * len(vixl_encodings):
386 llvm_encodings = [
387 llvm_encodings[i * 2] + llvm_encodings[(i * 2) + 1]
388 for i in range(0, len(vixl_encodings))
389 ]
390
391 # Check the encodings from LLVM are identical to VIXL's.
392 if len(llvm_encodings) != len(vixl_encodings):
393 print("""Error: llvm-mc generated {} instructions than there are in the
394generated trace.
395 """.format("fewer" if len(llvm_encodings) < len(vixl_encodings) else "more"))
396 else:
397 for i in range(0, len(vixl_encodings)):
398 if llvm_encodings[i] != vixl_encodings[i]:
399 print("""Error: llvm-mc disagrees on the encoding of \"{instruction}\":
400 LLVM-MC: {llvm}
401 VIXL: {vixl}
402 """.format(instruction=vixl_instructions[i].replace("\n", "; "),
403 llvm=llvm_encodings[i],
404 vixl=vixl_encodings[i]))
405
406
407if __name__ == "__main__":
408 args = BuildOptions()
409
410 CheckLLVMVersion(args.llvm_mc)
411
412 trace_dir = 'test/a32/traces/'
413 trace_files = [
414 trace_file
415 for trace_file in os.listdir(trace_dir)
416 if trace_file.startswith("assembler-")
417 ]
418 trace_files.sort()
419 for trace_file in trace_files:
420 if args.verbose:
421 print("Verifying \"" + trace_file + "\".")
422 with open(os.path.join(trace_dir, trace_file), "r") as f:
423 if "t32" in trace_file:
424 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "thumbv8")
425 elif "a32" in trace_file:
426 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "armv8")
427 else:
428 raise Exception("Failed to recognize the ISA in \"" + trace_file + "\".")