If you use a compiled language, you should periodically look at Godbolt and see what your code is doing and what changes to your code will do in the compiled output.

In this case a positively insane way of calculating squares and cubes generates 311 lines of ARM assembler output that will swallow your memory. With even something as simple as -O1 on the command line it’s replaced by one or two multiplications respectively. With -fwhole-program it removes the functions entirely and interlaces them into the loop in main().

Know your tools. It makes huge differences!

  • ttmrichter@lemmy.worldOP
    link
    fedilink
    arrow-up
    1
    arrow-down
    1
    ·
    6 months ago

    I mean it could hurt:

    cube:
            push    {r4, r5, r6, r7, r8, r9, r10, fp}
            sub     sp, sp, #112
            add     r7, sp, #0
            str     r0, [r7, #92]
            mov     r3, sp
            mov     ip, r3
            ldr     r1, [r7, #92]
            ldr     r0, [r7, #92]
            ldr     r6, [r7, #92]
            subs    r3, r1, #1
            str     r3, [r7, #108]
            mov     r2, r1
            movs    r3, #0
            mov     r4, r2
            mov     r5, r3
            mov     r2, #0
            mov     r3, #0
            lsls    r3, r5, #3
            orr     r3, r3, r4, lsr #29
            lsls    r2, r4, #3
            subs    r3, r0, #1
            str     r3, [r7, #104]
            mov     r2, r1
            movs    r3, #0
            str     r2, [r7, #80]
            str     r3, [r7, #84]
            mov     r2, r0
            movs    r3, #0
            str     r2, [r7, #64]
            str     r3, [r7, #68]
            ldrd    r4, [r7, #80]
            mov     r3, r5
            ldr     r2, [r7, #64]
            mul     r2, r2, r3
            ldr     r3, [r7, #68]
            strd    r4, [r7, #80]
            ldr     r4, [r7, #80]
            mul     r3, r4, r3
            add     r3, r3, r2
            ldr     r2, [r7, #80]
            ldr     r4, [r7, #64]
            umull   r8, r9, r2, r4
            add     r3, r3, r9
            mov     r9, r3
            mov     r2, #0
            mov     r3, #0
            lsl     r3, r9, #3
            orr     r3, r3, r8, lsr #29
            lsl     r2, r8, #3
            subs    r3, r6, #1
            str     r3, [r7, #100]
            mov     r2, r1
            movs    r3, #0
            str     r2, [r7, #32]
            str     r3, [r7, #36]
            mov     r2, r0
            movs    r3, #0
            str     r2, [r7, #72]
            str     r3, [r7, #76]
            ldrd    r4, [r7, #32]
            mov     r3, r5
            ldrd    r8, [r7, #72]
            mov     r2, r8
            mul     r2, r2, r3
            strd    r8, [r7, #72]
            ldr     r3, [r7, #76]
            mov     r8, r4
            mov     r9, r5
            mov     r4, r8
            mul     r3, r4, r3
            add     r3, r3, r2
            mov     r2, r8
            ldr     r4, [r7, #72]
            umull   r10, fp, r2, r4
            add     r3, r3, fp
            mov     fp, r3
            mov     r2, r6
            movs    r3, #0
            str     r2, [r7, #24]
            str     r3, [r7, #28]
            ldrd    r4, [r7, #24]
            mov     r3, r4
            mul     r2, r3, fp
            mov     r3, r5
            mul     r3, r10, r3
            add     r3, r3, r2
            mov     r2, r4
            umull   r4, r2, r10, r2
            str     r2, [r7, #60]
            mov     r2, r4
            str     r2, [r7, #56]
            ldr     r2, [r7, #60]
            add     r3, r3, r2
            str     r3, [r7, #60]
            mov     r2, #0
            mov     r3, #0
            ldrd    r8, [r7, #56]
            mov     r4, r9
            lsls    r3, r4, #3
            mov     r4, r8
            orr     r3, r3, r4, lsr #29
            mov     r4, r8
            lsls    r2, r4, #3
            mov     r2, r1
            movs    r3, #0
            str     r2, [r7, #16]
            str     r3, [r7, #20]
            mov     r2, r0
            movs    r3, #0
            str     r2, [r7, #8]
            str     r3, [r7, #12]
            ldrd    r8, [r7, #16]
            mov     r3, r9
            ldrd    r10, [r7, #8]
            mov     r2, r10
            mul     r2, r2, r3
            mov     r3, fp
            mov     r4, r8
            mul     r3, r4, r3
            add     r3, r3, r2
            mov     r2, r8
            mov     r4, r10
            umull   r4, r2, r2, r4
            str     r2, [r7, #52]
            mov     r2, r4
            str     r2, [r7, #48]
            ldr     r2, [r7, #52]
            add     r3, r3, r2
            str     r3, [r7, #52]
            mov     r2, r6
            movs    r3, #0
            str     r2, [r7]
            str     r3, [r7, #4]
            ldrd    r8, [r7, #48]
            mov     r3, r9
            ldrd    r10, [r7]
            mov     r2, r10
            mul     r2, r2, r3
            mov     r3, fp
            mov     r4, r8
            mul     r3, r4, r3
            add     r3, r3, r2
            mov     r2, r8
            mov     r4, r10
            umull   r4, r2, r2, r4
            str     r2, [r7, #44]
            mov     r2, r4
            str     r2, [r7, #40]
            ldr     r2, [r7, #44]
            add     r3, r3, r2
            str     r3, [r7, #44]
            mov     r2, #0
            mov     r3, #0
            ldrd    r8, [r7, #40]
            mov     r4, r9
            lsls    r3, r4, #3
            mov     r4, r8
            orr     r3, r3, r4, lsr #29
            mov     r4, r8
            lsls    r2, r4, #3
            mov     r3, r1
            mov     r2, r0
            mul     r3, r2, r3
            mov     r2, r6
            mul     r3, r2, r3
            adds    r3, r3, #7
            lsrs    r3, r3, #3
            lsls    r3, r3, #3
            sub     sp, sp, r3
            mov     r3, sp
            str     r3, [r7, #96]
            mov     r3, r1
            mov     r2, r0
            mul     r3, r2, r3
            mov     r2, r6
            mul     r3, r2, r3
            mov     sp, ip
            mov     r0, r3
            adds    r7, r7, #112
            mov     sp, r7
            pop     {r4, r5, r6, r7, r8, r9, r10, fp}
            bx      lr