src/ia32/macro-assembler-ia32.cc - Issue 6148007: Speed up FastAsciiArrayJoin on ia32 by improving hand-written assembly code.

Unified Diff: src/ia32/macro-assembler-ia32.cc

Issue 6148007: Speed up FastAsciiArrayJoin on ia32 by improving hand-written assembly code. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 9 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/ia32/macro-assembler-ia32.cc

===================================================================

--- src/ia32/macro-assembler-ia32.cc (revision 6301)

+++ src/ia32/macro-assembler-ia32.cc (working copy)

@@ -877,55 +877,51 @@

Immediate(Factory::cons_ascii_string_map()));

}

-// All registers must be distinct. Only current_string needs valid contents

-// on entry. All registers may be invalid on exit. result_operand is

-// unchanged, padding_chars is updated correctly.

-void MacroAssembler::AppendStringToTopOfNewSpace(

- Register current_string, // Tagged pointer to string to copy.

- Register current_string_length,

- Register result_pos,

- Register scratch,

- Register new_padding_chars,

- Operand operand_result,

- Operand operand_padding_chars,

- Label* bailout) {

- mov(current_string_length,

- FieldOperand(current_string, String::kLengthOffset));

- shr(current_string_length, 1);

- sub(current_string_length, operand_padding_chars);

- mov(new_padding_chars, current_string_length);

- add(Operand(current_string_length), Immediate(kObjectAlignmentMask));

- and_(Operand(current_string_length), Immediate(~kObjectAlignmentMask));

- sub(new_padding_chars, Operand(current_string_length));

- neg(new_padding_chars);

- // We need an allocation even if current_string_length is 0, to fetch

- // result_pos. Consider using a faster fetch of result_pos in that case.

- AllocateInNewSpace(current_string_length, result_pos, scratch, no_reg,

- bailout, NO_ALLOCATION_FLAGS);

- sub(result_pos, operand_padding_chars);

- mov(operand_padding_chars, new_padding_chars);

- Register scratch_2 = new_padding_chars; // Used to compute total length.

- // Copy string to the end of result.

- mov(current_string_length,

- FieldOperand(current_string, String::kLengthOffset));

- mov(scratch, operand_result);

- mov(scratch_2, current_string_length);

- add(scratch_2, FieldOperand(scratch, String::kLengthOffset));

- mov(FieldOperand(scratch, String::kLengthOffset), scratch_2);

- shr(current_string_length, 1);

- lea(current_string,

- FieldOperand(current_string, SeqAsciiString::kHeaderSize));

- // Loop condition: while (--current_string_length >= 0).

- Label copy_loop;

- Label copy_loop_entry;

- jmp(&copy_loop_entry);

- bind(&copy_loop);

- mov_b(scratch, Operand(current_string, current_string_length, times_1, 0));

- mov_b(Operand(result_pos, current_string_length, times_1, 0), scratch);

- bind(&copy_loop_entry);

- sub(Operand(current_string_length), Immediate(1));

- j(greater_equal, &copy_loop);

+// Copy memory, byte-by-byte, from source to destination. Not optimized for

+// long or aligned copies. The contents of scratch and length are destroyed.

+// Source and destination are incremented by length.

+// Many variants of movsb, loop unrolling, word moves, and indexed operands

+// have been tried here already, and this is fastest.

+// A simpler loop is faster on small copies, but 30% slower on large ones.

+// The cld() instruction must have been emitted, to set the direction flag(),

+// before calling this function.

+void MacroAssembler::CopyBytes(Register source,

+ Register destination,

+ Register length,

+ Register scratch) {

+ Label loop, done, short_string, short_loop;

+ // Experimentation shows that the short string loop is faster if length < 10.

+ cmp(Operand(length), Immediate(10));

+ j(less_equal, &short_string);

+ ASSERT(source.is(esi));

+ ASSERT(destination.is(edi));

+ ASSERT(length.is(ecx));

+ // Because destination is 4-byte aligned, we keep it aligned for movs.

Lasse Reichstein 2011/01/14 10:47:37 How do we know that destination is 4-byte aligned?

William Hesse 2011/01/14 10:54:44 In our uses, source is 4-byte aligned. Changed com

+ mov(scratch, Operand(source, length, times_1, -4));

+ mov(Operand(destination, length, times_1, -4), scratch);

+ mov(scratch, ecx);

+ shr(ecx, 2);

Lasse Reichstein 2011/01/14 10:47:37 If length was divisible by four, you will copy the

William Hesse 2011/01/14 10:54:44 Long rep.movs averages much less than a cycle per

+ rep_movs();

+ and_(Operand(scratch), Immediate(0x3));

+ add(destination, Operand(scratch));

+ jmp(&done);

+ bind(&short_string);

+ test(length, Operand(length));

+ j(zero, &done);

+ bind(&short_loop);

+ mov_b(scratch, Operand(source, 0));

+ mov_b(Operand(destination, 0), scratch);

+ inc(source);

+ inc(destination);

+ dec(length);

Lasse Reichstein 2011/01/14 10:47:37 This won't be faster if you do: add(source, leng

William Hesse 2011/01/14 10:54:44 I tried that - it was slower. On 2011/01/14 10:47:

+ j(not_zero, &short_loop);

+ bind(&done);

}

« no previous file with comments | « src/ia32/macro-assembler-ia32.h ('k') | no next file » | no next file with comments »