Fixing task system off-by-one that was preventing proper distribution. (#6482)

This would lead to a single shard executing more tiles than it should in
a single iteration leading to unbalanced distribution when the tile count
was low.
Fixes #5568.
diff --git a/iree/task/task.c b/iree/task/task.c
index 160b6c1..51fc703 100644
--- a/iree/task/task.c
+++ b/iree/task/task.c
@@ -824,10 +824,6 @@
                                                    tiles_per_reservation,
                                                    iree_memory_order_relaxed);
   while (tile_base < tile_count) {
-    const uint32_t next_tile_base = iree_atomic_fetch_add_int32(
-        &shared_state->tile_index, tiles_per_reservation,
-        iree_memory_order_relaxed);
-
     const uint32_t tile_range =
         iree_min(tile_base + tiles_per_reservation, tile_count);
     for (uint32_t tile_index = tile_base; tile_index < tile_range;
@@ -865,7 +861,9 @@
       }
     }
 
-    tile_base = next_tile_base;
+    tile_base = iree_atomic_fetch_add_int32(&shared_state->tile_index,
+                                            tiles_per_reservation,
+                                            iree_memory_order_relaxed);
   }
 
   // Push aggregate statistics up to the dispatch.